Merge branch 'master' of https://github.com/tensorflow/models into RTESuperGLUE

2e9bb539 · stephenwu · 7bae5317 · 8fba84f8 · 2e9bb539 · 2e9bb539
Commit 2e9bb539 authored Feb 25, 2021 by stephenwu
20 changed files
--- a/official/nlp/modeling/ops/__init__.py
+++ b/official/nlp/modeling/ops/__init__.py
@@ -14,3 +14,5 @@
 # ==============================================================================
 """Ops package definition."""
 from official.nlp.modeling.ops.beam_search import sequence_beam_search
+from official.nlp.modeling.ops.segment_extractor import get_next_sentence_labels
+from official.nlp.modeling.ops.segment_extractor import get_sentence_order_labels
--- a/official/nlp/tools/export_tfhub.py
+++ b/official/nlp/tools/export_tfhub.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Exports a BERT-like encoder and its preprocessing as SavedModels for TF Hub.
+
+This tool creates preprocessor and encoder SavedModels suitable for uploading
+to https://tfhub.dev that implement the preprocessor and encoder APIs defined
+at https://www.tensorflow.org/hub/common_saved_model_apis/text.
+
+For a full usage guide, see
+https://github.com/tensorflow/models/blob/master/official/nlp/docs/tfhub.md
+
+Minimal usage examples:
+
+1) Exporting an Encoder from checkpoint and config.
+
+```
+export_tfhub \
+  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
+  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --export_type=model \
+  --export_path=/tmp/bert_model
+```
+
+An --encoder_config_file can specify encoder types other than BERT.
+For BERT, a --bert_config_file in the legacy JSON format can be passed instead.
+
+Flag --vocab_file (and flag --do_lower_case, whose default value is guessed
+from the vocab_file path) capture how BertTokenizer was used in pre-training.
+Use flag --sp_model_file instead if SentencepieceTokenizer was used.
+
+Changing --export_type to model_with_mlm additionally creates an `.mlm`
+subobject on the exported SavedModel that can be called to produce
+the logits of the Masked Language Model task from pretraining.
+The help string for flag --model_checkpoint_path explains the checkpoint
+formats required for each --export_type.
+
+
+2) Exporting a preprocessor SavedModel
+
+```
+export_tfhub \
+  --vocab_file ${BERT_DIR:?}/vocab.txt \
+  --export_type preprocessing --export_path /tmp/bert_preprocessing
+```
+
+Be sure to use flag values that match the encoder and how it has been
+pre-trained (see above for --vocab_file vs --sp_model_file).
+
+If your encoder has been trained with text preprocessing for which tfhub.dev
+already has SavedModel, you could guide your users to reuse that one instead
+of exporting and publishing your own.
+
+TODO(b/175369555): When exporting to users of TensorFlow 2.4, add flag
+`--experimental_disable_assert_in_preprocessing`.
+"""
+
+from absl import app
+from absl import flags
+import gin
+
+from official.modeling import hyperparams
+from official.nlp.bert import configs
+from official.nlp.configs import encoders
+from official.nlp.tools import export_tfhub_lib
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_enum(
+    "export_type", "model",
+    ["model", "model_with_mlm", "preprocessing"],
+    "The overall type of SavedModel to export. Flags "
+    "--bert_config_file/--encoder_config_file and --vocab_file/--sp_model_file "
+    "control which particular encoder model and preprocessing are exported.")
+flags.DEFINE_string(
+    "export_path", None,
+    "Directory to which the SavedModel is written.")
+flags.DEFINE_string(
+    "encoder_config_file", None,
+    "A yaml file representing `encoders.EncoderConfig` to define the encoder "
+    "(BERT or other). "
+    "Exactly one of --bert_config_file and --encoder_config_file can be set. "
+    "Needed for --export_type model and model_with_mlm.")
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "A JSON file with a legacy BERT configuration to define the BERT encoder. "
+    "Exactly one of --bert_config_file and --encoder_config_file can be set. "
+    "Needed for --export_type model and model_with_mlm.")
+flags.DEFINE_bool(
+    "copy_pooler_dense_to_encoder", False,
+    "When the model is trained using `BertPretrainerV2`, the pool layer "
+    "of next sentence prediction task exists in `ClassificationHead` passed "
+    "to `BertPretrainerV2`. If True, we will copy this pooler's dense layer "
+    "to the encoder that is exported by this tool (as in classic BERT). "
+    "Using `BertPretrainerV2` and leaving this False exports an untrained "
+    "(randomly initialized) pooling layer, which some authors recommend for "
+    "subsequent fine-tuning,")
+flags.DEFINE_string(
+    "model_checkpoint_path", None,
+    "File path to a pre-trained model checkpoint. "
+    "For --export_type model, this has to be an object-based (TF2) checkpoint "
+    "that can be restored to `tf.train.Checkpoint(encoder=encoder)` "
+    "for the `encoder` defined by the config file."
+    "(Legacy checkpoints with `model=` instead of `encoder=` are also "
+    "supported for now.) "
+    "For --export_type model_with_mlm, it must be restorable to "
+    "`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)`. "
+    "(For now, `tf.train.Checkpoint(pretrainer=BertPretrainerV2(...))` is also "
+    "accepted.)")
+flags.DEFINE_string(
+    "vocab_file", None,
+    "For encoders trained on BertTokenzier input: "
+    "the vocabulary file that the encoder model was trained with. "
+    "Exactly one of --vocab_file and --sp_model_file can be set. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_string(
+    "sp_model_file", None,
+    "For encoders trained on SentencepieceTokenzier input: "
+    "the SentencePiece .model file that the encoder model was trained with. "
+    "Exactly one of --vocab_file and --sp_model_file can be set. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_bool(
+    "do_lower_case", None,
+    "Whether to lowercase before tokenization. "
+    "If left as None, and --vocab_file is set, do_lower_case will be enabled "
+    "if 'uncased' appears in the name of --vocab_file. "
+    "If left as None, and --sp_model_file set, do_lower_case defaults to true. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_integer(
+    "default_seq_length", 128,
+    "The sequence length of preprocessing results from "
+    "top-level preprocess method. This is also the default "
+    "sequence length for the bert_pack_inputs subobject."
+    "Needed for --export_type preprocessing.")
+flags.DEFINE_bool(
+    "tokenize_with_offsets", False,  # Broken by b/149576200.
+    "Whether to export a .tokenize_with_offsets subobject for "
+    "--export_type preprocessing.")
+flags.DEFINE_multi_string(
+    "gin_file", default=None,
+    help="List of paths to the config files.")
+flags.DEFINE_multi_string(
+    "gin_params", default=None,
+    help="List of Gin bindings.")
+flags.DEFINE_bool(  # TODO(b/175369555): Remove this flag and its use.
+    "experimental_disable_assert_in_preprocessing", False,
+    "Export a preprocessing model without tf.Assert ops. "
+    "Usually, that would be a bad idea, except TF2.4 has an issue with "
+    "Assert ops in tf.functions used in Dataset.map() on a TPU worker, "
+    "and omitting the Assert ops lets SavedModels avoid the issue.")
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+
+  if bool(FLAGS.vocab_file) == bool(FLAGS.sp_model_file):
+    raise ValueError("Exactly one of `vocab_file` and `sp_model_file` "
+                     "can be specified, but got %s and %s." %
+                     (FLAGS.vocab_file, FLAGS.sp_model_file))
+  do_lower_case = export_tfhub_lib.get_do_lower_case(
+      FLAGS.do_lower_case, FLAGS.vocab_file, FLAGS.sp_model_file)
+
+  if FLAGS.export_type in ("model", "model_with_mlm"):
+    if bool(FLAGS.bert_config_file) == bool(FLAGS.encoder_config_file):
+      raise ValueError("Exactly one of `bert_config_file` and "
+                       "`encoder_config_file` can be specified, but got "
+                       "%s and %s." %
+                       (FLAGS.bert_config_file, FLAGS.encoder_config_file))
+    if FLAGS.bert_config_file:
+      bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+      encoder_config = None
+    else:
+      bert_config = None
+      encoder_config = encoders.EncoderConfig()
+      encoder_config = hyperparams.override_params_dict(
+          encoder_config, FLAGS.encoder_config_file, is_strict=True)
+    export_tfhub_lib.export_model(
+        FLAGS.export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=FLAGS.model_checkpoint_path,
+        vocab_file=FLAGS.vocab_file,
+        sp_model_file=FLAGS.sp_model_file,
+        do_lower_case=do_lower_case,
+        with_mlm=FLAGS.export_type == "model_with_mlm",
+        copy_pooler_dense_to_encoder=FLAGS.copy_pooler_dense_to_encoder)
+
+  elif FLAGS.export_type == "preprocessing":
+    export_tfhub_lib.export_preprocessing(
+        FLAGS.export_path,
+        vocab_file=FLAGS.vocab_file,
+        sp_model_file=FLAGS.sp_model_file,
+        do_lower_case=do_lower_case,
+        default_seq_length=FLAGS.default_seq_length,
+        tokenize_with_offsets=FLAGS.tokenize_with_offsets,
+        experimental_disable_assert=
+        FLAGS.experimental_disable_assert_in_preprocessing)
+
+  else:
+    raise app.UsageError(
+        "Unknown value '%s' for flag --export_type" % FLAGS.export_type)
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/official/nlp/tools/export_tfhub_lib.py
+++ b/official/nlp/tools/export_tfhub_lib.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library of components of export_tfhub.py. See docstring there for more."""
+
+import contextlib
+import hashlib
+import os
+import tempfile
+
+from typing import Optional, Text, Tuple
+
+# Import libraries
+from absl import logging
+import tensorflow as tf
+# pylint: disable=g-direct-tensorflow-import  TODO(b/175369555): Remove these.
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.ops import control_flow_ops
+# pylint: enable=g-direct-tensorflow-import
+from official.modeling import tf_utils
+from official.nlp.bert import configs
+from official.nlp.configs import encoders
+from official.nlp.modeling import layers
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
+
+
+def get_bert_encoder(bert_config):
+  """Returns a BertEncoder with dict outputs."""
+  bert_encoder = networks.BertEncoder(
+      vocab_size=bert_config.vocab_size,
+      hidden_size=bert_config.hidden_size,
+      num_layers=bert_config.num_hidden_layers,
+      num_attention_heads=bert_config.num_attention_heads,
+      intermediate_size=bert_config.intermediate_size,
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      dropout_rate=bert_config.hidden_dropout_prob,
+      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+      max_sequence_length=bert_config.max_position_embeddings,
+      type_vocab_size=bert_config.type_vocab_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=bert_config.initializer_range),
+      embedding_width=bert_config.embedding_size,
+      dict_outputs=True)
+
+  return bert_encoder
+
+
+def get_do_lower_case(do_lower_case, vocab_file=None, sp_model_file=None):
+  """Returns do_lower_case, replacing None by a guess from vocab file name."""
+  if do_lower_case is not None:
+    return do_lower_case
+  elif vocab_file:
+    do_lower_case = "uncased" in vocab_file
+    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
+                 do_lower_case, vocab_file)
+    return do_lower_case
+  elif sp_model_file:
+    do_lower_case = True  # All public ALBERTs (as of Oct 2020) do it.
+    logging.info("Defaulting to do_lower_case=%s for Sentencepiece tokenizer",
+                 do_lower_case)
+    return do_lower_case
+  else:
+    raise ValueError("Must set vocab_file or sp_model_file.")
+
+
+def _create_model(
+    *,
+    bert_config: Optional[configs.BertConfig] = None,
+    encoder_config: Optional[encoders.EncoderConfig] = None,
+    with_mlm: bool,
+) -> Tuple[tf.keras.Model, tf.keras.Model]:
+  """Creates the model to export and the model to restore the checkpoint.
+
+  Args:
+    bert_config: A legacy `BertConfig` to create a `BertEncoder` object.
+      Exactly one of encoder_config and bert_config must be set.
+    encoder_config: An `EncoderConfig` to create an encoder of the configured
+      type (`BertEncoder` or other).
+    with_mlm: A bool to control the second component of the result.
+      If True, will create a `BertPretrainerV2` object; otherwise, will
+      create a `BertEncoder` object.
+
+  Returns:
+    A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2`
+    object or `BertEncoder` object depending on the value of `with_mlm`
+    argument, which contains the first model and will be used for restoring
+    weights from the checkpoint.
+  """
+  if (bert_config is not None) == (encoder_config is not None):
+    raise ValueError("Exactly one of `bert_config` and `encoder_config` "
+                     "can be specified, but got %s and %s" %
+                     (bert_config, encoder_config))
+
+  if bert_config is not None:
+    encoder = get_bert_encoder(bert_config)
+  else:
+    encoder = encoders.build_encoder(encoder_config)
+
+  # Convert from list of named inputs to dict of inputs keyed by name.
+  # Only the latter accepts a dict of inputs after restoring from SavedModel.
+  encoder_inputs_dict = {x.name: x for x in encoder.inputs}
+  encoder_output_dict = encoder(encoder_inputs_dict)
+  # For interchangeability with other text representations,
+  # add "default" as an alias for BERT's whole-input reptesentations.
+  encoder_output_dict["default"] = encoder_output_dict["pooled_output"]
+  core_model = tf.keras.Model(
+      inputs=encoder_inputs_dict, outputs=encoder_output_dict)
+
+  if with_mlm:
+    if bert_config is not None:
+      hidden_act = bert_config.hidden_act
+    else:
+      assert encoder_config is not None
+      hidden_act = encoder_config.get().hidden_activation
+
+    pretrainer = models.BertPretrainerV2(
+        encoder_network=encoder,
+        mlm_activation=tf_utils.get_activation(hidden_act))
+
+    pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs}
+    pretrainer_output_dict = pretrainer(pretrainer_inputs_dict)
+    mlm_model = tf.keras.Model(
+        inputs=pretrainer_inputs_dict, outputs=pretrainer_output_dict)
+    # Set `_auto_track_sub_layers` to False, so that the additional weights
+    # from `mlm` sub-object will not be included in the core model.
+    # TODO(b/169210253): Use a public API when available.
+    core_model._auto_track_sub_layers = False  # pylint: disable=protected-access
+    core_model.mlm = mlm_model
+    return core_model, pretrainer
+  else:
+    return core_model, encoder
+
+
+def export_model(export_path: Text,
+                 *,
+                 bert_config: Optional[configs.BertConfig] = None,
+                 encoder_config: Optional[encoders.EncoderConfig] = None,
+                 model_checkpoint_path: Text,
+                 with_mlm: bool,
+                 copy_pooler_dense_to_encoder: bool = False,
+                 vocab_file: Optional[Text] = None,
+                 sp_model_file: Optional[Text] = None,
+                 do_lower_case: Optional[bool] = None) -> None:
+  """Exports an Encoder as SavedModel after restoring pre-trained weights.
+
+  The exported SavedModel implements a superset of the Encoder API for
+  Text embeddings with Transformer Encoders described at
+  https://www.tensorflow.org/hub/common_saved_model_apis/text.
+
+  In particular, the exported SavedModel can be used in the following way:
+
+  ```
+  # Calls default interface (encoder only).
+
+  encoder = hub.load(...)
+  encoder_inputs = dict(
+      input_word_ids=...,  # Shape [batch, seq_length], dtype=int32
+      input_mask=...,      # Shape [batch, seq_length], dtype=int32
+      input_type_ids=...,  # Shape [batch, seq_length], dtype=int32
+  )
+  encoder_outputs = encoder(encoder_inputs)
+  assert encoder_outputs.keys() == {
+    "pooled_output",   # Shape [batch_size, width], dtype=float32
+    "default",         # Alias for "pooled_output" (aligns with other models).
+    "sequence_output"  # Shape [batch_size, seq_length, width], dtype=float32
+    "encoder_outputs", # List of Tensors with outputs of all transformer layers.
+  }
+  ```
+
+  If `with_mlm` is True, the exported SavedModel can also be called in the
+  following way:
+
+  ```
+  # Calls expanded interface that includes logits of the Masked Language Model.
+  mlm_inputs = dict(
+      input_word_ids=...,       # Shape [batch, seq_length], dtype=int32
+      input_mask=...,           # Shape [batch, seq_length], dtype=int32
+      input_type_ids=...,       # Shape [batch, seq_length], dtype=int32
+      masked_lm_positions=...,  # Shape [batch, num_predictions], dtype=int32
+  )
+  mlm_outputs = encoder.mlm(mlm_inputs)
+  assert mlm_outputs.keys() == {
+    "pooled_output",   # Shape [batch, width], dtype=float32
+    "sequence_output", # Shape [batch, seq_length, width], dtype=float32
+    "encoder_outputs", # List of Tensors with outputs of all transformer layers.
+    "mlm_logits"    # Shape [batch, num_predictions, vocab_size], dtype=float32
+  }
+  ```
+
+  Args:
+    export_path: The SavedModel output directory.
+    bert_config: An optional `configs.BertConfig` object. Note: exactly one of
+      `bert_config` and following `encoder_config` must be specified.
+    encoder_config: An optional `encoders.EncoderConfig` object.
+    model_checkpoint_path: The path to the checkpoint.
+    with_mlm: Whether to export the additional mlm sub-object.
+    copy_pooler_dense_to_encoder: Whether to copy the pooler's dense layer
+      used in the next sentence prediction task to the encoder.
+    vocab_file: The path to the wordpiece vocab file, or None.
+    sp_model_file: The path to the sentencepiece model file, or None.
+      Exactly one of vocab_file and sp_model_file must be set.
+    do_lower_case: Whether to lower-case text before tokenization.
+  """
+  if with_mlm:
+    core_model, pretrainer = _create_model(bert_config=bert_config,
+                                           encoder_config=encoder_config,
+                                           with_mlm=with_mlm)
+    encoder = pretrainer.encoder_network
+    # It supports both the new pretrainer checkpoint produced by TF-NLP and
+    # the checkpoint converted from TF1 (original BERT, SmallBERTs).
+    checkpoint_items = pretrainer.checkpoint_items
+    checkpoint = tf.train.Checkpoint(**checkpoint_items)
+  else:
+    core_model, encoder = _create_model(bert_config=bert_config,
+                                        encoder_config=encoder_config,
+                                        with_mlm=with_mlm)
+    checkpoint = tf.train.Checkpoint(
+        model=encoder,  # Legacy checkpoints.
+        encoder=encoder)
+  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
+
+  if copy_pooler_dense_to_encoder:
+    logging.info("Copy pooler's dense layer to the encoder.")
+    pooler_checkpoint = tf.train.Checkpoint(
+        **{"next_sentence.pooler_dense": encoder.pooler_layer})
+    pooler_checkpoint.restore(
+        model_checkpoint_path).assert_existing_objects_matched()
+
+  # Before SavedModels for preprocessing appeared in Oct 2020, the encoders
+  # provided this information to let users do preprocessing themselves.
+  # We keep doing that for now. It helps users to upgrade incrementally.
+  # Moreover, it offers an escape hatch for advanced users who want the
+  # full vocab, not the high-level operations from the preprocessing model.
+  if vocab_file:
+    core_model.vocab_file = tf.saved_model.Asset(vocab_file)
+    if do_lower_case is None:
+      raise ValueError("Must pass do_lower_case if passing vocab_file.")
+    core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
+  elif sp_model_file:
+    # This was used by ALBERT, with implied values of do_lower_case=True
+    # and strip_diacritics=True.
+    core_model.sp_model_file = tf.saved_model.Asset(sp_model_file)
+  else:
+    raise ValueError("Must set vocab_file or sp_model_file")
+  core_model.save(export_path, include_optimizer=False, save_format="tf")
+
+
+class BertPackInputsSavedModelWrapper(tf.train.Checkpoint):
+  """Wraps a BertPackInputs layer for export to SavedModel.
+
+  The wrapper object is suitable for use with `tf.saved_model.save()` and
+  `.load()`. The wrapper object is callable with inputs and outputs like the
+  BertPackInputs layer, but differs from saving an unwrapped Keras object:
+
+    - The inputs can be a list of 1 or 2 RaggedTensors of dtype int32 and
+      ragged rank 1 or 2. (In Keras, saving to a tf.function in a SavedModel
+      would fix the number of RaggedTensors and their ragged rank.)
+    - The call accepts an optional keyword argument `seq_length=` to override
+      the layer's .seq_length hyperparameter. (In Keras, a hyperparameter
+      could not be changed after saving to a tf.function in a SavedModel.)
+  """
+
+  def __init__(self, bert_pack_inputs: layers.BertPackInputs):
+    super().__init__()
+
+    # Preserve the layer's configured seq_length as a default but make it
+    # overridable. Having this dynamically determined default argument
+    # requires self.__call__ to be defined in this indirect way.
+    default_seq_length = bert_pack_inputs.seq_length
+    @tf.function(autograph=False)
+    def call(inputs, seq_length=default_seq_length):
+      return layers.BertPackInputs.bert_pack_inputs(
+          inputs, seq_length=seq_length,
+          start_of_sequence_id=bert_pack_inputs.start_of_sequence_id,
+          end_of_segment_id=bert_pack_inputs.end_of_segment_id,
+          padding_id=bert_pack_inputs.padding_id)
+    self.__call__ = call
+
+    for ragged_rank in range(1, 3):
+      for num_segments in range(1, 3):
+        _ = self.__call__.get_concrete_function(
+            [tf.RaggedTensorSpec([None] * (ragged_rank + 1), dtype=tf.int32)
+             for _ in range(num_segments)],
+            seq_length=tf.TensorSpec([], tf.int32))
+
+
+def create_preprocessing(*,
+                         vocab_file: Optional[str] = None,
+                         sp_model_file: Optional[str] = None,
+                         do_lower_case: bool,
+                         tokenize_with_offsets: bool,
+                         default_seq_length: int) -> tf.keras.Model:
+  """Returns a preprocessing Model for given tokenization parameters.
+
+  This function builds a Keras Model with attached subobjects suitable for
+  saving to a SavedModel. The resulting SavedModel implements the Preprocessor
+  API for Text embeddings with Transformer Encoders described at
+  https://www.tensorflow.org/hub/common_saved_model_apis/text.
+
+  Args:
+    vocab_file: The path to the wordpiece vocab file, or None.
+    sp_model_file: The path to the sentencepiece model file, or None.
+      Exactly one of vocab_file and sp_model_file must be set.
+      This determines the type of tokenzer that is used.
+    do_lower_case: Whether to do lower case.
+    tokenize_with_offsets: Whether to include the .tokenize_with_offsets
+      subobject.
+    default_seq_length: The sequence length of preprocessing results from
+      root callable. This is also the default sequence length for the
+      bert_pack_inputs subobject.
+
+  Returns:
+    A tf.keras.Model object with several attached subobjects, suitable for
+    saving as a preprocessing SavedModel.
+  """
+  # Select tokenizer.
+  if bool(vocab_file) == bool(sp_model_file):
+    raise ValueError("Must set exactly one of vocab_file, sp_model_file")
+  if vocab_file:
+    tokenize = layers.BertTokenizer(
+        vocab_file=vocab_file,
+        lower_case=do_lower_case,
+        tokenize_with_offsets=tokenize_with_offsets)
+  else:
+    tokenize = layers.SentencepieceTokenizer(
+        model_file_path=sp_model_file,
+        lower_case=do_lower_case,
+        strip_diacritics=True,  #  Strip diacritics to follow ALBERT model.
+        tokenize_with_offsets=tokenize_with_offsets)
+
+  # The root object of the preprocessing model can be called to do
+  # one-shot preprocessing for users with single-sentence inputs.
+  sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
+  if tokenize_with_offsets:
+    tokens, start_offsets, limit_offsets = tokenize(sentences)
+  else:
+    tokens = tokenize(sentences)
+  pack = layers.BertPackInputs(
+      seq_length=default_seq_length,
+      special_tokens_dict=tokenize.get_special_tokens_dict())
+  model_inputs = pack(tokens)
+  preprocessing = tf.keras.Model(sentences, model_inputs)
+
+  # Individual steps of preprocessing are made available as named subobjects
+  # to enable more general preprocessing. For saving, they need to be Models
+  # in their own right.
+  preprocessing.tokenize = tf.keras.Model(sentences, tokens)
+  # Provide an equivalent to tokenize.get_special_tokens_dict().
+  preprocessing.tokenize.get_special_tokens_dict = tf.train.Checkpoint()
+  preprocessing.tokenize.get_special_tokens_dict.__call__ = tf.function(
+      lambda: tokenize.get_special_tokens_dict(),  # pylint: disable=[unnecessary-lambda]
+      input_signature=[])
+  if tokenize_with_offsets:
+    preprocessing.tokenize_with_offsets = tf.keras.Model(
+        sentences, [tokens, start_offsets, limit_offsets])
+    preprocessing.tokenize_with_offsets.get_special_tokens_dict = (
+        preprocessing.tokenize.get_special_tokens_dict)
+  # Conceptually, this should be
+  # preprocessing.bert_pack_inputs = tf.keras.Model(tokens, model_inputs)
+  # but technicalities require us to use a wrapper (see comments there).
+  # In particular, seq_length can be overridden when calling this.
+  preprocessing.bert_pack_inputs = BertPackInputsSavedModelWrapper(pack)
+
+  return preprocessing
+
+
+def _move_to_tmpdir(file_path: Optional[Text], tmpdir: Text) -> Optional[Text]:
+  """Returns new path with same basename and hash of original path."""
+  if file_path is None: return None
+  olddir, filename = os.path.split(file_path)
+  hasher = hashlib.sha1()
+  hasher.update(olddir.encode("utf-8"))
+  target_dir = os.path.join(tmpdir, hasher.hexdigest())
+  target_file = os.path.join(target_dir, filename)
+  tf.io.gfile.mkdir(target_dir)
+  tf.io.gfile.copy(file_path, target_file)
+  return target_file
+
+
+def export_preprocessing(export_path: Text,
+                         *,
+                         vocab_file: Optional[Text] = None,
+                         sp_model_file: Optional[Text] = None,
+                         do_lower_case: bool,
+                         tokenize_with_offsets: bool,
+                         default_seq_length: int,
+                         experimental_disable_assert: bool = False) -> None:
+  """Exports preprocessing to a SavedModel for TF Hub."""
+  with tempfile.TemporaryDirectory() as tmpdir:
+    # TODO(b/175369555): Remove experimental_disable_assert and its use.
+    with _maybe_disable_assert(experimental_disable_assert):
+      preprocessing = create_preprocessing(
+          vocab_file=_move_to_tmpdir(vocab_file, tmpdir),
+          sp_model_file=_move_to_tmpdir(sp_model_file, tmpdir),
+          do_lower_case=do_lower_case,
+          tokenize_with_offsets=tokenize_with_offsets,
+          default_seq_length=default_seq_length)
+      preprocessing.save(export_path, include_optimizer=False, save_format="tf")
+    if experimental_disable_assert:
+      _check_no_assert(export_path)
+  # It helps the unit test to prevent stray copies of the vocab file.
+  if tf.io.gfile.exists(tmpdir):
+    raise IOError("Failed to clean up TemporaryDirectory")
+
+
+# TODO(b/175369555): Remove all workarounds for this bug of TensorFlow 2.4
+# when this bug is no longer a concern for publishing new models.
+# TensorFlow 2.4 has a placement issue with Assert ops in tf.functions called
+# from Dataset.map() on a TPU worker. They end up on the TPU coordinator,
+# and invoking them from the TPU worker is either inefficient (when possible)
+# or impossible (notably when using "headless" TPU workers on Cloud that do not
+# have a channel to the coordinator). The bug has been fixed in time for TF 2.5.
+# To work around this, the following code avoids Assert ops in the exported
+# SavedModels. It monkey-patches calls to tf.Assert from inside TensorFlow and
+# replaces them by a no-op while building the exported model. This is fragile,
+# so _check_no_assert() validates the result. The resulting model should be fine
+# to read on future versions of TF, even if this workaround at export time
+# may break eventually. (Failing unit tests will tell.)
+
+
+def _dont_assert(condition, data, summarize=None, name="Assert"):
+  """The no-op version of tf.Assert installed by _maybe_disable_assert."""
+  del condition, data, summarize  # Unused.
+  if tf.executing_eagerly():
+    return
+  with tf.name_scope(name):
+    return tf.no_op(name="dont_assert")
+
+
+@contextlib.contextmanager
+def _maybe_disable_assert(disable_assert):
+  """Scoped monkey patch of control_flow_ops.Assert to a no-op."""
+  if not disable_assert:
+    yield
+    return
+
+  original_assert = control_flow_ops.Assert
+  control_flow_ops.Assert = _dont_assert
+  yield
+  control_flow_ops.Assert = original_assert
+
+
+def _check_no_assert(saved_model_path):
+  """Raises AssertionError if SavedModel contains Assert ops."""
+  saved_model_filename = os.path.join(saved_model_path, "saved_model.pb")
+  with tf.io.gfile.GFile(saved_model_filename, "rb") as f:
+    saved_model = saved_model_pb2.SavedModel.FromString(f.read())
+
+  assert_nodes = []
+  graph_def = saved_model.meta_graphs[0].graph_def
+  assert_nodes += ["node '{}' in global graph".format(n.name)
+                   for n in graph_def.node if n.op == "Assert"]
+  for fdef in graph_def.library.function:
+    assert_nodes += [
+        "node '{}' in function '{}'".format(n.name, fdef.signature.name)
+        for n in fdef.node_def if n.op == "Assert"]
+  if assert_nodes:
+    raise AssertionError(
+        "Internal tool error: "
+        "failed to suppress {} Assert ops in SavedModel:\n{}".format(
+            len(assert_nodes), "\n".join(assert_nodes[:10])))
--- a/official/nlp/tools/export_tfhub_lib_test.py
+++ b/official/nlp/tools/export_tfhub_lib_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests export_tfhub_lib."""
+
+import os
+import tempfile
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from sentencepiece import SentencePieceTrainer
+from official.modeling import tf_utils
+from official.nlp.bert import configs
+from official.nlp.configs import encoders
+from official.nlp.modeling import layers
+from official.nlp.modeling import models
+from official.nlp.tools import export_tfhub_lib
+
+
+def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
+                                       num_hidden_layers):
+  """Returns config args for export_tfhub_lib._create_model()."""
+  if use_bert_config:
+    bert_config = configs.BertConfig(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=num_hidden_layers)
+    encoder_config = None
+  else:
+    bert_config = None
+    encoder_config = encoders.EncoderConfig(
+        type="albert",
+        albert=encoders.AlbertEncoderConfig(
+            vocab_size=100,
+            embedding_width=16,
+            hidden_size=hidden_size,
+            intermediate_size=32,
+            max_position_embeddings=128,
+            num_attention_heads=2,
+            num_layers=num_hidden_layers,
+            dropout_rate=0.1))
+
+  return bert_config, encoder_config
+
+
+def _get_vocab_or_sp_model_dummy(temp_dir, use_sp_model):
+  """Returns tokenizer asset args for export_tfhub_lib.export_model()."""
+  dummy_file = os.path.join(temp_dir, "dummy_file.txt")
+  with tf.io.gfile.GFile(dummy_file, "w") as f:
+    f.write("dummy content")
+  if use_sp_model:
+    vocab_file, sp_model_file = None, dummy_file
+  else:
+    vocab_file, sp_model_file = dummy_file, None
+  return vocab_file, sp_model_file
+
+
+def _read_asset(asset: tf.saved_model.Asset):
+  return tf.io.gfile.GFile(asset.asset_path.numpy()).read()
+
+
+def _find_lambda_layers(layer):
+  """Returns list of all Lambda layers in a Keras model."""
+  if isinstance(layer, tf.keras.layers.Lambda):
+    return [layer]
+  elif hasattr(layer, "layers"):  # It's nested, like a Model.
+    result = []
+    for l in layer.layers:
+      result += _find_lambda_layers(l)
+    return result
+  else:
+    return []
+
+
+class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
+  """Tests exporting a Transformer Encoder model as a SavedModel.
+
+  This covers export from an Encoder checkpoint to a SavedModel without
+  the .mlm subobject. This is no longer preferred, but still useful
+    for models like Electra that are trained without the MLM task.
+
+  The export code is generic. This test focuses on two main cases
+  (the most important ones in practice when this was written in 2020):
+    - BERT built from a legacy BertConfig, for use with BertTokenizer.
+    - ALBERT built from an EncoderConfig (as a representative of all other
+      choices beyond BERT, for use with SentencepieceTokenizer (the one
+      alternative to BertTokenizer).
+  """
+
+  @parameterized.named_parameters(("Bert", True), ("Albert", False))
+  def test_export_model(self, use_bert):
+    # Create the encoder and export it.
+    hidden_size = 16
+    num_hidden_layers = 1
+    bert_config, encoder_config = _get_bert_config_or_encoder_config(
+        use_bert, hidden_size, num_hidden_layers)
+    bert_model, encoder = export_tfhub_lib._create_model(
+        bert_config=bert_config, encoder_config=encoder_config, with_mlm=False)
+    self.assertEmpty(
+        _find_lambda_layers(bert_model),
+        "Lambda layers are non-portable since they serialize Python bytecode.")
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(encoder=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=not use_bert)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=model_checkpoint_path,
+        with_mlm=False,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+
+    # Restore the exported model.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+
+    # Check legacy tokenization data.
+    if use_bert:
+      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.vocab_file))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
+    else:
+      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.sp_model_file))
+
+    # Check restored weights.
+    self.assertEqual(len(bert_model.trainable_weights),
+                     len(hub_layer.trainable_weights))
+    for source_weight, hub_weight in zip(bert_model.trainable_weights,
+                                         hub_layer.trainable_weights):
+      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
+
+    # Check computation.
+    seq_length = 10
+    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_output = hub_layer(input_dict)
+    source_output = bert_model(input_dict)
+    encoder_output = encoder(input_dict)
+    self.assertEqual(hub_output["pooled_output"].shape, (2, hidden_size))
+    self.assertEqual(hub_output["sequence_output"].shape,
+                     (2, seq_length, hidden_size))
+    self.assertLen(hub_output["encoder_outputs"], num_hidden_layers)
+
+    for key in ("pooled_output", "sequence_output", "encoder_outputs"):
+      self.assertAllClose(source_output[key], hub_output[key])
+      self.assertAllClose(source_output[key], encoder_output[key])
+
+    # The "default" output of BERT as a text representation is pooled_output.
+    self.assertAllClose(hub_output["pooled_output"], hub_output["default"])
+
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids))
+      outputs = np.concatenate([
+          hub_layer(input_dict, training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+
+    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
+
+    # Test propagation of seq_length in shape inference.
+    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_dict = dict(
+        input_word_ids=input_word_ids,
+        input_mask=input_mask,
+        input_type_ids=input_type_ids)
+    output_dict = hub_layer(input_dict)
+    pooled_output = output_dict["pooled_output"]
+    sequence_output = output_dict["sequence_output"]
+    encoder_outputs = output_dict["encoder_outputs"]
+
+    self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
+    self.assertEqual(sequence_output.shape.as_list(),
+                     [None, seq_length, hidden_size])
+    self.assertLen(encoder_outputs, num_hidden_layers)
+
+
+class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase):
+  """Tests exporting a Transformer Encoder model as a SavedModel.
+
+  This covers export from a Pretrainer checkpoint to a SavedModel including
+  the .mlm subobject, which is the preferred way since 2020.
+
+  The export code is generic. This test focuses on two main cases
+  (the most important ones in practice when this was written in 2020):
+    - BERT built from a legacy BertConfig, for use with BertTokenizer.
+    - ALBERT built from an EncoderConfig (as a representative of all other
+      choices beyond BERT, for use with SentencepieceTokenizer (the one
+      alternative to BertTokenizer).
+  """
+
+  def test_copy_pooler_dense_to_encoder(self):
+    encoder_config = encoders.EncoderConfig(
+        type="bert",
+        bert=encoders.BertEncoderConfig(
+            hidden_size=24, intermediate_size=48, num_layers=2))
+    cls_heads = [
+        layers.ClassificationHead(
+            inner_dim=24, num_classes=2, name="next_sentence")
+    ]
+    encoder = encoders.build_encoder(encoder_config)
+    pretrainer = models.BertPretrainerV2(
+        encoder_network=encoder,
+        classification_heads=cls_heads,
+        mlm_activation=tf_utils.get_activation(
+            encoder_config.get().hidden_activation))
+    # Makes sure the pretrainer variables are created.
+    _ = pretrainer(pretrainer.inputs)
+    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=True)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        encoder_config=encoder_config,
+        model_checkpoint_path=tf.train.latest_checkpoint(model_checkpoint_dir),
+        with_mlm=True,
+        copy_pooler_dense_to_encoder=True,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+    # Restores a hub KerasLayer.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+    dummy_ids = np.zeros((2, 10), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_pooled_output = hub_layer(input_dict)["pooled_output"]
+    encoder_outputs = encoder(input_dict)
+    # Verify that hub_layer's pooled_output is the same as the output of next
+    # sentence prediction's dense layer.
+    pretrained_pooled_output = cls_heads[0].dense(
+        (encoder_outputs["sequence_output"][:, 0, :]))
+    self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
+    # But the pooled_output between encoder and hub_layer are not the same.
+    encoder_pooled_output = encoder_outputs["pooled_output"]
+    self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
+
+  @parameterized.named_parameters(
+      ("Bert", True),
+      ("Albert", False),
+  )
+  def test_export_model_with_mlm(self, use_bert):
+    # Create the encoder and export it.
+    hidden_size = 16
+    num_hidden_layers = 2
+    bert_config, encoder_config = _get_bert_config_or_encoder_config(
+        use_bert, hidden_size, num_hidden_layers)
+    bert_model, pretrainer = export_tfhub_lib._create_model(
+        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
+    self.assertEmpty(
+        _find_lambda_layers(bert_model),
+        "Lambda layers are non-portable since they serialize Python bytecode.")
+    bert_model_with_mlm = bert_model.mlm
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+
+    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
+
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=not use_bert)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=model_checkpoint_path,
+        with_mlm=True,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+
+    # Restore the exported model.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+
+    # Check legacy tokenization data.
+    if use_bert:
+      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.vocab_file))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
+    else:
+      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.sp_model_file))
+
+    # Check restored weights.
+    # Note that we set `_auto_track_sub_layers` to False when exporting the
+    # SavedModel, so hub_layer has the same number of weights as bert_model;
+    # otherwise, hub_layer will have extra weights from its `mlm` subobject.
+    self.assertEqual(len(bert_model.trainable_weights),
+                     len(hub_layer.trainable_weights))
+    for source_weight, hub_weight in zip(bert_model.trainable_weights,
+                                         hub_layer.trainable_weights):
+      self.assertAllClose(source_weight, hub_weight)
+
+    # Check computation.
+    seq_length = 10
+    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_outputs_dict = hub_layer(input_dict)
+    source_outputs_dict = bert_model(input_dict)
+    encoder_outputs_dict = pretrainer.encoder_network(
+        [dummy_ids, dummy_ids, dummy_ids])
+    self.assertEqual(hub_outputs_dict["pooled_output"].shape, (2, hidden_size))
+    self.assertEqual(hub_outputs_dict["sequence_output"].shape,
+                     (2, seq_length, hidden_size))
+    for output_key in ("pooled_output", "sequence_output", "encoder_outputs"):
+      self.assertAllClose(source_outputs_dict[output_key],
+                          hub_outputs_dict[output_key])
+      self.assertAllClose(source_outputs_dict[output_key],
+                          encoder_outputs_dict[output_key])
+
+    # The "default" output of BERT as a text representation is pooled_output.
+    self.assertAllClose(hub_outputs_dict["pooled_output"],
+                        hub_outputs_dict["default"])
+
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids))
+      outputs = np.concatenate([
+          hub_layer(input_dict, training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+
+    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
+
+    # Checks sub-object `mlm`.
+    self.assertTrue(hasattr(hub_layer.resolved_object, "mlm"))
+
+    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
+                   len(bert_model_with_mlm.trainable_weights))
+    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
+                   len(pretrainer.trainable_weights))
+    for source_weight, hub_weight, pretrainer_weight in zip(
+        bert_model_with_mlm.trainable_weights,
+        hub_layer.resolved_object.mlm.trainable_variables,
+        pretrainer.trainable_weights):
+      self.assertAllClose(source_weight, hub_weight)
+      self.assertAllClose(source_weight, pretrainer_weight)
+
+    max_predictions_per_seq = 4
+    mlm_positions = np.zeros((2, max_predictions_per_seq), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids,
+        masked_lm_positions=mlm_positions)
+    hub_mlm_outputs_dict = hub_layer.resolved_object.mlm(input_dict)
+    source_mlm_outputs_dict = bert_model_with_mlm(input_dict)
+    for output_key in ("pooled_output", "sequence_output", "mlm_logits",
+                       "encoder_outputs"):
+      self.assertAllClose(hub_mlm_outputs_dict[output_key],
+                          source_mlm_outputs_dict[output_key])
+
+    pretrainer_mlm_logits_output = pretrainer(input_dict)["mlm_logits"]
+    self.assertAllClose(hub_mlm_outputs_dict["mlm_logits"],
+                        pretrainer_mlm_logits_output)
+
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev_mlm(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      mlm_position_ids = np.array([[1, 2, 3, 4]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids),
+          masked_lm_positions=mlm_position_ids)
+      outputs = np.concatenate([
+          hub_layer.resolved_object.mlm(input_dict,
+                                        training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+
+    self.assertLess(_dropout_mean_stddev_mlm(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev_mlm(training=True), 1e-3)
+
+    # Test propagation of seq_length in shape inference.
+    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_dict = dict(
+        input_word_ids=input_word_ids,
+        input_mask=input_mask,
+        input_type_ids=input_type_ids)
+    hub_outputs_dict = hub_layer(input_dict)
+    self.assertEqual(hub_outputs_dict["pooled_output"].shape.as_list(),
+                     [None, hidden_size])
+    self.assertEqual(hub_outputs_dict["sequence_output"].shape.as_list(),
+                     [None, seq_length, hidden_size])
+
+
+_STRING_NOT_TO_LEAK = "private_path_component_"
+
+
+class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
+
+  def _make_vocab_file(self, vocab, filename="vocab.txt"):
+    """Creates wordpiece vocab file with given words plus special tokens.
+
+    The tokens of the resulting model are, in this order:
+    [PAD], [UNK], [CLS], [SEP], ...vocab...
+
+    This function also accepts wordpieces that start with the ## continuation
+    marker, but avoiding those makes this function interchangeable with
+    _make_sp_model_file(), up to the extra dimension returned by BertTokenizer.
+
+    Args:
+      vocab: a list of strings with the words or wordpieces to put into the
+        model's vocabulary. Do not include special tokens here.
+      filename: Optionally, a filename (relative to the temporary directory
+        created by this function).
+
+    Returns:
+      The absolute filename of the created vocab file.
+    """
+    full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] + vocab
+    path = os.path.join(
+        tempfile.mkdtemp(dir=self.get_temp_dir(),  # New subdir each time.
+                         prefix=_STRING_NOT_TO_LEAK),
+        filename)
+    with tf.io.gfile.GFile(path, "w") as f:
+      f.write("\n".join(full_vocab + [""]))
+    return path
+
+  def _make_sp_model_file(self, vocab, prefix="spm"):
+    """Creates Sentencepiece word model with given words plus special tokens.
+
+    The tokens of the resulting model are, in this order:
+    <pad>, <unk>, [CLS], [SEP], ...vocab..., <s>, </s>
+
+    The words in the input vocab are plain text, without the whitespace marker.
+    That makes this function interchangeable with _make_vocab_file().
+
+    Args:
+      vocab: a list of strings with the words to put into the model's
+        vocabulary. Do not include special tokens here.
+      prefix: an optional string, to change the filename prefix for the model
+        (relative to the temporary directory created by this function).
+
+    Returns:
+      The absolute filename of the created Sentencepiece model file.
+    """
+    model_prefix = os.path.join(
+        tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
+        prefix)
+    input_file = model_prefix + "_train_input.txt"
+    # Create input text for training the sp model from the tokens provided.
+    # Repeat tokens, the earlier the more, because they are sorted by frequency.
+    input_text = []
+    for i, token in enumerate(vocab):
+      input_text.append(" ".join([token] * (len(vocab) - i)))
+    with tf.io.gfile.GFile(input_file, "w") as f:
+      f.write("\n".join(input_text + [""]))
+    full_vocab_size = len(vocab) + 6  # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
+    flags = dict(
+        model_prefix=model_prefix,
+        model_type="word",
+        input=input_file,
+        pad_id=0, unk_id=1, control_symbols="[CLS],[SEP]",
+        vocab_size=full_vocab_size,
+        bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
+    SentencePieceTrainer.Train(
+        " ".join(["--{}={}".format(k, v) for k, v in flags.items()]))
+    return model_prefix + ".model"
+
+  def _do_export(self, vocab, do_lower_case, default_seq_length=128,
+                 tokenize_with_offsets=True, use_sp_model=False,
+                 experimental_disable_assert=False):
+    """Runs SavedModel export and returns the export_path."""
+    export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
+    vocab_file = sp_model_file = None
+    if use_sp_model:
+      sp_model_file = self._make_sp_model_file(vocab)
+    else:
+      vocab_file = self._make_vocab_file(vocab)
+    export_tfhub_lib.export_preprocessing(
+        export_path,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=do_lower_case,
+        tokenize_with_offsets=tokenize_with_offsets,
+        default_seq_length=default_seq_length,
+        experimental_disable_assert=experimental_disable_assert)
+    # Invalidate the original filename to verify loading from the SavedModel.
+    tf.io.gfile.remove(sp_model_file or vocab_file)
+    return export_path
+
+  def test_no_leaks(self):
+    """Tests not leaking the path to the original vocab file."""
+    path = self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True, use_sp_model=False)
+    with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f:
+      self.assertFalse(  # pylint: disable=g-generic-assert
+          _STRING_NOT_TO_LEAK.encode("ascii") in f.read())
+
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_exported_callables(self, use_sp_model):
+    preprocess = tf.saved_model.load(self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True,
+        tokenize_with_offsets=not use_sp_model,  # TODO(b/149576200): drop this.
+        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+        use_sp_model=use_sp_model))
+
+    def fold_dim(rt):
+      """Removes the word/subword distinction of BertTokenizer."""
+      return rt if use_sp_model else rt.merge_dims(1, 2)
+
+    # .tokenize()
+    inputs = tf.constant(["abc d ef", "ABC D EF d"])
+    token_ids = preprocess.tokenize(inputs)
+    self.assertAllEqual(fold_dim(token_ids),
+                        tf.ragged.constant([[6, 4, 5],
+                                            [6, 4, 5, 4]]))
+
+    special_tokens_dict = {
+        k: v.numpy().item()  # Expecting eager Tensor, converting to Python.
+        for k, v in preprocess.tokenize.get_special_tokens_dict().items()}
+    self.assertDictEqual(special_tokens_dict,
+                         dict(padding_id=0,
+                              start_of_sequence_id=2,
+                              end_of_segment_id=3,
+                              vocab_size=4+6 if use_sp_model else 4+4))
+
+    # .tokenize_with_offsets()
+    if use_sp_model:
+      # TODO(b/149576200): Enable tokenize_with_offsets when it works and test.
+      self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+    else:
+      token_ids, start_offsets, limit_offsets = (
+          preprocess.tokenize_with_offsets(inputs))
+      self.assertAllEqual(fold_dim(token_ids),
+                          tf.ragged.constant([[6, 4, 5],
+                                              [6, 4, 5, 4]]))
+      self.assertAllEqual(fold_dim(start_offsets),
+                          tf.ragged.constant([[0, 4, 6],
+                                              [0, 4, 6, 9]]))
+      self.assertAllEqual(fold_dim(limit_offsets),
+                          tf.ragged.constant([[3, 5, 8],
+                                              [3, 5, 8, 10]]))
+      self.assertIs(preprocess.tokenize.get_special_tokens_dict,
+                    preprocess.tokenize_with_offsets.get_special_tokens_dict)
+
+    # Root callable.
+    bert_inputs = preprocess(inputs)
+    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128])
+    self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
+                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128])
+    self.assertAllEqual(bert_inputs["input_mask"][:, :10],
+                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128])
+    self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
+                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+
+    # .bert_pack_inputs()
+    inputs_2 = tf.constant(["d xy", "xy abc"])
+    token_ids_2 = preprocess.tokenize(inputs_2)
+    bert_inputs = preprocess.bert_pack_inputs(
+        [token_ids, token_ids_2], seq_length=256)
+    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256])
+    self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
+                        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+                                     [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256])
+    self.assertAllEqual(bert_inputs["input_mask"][:, :10],
+                        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256])
+    self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
+                        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                                     [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
+
+  # For BertTokenizer only: repeat relevant parts for do_lower_case=False,
+  # default_seq_length=10, experimental_disable_assert=False,
+  # tokenize_with_offsets=False, and without folding the word/subword dimension.
+  def test_cased_length10(self):
+    preprocess = tf.saved_model.load(self._do_export(
+        ["d", "##ef", "abc", "ABC"],
+        do_lower_case=False, default_seq_length=10,
+        tokenize_with_offsets=False,
+        use_sp_model=False,
+        experimental_disable_assert=False))
+    inputs = tf.constant(["abc def", "ABC DEF"])
+    token_ids = preprocess.tokenize(inputs)
+    self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
+                                                       [[7], [1]]]))
+
+    self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+
+    bert_inputs = preprocess(inputs)
+    self.assertAllEqual(bert_inputs["input_word_ids"],
+                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                                     [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"],
+                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                                     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"],
+                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+
+    inputs_2 = tf.constant(["d ABC", "ABC abc"])
+    token_ids_2 = preprocess.tokenize(inputs_2)
+    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2])
+    # Test default seq_length=10.
+    self.assertAllEqual(bert_inputs["input_word_ids"],
+                        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+                                     [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"],
+                        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"],
+                        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                                     [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
+
+  # XLA requires fixed shapes for tensors found in graph mode.
+  # Statically known shapes in Python are a particularly firm way to
+  # guarantee that, and they are generally more convenient to work with.
+  # We test that the exported SavedModel plays well with TF's shape
+  # inference when applied to fully or partially known input shapes.
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_shapes(self, use_sp_model):
+    preprocess = tf.saved_model.load(self._do_export(
+        ["abc", "def"], do_lower_case=True,
+        tokenize_with_offsets=not use_sp_model,  # TODO(b/149576200): drop this.
+        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+        use_sp_model=use_sp_model))
+
+    def expected_bert_input_shapes(batch_size, seq_length):
+      return dict(input_word_ids=[batch_size, seq_length],
+                  input_mask=[batch_size, seq_length],
+                  input_type_ids=[batch_size, seq_length])
+
+    for batch_size in [7, None]:
+      if use_sp_model:
+        token_out_shape = [batch_size, None]  # No word/subword distinction.
+      else:
+        token_out_shape = [batch_size, None, None]
+      self.assertEqual(
+          _result_shapes_in_tf_function(
+              preprocess.tokenize,
+              tf.TensorSpec([batch_size], tf.string)),
+          token_out_shape,
+          "with batch_size=%s" % batch_size)
+      # TODO(b/149576200): Enable tokenize_with_offsets when it works and test.
+      if use_sp_model:
+        self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+      else:
+        self.assertEqual(
+            _result_shapes_in_tf_function(
+                preprocess.tokenize_with_offsets,
+                tf.TensorSpec([batch_size], tf.string)),
+            [token_out_shape] * 3,
+            "with batch_size=%s" % batch_size)
+      self.assertEqual(
+          _result_shapes_in_tf_function(
+              preprocess.bert_pack_inputs,
+              [tf.RaggedTensorSpec([batch_size, None, None], tf.int32)] * 2,
+              seq_length=256), expected_bert_input_shapes(batch_size, 256),
+          "with batch_size=%s" % batch_size)
+      self.assertEqual(
+          _result_shapes_in_tf_function(preprocess,
+                                        tf.TensorSpec([batch_size], tf.string)),
+          expected_bert_input_shapes(batch_size, 128),
+          "with batch_size=%s" % batch_size)
+
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_reexport(self, use_sp_model):
+    """Test that preprocess keeps working after another save/load cycle."""
+    path1 = self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True, default_seq_length=10,
+        tokenize_with_offsets=False,
+        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+        use_sp_model=use_sp_model)
+    path2 = path1.rstrip("/") + ".2"
+    model1 = tf.saved_model.load(path1)
+    tf.saved_model.save(model1, path2)
+    # Delete the first SavedModel to test that the sceond one loads by itself.
+    # https://github.com/tensorflow/tensorflow/issues/46456 reports such a
+    # failure case for BertTokenizer.
+    tf.io.gfile.rmtree(path1)
+    model2 = tf.saved_model.load(path2)
+
+    inputs = tf.constant(["abc d ef", "ABC D EF d"])
+    bert_inputs = model2(inputs)
+    self.assertAllEqual(bert_inputs["input_word_ids"],
+                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"],
+                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"],
+                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_special_tokens_in_estimator(self, use_sp_model):
+    """Tests getting special tokens without an Eager init context."""
+    preprocess_export_path = self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True,
+        use_sp_model=use_sp_model, tokenize_with_offsets=False)
+
+    def _get_special_tokens_dict(obj):
+      """Returns special tokens of restored tokenizer as Python values."""
+      if tf.executing_eagerly():
+        special_tokens_numpy = {k: v.numpy()
+                                for k, v in obj.get_special_tokens_dict()}
+      else:
+        with tf.Graph().as_default():
+          # This code expects `get_special_tokens_dict()` to be a tf.function
+          # with no dependencies (bound args) from the context it was loaded in,
+          # and boldly assumes that it can just be called in a dfferent context.
+          special_tokens_tensors = obj.get_special_tokens_dict()
+          with tf.compat.v1.Session() as sess:
+            special_tokens_numpy = sess.run(special_tokens_tensors)
+      return {k: v.item()  # Numpy to Python.
+              for k, v in special_tokens_numpy.items()}
+
+    def input_fn():
+      self.assertFalse(tf.executing_eagerly())
+      # Build a preprocessing Model.
+      sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
+      preprocess = tf.saved_model.load(preprocess_export_path)
+      tokenize = hub.KerasLayer(preprocess.tokenize)
+      special_tokens_dict = _get_special_tokens_dict(tokenize.resolved_object)
+      for k, v in special_tokens_dict.items():
+        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
+      tokens = tokenize(sentences)
+      packed_inputs = layers.BertPackInputs(
+          4, special_tokens_dict=special_tokens_dict)(tokens)
+      preprocessing = tf.keras.Model(sentences, packed_inputs)
+      # Map the dataset.
+      ds = tf.data.Dataset.from_tensors(
+          (tf.constant(["abc", "D EF"]), tf.constant([0, 1])))
+      ds = ds.map(lambda features, labels: (preprocessing(features), labels))
+      return ds
+
+    def model_fn(features, labels, mode):
+      del labels  # Unused.
+      return tf.estimator.EstimatorSpec(mode=mode,
+                                        predictions=features["input_word_ids"])
+
+    estimator = tf.estimator.Estimator(model_fn=model_fn)
+    outputs = list(estimator.predict(input_fn))
+    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
+                                           [2, 4, 5, 3]]))
+
+  # TODO(b/175369555): Remove that code and its test.
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_check_no_assert(self, use_sp_model):
+    """Tests the self-check during export without assertions."""
+    preprocess_export_path = self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True,
+        use_sp_model=use_sp_model, tokenize_with_offsets=False,
+        experimental_disable_assert=False)
+    with self.assertRaisesRegex(AssertionError,
+                                r"failed to suppress \d+ Assert ops"):
+      export_tfhub_lib._check_no_assert(preprocess_export_path)
+
+
+def _result_shapes_in_tf_function(fn, *args, **kwargs):
+  """Returns shapes (as lists) observed on the result of `fn`.
+
+  Args:
+    fn: A callable.
+    *args: TensorSpecs for Tensor-valued arguments and actual values
+      for Python-valued arguments to fn.
+    **kwargs: Same for keyword arguments.
+
+  Returns:
+    The nest of partial tensor shapes (as lists) that is statically known inside
+    tf.function(fn)(*args, **kwargs) for the nest of its results.
+  """
+  # Use a captured mutable container for a side outout from the wrapper.
+  uninitialized = "uninitialized!"
+  result_shapes_container = [uninitialized]
+  assert result_shapes_container[0] is uninitialized
+
+  @tf.function
+  def shape_reporting_wrapper(*args, **kwargs):
+    result = fn(*args, **kwargs)
+    result_shapes_container[0] = tf.nest.map_structure(
+        lambda x: x.shape.as_list(), result)
+    return result
+
+  shape_reporting_wrapper.get_concrete_function(*args, **kwargs)
+  assert result_shapes_container[0] is not uninitialized
+  return result_shapes_container[0]
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/transformer/transformer.py
+++ b/official/nlp/transformer/transformer.py
@@ -109,8 +109,8 @@ class Transformer(tf.keras.Model):
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
-          outputs: [batch_size, decoded length]
-          scores: [batch_size, float]}
+          outputs: int tensor with shape [batch_size, decoded_length]
+          scores: float tensor with shape [batch_size]}
      Even when float16 is used, the output tensor(s) are always float32.

    Raises:

--- a/official/nlp/transformer/translate.py
+++ b/official/nlp/transformer/translate.py
@@ -151,14 +151,8 @@ def translate_file(model,
      text = distribution_strategy.run(text_as_per_replica)
      outputs = distribution_strategy.experimental_local_results(
          predict_step(text))
-      tags, unordered_val_outputs = outputs[0]
-      tags = [tag.numpy() for tag in tags._values]
-      unordered_val_outputs = [
-          val_output.numpy() for val_output in unordered_val_outputs._values]
-      # pylint: enable=protected-access
-      val_outputs = [None] * len(tags)
-      for k in range(len(tags)):
-        val_outputs[tags[k]] = unordered_val_outputs[k]
+      val_outputs = [output for _, output in outputs]
+
      val_outputs = np.reshape(val_outputs, [params["decode_batch_size"], -1])
    else:
      val_outputs, _ = model.predict(text)

--- a/official/nlp/xlnet/preprocess_pretrain_data.py
+++ b/official/nlp/xlnet/preprocess_pretrain_data.py
@@ -22,14 +22,15 @@ import random
 # Import libraries
 from absl import app
 from absl import flags
-import absl.logging as _logging  # pylint: disable=unused-import
+from absl import logging

 import numpy as np

-
-import tensorflow.google as tf
-from official.nlp.xlnet import preprocess_utils
+import tensorflow.compat.v1 as tf
 import sentencepiece as spm
+from official.nlp.xlnet import preprocess_utils
+
+FLAGS = flags.FLAGS


 special_symbols = {
@@ -89,6 +90,7 @@ def format_filename(prefix, bsz_per_host, seq_len, bi_data, suffix,


 def _create_data(idx, input_paths):
+  """Creates data."""
  # Load sentence-piece model
  sp = spm.SentencePieceProcessor()
  sp.Load(FLAGS.sp_path)
@@ -98,10 +100,10 @@ def _create_data(idx, input_paths):
  for input_path in input_paths:
    input_data, sent_ids = [], []
    sent_id, line_cnt = True, 0
-    tf.logging.info("Processing %s", input_path)
+    logging.info("Processing %s", input_path)
    for line in tf.gfile.Open(input_path):
      if line_cnt % 100000 == 0:
-        tf.logging.info("Loading line %d", line_cnt)
+        logging.info("Loading line %d", line_cnt)
      line_cnt += 1

      if not line.strip():
@@ -122,7 +124,7 @@ def _create_data(idx, input_paths):
      sent_ids.extend([sent_id] * len(cur_sent))
      sent_id = not sent_id

-    tf.logging.info("Finish with line %d", line_cnt)
+    logging.info("Finish with line %d", line_cnt)
    if line_cnt == 0:
      continue

@@ -132,7 +134,7 @@ def _create_data(idx, input_paths):
    total_line_cnt += line_cnt
    input_shards.append((input_data, sent_ids))

-  tf.logging.info("[Task %d] Total number line: %d", idx, total_line_cnt)
+  logging.info("[Task %d] Total number line: %d", idx, total_line_cnt)

  tfrecord_dir = os.path.join(FLAGS.save_dir, "tfrecords")

@@ -142,8 +144,8 @@ def _create_data(idx, input_paths):
  np.random.seed(100 * FLAGS.task + FLAGS.pass_id)

  perm_indices = np.random.permutation(len(input_shards))
-  tf.logging.info("Using perm indices %s for pass %d",
-                  perm_indices.tolist(), FLAGS.pass_id)
+  logging.info("Using perm indices %s for pass %d",
+               perm_indices.tolist(), FLAGS.pass_id)

  input_data_list, sent_ids_list = [], []
  prev_sent_id = None
@@ -185,6 +187,7 @@ def _create_data(idx, input_paths):


 def create_data(_):
+  """Creates pretrain data."""
  # Validate FLAGS
  assert FLAGS.bsz_per_host % FLAGS.num_core_per_host == 0
  if not FLAGS.use_tpu:
@@ -221,16 +224,16 @@ def create_data(_):

  # Interleavely split the work into FLAGS.num_task splits
  file_paths = sorted(tf.gfile.Glob(FLAGS.input_glob))
-  tf.logging.info("Use glob: %s", FLAGS.input_glob)
-  tf.logging.info("Find %d files: %s", len(file_paths), file_paths)
+  logging.info("Use glob: %s", FLAGS.input_glob)
+  logging.info("Find %d files: %s", len(file_paths), file_paths)

  task_file_paths = file_paths[FLAGS.task::FLAGS.num_task]
  if not task_file_paths:
-    tf.logging.info("Exit: task %d has no file to process.", FLAGS.task)
+    logging.info("Exit: task %d has no file to process.", FLAGS.task)
    return

-  tf.logging.info("Task %d process %d files: %s",
-                  FLAGS.task, len(task_file_paths), task_file_paths)
+  logging.info("Task %d process %d files: %s",
+               FLAGS.task, len(task_file_paths), task_file_paths)
  record_info = _create_data(FLAGS.task, task_file_paths)

  record_prefix = "record_info-{}-{}-{}".format(
@@ -253,6 +256,7 @@ def create_data(_):


 def batchify(data, bsz_per_host, sent_ids=None):
+  """Creates batches."""
  num_step = len(data) // bsz_per_host
  data = data[:bsz_per_host * num_step]
  data = data.reshape(bsz_per_host, num_step)
@@ -270,9 +274,9 @@ def _split_a_and_b(data, sent_ids, begin_idx, tot_len, extend_target=False):

  data_len = data.shape[0]
  if begin_idx + tot_len >= data_len:
-    tf.logging.info("[_split_a_and_b] returns None: "
-                    "begin_idx %d + tot_len %d >= data_len %d",
-                    begin_idx, tot_len, data_len)
+    logging.info("[_split_a_and_b] returns None: "
+                 "begin_idx %d + tot_len %d >= data_len %d",
+                 begin_idx, tot_len, data_len)
    return None

  end_idx = begin_idx + 1
@@ -284,9 +288,9 @@ def _split_a_and_b(data, sent_ids, begin_idx, tot_len, extend_target=False):
    end_idx += 1

  a_begin = begin_idx
-  if len(cut_points) == 0 or random.random() < 0.5:
+  if len(cut_points) == 0 or random.random() < 0.5:  # pylint:disable=g-explicit-length-test
    label = 0
-    if len(cut_points) == 0:
+    if len(cut_points) == 0:  # pylint:disable=g-explicit-length-test
      a_end = end_idx
    else:
      a_end = random.choice(cut_points)
@@ -321,9 +325,9 @@ def _split_a_and_b(data, sent_ids, begin_idx, tot_len, extend_target=False):

  if extend_target:
    if a_end >= data_len or b_end >= data_len:
-      tf.logging.info("[_split_a_and_b] returns None: "
-                      "a_end %d or b_end %d >= data_len %d",
-                      a_end, b_end, data_len)
+      logging.info("[_split_a_and_b] returns None: "
+                   "a_end %d or b_end %d >= data_len %d",
+                   a_end, b_end, data_len)
      return None
    a_target = data[a_begin + 1: a_end + 1]
    b_target = data[b_begin: b_end + 1]
@@ -342,9 +346,7 @@ def _is_start_piece(piece):


 def _sample_mask(sp, seg, reverse=False, max_gram=5, goal_num_predict=None):
-  """Sample `goal_num_predict` tokens for partial prediction.
-  About `mask_beta` tokens are chosen in a context of `mask_alpha` tokens."""
-
+  """Samples `goal_num_predict` tokens for partial prediction."""
  seg_len = len(seg)
  mask = np.array([False] * seg_len, dtype=np.bool)

@@ -406,8 +408,7 @@ def _sample_mask(sp, seg, reverse=False, max_gram=5, goal_num_predict=None):

 def _sample_mask_ngram(sp, seg, reverse=False, max_gram=5,
                       goal_num_predict=None):
-  """Sample `goal_num_predict` tokens for partial prediction.
-  About `mask_beta` tokens are chosen in a context of `mask_alpha` tokens."""
+  """Sample `goal_num_predict` tokens for partial prediction."""

  seg_len = len(seg)
  mask = np.array([False] * seg_len, dtype=np.bool)
@@ -474,6 +475,7 @@ def _sample_mask_ngram(sp, seg, reverse=False, max_gram=5,

 def create_tfrecords(save_dir, basename, data, bsz_per_host, seq_len,
                     bi_data, sp):
+  """Creates TFRecords."""
  data, sent_ids = data[0], data[1]

  num_core = FLAGS.num_core_per_host
@@ -496,7 +498,7 @@ def create_tfrecords(save_dir, basename, data, bsz_per_host, seq_len,
  else:
    data, sent_ids = batchify(data, bsz_per_host, sent_ids)

-  tf.logging.info("Raw data shape %s.", data.shape)
+  logging.info("Raw data shape %s.", data.shape)

  file_name = format_filename(
      prefix=basename,
@@ -512,7 +514,7 @@ def create_tfrecords(save_dir, basename, data, bsz_per_host, seq_len,
  )
  save_path = os.path.join(save_dir, file_name)
  record_writer = tf.python_io.TFRecordWriter(save_path)
-  tf.logging.info("Start writing %s.", save_path)
+  logging.info("Start writing %s.", save_path)

  num_batch = 0
  reuse_len = FLAGS.reuse_len
@@ -527,7 +529,7 @@ def create_tfrecords(save_dir, basename, data, bsz_per_host, seq_len,
  i = 0
  while i + seq_len <= data_len:
    if num_batch % 500 == 0:
-      tf.logging.info("Processing batch %d", num_batch)
+      logging.info("Processing batch %d", num_batch)

    all_ok = True
    features = []
@@ -542,7 +544,7 @@ def create_tfrecords(save_dir, basename, data, bsz_per_host, seq_len,
          tot_len=seq_len - reuse_len - 3,
          extend_target=True)
      if results is None:
-        tf.logging.info("Break out with seq idx %d", i)
+        logging.info("Break out with seq idx %d", i)
        all_ok = False
        break

@@ -600,7 +602,7 @@ def create_tfrecords(save_dir, basename, data, bsz_per_host, seq_len,
    i += reuse_len

  record_writer.close()
-  tf.logging.info("Done writing %s. Num of batches: %d", save_path, num_batch)
+  logging.info("Done writing %s. Num of batches: %d", save_path, num_batch)

  return save_path, num_batch

@@ -624,6 +626,7 @@ def _convert_example(example, use_bfloat16):

 def parse_files_to_dataset(parser, file_names, split, num_batch, num_hosts,
                           host_id, num_core_per_host, bsz_per_core):
+  """Parses files to a dataset."""
  # list of file pathes
  num_files = len(file_names)
  num_files_per_host = num_files // num_hosts
@@ -632,7 +635,7 @@ def parse_files_to_dataset(parser, file_names, split, num_batch, num_hosts,
  if host_id == num_hosts - 1:
    my_end_file_id = num_files
  file_paths = file_names[my_start_file_id: my_end_file_id]
-  tf.logging.info("Host %d handles %d files", host_id, len(file_paths))
+  logging.info("Host %d handles %d files", host_id, len(file_paths))

  assert split == "train"
  dataset = tf.data.Dataset.from_tensor_slices(file_paths)
@@ -657,9 +660,7 @@ def parse_files_to_dataset(parser, file_names, split, num_batch, num_hosts,


 def _local_perm(inputs, targets, is_masked, perm_size, seq_len):
-  """
-  Sample a permutation of the factorization order, and create an
-  attention mask accordingly.
+  """Samples a permutation of the factorization order, and create a mask.

  Args:
    inputs: int64 Tensor in shape [seq_len], input ids.
@@ -669,6 +670,10 @@ def _local_perm(inputs, targets, is_masked, perm_size, seq_len):
    perm_size: the length of longest permutation. Could be set to be reuse_len.
      Should not be larger than reuse_len or there will be data leaks.
    seq_len: int, sequence length.
+
+  Returns:
+    The permutation mask, new targets, target mask, and new inputs.
+
  """

  # Generate permutation indices
@@ -726,6 +731,7 @@ def _local_perm(inputs, targets, is_masked, perm_size, seq_len):
 def get_dataset(params, num_hosts, num_core_per_host, split, file_names,
                num_batch, seq_len, reuse_len, perm_size, mask_alpha,
                mask_beta, use_bfloat16=False, num_predict=None):
+  """Gets the dataset."""

  bsz_per_core = params["batch_size"]
  if num_hosts > 1:
@@ -821,7 +827,7 @@ def get_dataset(params, num_hosts, num_core_per_host, split, file_names,
    _convert_example(example, use_bfloat16)

    for k, v in example.items():
-      tf.logging.info("%s: %s", k, v)
+      logging.info("%s: %s", k, v)

    return example

@@ -855,6 +861,7 @@ def get_input_fn(
    num_passes=None,
    use_bfloat16=False,
    num_predict=None):
+  """Gets the input function."""

  # Merge all record infos into a single one
  record_glob_base = format_filename(
@@ -872,15 +879,14 @@ def get_input_fn(
  record_info = {"num_batch": 0, "filenames": []}

  tfrecord_dirs = tfrecord_dir.split(",")
-  tf.logging.info("Use the following tfrecord dirs: %s", tfrecord_dirs)
+  logging.info("Use the following tfrecord dirs: %s", tfrecord_dirs)

  for idx, record_dir in enumerate(tfrecord_dirs):
    record_glob = os.path.join(record_dir, record_glob_base)
-    tf.logging.info("[%d] Record glob: %s", idx, record_glob)
+    logging.info("[%d] Record glob: %s", idx, record_glob)

    record_paths = sorted(tf.gfile.Glob(record_glob))
-    tf.logging.info("[%d] Num of record info path: %d",
-                    idx, len(record_paths))
+    logging.info("[%d] Num of record info path: %d", idx, len(record_paths))

    cur_record_info = {"num_batch": 0, "filenames": []}

@@ -890,7 +896,7 @@ def get_input_fn(
        fields = record_info_name.split(".")[0].split("-")
        pass_id = int(fields[-1])
        if len(fields) == 5 and pass_id >= num_passes:
-          tf.logging.info("Skip pass %d: %s", pass_id, record_info_name)
+          logging.info("Skip pass %d: %s", pass_id, record_info_name)
          continue

      with tf.gfile.Open(record_info_path, "r") as fp:
@@ -912,21 +918,19 @@ def get_input_fn(
      new_filenames.append(new_filename)
    cur_record_info["filenames"] = new_filenames

-    tf.logging.info("[Dir %d] Number of chosen batches: %s",
-                    idx, cur_record_info["num_batch"])
-    tf.logging.info("[Dir %d] Number of chosen files: %s",
-                    idx, len(cur_record_info["filenames"]))
-    tf.logging.info(cur_record_info["filenames"])
+    logging.info("[Dir %d] Number of chosen batches: %s",
+                 idx, cur_record_info["num_batch"])
+    logging.info("[Dir %d] Number of chosen files: %s",
+                 idx, len(cur_record_info["filenames"]))
+    logging.info(cur_record_info["filenames"])

    # add `cur_record_info` to global `record_info`
    record_info["num_batch"] += cur_record_info["num_batch"]
    record_info["filenames"] += cur_record_info["filenames"]

-  tf.logging.info("Total number of batches: %d",
-                  record_info["num_batch"])
-  tf.logging.info("Total number of files: %d",
-                  len(record_info["filenames"]))
-  tf.logging.info(record_info["filenames"])
+  logging.info("Total number of batches: %d", record_info["num_batch"])
+  logging.info("Total number of files: %d", len(record_info["filenames"]))
+  logging.info(record_info["filenames"])

  def input_fn(params):
    """docs."""
@@ -952,8 +956,8 @@ def get_input_fn(
  return input_fn, record_info


-if __name__ == "__main__":
-  FLAGS = flags.FLAGS
+def define_flags():
+  """Defines relevant flags."""
  flags.DEFINE_bool("use_tpu", True, help="whether to use TPUs")
  flags.DEFINE_integer("bsz_per_host", 32, help="batch size per host.")
  flags.DEFINE_integer("num_core_per_host", 8, help="num TPU cores per host.")
@@ -991,5 +995,8 @@ if __name__ == "__main__":
  flags.DEFINE_integer("task", 0, help="The Task ID. This value is used when "
                       "using multiple workers to identify each worker.")

-  tf.logging.set_verbosity(tf.logging.INFO)
+
+if __name__ == "__main__":
+  define_flags()
+  logging.set_verbosity(logging.INFO)
  app.run(create_data)
--- a/official/utils/docs/build_docs.py
+++ b/official/utils/docs/build_docs.py
@@ -17,9 +17,10 @@ r"""Tool to generate api_docs for tensorflow_models/official library.

 Example:

-python build_docs \
+$> pip install -U git+https://github.com/tensorflow/docs
+$> python build_docs \
 --output_dir=/tmp/api_docs \
- --project_short_name=tf_nlp.modeling \
+ --project_short_name=tfnlp \
 --project_full_name="TensorFlow Official Models - NLP Modeling Library"
 """

@@ -34,7 +35,7 @@ from tensorflow_docs.api_generator import doc_controls
 from tensorflow_docs.api_generator import generate_lib
 from tensorflow_docs.api_generator import public_api

-from official.nlp import modeling as tf_nlp_modeling
+from official.nlp import modeling as tfnlp

 FLAGS = flags.FLAGS

@@ -47,18 +48,15 @@ flags.DEFINE_string(
 flags.DEFINE_bool('search_hints', True,
                  'Include metadata search hints in the generated files')

-flags.DEFINE_string('site_path', 'tf_nlp_modeling/api_docs/python',
+flags.DEFINE_string('site_path', '/api_docs/python',
                    'Path prefix in the _toc.yaml')

 flags.DEFINE_bool('gen_report', False,
                  'Generate an API report containing the health of the '
                  'docstrings of the public API.')
-flags.DEFINE_string(
-    'project_short_name', 'tf_nlp.modeling',
-    'The project short name referring to the python module to document.')
-flags.DEFINE_string('project_full_name',
-                    'TensorFlow Official Models - NLP Modeling Library',
-                    'The main title for the project.')
+
+PROJECT_SHORT_NAME = 'tfnlp'
+PROJECT_FULL_NAME = 'TensorFlow Official Models - NLP Modeling Library'


 def _hide_module_model_and_layer_methods():
@@ -104,8 +102,8 @@ def gen_api_docs(code_url_prefix, site_path, output_dir, gen_report,

  doc_generator = generate_lib.DocGenerator(
      root_title=project_full_name,
-      py_modules=[(project_short_name, tf_nlp_modeling)],
-      base_dir=os.path.dirname(tf_nlp_modeling.__file__),
+      py_modules=[(project_short_name, tfnlp)],
+      base_dir=os.path.dirname(tfnlp.__file__),
      code_url_prefix=code_url_prefix,
      search_hints=search_hints,
      site_path=site_path,
@@ -126,8 +124,8 @@ def main(argv):
      site_path=FLAGS.site_path,
      output_dir=FLAGS.output_dir,
      gen_report=FLAGS.gen_report,
-      project_short_name=FLAGS.project_short_name,
-      project_full_name=FLAGS.project_full_name,
+      project_short_name=PROJECT_SHORT_NAME,
+      project_full_name=PROJECT_FULL_NAME,
      search_hints=FLAGS.search_hints)



--- a/official/vision/beta/MODEL_GARDEN.md
+++ b/official/vision/beta/MODEL_GARDEN.md
@@ -6,29 +6,39 @@ TF Vision model garden provides a large collection of baselines and checkpoints

 ## Image Classification
 ### ImageNet Baselines
-#### Models trained with vanilla settings:
+#### ResNet models trained with vanilla settings:
 * Models are trained from scratch with batch size 4096 and 1.6 initial learning rate.
 * Linear warmup is applied for the first 5 epochs.
 * Models trained with l2 weight regularization and ReLU activation.

 | model        | resolution    | epochs  |  Top-1  |  Top-5  | download |
 | ------------ |:-------------:|--------:|--------:|---------:|---------:|
-| ResNet-50    | 224x224       |    90    | 76.1 | 92.9 | config |
-| ResNet-50    | 224x224       |    200   | 77.1 | 93.5 | config |
-| ResNet-101   | 224x224       |    200   | 78.3 | 94.2 | config |
-| ResNet-152   | 224x224       |    200   | 78.7 | 94.3 | config |
-
-#### Models trained with training features including:
-* Label smoothing 0.1.
-* Swish activation.
-
-| model        | resolution    | epochs  |   Top-1  |  Top-5  | download |
-| ------------ |:-------------:| ---------:|--------:|---------:|---------:|
-| ResNet-50    | 224x224       |    200    | 78.1 | 93.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
-| ResNet-101   | 224x224       |    200    | 79.1 | 94.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) |
-| ResNet-152   | 224x224       |    200    | 79.4 | 94.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) |
-| ResNet-200   | 224x224       |    200    | 79.9 | 94.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet200_tpu.yaml) |
-
+| ResNet-50    | 224x224       |    90    | 76.1 | 92.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
+| ResNet-50    | 224x224       |    200   | 77.1 | 93.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
+| ResNet-101   | 224x224       |    200   | 78.3 | 94.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) |
+| ResNet-152   | 224x224       |    200   | 78.7 | 94.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) |
+
+
+#### ResNet-RS models trained with settings including:
+
+*   ResNet-RS architectural changes and Swish activation.
+*   Regularization methods including Random Augment, 4e-5 weight decay, stochastic depth, label smoothing and dropout.
+*   New training methods including a 350-epoch schedule, cosine learning rate and
+    EMA.
+*   Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification).
+
+model     | resolution | params (M) | Top-1 | Top-5 | download
+--------- | :--------: | -----: | ----: | ----: | -------:
+ResNet-RS-50 | 160x160    | 35.7    | 79.1  | 94.5  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) |
+ResNet-RS-101 | 160x160    | 63.7    | 80.2  | 94.9  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) |
+ResNet-RS-101 | 192x192    | 63.7    | 81.3  | 95.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) |
+ResNet-RS-152 | 192x192    | 86.8    | 81.9  | 95.8  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) |
+ResNet-RS-152 | 224x224    | 86.8    | 82.5  | 96.1  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) |
+ResNet-RS-152 | 256x256    | 86.8    | 83.1  | 96.3  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) |
+ResNet-RS-200 | 256x256    | 93.4    | 83.5  | 96.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) |
+ResNet-RS-270 | 256x256    | 130.1    | 83.6  | 96.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) |
+ResNet-RS-350 | 256x256    |  164.3   | 83.7  | 96.7  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) |
+ResNet-RS-350 | 320x320    | 164.3   | 84.2  | 96.9  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) |


 ## Object Detection and Instance Segmentation

--- a/official/vision/beta/configs/backbones.py
+++ b/official/vision/beta/configs/backbones.py
@@ -30,6 +30,8 @@ class ResNet(hyperparams.Config):
  stem_type: str = 'v0'
  se_ratio: float = 0.0
  stochastic_depth_drop_rate: float = 0.0
+  resnetd_shortcut: bool = False
+  replace_stem_max_pool: bool = False


 @dataclasses.dataclass

--- a/official/vision/beta/configs/common.py
+++ b/official/vision/beta/configs/common.py
@@ -27,3 +27,11 @@ class NormActivation(hyperparams.Config):
  use_sync_bn: bool = True
  norm_momentum: float = 0.99
  norm_epsilon: float = 0.001
+
+
+@dataclasses.dataclass
+class PseudoLabelDataConfig(hyperparams.Config):
+  """Psuedo Label input config for training."""
+  input_path: str = ''
+  data_ratio: float = 1.0  # Per-batch ratio of pseudo-labeled to labeled data
+  file_type: str = 'tfrecord'
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet300_tpu.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet300_tpu.yaml
-# ResNet-300 ImageNet classification. 82.6% top-1 and 96.3% top-5 accuracy.
+# ResNet-RS-101 ImageNet classification. 80.2% accuracy.
 runtime:
  distribution_strategy: 'tpu'
  mixed_precision_dtype: 'bfloat16'
 task:
  model:
    num_classes: 1001
-    input_size: [380, 380, 3]
+    input_size: [160, 160, 3]
    backbone:
      type: 'resnet'
      resnet:
-        model_id: 300
-        stem_type: 'v1'
+        model_id: 101
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
        se_ratio: 0.25
-        stochastic_depth_drop_rate: 0.2
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.0
    norm_activation:
      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.25
  losses:
-    l2_weight_decay: 0.0001
+    l2_weight_decay: 0.00004
    one_hot: true
    label_smoothing: 0.1
  train_data:
@@ -24,6 +29,8 @@ task:
    is_training: true
    global_batch_size: 4096
    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
  validation_data:
    input_path: 'imagenet-2012-tfrecord/valid*'
    is_training: false
@@ -31,13 +38,15 @@ task:
    dtype: 'bfloat16'
    drop_remainder: false
 trainer:
-  train_steps: 62400
+  train_steps: 109200
  validation_steps: 13
  validation_interval: 312
  steps_per_loop: 312
  summary_interval: 312
  checkpoint_interval: 312
  optimizer_config:
+    ema:
+      average_decay: 0.9999
    optimizer:
      type: 'sgd'
      sgd:
@@ -46,7 +55,7 @@ trainer:
      type: 'cosine'
      cosine:
        initial_learning_rate: 1.6
-        decay_steps: 62400
+        decay_steps: 109200
    warmup:
      type: 'linear'
      linear:

--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml
+# ResNet-RS-101 ImageNet classification. 81.3% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [192, 192, 3]
+    backbone:
+      type: 'resnet'
+      resnet:
+        model_id: 101
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
+        se_ratio: 0.25
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.0
+    norm_activation:
+      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.25
+  losses:
+    l2_weight_decay: 0.00004
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 109200
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    ema:
+      average_decay: 0.9999
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'cosine'
+      cosine:
+        initial_learning_rate: 1.6
+        decay_steps: 109200
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml
+# ResNet-RS-152 ImageNet classification. 81.9% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [192, 192, 3]
+    backbone:
+      type: 'resnet'
+      resnet:
+        model_id: 152
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
+        se_ratio: 0.25
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.0
+    norm_activation:
+      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.25
+  losses:
+    l2_weight_decay: 0.00004
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 109200
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    ema:
+      average_decay: 0.9999
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'cosine'
+      cosine:
+        initial_learning_rate: 1.6
+        decay_steps: 109200
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet200_tpu.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet200_tpu.yaml
-# ResNet-200 ImageNet classification. 79.9% top-1 and 94.8% top-5 accuracy.
+# ResNet-RS-152 ImageNet classification. 82.5% top-5 accuracy.
 runtime:
  distribution_strategy: 'tpu'
  mixed_precision_dtype: 'bfloat16'
@@ -9,11 +9,19 @@ task:
    backbone:
      type: 'resnet'
      resnet:
-        model_id: 200
+        model_id: 152
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
+        se_ratio: 0.25
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.0
    norm_activation:
      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.25
  losses:
-    l2_weight_decay: 0.0001
+    l2_weight_decay: 0.00004
    one_hot: true
    label_smoothing: 0.1
  train_data:
@@ -21,6 +29,8 @@ task:
    is_training: true
    global_batch_size: 4096
    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
  validation_data:
    input_path: 'imagenet-2012-tfrecord/valid*'
    is_training: false
@@ -28,13 +38,15 @@ task:
    dtype: 'bfloat16'
    drop_remainder: false
 trainer:
-  train_steps: 62400
+  train_steps: 109200
  validation_steps: 13
  validation_interval: 312
  steps_per_loop: 312
  summary_interval: 312
  checkpoint_interval: 312
  optimizer_config:
+    ema:
+      average_decay: 0.9999
    optimizer:
      type: 'sgd'
      sgd:
@@ -43,7 +55,7 @@ trainer:
      type: 'cosine'
      cosine:
        initial_learning_rate: 1.6
-        decay_steps: 62400
+        decay_steps: 109200
    warmup:
      type: 'linear'
      linear:

--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml
+# ResNet-RS-152 ImageNet classification. 83.1% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [256, 256, 3]
+    backbone:
+      type: 'resnet'
+      resnet:
+        model_id: 152
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
+        se_ratio: 0.25
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.0
+    norm_activation:
+      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.25
+  losses:
+    l2_weight_decay: 0.00004
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 109200
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    ema:
+      average_decay: 0.9999
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'cosine'
+      cosine:
+        initial_learning_rate: 1.6
+        decay_steps: 109200
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml
+# ResNet-RS-200 ImageNet classification. 83.5% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [256, 256, 3]
+    backbone:
+      type: 'resnet'
+      resnet:
+        model_id: 200
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
+        se_ratio: 0.25
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.1
+    norm_activation:
+      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.25
+  losses:
+    l2_weight_decay: 0.00004
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 109200
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    ema:
+      average_decay: 0.9999
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'cosine'
+      cosine:
+        initial_learning_rate: 1.6
+        decay_steps: 109200
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml
+# ResNet-RS-270 ImageNet classification. 83.6% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [256, 256, 3]
+    backbone:
+      type: 'resnet'
+      resnet:
+        model_id: 270
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
+        se_ratio: 0.25
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.1
+    norm_activation:
+      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.25
+  losses:
+    l2_weight_decay: 0.00004
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 109200
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    ema:
+      average_decay: 0.9999
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'cosine'
+      cosine:
+        initial_learning_rate: 1.6
+        decay_steps: 109200
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml
+# ResNet-RS-350 ImageNet classification. 83.7% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [256, 256, 3]
+    backbone:
+      type: 'resnet'
+      resnet:
+        model_id: 350
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
+        se_ratio: 0.25
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.1
+    norm_activation:
+      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.25
+  losses:
+    l2_weight_decay: 0.00004
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 109200
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    ema:
+      average_decay: 0.9999
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'cosine'
+      cosine:
+        initial_learning_rate: 1.6
+        decay_steps: 109200
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i320.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i320.yaml
+# ResNet-RS-350 ImageNet classification. 84.2% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [320, 320, 3]
+    backbone:
+      type: 'resnet'
+      resnet:
+        model_id: 350
+        replace_stem_max_pool: true
+        resnetd_shortcut: true
+        se_ratio: 0.25
+        stem_type: 'v1'
+        stochastic_depth_drop_rate: 0.1
+    norm_activation:
+      activation: 'swish'
+      norm_momentum: 0.0
+      use_sync_bn: false
+    dropout_rate: 0.4
+  losses:
+    l2_weight_decay: 0.00004
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    aug_policy: 'randaug'
+    randaug_magnitude: 15
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 109200
+  validation_steps: 13
+  validation_interval: 312
+  steps_per_loop: 312
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    ema:
+      average_decay: 0.9999
+    optimizer:
+      type: 'sgd'
+      sgd:
+        momentum: 0.9
+    learning_rate:
+      type: 'cosine'
+      cosine:
+        initial_learning_rate: 1.6
+        decay_steps: 109200
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560