Release TF-NLP's generic export_tfhub tool with preprocessing support

as open source. A user guide will follow soon. PiperOrigin-RevId: 359244402

Release TF-NLP's generic export_tfhub tool with preprocessing support
as open source. A user guide will follow soon. PiperOrigin-RevId: 359244402
5b6171ce · A. Unique TensorFlower · 834ca16d · 5b6171ce · 5b6171ce · 5b6171ce
Commit 5b6171ce authored Feb 24, 2021 by A. Unique TensorFlower
3 changed files
--- a/official/nlp/tools/export_tfhub.py
+++ b/official/nlp/tools/export_tfhub.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Exports a BERT-like encoder and its preprocessing as SavedModels for TF Hub.
+
+This tool creates preprocessor and encoder SavedModels suitable for uploading
+to https://tfhub.dev that implement the preprocessor and encoder APIs defined
+at https://www.tensorflow.org/hub/common_saved_model_apis/text.
+
+Minimal usage examples:
+
+1) Exporting an Encoder from checkpoint and config.
+
+```
+export_tfhub \
+  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
+  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
+  --vocab_file=${BERT_DIR:?}/vocab.txt \
+  --export_type=model \
+  --export_path=/tmp/bert_model
+```
+
+An --encoder_config_file can specify encoder types other than BERT.
+For BERT, a --bert_config_file in the legacy JSON format can be passed instead.
+
+Flag --vocab_file (and flag --do_lower_case, whose default value is guessed
+from the vocab_file path) capture how BertTokenizer was used in pre-training.
+Use flag --sp_model_file instead if SentencepieceTokenizer was used.
+
+Changing --export_type to model_with_mlm additionally creates an `.mlm`
+subobject on the exported SavedModel that can be called to produce
+the logits of the Masked Language Model task from pretraining.
+The help string for flag --model_checkpoint_path explains the checkpoint
+formats required for each --export_type.
+
+
+2) Exporting a preprocessor SavedModel
+
+```
+export_tfhub \
+  --vocab_file ${BERT_DIR:?}/vocab.txt \
+  --export_type preprocessing --export_path /tmp/bert_preprocessing
+```
+
+Be sure to use flag values that match the encoder and how it has been
+pre-trained (see above for --vocab_file vs --sp_model_file).
+
+If your encoder has been trained with text preprocessing for which tfhub.dev
+already has SavedModel, you could guide your users to reuse that one instead
+of exporting and publishing your own.
+
+TODO(b/175369555): When exporting to users of TensorFlow 2.4, add flag
+`--experimental_disable_assert_in_preprocessing`.
+"""
+
+from absl import app
+from absl import flags
+import gin
+
+from official.modeling import hyperparams
+from official.nlp.bert import configs
+from official.nlp.configs import encoders
+from official.nlp.tools import export_tfhub_lib
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_enum(
+    "export_type", "model",
+    ["model", "model_with_mlm", "preprocessing"],
+    "The overall type of SavedModel to export. Flags "
+    "--bert_config_file/--encoder_config_file and --vocab_file/--sp_model_file "
+    "control which particular encoder model and preprocessing are exported.")
+flags.DEFINE_string(
+    "export_path", None,
+    "Directory to which the SavedModel is written.")
+flags.DEFINE_string(
+    "encoder_config_file", None,
+    "A yaml file representing `encoders.EncoderConfig` to define the encoder "
+    "(BERT or other). "
+    "Exactly one of --bert_config_file and --encoder_config_file can be set. "
+    "Needed for --export_type model and model_with_mlm.")
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "A JSON file with a legacy BERT configuration to define the BERT encoder. "
+    "Exactly one of --bert_config_file and --encoder_config_file can be set. "
+    "Needed for --export_type model and model_with_mlm.")
+flags.DEFINE_bool(
+    "copy_pooler_dense_to_encoder", False,
+    "When the model is trained using `BertPretrainerV2`, the pool layer "
+    "of next sentence prediction task exists in `ClassificationHead` passed "
+    "to `BertPretrainerV2`. If True, we will copy this pooler's dense layer "
+    "to the encoder that is exported by this tool (as in classic BERT). "
+    "Using `BertPretrainerV2` and leaving this False exports an untrained "
+    "(randomly initialized) pooling layer, which some authors recommend for "
+    "subsequent fine-tuning,")
+flags.DEFINE_string(
+    "model_checkpoint_path", None,
+    "File path to a pre-trained model checkpoint. "
+    "For --export_type model, this has to be an object-based (TF2) checkpoint "
+    "that can be restored to `tf.train.Checkpoint(encoder=encoder)` "
+    "for the `encoder` defined by the config file."
+    "(Legacy checkpoints with `model=` instead of `encoder=` are also "
+    "supported for now.) "
+    "For --export_type model_with_mlm, it must be restorable to "
+    "`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)`. "
+    "(For now, `tf.train.Checkpoint(pretrainer=BertPretrainerV2(...))` is also "
+    "accepted.)")
+flags.DEFINE_string(
+    "vocab_file", None,
+    "For encoders trained on BertTokenzier input: "
+    "the vocabulary file that the encoder model was trained with. "
+    "Exactly one of --vocab_file and --sp_model_file can be set. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_string(
+    "sp_model_file", None,
+    "For encoders trained on SentencepieceTokenzier input: "
+    "the SentencePiece .model file that the encoder model was trained with. "
+    "Exactly one of --vocab_file and --sp_model_file can be set. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_bool(
+    "do_lower_case", None,
+    "Whether to lowercase before tokenization. "
+    "If left as None, and --vocab_file is set, do_lower_case will be enabled "
+    "if 'uncased' appears in the name of --vocab_file. "
+    "If left as None, and --sp_model_file set, do_lower_case defaults to true. "
+    "Needed for --export_type model, model_with_mlm and preprocessing.")
+flags.DEFINE_integer(
+    "default_seq_length", 128,
+    "The sequence length of preprocessing results from "
+    "top-level preprocess method. This is also the default "
+    "sequence length for the bert_pack_inputs subobject."
+    "Needed for --export_type preprocessing.")
+flags.DEFINE_bool(
+    "tokenize_with_offsets", False,  # Broken by b/149576200.
+    "Whether to export a .tokenize_with_offsets subobject for "
+    "--export_type preprocessing.")
+flags.DEFINE_multi_string(
+    "gin_file", default=None,
+    help="List of paths to the config files.")
+flags.DEFINE_multi_string(
+    "gin_params", default=None,
+    help="List of Gin bindings.")
+flags.DEFINE_bool(  # TODO(b/175369555): Remove this flag and its use.
+    "experimental_disable_assert_in_preprocessing", False,
+    "Export a preprocessing model without tf.Assert ops. "
+    "Usually, that would be a bad idea, except TF2.4 has an issue with "
+    "Assert ops in tf.functions used in Dataset.map() on a TPU worker, "
+    "and omitting the Assert ops lets SavedModels avoid the issue.")
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+
+  if bool(FLAGS.vocab_file) == bool(FLAGS.sp_model_file):
+    raise ValueError("Exactly one of `vocab_file` and `sp_model_file` "
+                     "can be specified, but got %s and %s." %
+                     (FLAGS.vocab_file, FLAGS.sp_model_file))
+  do_lower_case = export_tfhub_lib.get_do_lower_case(
+      FLAGS.do_lower_case, FLAGS.vocab_file, FLAGS.sp_model_file)
+
+  if FLAGS.export_type in ("model", "model_with_mlm"):
+    if bool(FLAGS.bert_config_file) == bool(FLAGS.encoder_config_file):
+      raise ValueError("Exactly one of `bert_config_file` and "
+                       "`encoder_config_file` can be specified, but got "
+                       "%s and %s." %
+                       (FLAGS.bert_config_file, FLAGS.encoder_config_file))
+    if FLAGS.bert_config_file:
+      bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+      encoder_config = None
+    else:
+      bert_config = None
+      encoder_config = encoders.EncoderConfig()
+      encoder_config = hyperparams.override_params_dict(
+          encoder_config, FLAGS.encoder_config_file, is_strict=True)
+    export_tfhub_lib.export_model(
+        FLAGS.export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=FLAGS.model_checkpoint_path,
+        vocab_file=FLAGS.vocab_file,
+        sp_model_file=FLAGS.sp_model_file,
+        do_lower_case=do_lower_case,
+        with_mlm=FLAGS.export_type == "model_with_mlm",
+        copy_pooler_dense_to_encoder=FLAGS.copy_pooler_dense_to_encoder)
+
+  elif FLAGS.export_type == "preprocessing":
+    export_tfhub_lib.export_preprocessing(
+        FLAGS.export_path,
+        vocab_file=FLAGS.vocab_file,
+        sp_model_file=FLAGS.sp_model_file,
+        do_lower_case=do_lower_case,
+        default_seq_length=FLAGS.default_seq_length,
+        tokenize_with_offsets=FLAGS.tokenize_with_offsets,
+        experimental_disable_assert=
+        FLAGS.experimental_disable_assert_in_preprocessing)
+
+  else:
+    raise app.UsageError(
+        "Unknown value '%s' for flag --export_type" % FLAGS.export_type)
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/official/nlp/tools/export_tfhub_lib.py
+++ b/official/nlp/tools/export_tfhub_lib.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library of components of export_tfhub.py. See docstring there for more."""
+
+import contextlib
+import hashlib
+import os
+import tempfile
+
+from typing import Optional, Text, Tuple
+
+# Import libraries
+from absl import logging
+import tensorflow as tf
+# pylint: disable=g-direct-tensorflow-import  TODO(b/175369555): Remove these.
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.ops import control_flow_ops
+# pylint: enable=g-direct-tensorflow-import
+from official.modeling import tf_utils
+from official.nlp.bert import configs
+from official.nlp.configs import encoders
+from official.nlp.modeling import layers
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
+
+
+def get_bert_encoder(bert_config):
+  """Returns a BertEncoder with dict outputs."""
+  bert_encoder = networks.BertEncoder(
+      vocab_size=bert_config.vocab_size,
+      hidden_size=bert_config.hidden_size,
+      num_layers=bert_config.num_hidden_layers,
+      num_attention_heads=bert_config.num_attention_heads,
+      intermediate_size=bert_config.intermediate_size,
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      dropout_rate=bert_config.hidden_dropout_prob,
+      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+      max_sequence_length=bert_config.max_position_embeddings,
+      type_vocab_size=bert_config.type_vocab_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=bert_config.initializer_range),
+      embedding_width=bert_config.embedding_size,
+      dict_outputs=True)
+
+  return bert_encoder
+
+
+def get_do_lower_case(do_lower_case, vocab_file=None, sp_model_file=None):
+  """Returns do_lower_case, replacing None by a guess from vocab file name."""
+  if do_lower_case is not None:
+    return do_lower_case
+  elif vocab_file:
+    do_lower_case = "uncased" in vocab_file
+    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
+                 do_lower_case, vocab_file)
+    return do_lower_case
+  elif sp_model_file:
+    do_lower_case = True  # All public ALBERTs (as of Oct 2020) do it.
+    logging.info("Defaulting to do_lower_case=%s for Sentencepiece tokenizer",
+                 do_lower_case)
+    return do_lower_case
+  else:
+    raise ValueError("Must set vocab_file or sp_model_file.")
+
+
+def _create_model(
+    *,
+    bert_config: Optional[configs.BertConfig] = None,
+    encoder_config: Optional[encoders.EncoderConfig] = None,
+    with_mlm: bool,
+) -> Tuple[tf.keras.Model, tf.keras.Model]:
+  """Creates the model to export and the model to restore the checkpoint.
+
+  Args:
+    bert_config: A legacy `BertConfig` to create a `BertEncoder` object.
+      Exactly one of encoder_config and bert_config must be set.
+    encoder_config: An `EncoderConfig` to create an encoder of the configured
+      type (`BertEncoder` or other).
+    with_mlm: A bool to control the second component of the result.
+      If True, will create a `BertPretrainerV2` object; otherwise, will
+      create a `BertEncoder` object.
+
+  Returns:
+    A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2`
+    object or `BertEncoder` object depending on the value of `with_mlm`
+    argument, which contains the first model and will be used for restoring
+    weights from the checkpoint.
+  """
+  if (bert_config is not None) == (encoder_config is not None):
+    raise ValueError("Exactly one of `bert_config` and `encoder_config` "
+                     "can be specified, but got %s and %s" %
+                     (bert_config, encoder_config))
+
+  if bert_config is not None:
+    encoder = get_bert_encoder(bert_config)
+  else:
+    encoder = encoders.build_encoder(encoder_config)
+
+  # Convert from list of named inputs to dict of inputs keyed by name.
+  # Only the latter accepts a dict of inputs after restoring from SavedModel.
+  encoder_inputs_dict = {x.name: x for x in encoder.inputs}
+  encoder_output_dict = encoder(encoder_inputs_dict)
+  # For interchangeability with other text representations,
+  # add "default" as an alias for BERT's whole-input reptesentations.
+  encoder_output_dict["default"] = encoder_output_dict["pooled_output"]
+  core_model = tf.keras.Model(
+      inputs=encoder_inputs_dict, outputs=encoder_output_dict)
+
+  if with_mlm:
+    if bert_config is not None:
+      hidden_act = bert_config.hidden_act
+    else:
+      assert encoder_config is not None
+      hidden_act = encoder_config.get().hidden_activation
+
+    pretrainer = models.BertPretrainerV2(
+        encoder_network=encoder,
+        mlm_activation=tf_utils.get_activation(hidden_act))
+
+    pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs}
+    pretrainer_output_dict = pretrainer(pretrainer_inputs_dict)
+    mlm_model = tf.keras.Model(
+        inputs=pretrainer_inputs_dict, outputs=pretrainer_output_dict)
+    # Set `_auto_track_sub_layers` to False, so that the additional weights
+    # from `mlm` sub-object will not be included in the core model.
+    # TODO(b/169210253): Use a public API when available.
+    core_model._auto_track_sub_layers = False  # pylint: disable=protected-access
+    core_model.mlm = mlm_model
+    return core_model, pretrainer
+  else:
+    return core_model, encoder
+
+
+def export_model(export_path: Text,
+                 *,
+                 bert_config: Optional[configs.BertConfig] = None,
+                 encoder_config: Optional[encoders.EncoderConfig] = None,
+                 model_checkpoint_path: Text,
+                 with_mlm: bool,
+                 copy_pooler_dense_to_encoder: bool = False,
+                 vocab_file: Optional[Text] = None,
+                 sp_model_file: Optional[Text] = None,
+                 do_lower_case: Optional[bool] = None) -> None:
+  """Exports an Encoder as SavedModel after restoring pre-trained weights.
+
+  The exported SavedModel implements a superset of the Encoder API for
+  Text embeddings with Transformer Encoders described at
+  https://www.tensorflow.org/hub/common_saved_model_apis/text.
+
+  In particular, the exported SavedModel can be used in the following way:
+
+  ```
+  # Calls default interface (encoder only).
+
+  encoder = hub.load(...)
+  encoder_inputs = dict(
+      input_word_ids=...,  # Shape [batch, seq_length], dtype=int32
+      input_mask=...,      # Shape [batch, seq_length], dtype=int32
+      input_type_ids=...,  # Shape [batch, seq_length], dtype=int32
+  )
+  encoder_outputs = encoder(encoder_inputs)
+  assert encoder_outputs.keys() == {
+    "pooled_output",   # Shape [batch_size, width], dtype=float32
+    "default",         # Alias for "pooled_output" (aligns with other models).
+    "sequence_output"  # Shape [batch_size, seq_length, width], dtype=float32
+    "encoder_outputs", # List of Tensors with outputs of all transformer layers.
+  }
+  ```
+
+  If `with_mlm` is True, the exported SavedModel can also be called in the
+  following way:
+
+  ```
+  # Calls expanded interface that includes logits of the Masked Language Model.
+  mlm_inputs = dict(
+      input_word_ids=...,       # Shape [batch, seq_length], dtype=int32
+      input_mask=...,           # Shape [batch, seq_length], dtype=int32
+      input_type_ids=...,       # Shape [batch, seq_length], dtype=int32
+      masked_lm_positions=...,  # Shape [batch, num_predictions], dtype=int32
+  )
+  mlm_outputs = encoder.mlm(mlm_inputs)
+  assert mlm_outputs.keys() == {
+    "pooled_output",   # Shape [batch, width], dtype=float32
+    "sequence_output", # Shape [batch, seq_length, width], dtype=float32
+    "encoder_outputs", # List of Tensors with outputs of all transformer layers.
+    "mlm_logits"    # Shape [batch, num_predictions, vocab_size], dtype=float32
+  }
+  ```
+
+  Args:
+    export_path: The SavedModel output directory.
+    bert_config: An optional `configs.BertConfig` object. Note: exactly one of
+      `bert_config` and following `encoder_config` must be specified.
+    encoder_config: An optional `encoders.EncoderConfig` object.
+    model_checkpoint_path: The path to the checkpoint.
+    with_mlm: Whether to export the additional mlm sub-object.
+    copy_pooler_dense_to_encoder: Whether to copy the pooler's dense layer
+      used in the next sentence prediction task to the encoder.
+    vocab_file: The path to the wordpiece vocab file, or None.
+    sp_model_file: The path to the sentencepiece model file, or None.
+      Exactly one of vocab_file and sp_model_file must be set.
+    do_lower_case: Whether to lower-case text before tokenization.
+  """
+  if with_mlm:
+    core_model, pretrainer = _create_model(bert_config=bert_config,
+                                           encoder_config=encoder_config,
+                                           with_mlm=with_mlm)
+    encoder = pretrainer.encoder_network
+    # Support the official way to checkpoint a pretrainer.
+    checkpoint_items = pretrainer.checkpoint_items
+    # Keep supporting the ad-hoc way from Oct 2020 that is used
+    # in several important converted checkpoints (original BERT, SmallBERTs).
+    checkpoint_items["pretrainer"] = pretrainer
+    checkpoint = tf.train.Checkpoint(**checkpoint_items)
+  else:
+    core_model, encoder = _create_model(bert_config=bert_config,
+                                        encoder_config=encoder_config,
+                                        with_mlm=with_mlm)
+    checkpoint = tf.train.Checkpoint(
+        model=encoder,  # Legacy checkpoints.
+        encoder=encoder)
+  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
+
+  if copy_pooler_dense_to_encoder:
+    logging.info("Copy pooler's dense layer to the encoder.")
+    pooler_checkpoint = tf.train.Checkpoint(
+        **{"next_sentence.pooler_dense": encoder.pooler_layer})
+    pooler_checkpoint.restore(
+        model_checkpoint_path).assert_existing_objects_matched()
+
+  # Before SavedModels for preprocessing appeared in Oct 2020, the encoders
+  # provided this information to let users do preprocessing themselves.
+  # We keep doing that for now. It helps users to upgrade incrementally.
+  # Moreover, it offers an escape hatch for advanced users who want the
+  # full vocab, not the high-level operations from the preprocessing model.
+  if vocab_file:
+    core_model.vocab_file = tf.saved_model.Asset(vocab_file)
+    if do_lower_case is None:
+      raise ValueError("Must pass do_lower_case if passing vocab_file.")
+    core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
+  elif sp_model_file:
+    # This was used by ALBERT, with implied values of do_lower_case=True
+    # and strip_diacritics=True.
+    core_model.sp_model_file = tf.saved_model.Asset(sp_model_file)
+  else:
+    raise ValueError("Must set vocab_file or sp_model_file")
+  core_model.save(export_path, include_optimizer=False, save_format="tf")
+
+
+class BertPackInputsSavedModelWrapper(tf.train.Checkpoint):
+  """Wraps a BertPackInputs layer for export to SavedModel.
+
+  The wrapper object is suitable for use with `tf.saved_model.save()` and
+  `.load()`. The wrapper object is callable with inputs and outputs like the
+  BertPackInputs layer, but differs from saving an unwrapped Keras object:
+
+    - The inputs can be a list of 1 or 2 RaggedTensors of dtype int32 and
+      ragged rank 1 or 2. (In Keras, saving to a tf.function in a SavedModel
+      would fix the number of RaggedTensors and their ragged rank.)
+    - The call accepts an optional keyword argument `seq_length=` to override
+      the layer's .seq_length hyperparameter. (In Keras, a hyperparameter
+      could not be changed after saving to a tf.function in a SavedModel.)
+  """
+
+  def __init__(self, bert_pack_inputs: layers.BertPackInputs):
+    super().__init__()
+
+    # Preserve the layer's configured seq_length as a default but make it
+    # overridable. Having this dynamically determined default argument
+    # requires self.__call__ to be defined in this indirect way.
+    default_seq_length = bert_pack_inputs.seq_length
+    @tf.function(autograph=False)
+    def call(inputs, seq_length=default_seq_length):
+      return layers.BertPackInputs.bert_pack_inputs(
+          inputs, seq_length=seq_length,
+          start_of_sequence_id=bert_pack_inputs.start_of_sequence_id,
+          end_of_segment_id=bert_pack_inputs.end_of_segment_id,
+          padding_id=bert_pack_inputs.padding_id)
+    self.__call__ = call
+
+    for ragged_rank in range(1, 3):
+      for num_segments in range(1, 3):
+        _ = self.__call__.get_concrete_function(
+            [tf.RaggedTensorSpec([None] * (ragged_rank + 1), dtype=tf.int32)
+             for _ in range(num_segments)],
+            seq_length=tf.TensorSpec([], tf.int32))
+
+
+def create_preprocessing(*,
+                         vocab_file: Optional[str] = None,
+                         sp_model_file: Optional[str] = None,
+                         do_lower_case: bool,
+                         tokenize_with_offsets: bool,
+                         default_seq_length: int) -> tf.keras.Model:
+  """Returns a preprocessing Model for given tokenization parameters.
+
+  This function builds a Keras Model with attached subobjects suitable for
+  saving to a SavedModel. The resulting SavedModel implements the Preprocessor
+  API for Text embeddings with Transformer Encoders described at
+  https://www.tensorflow.org/hub/common_saved_model_apis/text.
+
+  Args:
+    vocab_file: The path to the wordpiece vocab file, or None.
+    sp_model_file: The path to the sentencepiece model file, or None.
+      Exactly one of vocab_file and sp_model_file must be set.
+      This determines the type of tokenzer that is used.
+    do_lower_case: Whether to do lower case.
+    tokenize_with_offsets: Whether to include the .tokenize_with_offsets
+      subobject.
+    default_seq_length: The sequence length of preprocessing results from
+      root callable. This is also the default sequence length for the
+      bert_pack_inputs subobject.
+
+  Returns:
+    A tf.keras.Model object with several attached subobjects, suitable for
+    saving as a preprocessing SavedModel.
+  """
+  # Select tokenizer.
+  if bool(vocab_file) == bool(sp_model_file):
+    raise ValueError("Must set exactly one of vocab_file, sp_model_file")
+  if vocab_file:
+    tokenize = layers.BertTokenizer(
+        vocab_file=vocab_file,
+        lower_case=do_lower_case,
+        tokenize_with_offsets=tokenize_with_offsets)
+  else:
+    tokenize = layers.SentencepieceTokenizer(
+        model_file_path=sp_model_file,
+        lower_case=do_lower_case,
+        strip_diacritics=True,  #  Strip diacritics to follow ALBERT model.
+        tokenize_with_offsets=tokenize_with_offsets)
+
+  # The root object of the preprocessing model can be called to do
+  # one-shot preprocessing for users with single-sentence inputs.
+  sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
+  if tokenize_with_offsets:
+    tokens, start_offsets, limit_offsets = tokenize(sentences)
+  else:
+    tokens = tokenize(sentences)
+  pack = layers.BertPackInputs(
+      seq_length=default_seq_length,
+      special_tokens_dict=tokenize.get_special_tokens_dict())
+  model_inputs = pack(tokens)
+  preprocessing = tf.keras.Model(sentences, model_inputs)
+
+  # Individual steps of preprocessing are made available as named subobjects
+  # to enable more general preprocessing. For saving, they need to be Models
+  # in their own right.
+  preprocessing.tokenize = tf.keras.Model(sentences, tokens)
+  # Provide an equivalent to tokenize.get_special_tokens_dict().
+  preprocessing.tokenize.get_special_tokens_dict = tf.train.Checkpoint()
+  preprocessing.tokenize.get_special_tokens_dict.__call__ = tf.function(
+      lambda: tokenize.get_special_tokens_dict(),  # pylint: disable=[unnecessary-lambda]
+      input_signature=[])
+  if tokenize_with_offsets:
+    preprocessing.tokenize_with_offsets = tf.keras.Model(
+        sentences, [tokens, start_offsets, limit_offsets])
+    preprocessing.tokenize_with_offsets.get_special_tokens_dict = (
+        preprocessing.tokenize.get_special_tokens_dict)
+  # Conceptually, this should be
+  # preprocessing.bert_pack_inputs = tf.keras.Model(tokens, model_inputs)
+  # but technicalities require us to use a wrapper (see comments there).
+  # In particular, seq_length can be overridden when calling this.
+  preprocessing.bert_pack_inputs = BertPackInputsSavedModelWrapper(pack)
+
+  return preprocessing
+
+
+def _move_to_tmpdir(file_path: Optional[Text], tmpdir: Text) -> Optional[Text]:
+  """Returns new path with same basename and hash of original path."""
+  if file_path is None: return None
+  olddir, filename = os.path.split(file_path)
+  hasher = hashlib.sha1()
+  hasher.update(olddir.encode("utf-8"))
+  target_dir = os.path.join(tmpdir, hasher.hexdigest())
+  target_file = os.path.join(target_dir, filename)
+  tf.io.gfile.mkdir(target_dir)
+  tf.io.gfile.copy(file_path, target_file)
+  return target_file
+
+
+def export_preprocessing(export_path: Text,
+                         *,
+                         vocab_file: Optional[Text] = None,
+                         sp_model_file: Optional[Text] = None,
+                         do_lower_case: bool,
+                         tokenize_with_offsets: bool,
+                         default_seq_length: int,
+                         experimental_disable_assert: bool = False) -> None:
+  """Exports preprocessing to a SavedModel for TF Hub."""
+  with tempfile.TemporaryDirectory() as tmpdir:
+    # TODO(b/175369555): Remove experimental_disable_assert and its use.
+    with _maybe_disable_assert(experimental_disable_assert):
+      preprocessing = create_preprocessing(
+          vocab_file=_move_to_tmpdir(vocab_file, tmpdir),
+          sp_model_file=_move_to_tmpdir(sp_model_file, tmpdir),
+          do_lower_case=do_lower_case,
+          tokenize_with_offsets=tokenize_with_offsets,
+          default_seq_length=default_seq_length)
+      preprocessing.save(export_path, include_optimizer=False, save_format="tf")
+    if experimental_disable_assert:
+      _check_no_assert(export_path)
+  # It helps the unit test to prevent stray copies of the vocab file.
+  if tf.io.gfile.exists(tmpdir):
+    raise IOError("Failed to clean up TemporaryDirectory")
+
+
+# TODO(b/175369555): Remove all workarounds for this bug of TensorFlow 2.4
+# when this bug is no longer a concern for publishing new models.
+# TensorFlow 2.4 has a placement issue with Assert ops in tf.functions called
+# from Dataset.map() on a TPU worker. They end up on the TPU coordinator,
+# and invoking them from the TPU worker is either inefficient (when possible)
+# or impossible (notably when using "headless" TPU workers on Cloud that do not
+# have a channel to the coordinator). The bug has been fixed in time for TF 2.5.
+# To work around this, the following code avoids Assert ops in the exported
+# SavedModels. It monkey-patches calls to tf.Assert from inside TensorFlow and
+# replaces them by a no-op while building the exported model. This is fragile,
+# so _check_no_assert() validates the result. The resulting model should be fine
+# to read on future versions of TF, even if this workaround at export time
+# may break eventually. (Failing unit tests will tell.)
+
+
+def _dont_assert(condition, data, summarize=None, name="Assert"):
+  """The no-op version of tf.Assert installed by _maybe_disable_assert."""
+  del condition, data, summarize  # Unused.
+  if tf.executing_eagerly():
+    return
+  with tf.name_scope(name):
+    return tf.no_op(name="dont_assert")
+
+
+@contextlib.contextmanager
+def _maybe_disable_assert(disable_assert):
+  """Scoped monkey patch of control_flow_ops.Assert to a no-op."""
+  if not disable_assert:
+    yield
+    return
+
+  original_assert = control_flow_ops.Assert
+  control_flow_ops.Assert = _dont_assert
+  yield
+  control_flow_ops.Assert = original_assert
+
+
+def _check_no_assert(saved_model_path):
+  """Raises AssertionError if SavedModel contains Assert ops."""
+  saved_model_filename = os.path.join(saved_model_path, "saved_model.pb")
+  with tf.io.gfile.GFile(saved_model_filename, "rb") as f:
+    saved_model = saved_model_pb2.SavedModel.FromString(f.read())
+
+  assert_nodes = []
+  graph_def = saved_model.meta_graphs[0].graph_def
+  assert_nodes += ["node '{}' in global graph".format(n.name)
+                   for n in graph_def.node if n.op == "Assert"]
+  for fdef in graph_def.library.function:
+    assert_nodes += [
+        "node '{}' in function '{}'".format(n.name, fdef.signature.name)
+        for n in fdef.node_def if n.op == "Assert"]
+  if assert_nodes:
+    raise AssertionError(
+        "Internal tool error: "
+        "failed to suppress {} Assert ops in SavedModel:\n{}".format(
+            len(assert_nodes), "\n".join(assert_nodes[:10])))
--- a/official/nlp/tools/export_tfhub_lib_test.py
+++ b/official/nlp/tools/export_tfhub_lib_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests export_tfhub_lib."""
+
+import os
+import tempfile
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from sentencepiece import SentencePieceTrainer
+from official.modeling import tf_utils
+from official.nlp.bert import configs
+from official.nlp.configs import encoders
+from official.nlp.modeling import layers
+from official.nlp.modeling import models
+from official.nlp.tools import export_tfhub_lib
+
+
+def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
+                                       num_hidden_layers):
+  """Returns config args for export_tfhub_lib._create_model()."""
+  if use_bert_config:
+    bert_config = configs.BertConfig(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=num_hidden_layers)
+    encoder_config = None
+  else:
+    bert_config = None
+    encoder_config = encoders.EncoderConfig(
+        type="albert",
+        albert=encoders.AlbertEncoderConfig(
+            vocab_size=100,
+            embedding_width=16,
+            hidden_size=hidden_size,
+            intermediate_size=32,
+            max_position_embeddings=128,
+            num_attention_heads=2,
+            num_layers=num_hidden_layers,
+            dropout_rate=0.1))
+
+  return bert_config, encoder_config
+
+
+def _get_vocab_or_sp_model_dummy(temp_dir, use_sp_model):
+  """Returns tokenizer asset args for export_tfhub_lib.export_model()."""
+  dummy_file = os.path.join(temp_dir, "dummy_file.txt")
+  with tf.io.gfile.GFile(dummy_file, "w") as f:
+    f.write("dummy content")
+  if use_sp_model:
+    vocab_file, sp_model_file = None, dummy_file
+  else:
+    vocab_file, sp_model_file = dummy_file, None
+  return vocab_file, sp_model_file
+
+
+def _read_asset(asset: tf.saved_model.Asset):
+  return tf.io.gfile.GFile(asset.asset_path.numpy()).read()
+
+
+def _find_lambda_layers(layer):
+  """Returns list of all Lambda layers in a Keras model."""
+  if isinstance(layer, tf.keras.layers.Lambda):
+    return [layer]
+  elif hasattr(layer, "layers"):  # It's nested, like a Model.
+    result = []
+    for l in layer.layers:
+      result += _find_lambda_layers(l)
+    return result
+  else:
+    return []
+
+
+class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
+  """Tests exporting a Transformer Encoder model as a SavedModel.
+
+  This covers export from an Encoder checkpoint to a SavedModel without
+  the .mlm subobject. This is no longer preferred, but still useful
+    for models like Electra that are trained without the MLM task.
+
+  The export code is generic. This test focuses on two main cases
+  (the most important ones in practice when this was written in 2020):
+    - BERT built from a legacy BertConfig, for use with BertTokenizer.
+    - ALBERT built from an EncoderConfig (as a representative of all other
+      choices beyond BERT, for use with SentencepieceTokenizer (the one
+      alternative to BertTokenizer).
+  """
+
+  @parameterized.named_parameters(("Bert", True), ("Albert", False))
+  def test_export_model(self, use_bert):
+    # Create the encoder and export it.
+    hidden_size = 16
+    num_hidden_layers = 1
+    bert_config, encoder_config = _get_bert_config_or_encoder_config(
+        use_bert, hidden_size, num_hidden_layers)
+    bert_model, encoder = export_tfhub_lib._create_model(
+        bert_config=bert_config, encoder_config=encoder_config, with_mlm=False)
+    self.assertEmpty(
+        _find_lambda_layers(bert_model),
+        "Lambda layers are non-portable since they serialize Python bytecode.")
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(encoder=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=not use_bert)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=model_checkpoint_path,
+        with_mlm=False,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+
+    # Restore the exported model.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+
+    # Check legacy tokenization data.
+    if use_bert:
+      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.vocab_file))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
+    else:
+      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.sp_model_file))
+
+    # Check restored weights.
+    self.assertEqual(len(bert_model.trainable_weights),
+                     len(hub_layer.trainable_weights))
+    for source_weight, hub_weight in zip(bert_model.trainable_weights,
+                                         hub_layer.trainable_weights):
+      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
+
+    # Check computation.
+    seq_length = 10
+    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_output = hub_layer(input_dict)
+    source_output = bert_model(input_dict)
+    encoder_output = encoder(input_dict)
+    self.assertEqual(hub_output["pooled_output"].shape, (2, hidden_size))
+    self.assertEqual(hub_output["sequence_output"].shape,
+                     (2, seq_length, hidden_size))
+    self.assertLen(hub_output["encoder_outputs"], num_hidden_layers)
+
+    for key in ("pooled_output", "sequence_output", "encoder_outputs"):
+      self.assertAllClose(source_output[key], hub_output[key])
+      self.assertAllClose(source_output[key], encoder_output[key])
+
+    # The "default" output of BERT as a text representation is pooled_output.
+    self.assertAllClose(hub_output["pooled_output"], hub_output["default"])
+
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids))
+      outputs = np.concatenate([
+          hub_layer(input_dict, training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+
+    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
+
+    # Test propagation of seq_length in shape inference.
+    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_dict = dict(
+        input_word_ids=input_word_ids,
+        input_mask=input_mask,
+        input_type_ids=input_type_ids)
+    output_dict = hub_layer(input_dict)
+    pooled_output = output_dict["pooled_output"]
+    sequence_output = output_dict["sequence_output"]
+    encoder_outputs = output_dict["encoder_outputs"]
+
+    self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
+    self.assertEqual(sequence_output.shape.as_list(),
+                     [None, seq_length, hidden_size])
+    self.assertLen(encoder_outputs, num_hidden_layers)
+
+
+class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase):
+  """Tests exporting a Transformer Encoder model as a SavedModel.
+
+  This covers export from a Pretrainer checkpoint to a SavedModel including
+  the .mlm subobject, which is the preferred way since 2020.
+
+  The export code is generic. This test focuses on two main cases
+  (the most important ones in practice when this was written in 2020):
+    - BERT built from a legacy BertConfig, for use with BertTokenizer.
+    - ALBERT built from an EncoderConfig (as a representative of all other
+      choices beyond BERT, for use with SentencepieceTokenizer (the one
+      alternative to BertTokenizer).
+  """
+
+  def test_copy_pooler_dense_to_encoder(self):
+    encoder_config = encoders.EncoderConfig(
+        type="bert",
+        bert=encoders.BertEncoderConfig(
+            hidden_size=24, intermediate_size=48, num_layers=2))
+    cls_heads = [
+        layers.ClassificationHead(
+            inner_dim=24, num_classes=2, name="next_sentence")
+    ]
+    encoder = encoders.build_encoder(encoder_config)
+    pretrainer = models.BertPretrainerV2(
+        encoder_network=encoder,
+        classification_heads=cls_heads,
+        mlm_activation=tf_utils.get_activation(
+            encoder_config.get().hidden_activation))
+    # Makes sure the pretrainer variables are created.
+    _ = pretrainer(pretrainer.inputs)
+    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=True)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        encoder_config=encoder_config,
+        model_checkpoint_path=tf.train.latest_checkpoint(model_checkpoint_dir),
+        with_mlm=True,
+        copy_pooler_dense_to_encoder=True,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+    # Restores a hub KerasLayer.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+    dummy_ids = np.zeros((2, 10), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_pooled_output = hub_layer(input_dict)["pooled_output"]
+    encoder_outputs = encoder(input_dict)
+    # Verify that hub_layer's pooled_output is the same as the output of next
+    # sentence prediction's dense layer.
+    pretrained_pooled_output = cls_heads[0].dense(
+        (encoder_outputs["sequence_output"][:, 0, :]))
+    self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
+    # But the pooled_output between encoder and hub_layer are not the same.
+    encoder_pooled_output = encoder_outputs["pooled_output"]
+    self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
+
+  @parameterized.named_parameters(
+      ("Bert", True, False),
+      ("BertLegacyCheckpoint", True, True),
+      ("Albert", False, False),
+      ("AlbertLegacyCheckpoint", False, True),
+  )
+  def test_export_model_with_mlm(self, use_bert, legacy_checkpoint):
+    # Create the encoder and export it.
+    hidden_size = 16
+    num_hidden_layers = 2
+    bert_config, encoder_config = _get_bert_config_or_encoder_config(
+        use_bert, hidden_size, num_hidden_layers)
+    bert_model, pretrainer = export_tfhub_lib._create_model(
+        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
+    self.assertEmpty(
+        _find_lambda_layers(bert_model),
+        "Lambda layers are non-portable since they serialize Python bytecode.")
+    bert_model_with_mlm = bert_model.mlm
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+
+    if legacy_checkpoint:
+      checkpoint = tf.train.Checkpoint(pretrainer=pretrainer)
+    else:
+      checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
+
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+
+    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
+        self.get_temp_dir(), use_sp_model=not use_bert)
+    export_path = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub_lib.export_model(
+        export_path=export_path,
+        bert_config=bert_config,
+        encoder_config=encoder_config,
+        model_checkpoint_path=model_checkpoint_path,
+        with_mlm=True,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=True)
+
+    # Restore the exported model.
+    hub_layer = hub.KerasLayer(export_path, trainable=True)
+
+    # Check legacy tokenization data.
+    if use_bert:
+      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.vocab_file))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
+    else:
+      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
+      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
+      self.assertEqual("dummy content",
+                       _read_asset(hub_layer.resolved_object.sp_model_file))
+
+    # Check restored weights.
+    # Note that we set `_auto_track_sub_layers` to False when exporting the
+    # SavedModel, so hub_layer has the same number of weights as bert_model;
+    # otherwise, hub_layer will have extra weights from its `mlm` subobject.
+    self.assertEqual(len(bert_model.trainable_weights),
+                     len(hub_layer.trainable_weights))
+    for source_weight, hub_weight in zip(bert_model.trainable_weights,
+                                         hub_layer.trainable_weights):
+      self.assertAllClose(source_weight, hub_weight)
+
+    # Check computation.
+    seq_length = 10
+    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    hub_outputs_dict = hub_layer(input_dict)
+    source_outputs_dict = bert_model(input_dict)
+    encoder_outputs_dict = pretrainer.encoder_network(
+        [dummy_ids, dummy_ids, dummy_ids])
+    self.assertEqual(hub_outputs_dict["pooled_output"].shape, (2, hidden_size))
+    self.assertEqual(hub_outputs_dict["sequence_output"].shape,
+                     (2, seq_length, hidden_size))
+    for output_key in ("pooled_output", "sequence_output", "encoder_outputs"):
+      self.assertAllClose(source_outputs_dict[output_key],
+                          hub_outputs_dict[output_key])
+      self.assertAllClose(source_outputs_dict[output_key],
+                          encoder_outputs_dict[output_key])
+
+    # The "default" output of BERT as a text representation is pooled_output.
+    self.assertAllClose(hub_outputs_dict["pooled_output"],
+                        hub_outputs_dict["default"])
+
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids))
+      outputs = np.concatenate([
+          hub_layer(input_dict, training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+
+    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
+
+    # Checks sub-object `mlm`.
+    self.assertTrue(hasattr(hub_layer.resolved_object, "mlm"))
+
+    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
+                   len(bert_model_with_mlm.trainable_weights))
+    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
+                   len(pretrainer.trainable_weights))
+    for source_weight, hub_weight, pretrainer_weight in zip(
+        bert_model_with_mlm.trainable_weights,
+        hub_layer.resolved_object.mlm.trainable_variables,
+        pretrainer.trainable_weights):
+      self.assertAllClose(source_weight, hub_weight)
+      self.assertAllClose(source_weight, pretrainer_weight)
+
+    max_predictions_per_seq = 4
+    mlm_positions = np.zeros((2, max_predictions_per_seq), dtype=np.int32)
+    input_dict = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids,
+        masked_lm_positions=mlm_positions)
+    hub_mlm_outputs_dict = hub_layer.resolved_object.mlm(input_dict)
+    source_mlm_outputs_dict = bert_model_with_mlm(input_dict)
+    for output_key in ("pooled_output", "sequence_output", "mlm_logits",
+                       "encoder_outputs"):
+      self.assertAllClose(hub_mlm_outputs_dict[output_key],
+                          source_mlm_outputs_dict[output_key])
+
+    pretrainer_mlm_logits_output = pretrainer(input_dict)["mlm_logits"]
+    self.assertAllClose(hub_mlm_outputs_dict["mlm_logits"],
+                        pretrainer_mlm_logits_output)
+
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev_mlm(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      mlm_position_ids = np.array([[1, 2, 3, 4]], np.int32)
+      input_dict = dict(
+          input_word_ids=input_ids,
+          input_mask=np.ones_like(input_ids),
+          input_type_ids=np.zeros_like(input_ids),
+          masked_lm_positions=mlm_position_ids)
+      outputs = np.concatenate([
+          hub_layer.resolved_object.mlm(input_dict,
+                                        training=training)["pooled_output"]
+          for _ in range(num_runs)
+      ])
+      return np.mean(np.std(outputs, axis=0))
+
+    self.assertLess(_dropout_mean_stddev_mlm(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev_mlm(training=True), 1e-3)
+
+    # Test propagation of seq_length in shape inference.
+    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_dict = dict(
+        input_word_ids=input_word_ids,
+        input_mask=input_mask,
+        input_type_ids=input_type_ids)
+    hub_outputs_dict = hub_layer(input_dict)
+    self.assertEqual(hub_outputs_dict["pooled_output"].shape.as_list(),
+                     [None, hidden_size])
+    self.assertEqual(hub_outputs_dict["sequence_output"].shape.as_list(),
+                     [None, seq_length, hidden_size])
+
+
+_STRING_NOT_TO_LEAK = "private_path_component_"
+
+
+class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
+
+  def _make_vocab_file(self, vocab, filename="vocab.txt"):
+    """Creates wordpiece vocab file with given words plus special tokens.
+
+    The tokens of the resulting model are, in this order:
+    [PAD], [UNK], [CLS], [SEP], ...vocab...
+
+    This function also accepts wordpieces that start with the ## continuation
+    marker, but avoiding those makes this function interchangeable with
+    _make_sp_model_file(), up to the extra dimension returned by BertTokenizer.
+
+    Args:
+      vocab: a list of strings with the words or wordpieces to put into the
+        model's vocabulary. Do not include special tokens here.
+      filename: Optionally, a filename (relative to the temporary directory
+        created by this function).
+
+    Returns:
+      The absolute filename of the created vocab file.
+    """
+    full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] + vocab
+    path = os.path.join(
+        tempfile.mkdtemp(dir=self.get_temp_dir(),  # New subdir each time.
+                         prefix=_STRING_NOT_TO_LEAK),
+        filename)
+    with tf.io.gfile.GFile(path, "w") as f:
+      f.write("\n".join(full_vocab + [""]))
+    return path
+
+  def _make_sp_model_file(self, vocab, prefix="spm"):
+    """Creates Sentencepiece word model with given words plus special tokens.
+
+    The tokens of the resulting model are, in this order:
+    <pad>, <unk>, [CLS], [SEP], ...vocab..., <s>, </s>
+
+    The words in the input vocab are plain text, without the whitespace marker.
+    That makes this function interchangeable with _make_vocab_file().
+
+    Args:
+      vocab: a list of strings with the words to put into the model's
+        vocabulary. Do not include special tokens here.
+      prefix: an optional string, to change the filename prefix for the model
+        (relative to the temporary directory created by this function).
+
+    Returns:
+      The absolute filename of the created Sentencepiece model file.
+    """
+    model_prefix = os.path.join(
+        tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
+        prefix)
+    input_file = model_prefix + "_train_input.txt"
+    # Create input text for training the sp model from the tokens provided.
+    # Repeat tokens, the earlier the more, because they are sorted by frequency.
+    input_text = []
+    for i, token in enumerate(vocab):
+      input_text.append(" ".join([token] * (len(vocab) - i)))
+    with tf.io.gfile.GFile(input_file, "w") as f:
+      f.write("\n".join(input_text + [""]))
+    full_vocab_size = len(vocab) + 6  # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
+    flags = dict(
+        model_prefix=model_prefix,
+        model_type="word",
+        input=input_file,
+        pad_id=0, unk_id=1, control_symbols="[CLS],[SEP]",
+        vocab_size=full_vocab_size,
+        bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
+    SentencePieceTrainer.Train(
+        " ".join(["--{}={}".format(k, v) for k, v in flags.items()]))
+    return model_prefix + ".model"
+
+  def _do_export(self, vocab, do_lower_case, default_seq_length=128,
+                 tokenize_with_offsets=True, use_sp_model=False,
+                 experimental_disable_assert=False):
+    """Runs SavedModel export and returns the export_path."""
+    export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
+    vocab_file = sp_model_file = None
+    if use_sp_model:
+      sp_model_file = self._make_sp_model_file(vocab)
+    else:
+      vocab_file = self._make_vocab_file(vocab)
+    export_tfhub_lib.export_preprocessing(
+        export_path,
+        vocab_file=vocab_file,
+        sp_model_file=sp_model_file,
+        do_lower_case=do_lower_case,
+        tokenize_with_offsets=tokenize_with_offsets,
+        default_seq_length=default_seq_length,
+        experimental_disable_assert=experimental_disable_assert)
+    # Invalidate the original filename to verify loading from the SavedModel.
+    tf.io.gfile.remove(sp_model_file or vocab_file)
+    return export_path
+
+  def test_no_leaks(self):
+    """Tests not leaking the path to the original vocab file."""
+    path = self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True, use_sp_model=False)
+    with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f:
+      self.assertFalse(  # pylint: disable=g-generic-assert
+          _STRING_NOT_TO_LEAK.encode("ascii") in f.read())
+
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_exported_callables(self, use_sp_model):
+    preprocess = tf.saved_model.load(self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True,
+        tokenize_with_offsets=not use_sp_model,  # TODO(b/149576200): drop this.
+        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+        use_sp_model=use_sp_model))
+
+    def fold_dim(rt):
+      """Removes the word/subword distinction of BertTokenizer."""
+      return rt if use_sp_model else rt.merge_dims(1, 2)
+
+    # .tokenize()
+    inputs = tf.constant(["abc d ef", "ABC D EF d"])
+    token_ids = preprocess.tokenize(inputs)
+    self.assertAllEqual(fold_dim(token_ids),
+                        tf.ragged.constant([[6, 4, 5],
+                                            [6, 4, 5, 4]]))
+
+    special_tokens_dict = {
+        k: v.numpy().item()  # Expecting eager Tensor, converting to Python.
+        for k, v in preprocess.tokenize.get_special_tokens_dict().items()}
+    self.assertDictEqual(special_tokens_dict,
+                         dict(padding_id=0,
+                              start_of_sequence_id=2,
+                              end_of_segment_id=3,
+                              vocab_size=4+6 if use_sp_model else 4+4))
+
+    # .tokenize_with_offsets()
+    if use_sp_model:
+      # TODO(b/149576200): Enable tokenize_with_offsets when it works and test.
+      self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+    else:
+      token_ids, start_offsets, limit_offsets = (
+          preprocess.tokenize_with_offsets(inputs))
+      self.assertAllEqual(fold_dim(token_ids),
+                          tf.ragged.constant([[6, 4, 5],
+                                              [6, 4, 5, 4]]))
+      self.assertAllEqual(fold_dim(start_offsets),
+                          tf.ragged.constant([[0, 4, 6],
+                                              [0, 4, 6, 9]]))
+      self.assertAllEqual(fold_dim(limit_offsets),
+                          tf.ragged.constant([[3, 5, 8],
+                                              [3, 5, 8, 10]]))
+      self.assertIs(preprocess.tokenize.get_special_tokens_dict,
+                    preprocess.tokenize_with_offsets.get_special_tokens_dict)
+
+    # Root callable.
+    bert_inputs = preprocess(inputs)
+    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128])
+    self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
+                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128])
+    self.assertAllEqual(bert_inputs["input_mask"][:, :10],
+                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128])
+    self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
+                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+
+    # .bert_pack_inputs()
+    inputs_2 = tf.constant(["d xy", "xy abc"])
+    token_ids_2 = preprocess.tokenize(inputs_2)
+    bert_inputs = preprocess.bert_pack_inputs(
+        [token_ids, token_ids_2], seq_length=256)
+    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256])
+    self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
+                        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+                                     [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256])
+    self.assertAllEqual(bert_inputs["input_mask"][:, :10],
+                        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256])
+    self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
+                        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                                     [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
+
+  # For BertTokenizer only: repeat relevant parts for do_lower_case=False,
+  # default_seq_length=10, experimental_disable_assert=False,
+  # tokenize_with_offsets=False, and without folding the word/subword dimension.
+  def test_cased_length10(self):
+    preprocess = tf.saved_model.load(self._do_export(
+        ["d", "##ef", "abc", "ABC"],
+        do_lower_case=False, default_seq_length=10,
+        tokenize_with_offsets=False,
+        use_sp_model=False,
+        experimental_disable_assert=False))
+    inputs = tf.constant(["abc def", "ABC DEF"])
+    token_ids = preprocess.tokenize(inputs)
+    self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
+                                                       [[7], [1]]]))
+
+    self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+
+    bert_inputs = preprocess(inputs)
+    self.assertAllEqual(bert_inputs["input_word_ids"],
+                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                                     [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"],
+                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                                     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"],
+                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+
+    inputs_2 = tf.constant(["d ABC", "ABC abc"])
+    token_ids_2 = preprocess.tokenize(inputs_2)
+    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2])
+    # Test default seq_length=10.
+    self.assertAllEqual(bert_inputs["input_word_ids"],
+                        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+                                     [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"],
+                        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"],
+                        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                                     [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
+
+  # XLA requires fixed shapes for tensors found in graph mode.
+  # Statically known shapes in Python are a particularly firm way to
+  # guarantee that, and they are generally more convenient to work with.
+  # We test that the exported SavedModel plays well with TF's shape
+  # inference when applied to fully or partially known input shapes.
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_shapes(self, use_sp_model):
+    preprocess = tf.saved_model.load(self._do_export(
+        ["abc", "def"], do_lower_case=True,
+        tokenize_with_offsets=not use_sp_model,  # TODO(b/149576200): drop this.
+        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+        use_sp_model=use_sp_model))
+
+    def expected_bert_input_shapes(batch_size, seq_length):
+      return dict(input_word_ids=[batch_size, seq_length],
+                  input_mask=[batch_size, seq_length],
+                  input_type_ids=[batch_size, seq_length])
+
+    for batch_size in [7, None]:
+      if use_sp_model:
+        token_out_shape = [batch_size, None]  # No word/subword distinction.
+      else:
+        token_out_shape = [batch_size, None, None]
+      self.assertEqual(
+          _result_shapes_in_tf_function(
+              preprocess.tokenize,
+              tf.TensorSpec([batch_size], tf.string)),
+          token_out_shape,
+          "with batch_size=%s" % batch_size)
+      # TODO(b/149576200): Enable tokenize_with_offsets when it works and test.
+      if use_sp_model:
+        self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
+      else:
+        self.assertEqual(
+            _result_shapes_in_tf_function(
+                preprocess.tokenize_with_offsets,
+                tf.TensorSpec([batch_size], tf.string)),
+            [token_out_shape] * 3,
+            "with batch_size=%s" % batch_size)
+      self.assertEqual(
+          _result_shapes_in_tf_function(
+              preprocess.bert_pack_inputs,
+              [tf.RaggedTensorSpec([batch_size, None, None], tf.int32)] * 2,
+              seq_length=256), expected_bert_input_shapes(batch_size, 256),
+          "with batch_size=%s" % batch_size)
+      self.assertEqual(
+          _result_shapes_in_tf_function(preprocess,
+                                        tf.TensorSpec([batch_size], tf.string)),
+          expected_bert_input_shapes(batch_size, 128),
+          "with batch_size=%s" % batch_size)
+
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_reexport(self, use_sp_model):
+    """Test that preprocess keeps working after another save/load cycle."""
+    path1 = self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True, default_seq_length=10,
+        tokenize_with_offsets=False,
+        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+        use_sp_model=use_sp_model)
+    path2 = path1.rstrip("/") + ".2"
+    model1 = tf.saved_model.load(path1)
+    tf.saved_model.save(model1, path2)
+    # Delete the first SavedModel to test that the sceond one loads by itself.
+    # https://github.com/tensorflow/tensorflow/issues/46456 reports such a
+    # failure case for BertTokenizer.
+    tf.io.gfile.rmtree(path1)
+    model2 = tf.saved_model.load(path2)
+
+    inputs = tf.constant(["abc d ef", "ABC D EF d"])
+    bert_inputs = model2(inputs)
+    self.assertAllEqual(bert_inputs["input_word_ids"],
+                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_mask"],
+                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
+    self.assertAllEqual(bert_inputs["input_type_ids"],
+                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_special_tokens_in_estimator(self, use_sp_model):
+    """Tests getting special tokens without an Eager init context."""
+    preprocess_export_path = self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True,
+        use_sp_model=use_sp_model, tokenize_with_offsets=False)
+
+    def _get_special_tokens_dict(obj):
+      """Returns special tokens of restored tokenizer as Python values."""
+      if tf.executing_eagerly():
+        special_tokens_numpy = {k: v.numpy()
+                                for k, v in obj.get_special_tokens_dict()}
+      else:
+        with tf.Graph().as_default():
+          # This code expects `get_special_tokens_dict()` to be a tf.function
+          # with no dependencies (bound args) from the context it was loaded in,
+          # and boldly assumes that it can just be called in a dfferent context.
+          special_tokens_tensors = obj.get_special_tokens_dict()
+          with tf.compat.v1.Session() as sess:
+            special_tokens_numpy = sess.run(special_tokens_tensors)
+      return {k: v.item()  # Numpy to Python.
+              for k, v in special_tokens_numpy.items()}
+
+    def input_fn():
+      self.assertFalse(tf.executing_eagerly())
+      # Build a preprocessing Model.
+      sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
+      preprocess = tf.saved_model.load(preprocess_export_path)
+      tokenize = hub.KerasLayer(preprocess.tokenize)
+      special_tokens_dict = _get_special_tokens_dict(tokenize.resolved_object)
+      for k, v in special_tokens_dict.items():
+        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
+      tokens = tokenize(sentences)
+      packed_inputs = layers.BertPackInputs(
+          4, special_tokens_dict=special_tokens_dict)(tokens)
+      preprocessing = tf.keras.Model(sentences, packed_inputs)
+      # Map the dataset.
+      ds = tf.data.Dataset.from_tensors(
+          (tf.constant(["abc", "D EF"]), tf.constant([0, 1])))
+      ds = ds.map(lambda features, labels: (preprocessing(features), labels))
+      return ds
+
+    def model_fn(features, labels, mode):
+      del labels  # Unused.
+      return tf.estimator.EstimatorSpec(mode=mode,
+                                        predictions=features["input_word_ids"])
+
+    estimator = tf.estimator.Estimator(model_fn=model_fn)
+    outputs = list(estimator.predict(input_fn))
+    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
+                                           [2, 4, 5, 3]]))
+
+  # TODO(b/175369555): Remove that code and its test.
+  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
+  def test_check_no_assert(self, use_sp_model):
+    """Tests the self-check during export without assertions."""
+    preprocess_export_path = self._do_export(
+        ["d", "ef", "abc", "xy"], do_lower_case=True,
+        use_sp_model=use_sp_model, tokenize_with_offsets=False,
+        experimental_disable_assert=False)
+    with self.assertRaisesRegex(AssertionError,
+                                r"failed to suppress \d+ Assert ops"):
+      export_tfhub_lib._check_no_assert(preprocess_export_path)
+
+
+def _result_shapes_in_tf_function(fn, *args, **kwargs):
+  """Returns shapes (as lists) observed on the result of `fn`.
+
+  Args:
+    fn: A callable.
+    *args: TensorSpecs for Tensor-valued arguments and actual values
+      for Python-valued arguments to fn.
+    **kwargs: Same for keyword arguments.
+
+  Returns:
+    The nest of partial tensor shapes (as lists) that is statically known inside
+    tf.function(fn)(*args, **kwargs) for the nest of its results.
+  """
+  # Use a captured mutable container for a side outout from the wrapper.
+  uninitialized = "uninitialized!"
+  result_shapes_container = [uninitialized]
+  assert result_shapes_container[0] is uninitialized
+
+  @tf.function
+  def shape_reporting_wrapper(*args, **kwargs):
+    result = fn(*args, **kwargs)
+    result_shapes_container[0] = tf.nest.map_structure(
+        lambda x: x.shape.as_list(), result)
+    return result
+
+  shape_reporting_wrapper.get_concrete_function(*args, **kwargs)
+  assert result_shapes_container[0] is not uninitialized
+  return result_shapes_container[0]
+
+
+if __name__ == "__main__":
+  tf.test.main()