Commit 5b6171ce authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

Release TF-NLP's generic export_tfhub tool with preprocessing support

as open source.

A user guide will follow soon.

PiperOrigin-RevId: 359244402
parent 834ca16d
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Exports a BERT-like encoder and its preprocessing as SavedModels for TF Hub.
This tool creates preprocessor and encoder SavedModels suitable for uploading
to https://tfhub.dev that implement the preprocessor and encoder APIs defined
at https://www.tensorflow.org/hub/common_saved_model_apis/text.
Minimal usage examples:
1) Exporting an Encoder from checkpoint and config.
```
export_tfhub \
--encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
--model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
--vocab_file=${BERT_DIR:?}/vocab.txt \
--export_type=model \
--export_path=/tmp/bert_model
```
An --encoder_config_file can specify encoder types other than BERT.
For BERT, a --bert_config_file in the legacy JSON format can be passed instead.
Flag --vocab_file (and flag --do_lower_case, whose default value is guessed
from the vocab_file path) capture how BertTokenizer was used in pre-training.
Use flag --sp_model_file instead if SentencepieceTokenizer was used.
Changing --export_type to model_with_mlm additionally creates an `.mlm`
subobject on the exported SavedModel that can be called to produce
the logits of the Masked Language Model task from pretraining.
The help string for flag --model_checkpoint_path explains the checkpoint
formats required for each --export_type.
2) Exporting a preprocessor SavedModel
```
export_tfhub \
--vocab_file ${BERT_DIR:?}/vocab.txt \
--export_type preprocessing --export_path /tmp/bert_preprocessing
```
Be sure to use flag values that match the encoder and how it has been
pre-trained (see above for --vocab_file vs --sp_model_file).
If your encoder has been trained with text preprocessing for which tfhub.dev
already has SavedModel, you could guide your users to reuse that one instead
of exporting and publishing your own.
TODO(b/175369555): When exporting to users of TensorFlow 2.4, add flag
`--experimental_disable_assert_in_preprocessing`.
"""
from absl import app
from absl import flags
import gin
from official.modeling import hyperparams
from official.nlp.bert import configs
from official.nlp.configs import encoders
from official.nlp.tools import export_tfhub_lib
FLAGS = flags.FLAGS
flags.DEFINE_enum(
"export_type", "model",
["model", "model_with_mlm", "preprocessing"],
"The overall type of SavedModel to export. Flags "
"--bert_config_file/--encoder_config_file and --vocab_file/--sp_model_file "
"control which particular encoder model and preprocessing are exported.")
flags.DEFINE_string(
"export_path", None,
"Directory to which the SavedModel is written.")
flags.DEFINE_string(
"encoder_config_file", None,
"A yaml file representing `encoders.EncoderConfig` to define the encoder "
"(BERT or other). "
"Exactly one of --bert_config_file and --encoder_config_file can be set. "
"Needed for --export_type model and model_with_mlm.")
flags.DEFINE_string(
"bert_config_file", None,
"A JSON file with a legacy BERT configuration to define the BERT encoder. "
"Exactly one of --bert_config_file and --encoder_config_file can be set. "
"Needed for --export_type model and model_with_mlm.")
flags.DEFINE_bool(
"copy_pooler_dense_to_encoder", False,
"When the model is trained using `BertPretrainerV2`, the pool layer "
"of next sentence prediction task exists in `ClassificationHead` passed "
"to `BertPretrainerV2`. If True, we will copy this pooler's dense layer "
"to the encoder that is exported by this tool (as in classic BERT). "
"Using `BertPretrainerV2` and leaving this False exports an untrained "
"(randomly initialized) pooling layer, which some authors recommend for "
"subsequent fine-tuning,")
flags.DEFINE_string(
"model_checkpoint_path", None,
"File path to a pre-trained model checkpoint. "
"For --export_type model, this has to be an object-based (TF2) checkpoint "
"that can be restored to `tf.train.Checkpoint(encoder=encoder)` "
"for the `encoder` defined by the config file."
"(Legacy checkpoints with `model=` instead of `encoder=` are also "
"supported for now.) "
"For --export_type model_with_mlm, it must be restorable to "
"`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)`. "
"(For now, `tf.train.Checkpoint(pretrainer=BertPretrainerV2(...))` is also "
"accepted.)")
flags.DEFINE_string(
"vocab_file", None,
"For encoders trained on BertTokenzier input: "
"the vocabulary file that the encoder model was trained with. "
"Exactly one of --vocab_file and --sp_model_file can be set. "
"Needed for --export_type model, model_with_mlm and preprocessing.")
flags.DEFINE_string(
"sp_model_file", None,
"For encoders trained on SentencepieceTokenzier input: "
"the SentencePiece .model file that the encoder model was trained with. "
"Exactly one of --vocab_file and --sp_model_file can be set. "
"Needed for --export_type model, model_with_mlm and preprocessing.")
flags.DEFINE_bool(
"do_lower_case", None,
"Whether to lowercase before tokenization. "
"If left as None, and --vocab_file is set, do_lower_case will be enabled "
"if 'uncased' appears in the name of --vocab_file. "
"If left as None, and --sp_model_file set, do_lower_case defaults to true. "
"Needed for --export_type model, model_with_mlm and preprocessing.")
flags.DEFINE_integer(
"default_seq_length", 128,
"The sequence length of preprocessing results from "
"top-level preprocess method. This is also the default "
"sequence length for the bert_pack_inputs subobject."
"Needed for --export_type preprocessing.")
flags.DEFINE_bool(
"tokenize_with_offsets", False, # Broken by b/149576200.
"Whether to export a .tokenize_with_offsets subobject for "
"--export_type preprocessing.")
flags.DEFINE_multi_string(
"gin_file", default=None,
help="List of paths to the config files.")
flags.DEFINE_multi_string(
"gin_params", default=None,
help="List of Gin bindings.")
flags.DEFINE_bool( # TODO(b/175369555): Remove this flag and its use.
"experimental_disable_assert_in_preprocessing", False,
"Export a preprocessing model without tf.Assert ops. "
"Usually, that would be a bad idea, except TF2.4 has an issue with "
"Assert ops in tf.functions used in Dataset.map() on a TPU worker, "
"and omitting the Assert ops lets SavedModels avoid the issue.")
def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
if bool(FLAGS.vocab_file) == bool(FLAGS.sp_model_file):
raise ValueError("Exactly one of `vocab_file` and `sp_model_file` "
"can be specified, but got %s and %s." %
(FLAGS.vocab_file, FLAGS.sp_model_file))
do_lower_case = export_tfhub_lib.get_do_lower_case(
FLAGS.do_lower_case, FLAGS.vocab_file, FLAGS.sp_model_file)
if FLAGS.export_type in ("model", "model_with_mlm"):
if bool(FLAGS.bert_config_file) == bool(FLAGS.encoder_config_file):
raise ValueError("Exactly one of `bert_config_file` and "
"`encoder_config_file` can be specified, but got "
"%s and %s." %
(FLAGS.bert_config_file, FLAGS.encoder_config_file))
if FLAGS.bert_config_file:
bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
encoder_config = None
else:
bert_config = None
encoder_config = encoders.EncoderConfig()
encoder_config = hyperparams.override_params_dict(
encoder_config, FLAGS.encoder_config_file, is_strict=True)
export_tfhub_lib.export_model(
FLAGS.export_path,
bert_config=bert_config,
encoder_config=encoder_config,
model_checkpoint_path=FLAGS.model_checkpoint_path,
vocab_file=FLAGS.vocab_file,
sp_model_file=FLAGS.sp_model_file,
do_lower_case=do_lower_case,
with_mlm=FLAGS.export_type == "model_with_mlm",
copy_pooler_dense_to_encoder=FLAGS.copy_pooler_dense_to_encoder)
elif FLAGS.export_type == "preprocessing":
export_tfhub_lib.export_preprocessing(
FLAGS.export_path,
vocab_file=FLAGS.vocab_file,
sp_model_file=FLAGS.sp_model_file,
do_lower_case=do_lower_case,
default_seq_length=FLAGS.default_seq_length,
tokenize_with_offsets=FLAGS.tokenize_with_offsets,
experimental_disable_assert=
FLAGS.experimental_disable_assert_in_preprocessing)
else:
raise app.UsageError(
"Unknown value '%s' for flag --export_type" % FLAGS.export_type)
if __name__ == "__main__":
app.run(main)
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library of components of export_tfhub.py. See docstring there for more."""
import contextlib
import hashlib
import os
import tempfile
from typing import Optional, Text, Tuple
# Import libraries
from absl import logging
import tensorflow as tf
# pylint: disable=g-direct-tensorflow-import TODO(b/175369555): Remove these.
from tensorflow.core.protobuf import saved_model_pb2
from tensorflow.python.ops import control_flow_ops
# pylint: enable=g-direct-tensorflow-import
from official.modeling import tf_utils
from official.nlp.bert import configs
from official.nlp.configs import encoders
from official.nlp.modeling import layers
from official.nlp.modeling import models
from official.nlp.modeling import networks
def get_bert_encoder(bert_config):
"""Returns a BertEncoder with dict outputs."""
bert_encoder = networks.BertEncoder(
vocab_size=bert_config.vocab_size,
hidden_size=bert_config.hidden_size,
num_layers=bert_config.num_hidden_layers,
num_attention_heads=bert_config.num_attention_heads,
intermediate_size=bert_config.intermediate_size,
activation=tf_utils.get_activation(bert_config.hidden_act),
dropout_rate=bert_config.hidden_dropout_prob,
attention_dropout_rate=bert_config.attention_probs_dropout_prob,
max_sequence_length=bert_config.max_position_embeddings,
type_vocab_size=bert_config.type_vocab_size,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=bert_config.initializer_range),
embedding_width=bert_config.embedding_size,
dict_outputs=True)
return bert_encoder
def get_do_lower_case(do_lower_case, vocab_file=None, sp_model_file=None):
"""Returns do_lower_case, replacing None by a guess from vocab file name."""
if do_lower_case is not None:
return do_lower_case
elif vocab_file:
do_lower_case = "uncased" in vocab_file
logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
do_lower_case, vocab_file)
return do_lower_case
elif sp_model_file:
do_lower_case = True # All public ALBERTs (as of Oct 2020) do it.
logging.info("Defaulting to do_lower_case=%s for Sentencepiece tokenizer",
do_lower_case)
return do_lower_case
else:
raise ValueError("Must set vocab_file or sp_model_file.")
def _create_model(
*,
bert_config: Optional[configs.BertConfig] = None,
encoder_config: Optional[encoders.EncoderConfig] = None,
with_mlm: bool,
) -> Tuple[tf.keras.Model, tf.keras.Model]:
"""Creates the model to export and the model to restore the checkpoint.
Args:
bert_config: A legacy `BertConfig` to create a `BertEncoder` object.
Exactly one of encoder_config and bert_config must be set.
encoder_config: An `EncoderConfig` to create an encoder of the configured
type (`BertEncoder` or other).
with_mlm: A bool to control the second component of the result.
If True, will create a `BertPretrainerV2` object; otherwise, will
create a `BertEncoder` object.
Returns:
A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2`
object or `BertEncoder` object depending on the value of `with_mlm`
argument, which contains the first model and will be used for restoring
weights from the checkpoint.
"""
if (bert_config is not None) == (encoder_config is not None):
raise ValueError("Exactly one of `bert_config` and `encoder_config` "
"can be specified, but got %s and %s" %
(bert_config, encoder_config))
if bert_config is not None:
encoder = get_bert_encoder(bert_config)
else:
encoder = encoders.build_encoder(encoder_config)
# Convert from list of named inputs to dict of inputs keyed by name.
# Only the latter accepts a dict of inputs after restoring from SavedModel.
encoder_inputs_dict = {x.name: x for x in encoder.inputs}
encoder_output_dict = encoder(encoder_inputs_dict)
# For interchangeability with other text representations,
# add "default" as an alias for BERT's whole-input reptesentations.
encoder_output_dict["default"] = encoder_output_dict["pooled_output"]
core_model = tf.keras.Model(
inputs=encoder_inputs_dict, outputs=encoder_output_dict)
if with_mlm:
if bert_config is not None:
hidden_act = bert_config.hidden_act
else:
assert encoder_config is not None
hidden_act = encoder_config.get().hidden_activation
pretrainer = models.BertPretrainerV2(
encoder_network=encoder,
mlm_activation=tf_utils.get_activation(hidden_act))
pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs}
pretrainer_output_dict = pretrainer(pretrainer_inputs_dict)
mlm_model = tf.keras.Model(
inputs=pretrainer_inputs_dict, outputs=pretrainer_output_dict)
# Set `_auto_track_sub_layers` to False, so that the additional weights
# from `mlm` sub-object will not be included in the core model.
# TODO(b/169210253): Use a public API when available.
core_model._auto_track_sub_layers = False # pylint: disable=protected-access
core_model.mlm = mlm_model
return core_model, pretrainer
else:
return core_model, encoder
def export_model(export_path: Text,
*,
bert_config: Optional[configs.BertConfig] = None,
encoder_config: Optional[encoders.EncoderConfig] = None,
model_checkpoint_path: Text,
with_mlm: bool,
copy_pooler_dense_to_encoder: bool = False,
vocab_file: Optional[Text] = None,
sp_model_file: Optional[Text] = None,
do_lower_case: Optional[bool] = None) -> None:
"""Exports an Encoder as SavedModel after restoring pre-trained weights.
The exported SavedModel implements a superset of the Encoder API for
Text embeddings with Transformer Encoders described at
https://www.tensorflow.org/hub/common_saved_model_apis/text.
In particular, the exported SavedModel can be used in the following way:
```
# Calls default interface (encoder only).
encoder = hub.load(...)
encoder_inputs = dict(
input_word_ids=..., # Shape [batch, seq_length], dtype=int32
input_mask=..., # Shape [batch, seq_length], dtype=int32
input_type_ids=..., # Shape [batch, seq_length], dtype=int32
)
encoder_outputs = encoder(encoder_inputs)
assert encoder_outputs.keys() == {
"pooled_output", # Shape [batch_size, width], dtype=float32
"default", # Alias for "pooled_output" (aligns with other models).
"sequence_output" # Shape [batch_size, seq_length, width], dtype=float32
"encoder_outputs", # List of Tensors with outputs of all transformer layers.
}
```
If `with_mlm` is True, the exported SavedModel can also be called in the
following way:
```
# Calls expanded interface that includes logits of the Masked Language Model.
mlm_inputs = dict(
input_word_ids=..., # Shape [batch, seq_length], dtype=int32
input_mask=..., # Shape [batch, seq_length], dtype=int32
input_type_ids=..., # Shape [batch, seq_length], dtype=int32
masked_lm_positions=..., # Shape [batch, num_predictions], dtype=int32
)
mlm_outputs = encoder.mlm(mlm_inputs)
assert mlm_outputs.keys() == {
"pooled_output", # Shape [batch, width], dtype=float32
"sequence_output", # Shape [batch, seq_length, width], dtype=float32
"encoder_outputs", # List of Tensors with outputs of all transformer layers.
"mlm_logits" # Shape [batch, num_predictions, vocab_size], dtype=float32
}
```
Args:
export_path: The SavedModel output directory.
bert_config: An optional `configs.BertConfig` object. Note: exactly one of
`bert_config` and following `encoder_config` must be specified.
encoder_config: An optional `encoders.EncoderConfig` object.
model_checkpoint_path: The path to the checkpoint.
with_mlm: Whether to export the additional mlm sub-object.
copy_pooler_dense_to_encoder: Whether to copy the pooler's dense layer
used in the next sentence prediction task to the encoder.
vocab_file: The path to the wordpiece vocab file, or None.
sp_model_file: The path to the sentencepiece model file, or None.
Exactly one of vocab_file and sp_model_file must be set.
do_lower_case: Whether to lower-case text before tokenization.
"""
if with_mlm:
core_model, pretrainer = _create_model(bert_config=bert_config,
encoder_config=encoder_config,
with_mlm=with_mlm)
encoder = pretrainer.encoder_network
# Support the official way to checkpoint a pretrainer.
checkpoint_items = pretrainer.checkpoint_items
# Keep supporting the ad-hoc way from Oct 2020 that is used
# in several important converted checkpoints (original BERT, SmallBERTs).
checkpoint_items["pretrainer"] = pretrainer
checkpoint = tf.train.Checkpoint(**checkpoint_items)
else:
core_model, encoder = _create_model(bert_config=bert_config,
encoder_config=encoder_config,
with_mlm=with_mlm)
checkpoint = tf.train.Checkpoint(
model=encoder, # Legacy checkpoints.
encoder=encoder)
checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
if copy_pooler_dense_to_encoder:
logging.info("Copy pooler's dense layer to the encoder.")
pooler_checkpoint = tf.train.Checkpoint(
**{"next_sentence.pooler_dense": encoder.pooler_layer})
pooler_checkpoint.restore(
model_checkpoint_path).assert_existing_objects_matched()
# Before SavedModels for preprocessing appeared in Oct 2020, the encoders
# provided this information to let users do preprocessing themselves.
# We keep doing that for now. It helps users to upgrade incrementally.
# Moreover, it offers an escape hatch for advanced users who want the
# full vocab, not the high-level operations from the preprocessing model.
if vocab_file:
core_model.vocab_file = tf.saved_model.Asset(vocab_file)
if do_lower_case is None:
raise ValueError("Must pass do_lower_case if passing vocab_file.")
core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
elif sp_model_file:
# This was used by ALBERT, with implied values of do_lower_case=True
# and strip_diacritics=True.
core_model.sp_model_file = tf.saved_model.Asset(sp_model_file)
else:
raise ValueError("Must set vocab_file or sp_model_file")
core_model.save(export_path, include_optimizer=False, save_format="tf")
class BertPackInputsSavedModelWrapper(tf.train.Checkpoint):
"""Wraps a BertPackInputs layer for export to SavedModel.
The wrapper object is suitable for use with `tf.saved_model.save()` and
`.load()`. The wrapper object is callable with inputs and outputs like the
BertPackInputs layer, but differs from saving an unwrapped Keras object:
- The inputs can be a list of 1 or 2 RaggedTensors of dtype int32 and
ragged rank 1 or 2. (In Keras, saving to a tf.function in a SavedModel
would fix the number of RaggedTensors and their ragged rank.)
- The call accepts an optional keyword argument `seq_length=` to override
the layer's .seq_length hyperparameter. (In Keras, a hyperparameter
could not be changed after saving to a tf.function in a SavedModel.)
"""
def __init__(self, bert_pack_inputs: layers.BertPackInputs):
super().__init__()
# Preserve the layer's configured seq_length as a default but make it
# overridable. Having this dynamically determined default argument
# requires self.__call__ to be defined in this indirect way.
default_seq_length = bert_pack_inputs.seq_length
@tf.function(autograph=False)
def call(inputs, seq_length=default_seq_length):
return layers.BertPackInputs.bert_pack_inputs(
inputs, seq_length=seq_length,
start_of_sequence_id=bert_pack_inputs.start_of_sequence_id,
end_of_segment_id=bert_pack_inputs.end_of_segment_id,
padding_id=bert_pack_inputs.padding_id)
self.__call__ = call
for ragged_rank in range(1, 3):
for num_segments in range(1, 3):
_ = self.__call__.get_concrete_function(
[tf.RaggedTensorSpec([None] * (ragged_rank + 1), dtype=tf.int32)
for _ in range(num_segments)],
seq_length=tf.TensorSpec([], tf.int32))
def create_preprocessing(*,
vocab_file: Optional[str] = None,
sp_model_file: Optional[str] = None,
do_lower_case: bool,
tokenize_with_offsets: bool,
default_seq_length: int) -> tf.keras.Model:
"""Returns a preprocessing Model for given tokenization parameters.
This function builds a Keras Model with attached subobjects suitable for
saving to a SavedModel. The resulting SavedModel implements the Preprocessor
API for Text embeddings with Transformer Encoders described at
https://www.tensorflow.org/hub/common_saved_model_apis/text.
Args:
vocab_file: The path to the wordpiece vocab file, or None.
sp_model_file: The path to the sentencepiece model file, or None.
Exactly one of vocab_file and sp_model_file must be set.
This determines the type of tokenzer that is used.
do_lower_case: Whether to do lower case.
tokenize_with_offsets: Whether to include the .tokenize_with_offsets
subobject.
default_seq_length: The sequence length of preprocessing results from
root callable. This is also the default sequence length for the
bert_pack_inputs subobject.
Returns:
A tf.keras.Model object with several attached subobjects, suitable for
saving as a preprocessing SavedModel.
"""
# Select tokenizer.
if bool(vocab_file) == bool(sp_model_file):
raise ValueError("Must set exactly one of vocab_file, sp_model_file")
if vocab_file:
tokenize = layers.BertTokenizer(
vocab_file=vocab_file,
lower_case=do_lower_case,
tokenize_with_offsets=tokenize_with_offsets)
else:
tokenize = layers.SentencepieceTokenizer(
model_file_path=sp_model_file,
lower_case=do_lower_case,
strip_diacritics=True, # Strip diacritics to follow ALBERT model.
tokenize_with_offsets=tokenize_with_offsets)
# The root object of the preprocessing model can be called to do
# one-shot preprocessing for users with single-sentence inputs.
sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
if tokenize_with_offsets:
tokens, start_offsets, limit_offsets = tokenize(sentences)
else:
tokens = tokenize(sentences)
pack = layers.BertPackInputs(
seq_length=default_seq_length,
special_tokens_dict=tokenize.get_special_tokens_dict())
model_inputs = pack(tokens)
preprocessing = tf.keras.Model(sentences, model_inputs)
# Individual steps of preprocessing are made available as named subobjects
# to enable more general preprocessing. For saving, they need to be Models
# in their own right.
preprocessing.tokenize = tf.keras.Model(sentences, tokens)
# Provide an equivalent to tokenize.get_special_tokens_dict().
preprocessing.tokenize.get_special_tokens_dict = tf.train.Checkpoint()
preprocessing.tokenize.get_special_tokens_dict.__call__ = tf.function(
lambda: tokenize.get_special_tokens_dict(), # pylint: disable=[unnecessary-lambda]
input_signature=[])
if tokenize_with_offsets:
preprocessing.tokenize_with_offsets = tf.keras.Model(
sentences, [tokens, start_offsets, limit_offsets])
preprocessing.tokenize_with_offsets.get_special_tokens_dict = (
preprocessing.tokenize.get_special_tokens_dict)
# Conceptually, this should be
# preprocessing.bert_pack_inputs = tf.keras.Model(tokens, model_inputs)
# but technicalities require us to use a wrapper (see comments there).
# In particular, seq_length can be overridden when calling this.
preprocessing.bert_pack_inputs = BertPackInputsSavedModelWrapper(pack)
return preprocessing
def _move_to_tmpdir(file_path: Optional[Text], tmpdir: Text) -> Optional[Text]:
"""Returns new path with same basename and hash of original path."""
if file_path is None: return None
olddir, filename = os.path.split(file_path)
hasher = hashlib.sha1()
hasher.update(olddir.encode("utf-8"))
target_dir = os.path.join(tmpdir, hasher.hexdigest())
target_file = os.path.join(target_dir, filename)
tf.io.gfile.mkdir(target_dir)
tf.io.gfile.copy(file_path, target_file)
return target_file
def export_preprocessing(export_path: Text,
*,
vocab_file: Optional[Text] = None,
sp_model_file: Optional[Text] = None,
do_lower_case: bool,
tokenize_with_offsets: bool,
default_seq_length: int,
experimental_disable_assert: bool = False) -> None:
"""Exports preprocessing to a SavedModel for TF Hub."""
with tempfile.TemporaryDirectory() as tmpdir:
# TODO(b/175369555): Remove experimental_disable_assert and its use.
with _maybe_disable_assert(experimental_disable_assert):
preprocessing = create_preprocessing(
vocab_file=_move_to_tmpdir(vocab_file, tmpdir),
sp_model_file=_move_to_tmpdir(sp_model_file, tmpdir),
do_lower_case=do_lower_case,
tokenize_with_offsets=tokenize_with_offsets,
default_seq_length=default_seq_length)
preprocessing.save(export_path, include_optimizer=False, save_format="tf")
if experimental_disable_assert:
_check_no_assert(export_path)
# It helps the unit test to prevent stray copies of the vocab file.
if tf.io.gfile.exists(tmpdir):
raise IOError("Failed to clean up TemporaryDirectory")
# TODO(b/175369555): Remove all workarounds for this bug of TensorFlow 2.4
# when this bug is no longer a concern for publishing new models.
# TensorFlow 2.4 has a placement issue with Assert ops in tf.functions called
# from Dataset.map() on a TPU worker. They end up on the TPU coordinator,
# and invoking them from the TPU worker is either inefficient (when possible)
# or impossible (notably when using "headless" TPU workers on Cloud that do not
# have a channel to the coordinator). The bug has been fixed in time for TF 2.5.
# To work around this, the following code avoids Assert ops in the exported
# SavedModels. It monkey-patches calls to tf.Assert from inside TensorFlow and
# replaces them by a no-op while building the exported model. This is fragile,
# so _check_no_assert() validates the result. The resulting model should be fine
# to read on future versions of TF, even if this workaround at export time
# may break eventually. (Failing unit tests will tell.)
def _dont_assert(condition, data, summarize=None, name="Assert"):
"""The no-op version of tf.Assert installed by _maybe_disable_assert."""
del condition, data, summarize # Unused.
if tf.executing_eagerly():
return
with tf.name_scope(name):
return tf.no_op(name="dont_assert")
@contextlib.contextmanager
def _maybe_disable_assert(disable_assert):
"""Scoped monkey patch of control_flow_ops.Assert to a no-op."""
if not disable_assert:
yield
return
original_assert = control_flow_ops.Assert
control_flow_ops.Assert = _dont_assert
yield
control_flow_ops.Assert = original_assert
def _check_no_assert(saved_model_path):
"""Raises AssertionError if SavedModel contains Assert ops."""
saved_model_filename = os.path.join(saved_model_path, "saved_model.pb")
with tf.io.gfile.GFile(saved_model_filename, "rb") as f:
saved_model = saved_model_pb2.SavedModel.FromString(f.read())
assert_nodes = []
graph_def = saved_model.meta_graphs[0].graph_def
assert_nodes += ["node '{}' in global graph".format(n.name)
for n in graph_def.node if n.op == "Assert"]
for fdef in graph_def.library.function:
assert_nodes += [
"node '{}' in function '{}'".format(n.name, fdef.signature.name)
for n in fdef.node_def if n.op == "Assert"]
if assert_nodes:
raise AssertionError(
"Internal tool error: "
"failed to suppress {} Assert ops in SavedModel:\n{}".format(
len(assert_nodes), "\n".join(assert_nodes[:10])))
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests export_tfhub_lib."""
import os
import tempfile
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from sentencepiece import SentencePieceTrainer
from official.modeling import tf_utils
from official.nlp.bert import configs
from official.nlp.configs import encoders
from official.nlp.modeling import layers
from official.nlp.modeling import models
from official.nlp.tools import export_tfhub_lib
def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
num_hidden_layers):
"""Returns config args for export_tfhub_lib._create_model()."""
if use_bert_config:
bert_config = configs.BertConfig(
vocab_size=100,
hidden_size=hidden_size,
intermediate_size=32,
max_position_embeddings=128,
num_attention_heads=2,
num_hidden_layers=num_hidden_layers)
encoder_config = None
else:
bert_config = None
encoder_config = encoders.EncoderConfig(
type="albert",
albert=encoders.AlbertEncoderConfig(
vocab_size=100,
embedding_width=16,
hidden_size=hidden_size,
intermediate_size=32,
max_position_embeddings=128,
num_attention_heads=2,
num_layers=num_hidden_layers,
dropout_rate=0.1))
return bert_config, encoder_config
def _get_vocab_or_sp_model_dummy(temp_dir, use_sp_model):
"""Returns tokenizer asset args for export_tfhub_lib.export_model()."""
dummy_file = os.path.join(temp_dir, "dummy_file.txt")
with tf.io.gfile.GFile(dummy_file, "w") as f:
f.write("dummy content")
if use_sp_model:
vocab_file, sp_model_file = None, dummy_file
else:
vocab_file, sp_model_file = dummy_file, None
return vocab_file, sp_model_file
def _read_asset(asset: tf.saved_model.Asset):
return tf.io.gfile.GFile(asset.asset_path.numpy()).read()
def _find_lambda_layers(layer):
"""Returns list of all Lambda layers in a Keras model."""
if isinstance(layer, tf.keras.layers.Lambda):
return [layer]
elif hasattr(layer, "layers"): # It's nested, like a Model.
result = []
for l in layer.layers:
result += _find_lambda_layers(l)
return result
else:
return []
class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
"""Tests exporting a Transformer Encoder model as a SavedModel.
This covers export from an Encoder checkpoint to a SavedModel without
the .mlm subobject. This is no longer preferred, but still useful
for models like Electra that are trained without the MLM task.
The export code is generic. This test focuses on two main cases
(the most important ones in practice when this was written in 2020):
- BERT built from a legacy BertConfig, for use with BertTokenizer.
- ALBERT built from an EncoderConfig (as a representative of all other
choices beyond BERT, for use with SentencepieceTokenizer (the one
alternative to BertTokenizer).
"""
@parameterized.named_parameters(("Bert", True), ("Albert", False))
def test_export_model(self, use_bert):
# Create the encoder and export it.
hidden_size = 16
num_hidden_layers = 1
bert_config, encoder_config = _get_bert_config_or_encoder_config(
use_bert, hidden_size, num_hidden_layers)
bert_model, encoder = export_tfhub_lib._create_model(
bert_config=bert_config, encoder_config=encoder_config, with_mlm=False)
self.assertEmpty(
_find_lambda_layers(bert_model),
"Lambda layers are non-portable since they serialize Python bytecode.")
model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
checkpoint = tf.train.Checkpoint(encoder=encoder)
checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
self.get_temp_dir(), use_sp_model=not use_bert)
export_path = os.path.join(self.get_temp_dir(), "hub")
export_tfhub_lib.export_model(
export_path=export_path,
bert_config=bert_config,
encoder_config=encoder_config,
model_checkpoint_path=model_checkpoint_path,
with_mlm=False,
vocab_file=vocab_file,
sp_model_file=sp_model_file,
do_lower_case=True)
# Restore the exported model.
hub_layer = hub.KerasLayer(export_path, trainable=True)
# Check legacy tokenization data.
if use_bert:
self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
self.assertEqual("dummy content",
_read_asset(hub_layer.resolved_object.vocab_file))
self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
else:
self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
self.assertEqual("dummy content",
_read_asset(hub_layer.resolved_object.sp_model_file))
# Check restored weights.
self.assertEqual(len(bert_model.trainable_weights),
len(hub_layer.trainable_weights))
for source_weight, hub_weight in zip(bert_model.trainable_weights,
hub_layer.trainable_weights):
self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
# Check computation.
seq_length = 10
dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
input_dict = dict(
input_word_ids=dummy_ids,
input_mask=dummy_ids,
input_type_ids=dummy_ids)
hub_output = hub_layer(input_dict)
source_output = bert_model(input_dict)
encoder_output = encoder(input_dict)
self.assertEqual(hub_output["pooled_output"].shape, (2, hidden_size))
self.assertEqual(hub_output["sequence_output"].shape,
(2, seq_length, hidden_size))
self.assertLen(hub_output["encoder_outputs"], num_hidden_layers)
for key in ("pooled_output", "sequence_output", "encoder_outputs"):
self.assertAllClose(source_output[key], hub_output[key])
self.assertAllClose(source_output[key], encoder_output[key])
# The "default" output of BERT as a text representation is pooled_output.
self.assertAllClose(hub_output["pooled_output"], hub_output["default"])
# Test that training=True makes a difference (activates dropout).
def _dropout_mean_stddev(training, num_runs=20):
input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
input_dict = dict(
input_word_ids=input_ids,
input_mask=np.ones_like(input_ids),
input_type_ids=np.zeros_like(input_ids))
outputs = np.concatenate([
hub_layer(input_dict, training=training)["pooled_output"]
for _ in range(num_runs)
])
return np.mean(np.std(outputs, axis=0))
self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
# Test propagation of seq_length in shape inference.
input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
input_dict = dict(
input_word_ids=input_word_ids,
input_mask=input_mask,
input_type_ids=input_type_ids)
output_dict = hub_layer(input_dict)
pooled_output = output_dict["pooled_output"]
sequence_output = output_dict["sequence_output"]
encoder_outputs = output_dict["encoder_outputs"]
self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
self.assertEqual(sequence_output.shape.as_list(),
[None, seq_length, hidden_size])
self.assertLen(encoder_outputs, num_hidden_layers)
class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase):
"""Tests exporting a Transformer Encoder model as a SavedModel.
This covers export from a Pretrainer checkpoint to a SavedModel including
the .mlm subobject, which is the preferred way since 2020.
The export code is generic. This test focuses on two main cases
(the most important ones in practice when this was written in 2020):
- BERT built from a legacy BertConfig, for use with BertTokenizer.
- ALBERT built from an EncoderConfig (as a representative of all other
choices beyond BERT, for use with SentencepieceTokenizer (the one
alternative to BertTokenizer).
"""
def test_copy_pooler_dense_to_encoder(self):
encoder_config = encoders.EncoderConfig(
type="bert",
bert=encoders.BertEncoderConfig(
hidden_size=24, intermediate_size=48, num_layers=2))
cls_heads = [
layers.ClassificationHead(
inner_dim=24, num_classes=2, name="next_sentence")
]
encoder = encoders.build_encoder(encoder_config)
pretrainer = models.BertPretrainerV2(
encoder_network=encoder,
classification_heads=cls_heads,
mlm_activation=tf_utils.get_activation(
encoder_config.get().hidden_activation))
# Makes sure the pretrainer variables are created.
_ = pretrainer(pretrainer.inputs)
checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
self.get_temp_dir(), use_sp_model=True)
export_path = os.path.join(self.get_temp_dir(), "hub")
export_tfhub_lib.export_model(
export_path=export_path,
encoder_config=encoder_config,
model_checkpoint_path=tf.train.latest_checkpoint(model_checkpoint_dir),
with_mlm=True,
copy_pooler_dense_to_encoder=True,
vocab_file=vocab_file,
sp_model_file=sp_model_file,
do_lower_case=True)
# Restores a hub KerasLayer.
hub_layer = hub.KerasLayer(export_path, trainable=True)
dummy_ids = np.zeros((2, 10), dtype=np.int32)
input_dict = dict(
input_word_ids=dummy_ids,
input_mask=dummy_ids,
input_type_ids=dummy_ids)
hub_pooled_output = hub_layer(input_dict)["pooled_output"]
encoder_outputs = encoder(input_dict)
# Verify that hub_layer's pooled_output is the same as the output of next
# sentence prediction's dense layer.
pretrained_pooled_output = cls_heads[0].dense(
(encoder_outputs["sequence_output"][:, 0, :]))
self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
# But the pooled_output between encoder and hub_layer are not the same.
encoder_pooled_output = encoder_outputs["pooled_output"]
self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
@parameterized.named_parameters(
("Bert", True, False),
("BertLegacyCheckpoint", True, True),
("Albert", False, False),
("AlbertLegacyCheckpoint", False, True),
)
def test_export_model_with_mlm(self, use_bert, legacy_checkpoint):
# Create the encoder and export it.
hidden_size = 16
num_hidden_layers = 2
bert_config, encoder_config = _get_bert_config_or_encoder_config(
use_bert, hidden_size, num_hidden_layers)
bert_model, pretrainer = export_tfhub_lib._create_model(
bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
self.assertEmpty(
_find_lambda_layers(bert_model),
"Lambda layers are non-portable since they serialize Python bytecode.")
bert_model_with_mlm = bert_model.mlm
model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
if legacy_checkpoint:
checkpoint = tf.train.Checkpoint(pretrainer=pretrainer)
else:
checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
self.get_temp_dir(), use_sp_model=not use_bert)
export_path = os.path.join(self.get_temp_dir(), "hub")
export_tfhub_lib.export_model(
export_path=export_path,
bert_config=bert_config,
encoder_config=encoder_config,
model_checkpoint_path=model_checkpoint_path,
with_mlm=True,
vocab_file=vocab_file,
sp_model_file=sp_model_file,
do_lower_case=True)
# Restore the exported model.
hub_layer = hub.KerasLayer(export_path, trainable=True)
# Check legacy tokenization data.
if use_bert:
self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
self.assertEqual("dummy content",
_read_asset(hub_layer.resolved_object.vocab_file))
self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
else:
self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
self.assertEqual("dummy content",
_read_asset(hub_layer.resolved_object.sp_model_file))
# Check restored weights.
# Note that we set `_auto_track_sub_layers` to False when exporting the
# SavedModel, so hub_layer has the same number of weights as bert_model;
# otherwise, hub_layer will have extra weights from its `mlm` subobject.
self.assertEqual(len(bert_model.trainable_weights),
len(hub_layer.trainable_weights))
for source_weight, hub_weight in zip(bert_model.trainable_weights,
hub_layer.trainable_weights):
self.assertAllClose(source_weight, hub_weight)
# Check computation.
seq_length = 10
dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
input_dict = dict(
input_word_ids=dummy_ids,
input_mask=dummy_ids,
input_type_ids=dummy_ids)
hub_outputs_dict = hub_layer(input_dict)
source_outputs_dict = bert_model(input_dict)
encoder_outputs_dict = pretrainer.encoder_network(
[dummy_ids, dummy_ids, dummy_ids])
self.assertEqual(hub_outputs_dict["pooled_output"].shape, (2, hidden_size))
self.assertEqual(hub_outputs_dict["sequence_output"].shape,
(2, seq_length, hidden_size))
for output_key in ("pooled_output", "sequence_output", "encoder_outputs"):
self.assertAllClose(source_outputs_dict[output_key],
hub_outputs_dict[output_key])
self.assertAllClose(source_outputs_dict[output_key],
encoder_outputs_dict[output_key])
# The "default" output of BERT as a text representation is pooled_output.
self.assertAllClose(hub_outputs_dict["pooled_output"],
hub_outputs_dict["default"])
# Test that training=True makes a difference (activates dropout).
def _dropout_mean_stddev(training, num_runs=20):
input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
input_dict = dict(
input_word_ids=input_ids,
input_mask=np.ones_like(input_ids),
input_type_ids=np.zeros_like(input_ids))
outputs = np.concatenate([
hub_layer(input_dict, training=training)["pooled_output"]
for _ in range(num_runs)
])
return np.mean(np.std(outputs, axis=0))
self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
# Checks sub-object `mlm`.
self.assertTrue(hasattr(hub_layer.resolved_object, "mlm"))
self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
len(bert_model_with_mlm.trainable_weights))
self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
len(pretrainer.trainable_weights))
for source_weight, hub_weight, pretrainer_weight in zip(
bert_model_with_mlm.trainable_weights,
hub_layer.resolved_object.mlm.trainable_variables,
pretrainer.trainable_weights):
self.assertAllClose(source_weight, hub_weight)
self.assertAllClose(source_weight, pretrainer_weight)
max_predictions_per_seq = 4
mlm_positions = np.zeros((2, max_predictions_per_seq), dtype=np.int32)
input_dict = dict(
input_word_ids=dummy_ids,
input_mask=dummy_ids,
input_type_ids=dummy_ids,
masked_lm_positions=mlm_positions)
hub_mlm_outputs_dict = hub_layer.resolved_object.mlm(input_dict)
source_mlm_outputs_dict = bert_model_with_mlm(input_dict)
for output_key in ("pooled_output", "sequence_output", "mlm_logits",
"encoder_outputs"):
self.assertAllClose(hub_mlm_outputs_dict[output_key],
source_mlm_outputs_dict[output_key])
pretrainer_mlm_logits_output = pretrainer(input_dict)["mlm_logits"]
self.assertAllClose(hub_mlm_outputs_dict["mlm_logits"],
pretrainer_mlm_logits_output)
# Test that training=True makes a difference (activates dropout).
def _dropout_mean_stddev_mlm(training, num_runs=20):
input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
mlm_position_ids = np.array([[1, 2, 3, 4]], np.int32)
input_dict = dict(
input_word_ids=input_ids,
input_mask=np.ones_like(input_ids),
input_type_ids=np.zeros_like(input_ids),
masked_lm_positions=mlm_position_ids)
outputs = np.concatenate([
hub_layer.resolved_object.mlm(input_dict,
training=training)["pooled_output"]
for _ in range(num_runs)
])
return np.mean(np.std(outputs, axis=0))
self.assertLess(_dropout_mean_stddev_mlm(training=False), 1e-6)
self.assertGreater(_dropout_mean_stddev_mlm(training=True), 1e-3)
# Test propagation of seq_length in shape inference.
input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
input_dict = dict(
input_word_ids=input_word_ids,
input_mask=input_mask,
input_type_ids=input_type_ids)
hub_outputs_dict = hub_layer(input_dict)
self.assertEqual(hub_outputs_dict["pooled_output"].shape.as_list(),
[None, hidden_size])
self.assertEqual(hub_outputs_dict["sequence_output"].shape.as_list(),
[None, seq_length, hidden_size])
_STRING_NOT_TO_LEAK = "private_path_component_"
class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def _make_vocab_file(self, vocab, filename="vocab.txt"):
"""Creates wordpiece vocab file with given words plus special tokens.
The tokens of the resulting model are, in this order:
[PAD], [UNK], [CLS], [SEP], ...vocab...
This function also accepts wordpieces that start with the ## continuation
marker, but avoiding those makes this function interchangeable with
_make_sp_model_file(), up to the extra dimension returned by BertTokenizer.
Args:
vocab: a list of strings with the words or wordpieces to put into the
model's vocabulary. Do not include special tokens here.
filename: Optionally, a filename (relative to the temporary directory
created by this function).
Returns:
The absolute filename of the created vocab file.
"""
full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] + vocab
path = os.path.join(
tempfile.mkdtemp(dir=self.get_temp_dir(), # New subdir each time.
prefix=_STRING_NOT_TO_LEAK),
filename)
with tf.io.gfile.GFile(path, "w") as f:
f.write("\n".join(full_vocab + [""]))
return path
def _make_sp_model_file(self, vocab, prefix="spm"):
"""Creates Sentencepiece word model with given words plus special tokens.
The tokens of the resulting model are, in this order:
<pad>, <unk>, [CLS], [SEP], ...vocab..., <s>, </s>
The words in the input vocab are plain text, without the whitespace marker.
That makes this function interchangeable with _make_vocab_file().
Args:
vocab: a list of strings with the words to put into the model's
vocabulary. Do not include special tokens here.
prefix: an optional string, to change the filename prefix for the model
(relative to the temporary directory created by this function).
Returns:
The absolute filename of the created Sentencepiece model file.
"""
model_prefix = os.path.join(
tempfile.mkdtemp(dir=self.get_temp_dir()), # New subdir each time.
prefix)
input_file = model_prefix + "_train_input.txt"
# Create input text for training the sp model from the tokens provided.
# Repeat tokens, the earlier the more, because they are sorted by frequency.
input_text = []
for i, token in enumerate(vocab):
input_text.append(" ".join([token] * (len(vocab) - i)))
with tf.io.gfile.GFile(input_file, "w") as f:
f.write("\n".join(input_text + [""]))
full_vocab_size = len(vocab) + 6 # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
flags = dict(
model_prefix=model_prefix,
model_type="word",
input=input_file,
pad_id=0, unk_id=1, control_symbols="[CLS],[SEP]",
vocab_size=full_vocab_size,
bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
SentencePieceTrainer.Train(
" ".join(["--{}={}".format(k, v) for k, v in flags.items()]))
return model_prefix + ".model"
def _do_export(self, vocab, do_lower_case, default_seq_length=128,
tokenize_with_offsets=True, use_sp_model=False,
experimental_disable_assert=False):
"""Runs SavedModel export and returns the export_path."""
export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
vocab_file = sp_model_file = None
if use_sp_model:
sp_model_file = self._make_sp_model_file(vocab)
else:
vocab_file = self._make_vocab_file(vocab)
export_tfhub_lib.export_preprocessing(
export_path,
vocab_file=vocab_file,
sp_model_file=sp_model_file,
do_lower_case=do_lower_case,
tokenize_with_offsets=tokenize_with_offsets,
default_seq_length=default_seq_length,
experimental_disable_assert=experimental_disable_assert)
# Invalidate the original filename to verify loading from the SavedModel.
tf.io.gfile.remove(sp_model_file or vocab_file)
return export_path
def test_no_leaks(self):
"""Tests not leaking the path to the original vocab file."""
path = self._do_export(
["d", "ef", "abc", "xy"], do_lower_case=True, use_sp_model=False)
with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f:
self.assertFalse( # pylint: disable=g-generic-assert
_STRING_NOT_TO_LEAK.encode("ascii") in f.read())
@parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
def test_exported_callables(self, use_sp_model):
preprocess = tf.saved_model.load(self._do_export(
["d", "ef", "abc", "xy"], do_lower_case=True,
tokenize_with_offsets=not use_sp_model, # TODO(b/149576200): drop this.
experimental_disable_assert=True, # TODO(b/175369555): drop this.
use_sp_model=use_sp_model))
def fold_dim(rt):
"""Removes the word/subword distinction of BertTokenizer."""
return rt if use_sp_model else rt.merge_dims(1, 2)
# .tokenize()
inputs = tf.constant(["abc d ef", "ABC D EF d"])
token_ids = preprocess.tokenize(inputs)
self.assertAllEqual(fold_dim(token_ids),
tf.ragged.constant([[6, 4, 5],
[6, 4, 5, 4]]))
special_tokens_dict = {
k: v.numpy().item() # Expecting eager Tensor, converting to Python.
for k, v in preprocess.tokenize.get_special_tokens_dict().items()}
self.assertDictEqual(special_tokens_dict,
dict(padding_id=0,
start_of_sequence_id=2,
end_of_segment_id=3,
vocab_size=4+6 if use_sp_model else 4+4))
# .tokenize_with_offsets()
if use_sp_model:
# TODO(b/149576200): Enable tokenize_with_offsets when it works and test.
self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
else:
token_ids, start_offsets, limit_offsets = (
preprocess.tokenize_with_offsets(inputs))
self.assertAllEqual(fold_dim(token_ids),
tf.ragged.constant([[6, 4, 5],
[6, 4, 5, 4]]))
self.assertAllEqual(fold_dim(start_offsets),
tf.ragged.constant([[0, 4, 6],
[0, 4, 6, 9]]))
self.assertAllEqual(fold_dim(limit_offsets),
tf.ragged.constant([[3, 5, 8],
[3, 5, 8, 10]]))
self.assertIs(preprocess.tokenize.get_special_tokens_dict,
preprocess.tokenize_with_offsets.get_special_tokens_dict)
# Root callable.
bert_inputs = preprocess(inputs)
self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128])
self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
[2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128])
self.assertAllEqual(bert_inputs["input_mask"][:, :10],
tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128])
self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
# .bert_pack_inputs()
inputs_2 = tf.constant(["d xy", "xy abc"])
token_ids_2 = preprocess.tokenize(inputs_2)
bert_inputs = preprocess.bert_pack_inputs(
[token_ids, token_ids_2], seq_length=256)
self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256])
self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
[2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256])
self.assertAllEqual(bert_inputs["input_mask"][:, :10],
tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256])
self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
# For BertTokenizer only: repeat relevant parts for do_lower_case=False,
# default_seq_length=10, experimental_disable_assert=False,
# tokenize_with_offsets=False, and without folding the word/subword dimension.
def test_cased_length10(self):
preprocess = tf.saved_model.load(self._do_export(
["d", "##ef", "abc", "ABC"],
do_lower_case=False, default_seq_length=10,
tokenize_with_offsets=False,
use_sp_model=False,
experimental_disable_assert=False))
inputs = tf.constant(["abc def", "ABC DEF"])
token_ids = preprocess.tokenize(inputs)
self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
[[7], [1]]]))
self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
bert_inputs = preprocess(inputs)
self.assertAllEqual(bert_inputs["input_word_ids"],
tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
[2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
self.assertAllEqual(bert_inputs["input_mask"],
tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
self.assertAllEqual(bert_inputs["input_type_ids"],
tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
inputs_2 = tf.constant(["d ABC", "ABC abc"])
token_ids_2 = preprocess.tokenize(inputs_2)
bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2])
# Test default seq_length=10.
self.assertAllEqual(bert_inputs["input_word_ids"],
tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
[2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
self.assertAllEqual(bert_inputs["input_mask"],
tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
self.assertAllEqual(bert_inputs["input_type_ids"],
tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
[0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
# XLA requires fixed shapes for tensors found in graph mode.
# Statically known shapes in Python are a particularly firm way to
# guarantee that, and they are generally more convenient to work with.
# We test that the exported SavedModel plays well with TF's shape
# inference when applied to fully or partially known input shapes.
@parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
def test_shapes(self, use_sp_model):
preprocess = tf.saved_model.load(self._do_export(
["abc", "def"], do_lower_case=True,
tokenize_with_offsets=not use_sp_model, # TODO(b/149576200): drop this.
experimental_disable_assert=True, # TODO(b/175369555): drop this.
use_sp_model=use_sp_model))
def expected_bert_input_shapes(batch_size, seq_length):
return dict(input_word_ids=[batch_size, seq_length],
input_mask=[batch_size, seq_length],
input_type_ids=[batch_size, seq_length])
for batch_size in [7, None]:
if use_sp_model:
token_out_shape = [batch_size, None] # No word/subword distinction.
else:
token_out_shape = [batch_size, None, None]
self.assertEqual(
_result_shapes_in_tf_function(
preprocess.tokenize,
tf.TensorSpec([batch_size], tf.string)),
token_out_shape,
"with batch_size=%s" % batch_size)
# TODO(b/149576200): Enable tokenize_with_offsets when it works and test.
if use_sp_model:
self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
else:
self.assertEqual(
_result_shapes_in_tf_function(
preprocess.tokenize_with_offsets,
tf.TensorSpec([batch_size], tf.string)),
[token_out_shape] * 3,
"with batch_size=%s" % batch_size)
self.assertEqual(
_result_shapes_in_tf_function(
preprocess.bert_pack_inputs,
[tf.RaggedTensorSpec([batch_size, None, None], tf.int32)] * 2,
seq_length=256), expected_bert_input_shapes(batch_size, 256),
"with batch_size=%s" % batch_size)
self.assertEqual(
_result_shapes_in_tf_function(preprocess,
tf.TensorSpec([batch_size], tf.string)),
expected_bert_input_shapes(batch_size, 128),
"with batch_size=%s" % batch_size)
@parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
def test_reexport(self, use_sp_model):
"""Test that preprocess keeps working after another save/load cycle."""
path1 = self._do_export(
["d", "ef", "abc", "xy"], do_lower_case=True, default_seq_length=10,
tokenize_with_offsets=False,
experimental_disable_assert=True, # TODO(b/175369555): drop this.
use_sp_model=use_sp_model)
path2 = path1.rstrip("/") + ".2"
model1 = tf.saved_model.load(path1)
tf.saved_model.save(model1, path2)
# Delete the first SavedModel to test that the sceond one loads by itself.
# https://github.com/tensorflow/tensorflow/issues/46456 reports such a
# failure case for BertTokenizer.
tf.io.gfile.rmtree(path1)
model2 = tf.saved_model.load(path2)
inputs = tf.constant(["abc d ef", "ABC D EF d"])
bert_inputs = model2(inputs)
self.assertAllEqual(bert_inputs["input_word_ids"],
tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
[2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
self.assertAllEqual(bert_inputs["input_mask"],
tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
self.assertAllEqual(bert_inputs["input_type_ids"],
tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
@parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
def test_special_tokens_in_estimator(self, use_sp_model):
"""Tests getting special tokens without an Eager init context."""
preprocess_export_path = self._do_export(
["d", "ef", "abc", "xy"], do_lower_case=True,
use_sp_model=use_sp_model, tokenize_with_offsets=False)
def _get_special_tokens_dict(obj):
"""Returns special tokens of restored tokenizer as Python values."""
if tf.executing_eagerly():
special_tokens_numpy = {k: v.numpy()
for k, v in obj.get_special_tokens_dict()}
else:
with tf.Graph().as_default():
# This code expects `get_special_tokens_dict()` to be a tf.function
# with no dependencies (bound args) from the context it was loaded in,
# and boldly assumes that it can just be called in a dfferent context.
special_tokens_tensors = obj.get_special_tokens_dict()
with tf.compat.v1.Session() as sess:
special_tokens_numpy = sess.run(special_tokens_tensors)
return {k: v.item() # Numpy to Python.
for k, v in special_tokens_numpy.items()}
def input_fn():
self.assertFalse(tf.executing_eagerly())
# Build a preprocessing Model.
sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
preprocess = tf.saved_model.load(preprocess_export_path)
tokenize = hub.KerasLayer(preprocess.tokenize)
special_tokens_dict = _get_special_tokens_dict(tokenize.resolved_object)
for k, v in special_tokens_dict.items():
self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
tokens = tokenize(sentences)
packed_inputs = layers.BertPackInputs(
4, special_tokens_dict=special_tokens_dict)(tokens)
preprocessing = tf.keras.Model(sentences, packed_inputs)
# Map the dataset.
ds = tf.data.Dataset.from_tensors(
(tf.constant(["abc", "D EF"]), tf.constant([0, 1])))
ds = ds.map(lambda features, labels: (preprocessing(features), labels))
return ds
def model_fn(features, labels, mode):
del labels # Unused.
return tf.estimator.EstimatorSpec(mode=mode,
predictions=features["input_word_ids"])
estimator = tf.estimator.Estimator(model_fn=model_fn)
outputs = list(estimator.predict(input_fn))
self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
[2, 4, 5, 3]]))
# TODO(b/175369555): Remove that code and its test.
@parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
def test_check_no_assert(self, use_sp_model):
"""Tests the self-check during export without assertions."""
preprocess_export_path = self._do_export(
["d", "ef", "abc", "xy"], do_lower_case=True,
use_sp_model=use_sp_model, tokenize_with_offsets=False,
experimental_disable_assert=False)
with self.assertRaisesRegex(AssertionError,
r"failed to suppress \d+ Assert ops"):
export_tfhub_lib._check_no_assert(preprocess_export_path)
def _result_shapes_in_tf_function(fn, *args, **kwargs):
"""Returns shapes (as lists) observed on the result of `fn`.
Args:
fn: A callable.
*args: TensorSpecs for Tensor-valued arguments and actual values
for Python-valued arguments to fn.
**kwargs: Same for keyword arguments.
Returns:
The nest of partial tensor shapes (as lists) that is statically known inside
tf.function(fn)(*args, **kwargs) for the nest of its results.
"""
# Use a captured mutable container for a side outout from the wrapper.
uninitialized = "uninitialized!"
result_shapes_container = [uninitialized]
assert result_shapes_container[0] is uninitialized
@tf.function
def shape_reporting_wrapper(*args, **kwargs):
result = fn(*args, **kwargs)
result_shapes_container[0] = tf.nest.map_structure(
lambda x: x.shape.as_list(), result)
return result
shape_reporting_wrapper.get_concrete_function(*args, **kwargs)
assert result_shapes_container[0] is not uninitialized
return result_shapes_container[0]
if __name__ == "__main__":
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment