Commit 28720cfa authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

Test call to .mlm subobject with preprocessed inputs.

PiperOrigin-RevId: 359696240
parent 052a2543
......@@ -21,6 +21,7 @@ from absl.testing import parameterized
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sentencepiece import SentencePieceTrainer
from official.modeling import tf_utils
......@@ -32,11 +33,11 @@ from official.nlp.tools import export_tfhub_lib
def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
num_hidden_layers):
num_hidden_layers, vocab_size=100):
"""Returns config args for export_tfhub_lib._create_model()."""
if use_bert_config:
bert_config = configs.BertConfig(
vocab_size=100,
vocab_size=vocab_size,
hidden_size=hidden_size,
intermediate_size=32,
max_position_embeddings=128,
......@@ -48,7 +49,7 @@ def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
encoder_config = encoders.EncoderConfig(
type="albert",
albert=encoders.AlbertEncoderConfig(
vocab_size=100,
vocab_size=vocab_size,
embedding_width=16,
hidden_size=hidden_size,
intermediate_size=32,
......@@ -450,11 +451,12 @@ _STRING_NOT_TO_LEAK = "private_path_component_"
class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def _make_vocab_file(self, vocab, filename="vocab.txt"):
def _make_vocab_file(self, vocab, filename="vocab.txt", add_mask_token=False):
"""Creates wordpiece vocab file with given words plus special tokens.
The tokens of the resulting model are, in this order:
[PAD], [UNK], [CLS], [SEP], ...vocab...
[PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab...
*=if requested by args.
This function also accepts wordpieces that start with the ## continuation
marker, but avoiding those makes this function interchangeable with
......@@ -465,11 +467,13 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
model's vocabulary. Do not include special tokens here.
filename: Optionally, a filename (relative to the temporary directory
created by this function).
add_mask_token: an optional bool, whether to include a [MASK] token.
Returns:
The absolute filename of the created vocab file.
"""
full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] + vocab
full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"
] + ["[MASK]"]*add_mask_token + vocab
path = os.path.join(
tempfile.mkdtemp(dir=self.get_temp_dir(), # New subdir each time.
prefix=_STRING_NOT_TO_LEAK),
......@@ -478,11 +482,12 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
f.write("\n".join(full_vocab + [""]))
return path
def _make_sp_model_file(self, vocab, prefix="spm"):
def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False):
"""Creates Sentencepiece word model with given words plus special tokens.
The tokens of the resulting model are, in this order:
<pad>, <unk>, [CLS], [SEP], ...vocab..., <s>, </s>
<pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
*=if requested by args.
The words in the input vocab are plain text, without the whitespace marker.
That makes this function interchangeable with _make_vocab_file().
......@@ -492,6 +497,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
vocabulary. Do not include special tokens here.
prefix: an optional string, to change the filename prefix for the model
(relative to the temporary directory created by this function).
add_mask_token: an optional bool, whether to include a [MASK] token.
Returns:
The absolute filename of the created Sentencepiece model file.
......@@ -507,12 +513,16 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
input_text.append(" ".join([token] * (len(vocab) - i)))
with tf.io.gfile.GFile(input_file, "w") as f:
f.write("\n".join(input_text + [""]))
control_symbols = "[CLS],[SEP]"
full_vocab_size = len(vocab) + 6 # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
if add_mask_token:
control_symbols += ",[MASK]"
full_vocab_size += 1
flags = dict(
model_prefix=model_prefix,
model_type="word",
input=input_file,
pad_id=0, unk_id=1, control_symbols="[CLS],[SEP]",
pad_id=0, unk_id=1, control_symbols=control_symbols,
vocab_size=full_vocab_size,
bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
SentencePieceTrainer.Train(
......@@ -521,14 +531,15 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def _do_export(self, vocab, do_lower_case, default_seq_length=128,
tokenize_with_offsets=True, use_sp_model=False,
experimental_disable_assert=False):
experimental_disable_assert=False, add_mask_token=False):
"""Runs SavedModel export and returns the export_path."""
export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
vocab_file = sp_model_file = None
if use_sp_model:
sp_model_file = self._make_sp_model_file(vocab)
sp_model_file = self._make_sp_model_file(vocab,
add_mask_token=add_mask_token)
else:
vocab_file = self._make_vocab_file(vocab)
vocab_file = self._make_vocab_file(vocab, add_mask_token=add_mask_token)
export_tfhub_lib.export_preprocessing(
export_path,
vocab_file=vocab_file,
......@@ -751,6 +762,118 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
@parameterized.named_parameters(("Bert", True), ("Albert", False))
def test_preprocessing_for_mlm(self, use_bert):
"""Combines both SavedModel types and TF.text helpers for MLM."""
# Create the preprocessing SavedModel with a [MASK] token.
preprocess = tf.saved_model.load(self._do_export(
["d", "ef", "abc", "xy"], do_lower_case=True,
tokenize_with_offsets=use_bert, # TODO(b/149576200): drop this.
experimental_disable_assert=True, # TODO(b/175369555): drop this.
add_mask_token=True, use_sp_model=not use_bert))
vocab_size = 4+5 if use_bert else 4+7
# Create the encoder SavedModel with an .mlm subobject.
hidden_size = 16
num_hidden_layers = 2
bert_config, encoder_config = _get_bert_config_or_encoder_config(
use_bert, hidden_size, num_hidden_layers, vocab_size)
_, pretrainer = export_tfhub_lib._create_model(
bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( # Not used below.
self.get_temp_dir(), use_sp_model=not use_bert)
encoder_export_path = os.path.join(self.get_temp_dir(), "encoder_export")
export_tfhub_lib.export_model(
export_path=encoder_export_path,
bert_config=bert_config,
encoder_config=encoder_config,
model_checkpoint_path=model_checkpoint_path,
with_mlm=True,
vocab_file=vocab_file,
sp_model_file=sp_model_file,
do_lower_case=True)
encoder = tf.saved_model.load(encoder_export_path)
# Get special tokens from the vocab (and vocab size).
special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
self.assertEqual(int(special_tokens_dict["vocab_size"]), vocab_size)
padding_id = int(special_tokens_dict["padding_id"])
self.assertEqual(padding_id, 0)
start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])
self.assertEqual(start_of_sequence_id, 2)
end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])
self.assertEqual(end_of_segment_id, 3)
mask_id = int(special_tokens_dict["mask_id"])
self.assertEqual(mask_id, 4)
# A batch of 3 segment pairs.
raw_segments = [tf.constant(["hello", "nice movie", "quick brown fox"]),
tf.constant(["world", "great actors", "lazy dog"])]
batch_size = 3
# Misc hyperparameters.
seq_length = 12
max_selections_per_seq = 2
# Tokenize inputs.
tokenized_segments = [preprocess.tokenize(s) for s in raw_segments]
# Trim inputs to eventually fit seq_lentgh.
num_special_tokens = len(raw_segments) + 1
trimmed_segments = text.WaterfallTrimmer(
seq_length - num_special_tokens).trim(tokenized_segments)
# Combine input segments into one input sequence.
input_ids, segment_ids = text.combine_segments(
trimmed_segments,
start_of_sequence_id=start_of_sequence_id,
end_of_segment_id=end_of_segment_id)
# Apply random masking controlled by policy objects.
(masked_input_ids, masked_lm_positions,
masked_ids) = text.mask_language_model(
input_ids=input_ids,
item_selector=text.RandomItemSelector(
max_selections_per_seq,
selection_rate=0.15,
unselectable_ids=[start_of_sequence_id, end_of_segment_id]),
mask_values_chooser=text.MaskValuesChooser(vocab_size=vocab_size,
mask_token=mask_id,
mask_token_rate=0.8,
random_token_rate=0.1))
# Pad to fixed-length Transformer encoder inputs.
input_word_ids, _ = text.pad_model_inputs(masked_input_ids,
seq_length,
pad_value=padding_id)
input_type_ids, input_mask = text.pad_model_inputs(segment_ids, seq_length,
pad_value=0)
masked_lm_positions, _ = text.pad_model_inputs(masked_lm_positions,
max_selections_per_seq,
pad_value=0)
masked_lm_positions = tf.cast(masked_lm_positions, tf.int32)
num_predictions = int(tf.shape(masked_lm_positions)[1])
# Call the MLM head of the Transformer encoder.
mlm_inputs = dict(
input_word_ids=input_word_ids,
input_mask=input_mask,
input_type_ids=input_type_ids,
masked_lm_positions=masked_lm_positions,
)
mlm_outputs = encoder.mlm(mlm_inputs)
self.assertEqual(mlm_outputs["pooled_output"].shape,
(batch_size, hidden_size))
self.assertEqual(mlm_outputs["sequence_output"].shape,
(batch_size, seq_length, hidden_size))
self.assertEqual(mlm_outputs["mlm_logits"].shape,
(batch_size, num_predictions, vocab_size))
self.assertLen(mlm_outputs["encoder_outputs"], num_hidden_layers)
# A real trainer would now compute the loss of mlm_logits
# trying to predict the masked_ids.
del masked_ids # Unused.
@parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
def test_special_tokens_in_estimator(self, use_sp_model):
"""Tests getting special tokens without an Eager init context."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment