Commit 56109286 authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

Test call to .mlm subobject with preprocessed inputs.

PiperOrigin-RevId: 359696240
parent 8fba84f8
...@@ -21,6 +21,7 @@ from absl.testing import parameterized ...@@ -21,6 +21,7 @@ from absl.testing import parameterized
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
import tensorflow_hub as hub import tensorflow_hub as hub
import tensorflow_text as text
from sentencepiece import SentencePieceTrainer from sentencepiece import SentencePieceTrainer
from official.modeling import tf_utils from official.modeling import tf_utils
...@@ -32,11 +33,11 @@ from official.nlp.tools import export_tfhub_lib ...@@ -32,11 +33,11 @@ from official.nlp.tools import export_tfhub_lib
def _get_bert_config_or_encoder_config(use_bert_config, hidden_size, def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
num_hidden_layers): num_hidden_layers, vocab_size=100):
"""Returns config args for export_tfhub_lib._create_model().""" """Returns config args for export_tfhub_lib._create_model()."""
if use_bert_config: if use_bert_config:
bert_config = configs.BertConfig( bert_config = configs.BertConfig(
vocab_size=100, vocab_size=vocab_size,
hidden_size=hidden_size, hidden_size=hidden_size,
intermediate_size=32, intermediate_size=32,
max_position_embeddings=128, max_position_embeddings=128,
...@@ -48,7 +49,7 @@ def _get_bert_config_or_encoder_config(use_bert_config, hidden_size, ...@@ -48,7 +49,7 @@ def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
encoder_config = encoders.EncoderConfig( encoder_config = encoders.EncoderConfig(
type="albert", type="albert",
albert=encoders.AlbertEncoderConfig( albert=encoders.AlbertEncoderConfig(
vocab_size=100, vocab_size=vocab_size,
embedding_width=16, embedding_width=16,
hidden_size=hidden_size, hidden_size=hidden_size,
intermediate_size=32, intermediate_size=32,
...@@ -450,11 +451,12 @@ _STRING_NOT_TO_LEAK = "private_path_component_" ...@@ -450,11 +451,12 @@ _STRING_NOT_TO_LEAK = "private_path_component_"
class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def _make_vocab_file(self, vocab, filename="vocab.txt"): def _make_vocab_file(self, vocab, filename="vocab.txt", add_mask_token=False):
"""Creates wordpiece vocab file with given words plus special tokens. """Creates wordpiece vocab file with given words plus special tokens.
The tokens of the resulting model are, in this order: The tokens of the resulting model are, in this order:
[PAD], [UNK], [CLS], [SEP], ...vocab... [PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab...
*=if requested by args.
This function also accepts wordpieces that start with the ## continuation This function also accepts wordpieces that start with the ## continuation
marker, but avoiding those makes this function interchangeable with marker, but avoiding those makes this function interchangeable with
...@@ -465,11 +467,13 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -465,11 +467,13 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
model's vocabulary. Do not include special tokens here. model's vocabulary. Do not include special tokens here.
filename: Optionally, a filename (relative to the temporary directory filename: Optionally, a filename (relative to the temporary directory
created by this function). created by this function).
add_mask_token: an optional bool, whether to include a [MASK] token.
Returns: Returns:
The absolute filename of the created vocab file. The absolute filename of the created vocab file.
""" """
full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] + vocab full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"
] + ["[MASK]"]*add_mask_token + vocab
path = os.path.join( path = os.path.join(
tempfile.mkdtemp(dir=self.get_temp_dir(), # New subdir each time. tempfile.mkdtemp(dir=self.get_temp_dir(), # New subdir each time.
prefix=_STRING_NOT_TO_LEAK), prefix=_STRING_NOT_TO_LEAK),
...@@ -478,11 +482,12 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -478,11 +482,12 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
f.write("\n".join(full_vocab + [""])) f.write("\n".join(full_vocab + [""]))
return path return path
def _make_sp_model_file(self, vocab, prefix="spm"): def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False):
"""Creates Sentencepiece word model with given words plus special tokens. """Creates Sentencepiece word model with given words plus special tokens.
The tokens of the resulting model are, in this order: The tokens of the resulting model are, in this order:
<pad>, <unk>, [CLS], [SEP], ...vocab..., <s>, </s> <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
*=if requested by args.
The words in the input vocab are plain text, without the whitespace marker. The words in the input vocab are plain text, without the whitespace marker.
That makes this function interchangeable with _make_vocab_file(). That makes this function interchangeable with _make_vocab_file().
...@@ -492,6 +497,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -492,6 +497,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
vocabulary. Do not include special tokens here. vocabulary. Do not include special tokens here.
prefix: an optional string, to change the filename prefix for the model prefix: an optional string, to change the filename prefix for the model
(relative to the temporary directory created by this function). (relative to the temporary directory created by this function).
add_mask_token: an optional bool, whether to include a [MASK] token.
Returns: Returns:
The absolute filename of the created Sentencepiece model file. The absolute filename of the created Sentencepiece model file.
...@@ -507,12 +513,16 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -507,12 +513,16 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
input_text.append(" ".join([token] * (len(vocab) - i))) input_text.append(" ".join([token] * (len(vocab) - i)))
with tf.io.gfile.GFile(input_file, "w") as f: with tf.io.gfile.GFile(input_file, "w") as f:
f.write("\n".join(input_text + [""])) f.write("\n".join(input_text + [""]))
control_symbols = "[CLS],[SEP]"
full_vocab_size = len(vocab) + 6 # <pad>, <unk>, [CLS], [SEP], <s>, </s>. full_vocab_size = len(vocab) + 6 # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
if add_mask_token:
control_symbols += ",[MASK]"
full_vocab_size += 1
flags = dict( flags = dict(
model_prefix=model_prefix, model_prefix=model_prefix,
model_type="word", model_type="word",
input=input_file, input=input_file,
pad_id=0, unk_id=1, control_symbols="[CLS],[SEP]", pad_id=0, unk_id=1, control_symbols=control_symbols,
vocab_size=full_vocab_size, vocab_size=full_vocab_size,
bos_id=full_vocab_size-2, eos_id=full_vocab_size-1) bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
SentencePieceTrainer.Train( SentencePieceTrainer.Train(
...@@ -521,14 +531,15 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -521,14 +531,15 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def _do_export(self, vocab, do_lower_case, default_seq_length=128, def _do_export(self, vocab, do_lower_case, default_seq_length=128,
tokenize_with_offsets=True, use_sp_model=False, tokenize_with_offsets=True, use_sp_model=False,
experimental_disable_assert=False): experimental_disable_assert=False, add_mask_token=False):
"""Runs SavedModel export and returns the export_path.""" """Runs SavedModel export and returns the export_path."""
export_path = tempfile.mkdtemp(dir=self.get_temp_dir()) export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
vocab_file = sp_model_file = None vocab_file = sp_model_file = None
if use_sp_model: if use_sp_model:
sp_model_file = self._make_sp_model_file(vocab) sp_model_file = self._make_sp_model_file(vocab,
add_mask_token=add_mask_token)
else: else:
vocab_file = self._make_vocab_file(vocab) vocab_file = self._make_vocab_file(vocab, add_mask_token=add_mask_token)
export_tfhub_lib.export_preprocessing( export_tfhub_lib.export_preprocessing(
export_path, export_path,
vocab_file=vocab_file, vocab_file=vocab_file,
...@@ -751,6 +762,118 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -751,6 +762,118 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
@parameterized.named_parameters(("Bert", True), ("Albert", False))
def test_preprocessing_for_mlm(self, use_bert):
"""Combines both SavedModel types and TF.text helpers for MLM."""
# Create the preprocessing SavedModel with a [MASK] token.
preprocess = tf.saved_model.load(self._do_export(
["d", "ef", "abc", "xy"], do_lower_case=True,
tokenize_with_offsets=use_bert, # TODO(b/149576200): drop this.
experimental_disable_assert=True, # TODO(b/175369555): drop this.
add_mask_token=True, use_sp_model=not use_bert))
vocab_size = 4+5 if use_bert else 4+7
# Create the encoder SavedModel with an .mlm subobject.
hidden_size = 16
num_hidden_layers = 2
bert_config, encoder_config = _get_bert_config_or_encoder_config(
use_bert, hidden_size, num_hidden_layers, vocab_size)
_, pretrainer = export_tfhub_lib._create_model(
bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( # Not used below.
self.get_temp_dir(), use_sp_model=not use_bert)
encoder_export_path = os.path.join(self.get_temp_dir(), "encoder_export")
export_tfhub_lib.export_model(
export_path=encoder_export_path,
bert_config=bert_config,
encoder_config=encoder_config,
model_checkpoint_path=model_checkpoint_path,
with_mlm=True,
vocab_file=vocab_file,
sp_model_file=sp_model_file,
do_lower_case=True)
encoder = tf.saved_model.load(encoder_export_path)
# Get special tokens from the vocab (and vocab size).
special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
self.assertEqual(int(special_tokens_dict["vocab_size"]), vocab_size)
padding_id = int(special_tokens_dict["padding_id"])
self.assertEqual(padding_id, 0)
start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])
self.assertEqual(start_of_sequence_id, 2)
end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])
self.assertEqual(end_of_segment_id, 3)
mask_id = int(special_tokens_dict["mask_id"])
self.assertEqual(mask_id, 4)
# A batch of 3 segment pairs.
raw_segments = [tf.constant(["hello", "nice movie", "quick brown fox"]),
tf.constant(["world", "great actors", "lazy dog"])]
batch_size = 3
# Misc hyperparameters.
seq_length = 12
max_selections_per_seq = 2
# Tokenize inputs.
tokenized_segments = [preprocess.tokenize(s) for s in raw_segments]
# Trim inputs to eventually fit seq_lentgh.
num_special_tokens = len(raw_segments) + 1
trimmed_segments = text.WaterfallTrimmer(
seq_length - num_special_tokens).trim(tokenized_segments)
# Combine input segments into one input sequence.
input_ids, segment_ids = text.combine_segments(
trimmed_segments,
start_of_sequence_id=start_of_sequence_id,
end_of_segment_id=end_of_segment_id)
# Apply random masking controlled by policy objects.
(masked_input_ids, masked_lm_positions,
masked_ids) = text.mask_language_model(
input_ids=input_ids,
item_selector=text.RandomItemSelector(
max_selections_per_seq,
selection_rate=0.15,
unselectable_ids=[start_of_sequence_id, end_of_segment_id]),
mask_values_chooser=text.MaskValuesChooser(vocab_size=vocab_size,
mask_token=mask_id,
mask_token_rate=0.8,
random_token_rate=0.1))
# Pad to fixed-length Transformer encoder inputs.
input_word_ids, _ = text.pad_model_inputs(masked_input_ids,
seq_length,
pad_value=padding_id)
input_type_ids, input_mask = text.pad_model_inputs(segment_ids, seq_length,
pad_value=0)
masked_lm_positions, _ = text.pad_model_inputs(masked_lm_positions,
max_selections_per_seq,
pad_value=0)
masked_lm_positions = tf.cast(masked_lm_positions, tf.int32)
num_predictions = int(tf.shape(masked_lm_positions)[1])
# Call the MLM head of the Transformer encoder.
mlm_inputs = dict(
input_word_ids=input_word_ids,
input_mask=input_mask,
input_type_ids=input_type_ids,
masked_lm_positions=masked_lm_positions,
)
mlm_outputs = encoder.mlm(mlm_inputs)
self.assertEqual(mlm_outputs["pooled_output"].shape,
(batch_size, hidden_size))
self.assertEqual(mlm_outputs["sequence_output"].shape,
(batch_size, seq_length, hidden_size))
self.assertEqual(mlm_outputs["mlm_logits"].shape,
(batch_size, num_predictions, vocab_size))
self.assertLen(mlm_outputs["encoder_outputs"], num_hidden_layers)
# A real trainer would now compute the loss of mlm_logits
# trying to predict the masked_ids.
del masked_ids # Unused.
@parameterized.named_parameters(("Bert", False), ("Sentencepiece", True)) @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
def test_special_tokens_in_estimator(self, use_sp_model): def test_special_tokens_in_estimator(self, use_sp_model):
"""Tests getting special tokens without an Eager init context.""" """Tests getting special tokens without an Eager init context."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment