Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
28720cfa
Commit
28720cfa
authored
Feb 25, 2021
by
A. Unique TensorFlower
Browse files
Test call to .mlm subobject with preprocessed inputs.
PiperOrigin-RevId: 359696240
parent
052a2543
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
135 additions
and
12 deletions
+135
-12
official/nlp/tools/export_tfhub_lib_test.py
official/nlp/tools/export_tfhub_lib_test.py
+135
-12
No files found.
official/nlp/tools/export_tfhub_lib_test.py
View file @
28720cfa
...
...
@@ -21,6 +21,7 @@ from absl.testing import parameterized
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow_hub
as
hub
import
tensorflow_text
as
text
from
sentencepiece
import
SentencePieceTrainer
from
official.modeling
import
tf_utils
...
...
@@ -32,11 +33,11 @@ from official.nlp.tools import export_tfhub_lib
def
_get_bert_config_or_encoder_config
(
use_bert_config
,
hidden_size
,
num_hidden_layers
):
num_hidden_layers
,
vocab_size
=
100
):
"""Returns config args for export_tfhub_lib._create_model()."""
if
use_bert_config
:
bert_config
=
configs
.
BertConfig
(
vocab_size
=
100
,
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
intermediate_size
=
32
,
max_position_embeddings
=
128
,
...
...
@@ -48,7 +49,7 @@ def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
encoder_config
=
encoders
.
EncoderConfig
(
type
=
"albert"
,
albert
=
encoders
.
AlbertEncoderConfig
(
vocab_size
=
100
,
vocab_size
=
vocab_size
,
embedding_width
=
16
,
hidden_size
=
hidden_size
,
intermediate_size
=
32
,
...
...
@@ -450,11 +451,12 @@ _STRING_NOT_TO_LEAK = "private_path_component_"
class
ExportPreprocessingTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
_make_vocab_file
(
self
,
vocab
,
filename
=
"vocab.txt"
):
def
_make_vocab_file
(
self
,
vocab
,
filename
=
"vocab.txt"
,
add_mask_token
=
False
):
"""Creates wordpiece vocab file with given words plus special tokens.
The tokens of the resulting model are, in this order:
[PAD], [UNK], [CLS], [SEP], ...vocab...
[PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab...
*=if requested by args.
This function also accepts wordpieces that start with the ## continuation
marker, but avoiding those makes this function interchangeable with
...
...
@@ -465,11 +467,13 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
model's vocabulary. Do not include special tokens here.
filename: Optionally, a filename (relative to the temporary directory
created by this function).
add_mask_token: an optional bool, whether to include a [MASK] token.
Returns:
The absolute filename of the created vocab file.
"""
full_vocab
=
[
"[PAD]"
,
"[UNK]"
,
"[CLS]"
,
"[SEP]"
]
+
vocab
full_vocab
=
[
"[PAD]"
,
"[UNK]"
,
"[CLS]"
,
"[SEP]"
]
+
[
"[MASK]"
]
*
add_mask_token
+
vocab
path
=
os
.
path
.
join
(
tempfile
.
mkdtemp
(
dir
=
self
.
get_temp_dir
(),
# New subdir each time.
prefix
=
_STRING_NOT_TO_LEAK
),
...
...
@@ -478,11 +482,12 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
f
.
write
(
"
\n
"
.
join
(
full_vocab
+
[
""
]))
return
path
def
_make_sp_model_file
(
self
,
vocab
,
prefix
=
"spm"
):
def
_make_sp_model_file
(
self
,
vocab
,
prefix
=
"spm"
,
add_mask_token
=
False
):
"""Creates Sentencepiece word model with given words plus special tokens.
The tokens of the resulting model are, in this order:
<pad>, <unk>, [CLS], [SEP], ...vocab..., <s>, </s>
<pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
*=if requested by args.
The words in the input vocab are plain text, without the whitespace marker.
That makes this function interchangeable with _make_vocab_file().
...
...
@@ -492,6 +497,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
vocabulary. Do not include special tokens here.
prefix: an optional string, to change the filename prefix for the model
(relative to the temporary directory created by this function).
add_mask_token: an optional bool, whether to include a [MASK] token.
Returns:
The absolute filename of the created Sentencepiece model file.
...
...
@@ -507,12 +513,16 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
input_text
.
append
(
" "
.
join
([
token
]
*
(
len
(
vocab
)
-
i
)))
with
tf
.
io
.
gfile
.
GFile
(
input_file
,
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
input_text
+
[
""
]))
control_symbols
=
"[CLS],[SEP]"
full_vocab_size
=
len
(
vocab
)
+
6
# <pad>, <unk>, [CLS], [SEP], <s>, </s>.
if
add_mask_token
:
control_symbols
+=
",[MASK]"
full_vocab_size
+=
1
flags
=
dict
(
model_prefix
=
model_prefix
,
model_type
=
"word"
,
input
=
input_file
,
pad_id
=
0
,
unk_id
=
1
,
control_symbols
=
"[CLS],[SEP]"
,
pad_id
=
0
,
unk_id
=
1
,
control_symbols
=
control_symbols
,
vocab_size
=
full_vocab_size
,
bos_id
=
full_vocab_size
-
2
,
eos_id
=
full_vocab_size
-
1
)
SentencePieceTrainer
.
Train
(
...
...
@@ -521,14 +531,15 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def
_do_export
(
self
,
vocab
,
do_lower_case
,
default_seq_length
=
128
,
tokenize_with_offsets
=
True
,
use_sp_model
=
False
,
experimental_disable_assert
=
False
):
experimental_disable_assert
=
False
,
add_mask_token
=
False
):
"""Runs SavedModel export and returns the export_path."""
export_path
=
tempfile
.
mkdtemp
(
dir
=
self
.
get_temp_dir
())
vocab_file
=
sp_model_file
=
None
if
use_sp_model
:
sp_model_file
=
self
.
_make_sp_model_file
(
vocab
)
sp_model_file
=
self
.
_make_sp_model_file
(
vocab
,
add_mask_token
=
add_mask_token
)
else
:
vocab_file
=
self
.
_make_vocab_file
(
vocab
)
vocab_file
=
self
.
_make_vocab_file
(
vocab
,
add_mask_token
=
add_mask_token
)
export_tfhub_lib
.
export_preprocessing
(
export_path
,
vocab_file
=
vocab_file
,
...
...
@@ -751,6 +762,118 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
tf
.
constant
([[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]]))
@
parameterized
.
named_parameters
((
"Bert"
,
True
),
(
"Albert"
,
False
))
def
test_preprocessing_for_mlm
(
self
,
use_bert
):
"""Combines both SavedModel types and TF.text helpers for MLM."""
# Create the preprocessing SavedModel with a [MASK] token.
preprocess
=
tf
.
saved_model
.
load
(
self
.
_do_export
(
[
"d"
,
"ef"
,
"abc"
,
"xy"
],
do_lower_case
=
True
,
tokenize_with_offsets
=
use_bert
,
# TODO(b/149576200): drop this.
experimental_disable_assert
=
True
,
# TODO(b/175369555): drop this.
add_mask_token
=
True
,
use_sp_model
=
not
use_bert
))
vocab_size
=
4
+
5
if
use_bert
else
4
+
7
# Create the encoder SavedModel with an .mlm subobject.
hidden_size
=
16
num_hidden_layers
=
2
bert_config
,
encoder_config
=
_get_bert_config_or_encoder_config
(
use_bert
,
hidden_size
,
num_hidden_layers
,
vocab_size
)
_
,
pretrainer
=
export_tfhub_lib
.
_create_model
(
bert_config
=
bert_config
,
encoder_config
=
encoder_config
,
with_mlm
=
True
)
model_checkpoint_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"checkpoint"
)
checkpoint
=
tf
.
train
.
Checkpoint
(
**
pretrainer
.
checkpoint_items
)
checkpoint
.
save
(
os
.
path
.
join
(
model_checkpoint_dir
,
"test"
))
model_checkpoint_path
=
tf
.
train
.
latest_checkpoint
(
model_checkpoint_dir
)
vocab_file
,
sp_model_file
=
_get_vocab_or_sp_model_dummy
(
# Not used below.
self
.
get_temp_dir
(),
use_sp_model
=
not
use_bert
)
encoder_export_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"encoder_export"
)
export_tfhub_lib
.
export_model
(
export_path
=
encoder_export_path
,
bert_config
=
bert_config
,
encoder_config
=
encoder_config
,
model_checkpoint_path
=
model_checkpoint_path
,
with_mlm
=
True
,
vocab_file
=
vocab_file
,
sp_model_file
=
sp_model_file
,
do_lower_case
=
True
)
encoder
=
tf
.
saved_model
.
load
(
encoder_export_path
)
# Get special tokens from the vocab (and vocab size).
special_tokens_dict
=
preprocess
.
tokenize
.
get_special_tokens_dict
()
self
.
assertEqual
(
int
(
special_tokens_dict
[
"vocab_size"
]),
vocab_size
)
padding_id
=
int
(
special_tokens_dict
[
"padding_id"
])
self
.
assertEqual
(
padding_id
,
0
)
start_of_sequence_id
=
int
(
special_tokens_dict
[
"start_of_sequence_id"
])
self
.
assertEqual
(
start_of_sequence_id
,
2
)
end_of_segment_id
=
int
(
special_tokens_dict
[
"end_of_segment_id"
])
self
.
assertEqual
(
end_of_segment_id
,
3
)
mask_id
=
int
(
special_tokens_dict
[
"mask_id"
])
self
.
assertEqual
(
mask_id
,
4
)
# A batch of 3 segment pairs.
raw_segments
=
[
tf
.
constant
([
"hello"
,
"nice movie"
,
"quick brown fox"
]),
tf
.
constant
([
"world"
,
"great actors"
,
"lazy dog"
])]
batch_size
=
3
# Misc hyperparameters.
seq_length
=
12
max_selections_per_seq
=
2
# Tokenize inputs.
tokenized_segments
=
[
preprocess
.
tokenize
(
s
)
for
s
in
raw_segments
]
# Trim inputs to eventually fit seq_lentgh.
num_special_tokens
=
len
(
raw_segments
)
+
1
trimmed_segments
=
text
.
WaterfallTrimmer
(
seq_length
-
num_special_tokens
).
trim
(
tokenized_segments
)
# Combine input segments into one input sequence.
input_ids
,
segment_ids
=
text
.
combine_segments
(
trimmed_segments
,
start_of_sequence_id
=
start_of_sequence_id
,
end_of_segment_id
=
end_of_segment_id
)
# Apply random masking controlled by policy objects.
(
masked_input_ids
,
masked_lm_positions
,
masked_ids
)
=
text
.
mask_language_model
(
input_ids
=
input_ids
,
item_selector
=
text
.
RandomItemSelector
(
max_selections_per_seq
,
selection_rate
=
0.15
,
unselectable_ids
=
[
start_of_sequence_id
,
end_of_segment_id
]),
mask_values_chooser
=
text
.
MaskValuesChooser
(
vocab_size
=
vocab_size
,
mask_token
=
mask_id
,
mask_token_rate
=
0.8
,
random_token_rate
=
0.1
))
# Pad to fixed-length Transformer encoder inputs.
input_word_ids
,
_
=
text
.
pad_model_inputs
(
masked_input_ids
,
seq_length
,
pad_value
=
padding_id
)
input_type_ids
,
input_mask
=
text
.
pad_model_inputs
(
segment_ids
,
seq_length
,
pad_value
=
0
)
masked_lm_positions
,
_
=
text
.
pad_model_inputs
(
masked_lm_positions
,
max_selections_per_seq
,
pad_value
=
0
)
masked_lm_positions
=
tf
.
cast
(
masked_lm_positions
,
tf
.
int32
)
num_predictions
=
int
(
tf
.
shape
(
masked_lm_positions
)[
1
])
# Call the MLM head of the Transformer encoder.
mlm_inputs
=
dict
(
input_word_ids
=
input_word_ids
,
input_mask
=
input_mask
,
input_type_ids
=
input_type_ids
,
masked_lm_positions
=
masked_lm_positions
,
)
mlm_outputs
=
encoder
.
mlm
(
mlm_inputs
)
self
.
assertEqual
(
mlm_outputs
[
"pooled_output"
].
shape
,
(
batch_size
,
hidden_size
))
self
.
assertEqual
(
mlm_outputs
[
"sequence_output"
].
shape
,
(
batch_size
,
seq_length
,
hidden_size
))
self
.
assertEqual
(
mlm_outputs
[
"mlm_logits"
].
shape
,
(
batch_size
,
num_predictions
,
vocab_size
))
self
.
assertLen
(
mlm_outputs
[
"encoder_outputs"
],
num_hidden_layers
)
# A real trainer would now compute the loss of mlm_logits
# trying to predict the masked_ids.
del
masked_ids
# Unused.
@
parameterized
.
named_parameters
((
"Bert"
,
False
),
(
"Sentencepiece"
,
True
))
def
test_special_tokens_in_estimator
(
self
,
use_sp_model
):
"""Tests getting special tokens without an Eager init context."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment