Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
56109286
Commit
56109286
authored
Feb 25, 2021
by
A. Unique TensorFlower
Browse files
Test call to .mlm subobject with preprocessed inputs.
PiperOrigin-RevId: 359696240
parent
8fba84f8
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
135 additions
and
12 deletions
+135
-12
official/nlp/tools/export_tfhub_lib_test.py
official/nlp/tools/export_tfhub_lib_test.py
+135
-12
No files found.
official/nlp/tools/export_tfhub_lib_test.py
View file @
56109286
...
@@ -21,6 +21,7 @@ from absl.testing import parameterized
...
@@ -21,6 +21,7 @@ from absl.testing import parameterized
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
import
tensorflow_hub
as
hub
import
tensorflow_hub
as
hub
import
tensorflow_text
as
text
from
sentencepiece
import
SentencePieceTrainer
from
sentencepiece
import
SentencePieceTrainer
from
official.modeling
import
tf_utils
from
official.modeling
import
tf_utils
...
@@ -32,11 +33,11 @@ from official.nlp.tools import export_tfhub_lib
...
@@ -32,11 +33,11 @@ from official.nlp.tools import export_tfhub_lib
def
_get_bert_config_or_encoder_config
(
use_bert_config
,
hidden_size
,
def
_get_bert_config_or_encoder_config
(
use_bert_config
,
hidden_size
,
num_hidden_layers
):
num_hidden_layers
,
vocab_size
=
100
):
"""Returns config args for export_tfhub_lib._create_model()."""
"""Returns config args for export_tfhub_lib._create_model()."""
if
use_bert_config
:
if
use_bert_config
:
bert_config
=
configs
.
BertConfig
(
bert_config
=
configs
.
BertConfig
(
vocab_size
=
100
,
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
intermediate_size
=
32
,
intermediate_size
=
32
,
max_position_embeddings
=
128
,
max_position_embeddings
=
128
,
...
@@ -48,7 +49,7 @@ def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
...
@@ -48,7 +49,7 @@ def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
encoder_config
=
encoders
.
EncoderConfig
(
encoder_config
=
encoders
.
EncoderConfig
(
type
=
"albert"
,
type
=
"albert"
,
albert
=
encoders
.
AlbertEncoderConfig
(
albert
=
encoders
.
AlbertEncoderConfig
(
vocab_size
=
100
,
vocab_size
=
vocab_size
,
embedding_width
=
16
,
embedding_width
=
16
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
intermediate_size
=
32
,
intermediate_size
=
32
,
...
@@ -450,11 +451,12 @@ _STRING_NOT_TO_LEAK = "private_path_component_"
...
@@ -450,11 +451,12 @@ _STRING_NOT_TO_LEAK = "private_path_component_"
class
ExportPreprocessingTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
class
ExportPreprocessingTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
_make_vocab_file
(
self
,
vocab
,
filename
=
"vocab.txt"
):
def
_make_vocab_file
(
self
,
vocab
,
filename
=
"vocab.txt"
,
add_mask_token
=
False
):
"""Creates wordpiece vocab file with given words plus special tokens.
"""Creates wordpiece vocab file with given words plus special tokens.
The tokens of the resulting model are, in this order:
The tokens of the resulting model are, in this order:
[PAD], [UNK], [CLS], [SEP], ...vocab...
[PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab...
*=if requested by args.
This function also accepts wordpieces that start with the ## continuation
This function also accepts wordpieces that start with the ## continuation
marker, but avoiding those makes this function interchangeable with
marker, but avoiding those makes this function interchangeable with
...
@@ -465,11 +467,13 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -465,11 +467,13 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
model's vocabulary. Do not include special tokens here.
model's vocabulary. Do not include special tokens here.
filename: Optionally, a filename (relative to the temporary directory
filename: Optionally, a filename (relative to the temporary directory
created by this function).
created by this function).
add_mask_token: an optional bool, whether to include a [MASK] token.
Returns:
Returns:
The absolute filename of the created vocab file.
The absolute filename of the created vocab file.
"""
"""
full_vocab
=
[
"[PAD]"
,
"[UNK]"
,
"[CLS]"
,
"[SEP]"
]
+
vocab
full_vocab
=
[
"[PAD]"
,
"[UNK]"
,
"[CLS]"
,
"[SEP]"
]
+
[
"[MASK]"
]
*
add_mask_token
+
vocab
path
=
os
.
path
.
join
(
path
=
os
.
path
.
join
(
tempfile
.
mkdtemp
(
dir
=
self
.
get_temp_dir
(),
# New subdir each time.
tempfile
.
mkdtemp
(
dir
=
self
.
get_temp_dir
(),
# New subdir each time.
prefix
=
_STRING_NOT_TO_LEAK
),
prefix
=
_STRING_NOT_TO_LEAK
),
...
@@ -478,11 +482,12 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -478,11 +482,12 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
f
.
write
(
"
\n
"
.
join
(
full_vocab
+
[
""
]))
f
.
write
(
"
\n
"
.
join
(
full_vocab
+
[
""
]))
return
path
return
path
def
_make_sp_model_file
(
self
,
vocab
,
prefix
=
"spm"
):
def
_make_sp_model_file
(
self
,
vocab
,
prefix
=
"spm"
,
add_mask_token
=
False
):
"""Creates Sentencepiece word model with given words plus special tokens.
"""Creates Sentencepiece word model with given words plus special tokens.
The tokens of the resulting model are, in this order:
The tokens of the resulting model are, in this order:
<pad>, <unk>, [CLS], [SEP], ...vocab..., <s>, </s>
<pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
*=if requested by args.
The words in the input vocab are plain text, without the whitespace marker.
The words in the input vocab are plain text, without the whitespace marker.
That makes this function interchangeable with _make_vocab_file().
That makes this function interchangeable with _make_vocab_file().
...
@@ -492,6 +497,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -492,6 +497,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
vocabulary. Do not include special tokens here.
vocabulary. Do not include special tokens here.
prefix: an optional string, to change the filename prefix for the model
prefix: an optional string, to change the filename prefix for the model
(relative to the temporary directory created by this function).
(relative to the temporary directory created by this function).
add_mask_token: an optional bool, whether to include a [MASK] token.
Returns:
Returns:
The absolute filename of the created Sentencepiece model file.
The absolute filename of the created Sentencepiece model file.
...
@@ -507,12 +513,16 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -507,12 +513,16 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
input_text
.
append
(
" "
.
join
([
token
]
*
(
len
(
vocab
)
-
i
)))
input_text
.
append
(
" "
.
join
([
token
]
*
(
len
(
vocab
)
-
i
)))
with
tf
.
io
.
gfile
.
GFile
(
input_file
,
"w"
)
as
f
:
with
tf
.
io
.
gfile
.
GFile
(
input_file
,
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
input_text
+
[
""
]))
f
.
write
(
"
\n
"
.
join
(
input_text
+
[
""
]))
control_symbols
=
"[CLS],[SEP]"
full_vocab_size
=
len
(
vocab
)
+
6
# <pad>, <unk>, [CLS], [SEP], <s>, </s>.
full_vocab_size
=
len
(
vocab
)
+
6
# <pad>, <unk>, [CLS], [SEP], <s>, </s>.
if
add_mask_token
:
control_symbols
+=
",[MASK]"
full_vocab_size
+=
1
flags
=
dict
(
flags
=
dict
(
model_prefix
=
model_prefix
,
model_prefix
=
model_prefix
,
model_type
=
"word"
,
model_type
=
"word"
,
input
=
input_file
,
input
=
input_file
,
pad_id
=
0
,
unk_id
=
1
,
control_symbols
=
"[CLS],[SEP]"
,
pad_id
=
0
,
unk_id
=
1
,
control_symbols
=
control_symbols
,
vocab_size
=
full_vocab_size
,
vocab_size
=
full_vocab_size
,
bos_id
=
full_vocab_size
-
2
,
eos_id
=
full_vocab_size
-
1
)
bos_id
=
full_vocab_size
-
2
,
eos_id
=
full_vocab_size
-
1
)
SentencePieceTrainer
.
Train
(
SentencePieceTrainer
.
Train
(
...
@@ -521,14 +531,15 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -521,14 +531,15 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def
_do_export
(
self
,
vocab
,
do_lower_case
,
default_seq_length
=
128
,
def
_do_export
(
self
,
vocab
,
do_lower_case
,
default_seq_length
=
128
,
tokenize_with_offsets
=
True
,
use_sp_model
=
False
,
tokenize_with_offsets
=
True
,
use_sp_model
=
False
,
experimental_disable_assert
=
False
):
experimental_disable_assert
=
False
,
add_mask_token
=
False
):
"""Runs SavedModel export and returns the export_path."""
"""Runs SavedModel export and returns the export_path."""
export_path
=
tempfile
.
mkdtemp
(
dir
=
self
.
get_temp_dir
())
export_path
=
tempfile
.
mkdtemp
(
dir
=
self
.
get_temp_dir
())
vocab_file
=
sp_model_file
=
None
vocab_file
=
sp_model_file
=
None
if
use_sp_model
:
if
use_sp_model
:
sp_model_file
=
self
.
_make_sp_model_file
(
vocab
)
sp_model_file
=
self
.
_make_sp_model_file
(
vocab
,
add_mask_token
=
add_mask_token
)
else
:
else
:
vocab_file
=
self
.
_make_vocab_file
(
vocab
)
vocab_file
=
self
.
_make_vocab_file
(
vocab
,
add_mask_token
=
add_mask_token
)
export_tfhub_lib
.
export_preprocessing
(
export_tfhub_lib
.
export_preprocessing
(
export_path
,
export_path
,
vocab_file
=
vocab_file
,
vocab_file
=
vocab_file
,
...
@@ -751,6 +762,118 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -751,6 +762,118 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
tf
.
constant
([[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
],
tf
.
constant
([[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]]))
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]]))
@
parameterized
.
named_parameters
((
"Bert"
,
True
),
(
"Albert"
,
False
))
def
test_preprocessing_for_mlm
(
self
,
use_bert
):
"""Combines both SavedModel types and TF.text helpers for MLM."""
# Create the preprocessing SavedModel with a [MASK] token.
preprocess
=
tf
.
saved_model
.
load
(
self
.
_do_export
(
[
"d"
,
"ef"
,
"abc"
,
"xy"
],
do_lower_case
=
True
,
tokenize_with_offsets
=
use_bert
,
# TODO(b/149576200): drop this.
experimental_disable_assert
=
True
,
# TODO(b/175369555): drop this.
add_mask_token
=
True
,
use_sp_model
=
not
use_bert
))
vocab_size
=
4
+
5
if
use_bert
else
4
+
7
# Create the encoder SavedModel with an .mlm subobject.
hidden_size
=
16
num_hidden_layers
=
2
bert_config
,
encoder_config
=
_get_bert_config_or_encoder_config
(
use_bert
,
hidden_size
,
num_hidden_layers
,
vocab_size
)
_
,
pretrainer
=
export_tfhub_lib
.
_create_model
(
bert_config
=
bert_config
,
encoder_config
=
encoder_config
,
with_mlm
=
True
)
model_checkpoint_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"checkpoint"
)
checkpoint
=
tf
.
train
.
Checkpoint
(
**
pretrainer
.
checkpoint_items
)
checkpoint
.
save
(
os
.
path
.
join
(
model_checkpoint_dir
,
"test"
))
model_checkpoint_path
=
tf
.
train
.
latest_checkpoint
(
model_checkpoint_dir
)
vocab_file
,
sp_model_file
=
_get_vocab_or_sp_model_dummy
(
# Not used below.
self
.
get_temp_dir
(),
use_sp_model
=
not
use_bert
)
encoder_export_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"encoder_export"
)
export_tfhub_lib
.
export_model
(
export_path
=
encoder_export_path
,
bert_config
=
bert_config
,
encoder_config
=
encoder_config
,
model_checkpoint_path
=
model_checkpoint_path
,
with_mlm
=
True
,
vocab_file
=
vocab_file
,
sp_model_file
=
sp_model_file
,
do_lower_case
=
True
)
encoder
=
tf
.
saved_model
.
load
(
encoder_export_path
)
# Get special tokens from the vocab (and vocab size).
special_tokens_dict
=
preprocess
.
tokenize
.
get_special_tokens_dict
()
self
.
assertEqual
(
int
(
special_tokens_dict
[
"vocab_size"
]),
vocab_size
)
padding_id
=
int
(
special_tokens_dict
[
"padding_id"
])
self
.
assertEqual
(
padding_id
,
0
)
start_of_sequence_id
=
int
(
special_tokens_dict
[
"start_of_sequence_id"
])
self
.
assertEqual
(
start_of_sequence_id
,
2
)
end_of_segment_id
=
int
(
special_tokens_dict
[
"end_of_segment_id"
])
self
.
assertEqual
(
end_of_segment_id
,
3
)
mask_id
=
int
(
special_tokens_dict
[
"mask_id"
])
self
.
assertEqual
(
mask_id
,
4
)
# A batch of 3 segment pairs.
raw_segments
=
[
tf
.
constant
([
"hello"
,
"nice movie"
,
"quick brown fox"
]),
tf
.
constant
([
"world"
,
"great actors"
,
"lazy dog"
])]
batch_size
=
3
# Misc hyperparameters.
seq_length
=
12
max_selections_per_seq
=
2
# Tokenize inputs.
tokenized_segments
=
[
preprocess
.
tokenize
(
s
)
for
s
in
raw_segments
]
# Trim inputs to eventually fit seq_lentgh.
num_special_tokens
=
len
(
raw_segments
)
+
1
trimmed_segments
=
text
.
WaterfallTrimmer
(
seq_length
-
num_special_tokens
).
trim
(
tokenized_segments
)
# Combine input segments into one input sequence.
input_ids
,
segment_ids
=
text
.
combine_segments
(
trimmed_segments
,
start_of_sequence_id
=
start_of_sequence_id
,
end_of_segment_id
=
end_of_segment_id
)
# Apply random masking controlled by policy objects.
(
masked_input_ids
,
masked_lm_positions
,
masked_ids
)
=
text
.
mask_language_model
(
input_ids
=
input_ids
,
item_selector
=
text
.
RandomItemSelector
(
max_selections_per_seq
,
selection_rate
=
0.15
,
unselectable_ids
=
[
start_of_sequence_id
,
end_of_segment_id
]),
mask_values_chooser
=
text
.
MaskValuesChooser
(
vocab_size
=
vocab_size
,
mask_token
=
mask_id
,
mask_token_rate
=
0.8
,
random_token_rate
=
0.1
))
# Pad to fixed-length Transformer encoder inputs.
input_word_ids
,
_
=
text
.
pad_model_inputs
(
masked_input_ids
,
seq_length
,
pad_value
=
padding_id
)
input_type_ids
,
input_mask
=
text
.
pad_model_inputs
(
segment_ids
,
seq_length
,
pad_value
=
0
)
masked_lm_positions
,
_
=
text
.
pad_model_inputs
(
masked_lm_positions
,
max_selections_per_seq
,
pad_value
=
0
)
masked_lm_positions
=
tf
.
cast
(
masked_lm_positions
,
tf
.
int32
)
num_predictions
=
int
(
tf
.
shape
(
masked_lm_positions
)[
1
])
# Call the MLM head of the Transformer encoder.
mlm_inputs
=
dict
(
input_word_ids
=
input_word_ids
,
input_mask
=
input_mask
,
input_type_ids
=
input_type_ids
,
masked_lm_positions
=
masked_lm_positions
,
)
mlm_outputs
=
encoder
.
mlm
(
mlm_inputs
)
self
.
assertEqual
(
mlm_outputs
[
"pooled_output"
].
shape
,
(
batch_size
,
hidden_size
))
self
.
assertEqual
(
mlm_outputs
[
"sequence_output"
].
shape
,
(
batch_size
,
seq_length
,
hidden_size
))
self
.
assertEqual
(
mlm_outputs
[
"mlm_logits"
].
shape
,
(
batch_size
,
num_predictions
,
vocab_size
))
self
.
assertLen
(
mlm_outputs
[
"encoder_outputs"
],
num_hidden_layers
)
# A real trainer would now compute the loss of mlm_logits
# trying to predict the masked_ids.
del
masked_ids
# Unused.
@
parameterized
.
named_parameters
((
"Bert"
,
False
),
(
"Sentencepiece"
,
True
))
@
parameterized
.
named_parameters
((
"Bert"
,
False
),
(
"Sentencepiece"
,
True
))
def
test_special_tokens_in_estimator
(
self
,
use_sp_model
):
def
test_special_tokens_in_estimator
(
self
,
use_sp_model
):
"""Tests getting special tokens without an Eager init context."""
"""Tests getting special tokens without an Eager init context."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment