Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
90dd6310
Commit
90dd6310
authored
Dec 21, 2021
by
Frederick Liu
Committed by
A. Unique TensorFlower
Dec 21, 2021
Browse files
Internal change
PiperOrigin-RevId: 417673004
parent
ddaca60a
Changes
27
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1850 additions
and
0 deletions
+1850
-0
official/legacy/transformer/transformer_main_test.py
official/legacy/transformer/transformer_main_test.py
+193
-0
official/legacy/transformer/transformer_test.py
official/legacy/transformer/transformer_test.py
+98
-0
official/legacy/transformer/translate.py
official/legacy/transformer/translate.py
+190
-0
official/legacy/transformer/utils/__init__.py
official/legacy/transformer/utils/__init__.py
+14
-0
official/legacy/transformer/utils/metrics.py
official/legacy/transformer/utils/metrics.py
+491
-0
official/legacy/transformer/utils/tokenizer.py
official/legacy/transformer/utils/tokenizer.py
+660
-0
official/legacy/transformer/utils/tokenizer_test.py
official/legacy/transformer/utils/tokenizer_test.py
+204
-0
No files found.
official/legacy/transformer/transformer_main_test.py
0 → 100644
View file @
90dd6310
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test Transformer model."""
import
os
import
re
import
sys
import
unittest
from
absl
import
flags
from
absl.testing
import
flagsaver
import
tensorflow
as
tf
from
tensorflow.python.eager
import
context
# pylint: disable=ungrouped-imports
from
official.legacy.transformer
import
misc
from
official.legacy.transformer
import
transformer_main
FLAGS
=
flags
.
FLAGS
FIXED_TIMESTAMP
=
'my_time_stamp'
WEIGHT_PATTERN
=
re
.
compile
(
r
'weights-epoch-.+\.hdf5'
)
def
_generate_file
(
filepath
,
lines
):
with
open
(
filepath
,
'w'
)
as
f
:
for
l
in
lines
:
f
.
write
(
'{}
\n
'
.
format
(
l
))
class
TransformerTaskTest
(
tf
.
test
.
TestCase
):
local_flags
=
None
def
setUp
(
self
):
# pylint: disable=g-missing-super-call
temp_dir
=
self
.
get_temp_dir
()
if
TransformerTaskTest
.
local_flags
is
None
:
misc
.
define_transformer_flags
()
# Loads flags, array cannot be blank.
flags
.
FLAGS
([
'foo'
])
TransformerTaskTest
.
local_flags
=
flagsaver
.
save_flag_values
()
else
:
flagsaver
.
restore_flag_values
(
TransformerTaskTest
.
local_flags
)
FLAGS
.
model_dir
=
os
.
path
.
join
(
temp_dir
,
FIXED_TIMESTAMP
)
FLAGS
.
param_set
=
'tiny'
FLAGS
.
use_synthetic_data
=
True
FLAGS
.
steps_between_evals
=
1
FLAGS
.
train_steps
=
1
FLAGS
.
validation_steps
=
1
FLAGS
.
batch_size
=
4
FLAGS
.
max_length
=
1
FLAGS
.
num_gpus
=
1
FLAGS
.
distribution_strategy
=
'off'
FLAGS
.
dtype
=
'fp32'
self
.
model_dir
=
FLAGS
.
model_dir
self
.
temp_dir
=
temp_dir
self
.
vocab_file
=
os
.
path
.
join
(
temp_dir
,
'vocab'
)
self
.
vocab_size
=
misc
.
get_model_params
(
FLAGS
.
param_set
,
0
)[
'vocab_size'
]
self
.
bleu_source
=
os
.
path
.
join
(
temp_dir
,
'bleu_source'
)
self
.
bleu_ref
=
os
.
path
.
join
(
temp_dir
,
'bleu_ref'
)
self
.
orig_policy
=
(
tf
.
compat
.
v2
.
keras
.
mixed_precision
.
global_policy
())
def
tearDown
(
self
):
# pylint: disable=g-missing-super-call
tf
.
compat
.
v2
.
keras
.
mixed_precision
.
set_global_policy
(
self
.
orig_policy
)
def
_assert_exists
(
self
,
filepath
):
self
.
assertTrue
(
os
.
path
.
exists
(
filepath
))
def
test_train_no_dist_strat
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
def
test_train_save_full_model
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
FLAGS
.
save_weights_only
=
False
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
def
test_train_static_batch
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
FLAGS
.
distribution_strategy
=
'one_device'
if
tf
.
test
.
is_built_with_cuda
():
FLAGS
.
num_gpus
=
1
else
:
FLAGS
.
num_gpus
=
0
FLAGS
.
static_batch
=
True
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_train_1_gpu_with_dist_strat
(
self
):
FLAGS
.
distribution_strategy
=
'one_device'
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_train_fp16
(
self
):
FLAGS
.
distribution_strategy
=
'one_device'
FLAGS
.
dtype
=
'fp16'
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_train_2_gpu
(
self
):
if
context
.
num_gpus
()
<
2
:
self
.
skipTest
(
'{} GPUs are not available for this test. {} GPUs are available'
.
format
(
2
,
context
.
num_gpus
()))
FLAGS
.
distribution_strategy
=
'mirrored'
FLAGS
.
num_gpus
=
2
FLAGS
.
param_set
=
'base'
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_train_2_gpu_fp16
(
self
):
if
context
.
num_gpus
()
<
2
:
self
.
skipTest
(
'{} GPUs are not available for this test. {} GPUs are available'
.
format
(
2
,
context
.
num_gpus
()))
FLAGS
.
distribution_strategy
=
'mirrored'
FLAGS
.
num_gpus
=
2
FLAGS
.
param_set
=
'base'
FLAGS
.
dtype
=
'fp16'
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
def
_prepare_files_and_flags
(
self
,
*
extra_flags
):
# Make log dir.
if
not
os
.
path
.
exists
(
self
.
temp_dir
):
os
.
makedirs
(
self
.
temp_dir
)
# Fake vocab, bleu_source and bleu_ref.
tokens
=
[
"'<pad>'"
,
"'<EOS>'"
,
"'_'"
,
"'a'"
,
"'b'"
,
"'c'"
,
"'d'"
,
"'a_'"
,
"'b_'"
,
"'c_'"
,
"'d_'"
]
tokens
+=
[
"'{}'"
.
format
(
i
)
for
i
in
range
(
self
.
vocab_size
-
len
(
tokens
))]
_generate_file
(
self
.
vocab_file
,
tokens
)
_generate_file
(
self
.
bleu_source
,
[
'a b'
,
'c d'
])
_generate_file
(
self
.
bleu_ref
,
[
'a b'
,
'd c'
])
# Update flags.
update_flags
=
[
'ignored_program_name'
,
'--vocab_file={}'
.
format
(
self
.
vocab_file
),
'--bleu_source={}'
.
format
(
self
.
bleu_source
),
'--bleu_ref={}'
.
format
(
self
.
bleu_ref
),
]
if
extra_flags
:
update_flags
.
extend
(
extra_flags
)
FLAGS
(
update_flags
)
def
test_predict
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
self
.
_prepare_files_and_flags
()
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
predict
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_predict_fp16
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
self
.
_prepare_files_and_flags
(
'--dtype=fp16'
)
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
predict
()
def
test_eval
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
if
'test_xla'
in
sys
.
argv
[
0
]:
self
.
skipTest
(
'TODO(xla): Make this test faster under XLA.'
)
self
.
_prepare_files_and_flags
()
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
eval
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/legacy/transformer/transformer_test.py
0 → 100644
View file @
90dd6310
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test Transformer model."""
import
tensorflow
as
tf
from
official.legacy.transformer
import
model_params
from
official.legacy.transformer
import
transformer
class
TransformerV2Test
(
tf
.
test
.
TestCase
):
def
setUp
(
self
):
super
().
setUp
()
self
.
params
=
params
=
model_params
.
TINY_PARAMS
params
[
"batch_size"
]
=
params
[
"default_batch_size"
]
=
16
params
[
"use_synthetic_data"
]
=
True
params
[
"hidden_size"
]
=
12
params
[
"num_hidden_layers"
]
=
2
params
[
"filter_size"
]
=
14
params
[
"num_heads"
]
=
2
params
[
"vocab_size"
]
=
41
params
[
"extra_decode_length"
]
=
2
params
[
"beam_size"
]
=
3
params
[
"dtype"
]
=
tf
.
float32
def
test_create_model_train
(
self
):
model
=
transformer
.
create_model
(
self
.
params
,
True
)
inputs
,
outputs
=
model
.
inputs
,
model
.
outputs
self
.
assertEqual
(
len
(
inputs
),
2
)
self
.
assertEqual
(
len
(
outputs
),
1
)
self
.
assertEqual
(
inputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
0
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
inputs
[
1
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
1
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
outputs
[
0
].
shape
.
as_list
(),
[
None
,
None
,
41
])
self
.
assertEqual
(
outputs
[
0
].
dtype
,
tf
.
float32
)
def
test_create_model_not_train
(
self
):
model
=
transformer
.
create_model
(
self
.
params
,
False
)
inputs
,
outputs
=
model
.
inputs
,
model
.
outputs
self
.
assertEqual
(
len
(
inputs
),
1
)
self
.
assertEqual
(
len
(
outputs
),
2
)
self
.
assertEqual
(
inputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
0
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
outputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
outputs
[
0
].
dtype
,
tf
.
int32
)
self
.
assertEqual
(
outputs
[
1
].
shape
.
as_list
(),
[
None
])
self
.
assertEqual
(
outputs
[
1
].
dtype
,
tf
.
float32
)
def
test_export
(
self
):
model
=
transformer
.
Transformer
(
self
.
params
,
name
=
"transformer_v2"
)
export_dir
=
self
.
get_temp_dir
()
batch_size
=
5
max_length
=
6
class
SaveModule
(
tf
.
Module
):
def
__init__
(
self
,
model
):
super
(
SaveModule
,
self
).
__init__
()
self
.
model
=
model
@
tf
.
function
def
serve
(
self
,
x
):
return
self
.
model
.
call
([
x
],
training
=
False
)
save_module
=
SaveModule
(
model
)
tensor_shape
=
(
None
,
None
)
sample_input
=
tf
.
zeros
((
batch_size
,
max_length
),
dtype
=
tf
.
int64
)
_
=
save_module
.
serve
(
sample_input
)
signatures
=
dict
(
serving_default
=
save_module
.
serve
.
get_concrete_function
(
tf
.
TensorSpec
(
shape
=
tensor_shape
,
dtype
=
tf
.
int64
,
name
=
"x"
)))
tf
.
saved_model
.
save
(
save_module
,
export_dir
,
signatures
=
signatures
)
imported
=
tf
.
saved_model
.
load
(
export_dir
)
serving_fn
=
imported
.
signatures
[
"serving_default"
]
all_outputs
=
serving_fn
(
sample_input
)
output
=
all_outputs
[
"outputs"
]
output_shapes
=
output
.
shape
.
as_list
()
self
.
assertEqual
(
output_shapes
[
0
],
batch_size
)
self
.
assertEqual
(
output_shapes
[
1
],
max_length
+
model
.
params
[
"extra_decode_length"
])
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/legacy/transformer/translate.py
0 → 100644
View file @
90dd6310
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Translate text or files using trained transformer model."""
# Import libraries
from
absl
import
logging
import
numpy
as
np
import
tensorflow
as
tf
from
official.legacy.transformer.utils
import
tokenizer
_EXTRA_DECODE_LENGTH
=
100
_BEAM_SIZE
=
4
_ALPHA
=
0.6
def
_get_sorted_inputs
(
filename
):
"""Read and sort lines from the file sorted by decreasing length.
Args:
filename: String name of file to read inputs from.
Returns:
Sorted list of inputs, and dictionary mapping original index->sorted index
of each element.
"""
with
tf
.
io
.
gfile
.
GFile
(
filename
)
as
f
:
records
=
f
.
read
().
split
(
"
\n
"
)
inputs
=
[
record
.
strip
()
for
record
in
records
]
if
not
inputs
[
-
1
]:
inputs
.
pop
()
input_lens
=
[(
i
,
len
(
line
.
split
()))
for
i
,
line
in
enumerate
(
inputs
)]
sorted_input_lens
=
sorted
(
input_lens
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
sorted_inputs
=
[
None
]
*
len
(
sorted_input_lens
)
sorted_keys
=
[
0
]
*
len
(
sorted_input_lens
)
for
i
,
(
index
,
_
)
in
enumerate
(
sorted_input_lens
):
sorted_inputs
[
i
]
=
inputs
[
index
]
sorted_keys
[
index
]
=
i
return
sorted_inputs
,
sorted_keys
def
_encode_and_add_eos
(
line
,
subtokenizer
):
"""Encode line with subtokenizer, and add EOS id to the end."""
return
subtokenizer
.
encode
(
line
)
+
[
tokenizer
.
EOS_ID
]
def
_trim_and_decode
(
ids
,
subtokenizer
):
"""Trim EOS and PAD tokens from ids, and decode to return a string."""
try
:
index
=
list
(
ids
).
index
(
tokenizer
.
EOS_ID
)
return
subtokenizer
.
decode
(
ids
[:
index
])
except
ValueError
:
# No EOS found in sequence
return
subtokenizer
.
decode
(
ids
)
def
translate_file
(
model
,
params
,
subtokenizer
,
input_file
,
output_file
=
None
,
print_all_translations
=
True
,
distribution_strategy
=
None
):
"""Translate lines in file, and save to output file if specified.
Args:
model: A Keras model, used to generate the translations.
params: A dictionary, containing the translation related parameters.
subtokenizer: A subtokenizer object, used for encoding and decoding source
and translated lines.
input_file: A file containing lines to translate.
output_file: A file that stores the generated translations.
print_all_translations: A bool. If true, all translations are printed to
stdout.
distribution_strategy: A distribution strategy, used to perform inference
directly with tf.function instead of Keras model.predict().
Raises:
ValueError: if output file is invalid.
"""
batch_size
=
params
[
"decode_batch_size"
]
# Read and sort inputs by length. Keep dictionary (original index-->new index
# in sorted list) to write translations in the original order.
sorted_inputs
,
sorted_keys
=
_get_sorted_inputs
(
input_file
)
total_samples
=
len
(
sorted_inputs
)
num_decode_batches
=
(
total_samples
-
1
)
//
batch_size
+
1
def
input_generator
():
"""Yield encoded strings from sorted_inputs."""
for
i
in
range
(
num_decode_batches
):
lines
=
[
sorted_inputs
[
j
+
i
*
batch_size
]
for
j
in
range
(
batch_size
)
if
j
+
i
*
batch_size
<
total_samples
]
lines
=
[
_encode_and_add_eos
(
l
,
subtokenizer
)
for
l
in
lines
]
if
distribution_strategy
:
for
j
in
range
(
batch_size
-
len
(
lines
)):
lines
.
append
([
tokenizer
.
EOS_ID
])
batch
=
tf
.
keras
.
preprocessing
.
sequence
.
pad_sequences
(
lines
,
maxlen
=
params
[
"decode_max_length"
],
dtype
=
"int32"
,
padding
=
"post"
)
logging
.
info
(
"Decoding batch %d out of %d."
,
i
,
num_decode_batches
)
yield
batch
@
tf
.
function
def
predict_step
(
inputs
):
"""Decoding step function for TPU runs."""
def
_step_fn
(
inputs
):
"""Per replica step function."""
tag
=
inputs
[
0
]
val_inputs
=
inputs
[
1
]
val_outputs
,
_
=
model
([
val_inputs
],
training
=
False
)
return
tag
,
val_outputs
return
distribution_strategy
.
run
(
_step_fn
,
args
=
(
inputs
,))
translations
=
[]
if
distribution_strategy
:
num_replicas
=
distribution_strategy
.
num_replicas_in_sync
local_batch_size
=
params
[
"decode_batch_size"
]
//
num_replicas
for
i
,
text
in
enumerate
(
input_generator
()):
if
distribution_strategy
:
text
=
np
.
reshape
(
text
,
[
num_replicas
,
local_batch_size
,
-
1
])
# Add tag to the input of each replica with the reordering logic after
# outputs, to ensure the output order matches the input order.
text
=
tf
.
constant
(
text
)
@
tf
.
function
def
text_as_per_replica
():
replica_context
=
tf
.
distribute
.
get_replica_context
()
replica_id
=
replica_context
.
replica_id_in_sync_group
return
replica_id
,
text
[
replica_id
]
# pylint: disable=cell-var-from-loop
text
=
distribution_strategy
.
run
(
text_as_per_replica
)
outputs
=
distribution_strategy
.
experimental_local_results
(
predict_step
(
text
))
val_outputs
=
[
output
for
_
,
output
in
outputs
]
val_outputs
=
np
.
reshape
(
val_outputs
,
[
params
[
"decode_batch_size"
],
-
1
])
else
:
val_outputs
,
_
=
model
.
predict
(
text
)
length
=
len
(
val_outputs
)
for
j
in
range
(
length
):
if
j
+
i
*
batch_size
<
total_samples
:
translation
=
_trim_and_decode
(
val_outputs
[
j
],
subtokenizer
)
translations
.
append
(
translation
)
if
print_all_translations
:
logging
.
info
(
"Translating:
\n\t
Input: %s
\n\t
Output: %s"
,
sorted_inputs
[
j
+
i
*
batch_size
],
translation
)
# Write translations in the order they appeared in the original file.
if
output_file
is
not
None
:
if
tf
.
io
.
gfile
.
isdir
(
output_file
):
raise
ValueError
(
"File output is a directory, will not save outputs to "
"file."
)
logging
.
info
(
"Writing to file %s"
,
output_file
)
with
tf
.
io
.
gfile
.
GFile
(
output_file
,
"w"
)
as
f
:
for
i
in
sorted_keys
:
f
.
write
(
"%s
\n
"
%
translations
[
i
])
def
translate_from_text
(
model
,
subtokenizer
,
txt
):
encoded_txt
=
_encode_and_add_eos
(
txt
,
subtokenizer
)
result
=
model
.
predict
(
encoded_txt
)
outputs
=
result
[
"outputs"
]
logging
.
info
(
"Original:
\"
%s
\"
"
,
txt
)
translate_from_input
(
outputs
,
subtokenizer
)
def
translate_from_input
(
outputs
,
subtokenizer
):
translation
=
_trim_and_decode
(
outputs
,
subtokenizer
)
logging
.
info
(
"Translation:
\"
%s
\"
"
,
translation
)
official/legacy/transformer/utils/__init__.py
0 → 100644
View file @
90dd6310
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/legacy/transformer/utils/metrics.py
0 → 100644
View file @
90dd6310
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions for calculating loss, accuracy, and other model metrics.
Metrics:
- Padded loss, accuracy, and negative log perplexity. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
- BLEU approximation. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
- ROUGE score. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
math
import
numpy
as
np
import
six
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow.compat.v1
as
tf
def
_pad_tensors_to_same_length
(
x
,
y
):
"""Pad x and y so that the results have the same length (second dimension)."""
with
tf
.
name_scope
(
"pad_to_same_length"
):
x_length
=
tf
.
shape
(
x
)[
1
]
y_length
=
tf
.
shape
(
y
)[
1
]
max_length
=
tf
.
maximum
(
x_length
,
y_length
)
x
=
tf
.
pad
(
x
,
[[
0
,
0
],
[
0
,
max_length
-
x_length
],
[
0
,
0
]])
y
=
tf
.
pad
(
y
,
[[
0
,
0
],
[
0
,
max_length
-
y_length
]])
return
x
,
y
def
padded_cross_entropy_loss
(
logits
,
labels
,
smoothing
,
vocab_size
):
"""Calculate cross entropy loss while ignoring padding.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch_size, length_labels]
smoothing: Label smoothing constant, used to determine the on and off values
vocab_size: int size of the vocabulary
Returns:
Returns the cross entropy loss and weight tensors: float32 tensors with
shape [batch_size, max(length_logits, length_labels)]
"""
with
tf
.
name_scope
(
"loss"
,
values
=
[
logits
,
labels
]):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
# Calculate smoothing cross entropy
with
tf
.
name_scope
(
"smoothing_cross_entropy"
,
values
=
[
logits
,
labels
]):
confidence
=
1.0
-
smoothing
low_confidence
=
(
1.0
-
confidence
)
/
tf
.
cast
(
vocab_size
-
1
,
tf
.
float32
)
soft_targets
=
tf
.
one_hot
(
tf
.
cast
(
labels
,
tf
.
int32
),
depth
=
vocab_size
,
on_value
=
confidence
,
off_value
=
low_confidence
)
xentropy
=
tf
.
nn
.
softmax_cross_entropy_with_logits_v2
(
logits
=
logits
,
labels
=
soft_targets
)
# Calculate the best (lowest) possible value of cross entropy, and
# subtract from the cross entropy loss.
normalizing_constant
=
-
(
confidence
*
tf
.
log
(
confidence
)
+
tf
.
cast
(
vocab_size
-
1
,
tf
.
float32
)
*
low_confidence
*
tf
.
log
(
low_confidence
+
1e-20
))
xentropy
-=
normalizing_constant
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
return
xentropy
*
weights
,
weights
def
_convert_to_eval_metric
(
metric_fn
):
"""Wrap a metric fn that returns scores and weights as an eval metric fn.
The input metric_fn returns values for the current batch. The wrapper
aggregates the return values collected over all of the batches evaluated.
Args:
metric_fn: function that returns scores and weights for the current batch's
logits and predicted labels.
Returns:
function that aggregates the scores and weights from metric_fn.
"""
def
problem_metric_fn
(
*
args
):
"""Returns an aggregation of the metric_fn's returned values."""
(
scores
,
weights
)
=
metric_fn
(
*
args
)
# The tf.metrics.mean function assures correct aggregation.
return
tf
.
metrics
.
mean
(
scores
,
weights
)
return
problem_metric_fn
def
get_eval_metrics
(
logits
,
labels
,
params
):
"""Return dictionary of model evaluation metrics."""
metrics
=
{
"accuracy"
:
_convert_to_eval_metric
(
padded_accuracy
)(
logits
,
labels
),
"accuracy_top5"
:
_convert_to_eval_metric
(
padded_accuracy_top5
)(
logits
,
labels
),
"accuracy_per_sequence"
:
_convert_to_eval_metric
(
padded_sequence_accuracy
)(
logits
,
labels
),
"neg_log_perplexity"
:
_convert_to_eval_metric
(
padded_neg_log_perplexity
)(
logits
,
labels
,
params
[
"vocab_size"
]),
}
if
not
params
[
"use_tpu"
]:
# TPU does not support tf.py_func
metrics
.
update
({
"approx_bleu_score"
:
_convert_to_eval_metric
(
bleu_score
)(
logits
,
labels
),
"rouge_2_fscore"
:
_convert_to_eval_metric
(
rouge_2_fscore
)(
logits
,
labels
),
"rouge_L_fscore"
:
_convert_to_eval_metric
(
rouge_l_fscore
)(
logits
,
labels
),
})
# Prefix each of the metric names with "metrics/". This allows the metric
# graphs to display under the "metrics" category in TensorBoard.
metrics
=
{
"metrics/%s"
%
k
:
v
for
k
,
v
in
six
.
iteritems
(
metrics
)}
return
metrics
def
padded_accuracy
(
logits
,
labels
):
"""Percentage of times that predictions matches labels on non-0s."""
with
tf
.
variable_scope
(
"padded_accuracy"
,
values
=
[
logits
,
labels
]):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
outputs
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
return
tf
.
cast
(
tf
.
equal
(
outputs
,
padded_labels
),
tf
.
float32
),
weights
def
padded_accuracy_topk
(
logits
,
labels
,
k
):
"""Percentage of times that top-k predictions matches labels on non-0s."""
with
tf
.
variable_scope
(
"padded_accuracy_topk"
,
values
=
[
logits
,
labels
]):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
effective_k
=
tf
.
minimum
(
k
,
tf
.
shape
(
logits
)[
-
1
])
_
,
outputs
=
tf
.
nn
.
top_k
(
logits
,
k
=
effective_k
)
outputs
=
tf
.
cast
(
outputs
,
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
padded_labels
=
tf
.
expand_dims
(
padded_labels
,
axis
=-
1
)
padded_labels
+=
tf
.
zeros_like
(
outputs
)
# Pad to same shape.
same
=
tf
.
cast
(
tf
.
equal
(
outputs
,
padded_labels
),
tf
.
float32
)
same_topk
=
tf
.
reduce_sum
(
same
,
axis
=-
1
)
return
same_topk
,
weights
def
padded_accuracy_top5
(
logits
,
labels
):
return
padded_accuracy_topk
(
logits
,
labels
,
5
)
def
padded_sequence_accuracy
(
logits
,
labels
):
"""Percentage of times that predictions matches labels everywhere (non-0)."""
with
tf
.
variable_scope
(
"padded_sequence_accuracy"
,
values
=
[
logits
,
labels
]):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
outputs
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
not_correct
=
(
tf
.
cast
(
tf
.
not_equal
(
outputs
,
padded_labels
),
tf
.
float32
)
*
weights
)
axis
=
list
(
range
(
1
,
len
(
outputs
.
get_shape
())))
correct_seq
=
1.0
-
tf
.
minimum
(
1.0
,
tf
.
reduce_sum
(
not_correct
,
axis
=
axis
))
return
correct_seq
,
tf
.
constant
(
1.0
)
def
padded_neg_log_perplexity
(
logits
,
labels
,
vocab_size
):
"""Average log-perplexity excluding padding 0s. No smoothing."""
num
,
den
=
padded_cross_entropy_loss
(
logits
,
labels
,
0
,
vocab_size
)
return
-
num
,
den
def
bleu_score
(
logits
,
labels
):
"""Approximate BLEU score computation between labels and predictions.
An approximate BLEU scoring method since we do not glue word pieces or
decode the ids and tokenize the output. By default, we use ngram order of 4
and use brevity penalty. Also, this does not have beam search.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch-size, length_labels]
Returns:
bleu: int, approx bleu score
"""
predictions
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
# TODO: Look into removing use of py_func # pylint: disable=g-bad-todo
bleu
=
tf
.
py_func
(
compute_bleu
,
(
labels
,
predictions
),
tf
.
float32
)
return
bleu
,
tf
.
constant
(
1.0
)
def
_get_ngrams_with_counter
(
segment
,
max_order
):
"""Extracts all n-grams up to a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts
=
collections
.
Counter
()
for
order
in
xrange
(
1
,
max_order
+
1
):
for
i
in
xrange
(
0
,
len
(
segment
)
-
order
+
1
):
ngram
=
tuple
(
segment
[
i
:
i
+
order
])
ngram_counts
[
ngram
]
+=
1
return
ngram_counts
def
compute_bleu
(
reference_corpus
,
translation_corpus
,
max_order
=
4
,
use_bp
=
True
):
"""Computes BLEU score of translated segments against one or more references.
Args:
reference_corpus: list of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
use_bp: boolean, whether to apply brevity penalty.
Returns:
BLEU score.
"""
reference_length
=
0
translation_length
=
0
bp
=
1.0
geo_mean
=
0
matches_by_order
=
[
0
]
*
max_order
possible_matches_by_order
=
[
0
]
*
max_order
precisions
=
[]
for
(
references
,
translations
)
in
zip
(
reference_corpus
,
translation_corpus
):
reference_length
+=
len
(
references
)
translation_length
+=
len
(
translations
)
ref_ngram_counts
=
_get_ngrams_with_counter
(
references
,
max_order
)
translation_ngram_counts
=
_get_ngrams_with_counter
(
translations
,
max_order
)
overlap
=
dict
((
ngram
,
min
(
count
,
translation_ngram_counts
[
ngram
]))
for
ngram
,
count
in
ref_ngram_counts
.
items
())
for
ngram
in
overlap
:
matches_by_order
[
len
(
ngram
)
-
1
]
+=
overlap
[
ngram
]
for
ngram
in
translation_ngram_counts
:
possible_matches_by_order
[
len
(
ngram
)
-
1
]
+=
translation_ngram_counts
[
ngram
]
precisions
=
[
0
]
*
max_order
smooth
=
1.0
for
i
in
xrange
(
0
,
max_order
):
if
possible_matches_by_order
[
i
]
>
0
:
precisions
[
i
]
=
float
(
matches_by_order
[
i
])
/
possible_matches_by_order
[
i
]
if
matches_by_order
[
i
]
>
0
:
precisions
[
i
]
=
float
(
matches_by_order
[
i
])
/
possible_matches_by_order
[
i
]
else
:
smooth
*=
2
precisions
[
i
]
=
1.0
/
(
smooth
*
possible_matches_by_order
[
i
])
else
:
precisions
[
i
]
=
0.0
if
max
(
precisions
)
>
0
:
p_log_sum
=
sum
(
math
.
log
(
p
)
for
p
in
precisions
if
p
)
geo_mean
=
math
.
exp
(
p_log_sum
/
max_order
)
if
use_bp
:
ratio
=
translation_length
/
reference_length
bp
=
math
.
exp
(
1
-
1.
/
ratio
)
if
ratio
<
1.0
else
1.0
bleu
=
geo_mean
*
bp
return
np
.
float32
(
bleu
)
def
rouge_2_fscore
(
logits
,
labels
):
"""ROUGE-2 F1 score computation between labels and predictions.
This is an approximate ROUGE scoring method since we do not glue word pieces
or decode the ids and tokenize the output.
Args:
logits: tensor, model predictions
labels: tensor, gold output.
Returns:
rouge2_fscore: approx rouge-2 f1 score.
"""
predictions
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
# TODO: Look into removing use of py_func # pylint: disable=g-bad-todo
rouge_2_f_score
=
tf
.
py_func
(
rouge_n
,
(
predictions
,
labels
),
tf
.
float32
)
return
rouge_2_f_score
,
tf
.
constant
(
1.0
)
def
_get_ngrams
(
n
,
text
):
"""Calculates n-grams.
Args:
n: which n-grams to calculate
text: An array of tokens
Returns:
A set of n-grams
"""
ngram_set
=
set
()
text_length
=
len
(
text
)
max_index_ngram_start
=
text_length
-
n
for
i
in
range
(
max_index_ngram_start
+
1
):
ngram_set
.
add
(
tuple
(
text
[
i
:
i
+
n
]))
return
ngram_set
def
rouge_n
(
eval_sentences
,
ref_sentences
,
n
=
2
):
"""Computes ROUGE-N f1 score of two text collections of sentences.
Source: https://www.microsoft.com/en-us/research/publication/
rouge-a-package-for-automatic-evaluation-of-summaries/
Args:
eval_sentences: Predicted sentences.
ref_sentences: Sentences from the reference set
n: Size of ngram. Defaults to 2.
Returns:
f1 score for ROUGE-N
"""
f1_scores
=
[]
for
eval_sentence
,
ref_sentence
in
zip
(
eval_sentences
,
ref_sentences
):
eval_ngrams
=
_get_ngrams
(
n
,
eval_sentence
)
ref_ngrams
=
_get_ngrams
(
n
,
ref_sentence
)
ref_count
=
len
(
ref_ngrams
)
eval_count
=
len
(
eval_ngrams
)
# Count the overlapping ngrams between evaluated and reference
overlapping_ngrams
=
eval_ngrams
.
intersection
(
ref_ngrams
)
overlapping_count
=
len
(
overlapping_ngrams
)
# Handle edge case. This isn't mathematically correct, but it's good enough
if
eval_count
==
0
:
precision
=
0.0
else
:
precision
=
float
(
overlapping_count
)
/
eval_count
if
ref_count
==
0
:
recall
=
0.0
else
:
recall
=
float
(
overlapping_count
)
/
ref_count
f1_scores
.
append
(
2.0
*
((
precision
*
recall
)
/
(
precision
+
recall
+
1e-8
)))
# return overlapping_count / reference_count
return
np
.
mean
(
f1_scores
,
dtype
=
np
.
float32
)
def
rouge_l_fscore
(
predictions
,
labels
):
"""ROUGE scores computation between labels and predictions.
This is an approximate ROUGE scoring method since we do not glue word pieces
or decode the ids and tokenize the output.
Args:
predictions: tensor, model predictions
labels: tensor, gold output.
Returns:
rouge_l_fscore: approx rouge-l f1 score.
"""
outputs
=
tf
.
cast
(
tf
.
argmax
(
predictions
,
axis
=-
1
),
tf
.
int32
)
rouge_l_f_score
=
tf
.
py_func
(
rouge_l_sentence_level
,
(
outputs
,
labels
),
tf
.
float32
)
return
rouge_l_f_score
,
tf
.
constant
(
1.0
)
def
rouge_l_sentence_level
(
eval_sentences
,
ref_sentences
):
"""Computes ROUGE-L (sentence level) of two collections of sentences.
Source: https://www.microsoft.com/en-us/research/publication/
rouge-a-package-for-automatic-evaluation-of-summaries/
Calculated according to:
R_lcs = LCS(X,Y)/m
P_lcs = LCS(X,Y)/n
F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
where:
X = reference summary
Y = Candidate summary
m = length of reference summary
n = length of candidate summary
Args:
eval_sentences: The sentences that have been picked by the summarizer
ref_sentences: The sentences from the reference set
Returns:
A float: F_lcs
"""
f1_scores
=
[]
for
eval_sentence
,
ref_sentence
in
zip
(
eval_sentences
,
ref_sentences
):
m
=
float
(
len
(
ref_sentence
))
n
=
float
(
len
(
eval_sentence
))
lcs
=
_len_lcs
(
eval_sentence
,
ref_sentence
)
f1_scores
.
append
(
_f_lcs
(
lcs
,
m
,
n
))
return
np
.
mean
(
f1_scores
,
dtype
=
np
.
float32
)
def
_len_lcs
(
x
,
y
):
"""Returns the length of the Longest Common Subsequence between two seqs.
Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
Args:
x: sequence of words
y: sequence of words
Returns
integer: Length of LCS between x and y
"""
table
=
_lcs
(
x
,
y
)
n
,
m
=
len
(
x
),
len
(
y
)
return
table
[
n
,
m
]
def
_lcs
(
x
,
y
):
"""Computes the length of the LCS between two seqs.
The implementation below uses a DP programming algorithm and runs
in O(nm) time where n = len(x) and m = len(y).
Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
Args:
x: collection of words
y: collection of words
Returns:
Table of dictionary of coord and len lcs
"""
n
,
m
=
len
(
x
),
len
(
y
)
table
=
dict
()
for
i
in
range
(
n
+
1
):
for
j
in
range
(
m
+
1
):
if
i
==
0
or
j
==
0
:
table
[
i
,
j
]
=
0
elif
x
[
i
-
1
]
==
y
[
j
-
1
]:
table
[
i
,
j
]
=
table
[
i
-
1
,
j
-
1
]
+
1
else
:
table
[
i
,
j
]
=
max
(
table
[
i
-
1
,
j
],
table
[
i
,
j
-
1
])
return
table
def
_f_lcs
(
llcs
,
m
,
n
):
"""Computes the LCS-based F-measure score.
Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
rouge-working-note-v1.3.1.pdf
Args:
llcs: Length of LCS
m: number of words in reference summary
n: number of words in candidate summary
Returns:
Float. LCS-based F-measure score
"""
r_lcs
=
llcs
/
m
p_lcs
=
llcs
/
n
beta
=
p_lcs
/
(
r_lcs
+
1e-12
)
num
=
(
1
+
(
beta
**
2
))
*
r_lcs
*
p_lcs
denom
=
r_lcs
+
((
beta
**
2
)
*
p_lcs
)
f_lcs
=
num
/
(
denom
+
1e-12
)
return
f_lcs
official/legacy/transformer/utils/tokenizer.py
0 → 100644
View file @
90dd6310
This diff is collapsed.
Click to expand it.
official/legacy/transformer/utils/tokenizer_test.py
0 → 100644
View file @
90dd6310
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test Subtokenizer and string helper methods."""
import
collections
import
tempfile
import
tensorflow
as
tf
from
official.legacy.transformer.utils
import
tokenizer
class
SubtokenizerTest
(
tf
.
test
.
TestCase
):
def
_init_subtokenizer
(
self
,
vocab_list
):
temp_file
=
tempfile
.
NamedTemporaryFile
(
delete
=
False
)
with
tf
.
io
.
gfile
.
GFile
(
temp_file
.
name
,
"w"
)
as
w
:
for
subtoken
in
vocab_list
:
w
.
write
(
"'%s'"
%
subtoken
)
w
.
write
(
"
\n
"
)
return
tokenizer
.
Subtokenizer
(
temp_file
.
name
,
reserved_tokens
=
[])
def
test_encode
(
self
):
vocab_list
=
[
"123_"
,
"test"
,
"ing_"
]
subtokenizer
=
self
.
_init_subtokenizer
(
vocab_list
)
s
=
"testing 123"
encoded_list
=
subtokenizer
.
encode
(
s
)
self
.
assertEqual
([
1
,
2
,
0
],
encoded_list
)
def
test_decode
(
self
):
vocab_list
=
[
"123_"
,
"test"
,
"ing_"
]
subtokenizer
=
self
.
_init_subtokenizer
(
vocab_list
)
encoded_list
=
[
1
,
2
,
0
]
# testing 123
decoded_str
=
subtokenizer
.
decode
(
encoded_list
)
self
.
assertEqual
(
"testing 123"
,
decoded_str
)
def
test_subtoken_ids_to_tokens
(
self
):
vocab_list
=
[
"123_"
,
"test"
,
"ing_"
]
subtokenizer
=
self
.
_init_subtokenizer
(
vocab_list
)
encoded_list
=
[
1
,
2
,
0
]
# testing 123
token_list
=
subtokenizer
.
_subtoken_ids_to_tokens
(
encoded_list
)
self
.
assertEqual
([
u
"testing"
,
u
"123"
],
token_list
)
class
StringHelperTest
(
tf
.
test
.
TestCase
):
def
test_split_string_to_tokens
(
self
):
text
=
"test? testing 123."
tokens
=
tokenizer
.
_split_string_to_tokens
(
text
,
tokenizer
.
_ALPHANUMERIC_CHAR_SET
)
self
.
assertEqual
([
"test"
,
"? "
,
"testing"
,
"123"
,
"."
],
tokens
)
def
test_join_tokens_to_string
(
self
):
tokens
=
[
"test"
,
"? "
,
"testing"
,
"123"
,
"."
]
s
=
tokenizer
.
_join_tokens_to_string
(
tokens
,
tokenizer
.
_ALPHANUMERIC_CHAR_SET
)
self
.
assertEqual
(
"test? testing 123."
,
s
)
def
test_escape_token
(
self
):
token
=
u
"abc_
\\
4"
alphabet
=
set
(
"abc_
\\
u;"
)
escaped_token
=
tokenizer
.
_escape_token
(
token
,
alphabet
)
self
.
assertEqual
(
"abc
\\
u
\\\\\\
52;_"
,
escaped_token
)
def
test_unescape_token
(
self
):
escaped_token
=
u
"Underline:
\\
u, Backslash:
\\\\
, Unicode:
\\
52;"
unescaped_token
=
tokenizer
.
_unescape_token
(
escaped_token
)
self
.
assertEqual
(
"Underline: _, Backslash:
\\
, Unicode: 4"
,
unescaped_token
)
def
test_list_to_index_dict
(
self
):
lst
=
[
"test"
,
"strings"
]
d
=
tokenizer
.
_list_to_index_dict
(
lst
)
self
.
assertDictEqual
({
"test"
:
0
,
"strings"
:
1
},
d
)
def
test_split_token_to_subtokens
(
self
):
token
=
"abc"
subtoken_dict
=
{
"a"
:
0
,
"b"
:
1
,
"c"
:
2
,
"ab"
:
3
}
max_subtoken_length
=
2
subtokens
=
tokenizer
.
_split_token_to_subtokens
(
token
,
subtoken_dict
,
max_subtoken_length
)
self
.
assertEqual
([
"ab"
,
"c"
],
subtokens
)
def
test_generate_alphabet_dict
(
self
):
s
=
[
"testing"
,
"123"
]
reserved_tokens
=
[
"???"
]
alphabet
=
tokenizer
.
_generate_alphabet_dict
(
s
,
reserved_tokens
)
self
.
assertIn
(
"?"
,
alphabet
)
self
.
assertIn
(
"t"
,
alphabet
)
self
.
assertIn
(
"e"
,
alphabet
)
self
.
assertIn
(
"s"
,
alphabet
)
self
.
assertIn
(
"i"
,
alphabet
)
self
.
assertIn
(
"n"
,
alphabet
)
self
.
assertIn
(
"g"
,
alphabet
)
self
.
assertIn
(
"1"
,
alphabet
)
self
.
assertIn
(
"2"
,
alphabet
)
self
.
assertIn
(
"3"
,
alphabet
)
def
test_count_and_gen_subtokens
(
self
):
token_counts
=
{
"abc"
:
5
}
alphabet
=
set
(
"abc_"
)
subtoken_dict
=
{
"a"
:
0
,
"b"
:
1
,
"c"
:
2
,
"_"
:
3
}
max_subtoken_length
=
2
subtoken_counts
=
tokenizer
.
_count_and_gen_subtokens
(
token_counts
,
alphabet
,
subtoken_dict
,
max_subtoken_length
)
self
.
assertIsInstance
(
subtoken_counts
,
collections
.
defaultdict
)
self
.
assertDictEqual
(
{
"a"
:
5
,
"b"
:
5
,
"c"
:
5
,
"_"
:
5
,
"ab"
:
5
,
"bc"
:
5
,
"c_"
:
5
,
"abc"
:
5
,
"bc_"
:
5
,
"abc_"
:
5
},
subtoken_counts
)
def
test_filter_and_bucket_subtokens
(
self
):
subtoken_counts
=
collections
.
defaultdict
(
int
,
{
"a"
:
2
,
"b"
:
4
,
"c"
:
1
,
"ab"
:
6
,
"ac"
:
3
,
"abbc"
:
5
})
min_count
=
3
subtoken_buckets
=
tokenizer
.
_filter_and_bucket_subtokens
(
subtoken_counts
,
min_count
)
self
.
assertEqual
(
len
(
subtoken_buckets
[
0
]),
0
)
self
.
assertEqual
(
set
(
"b"
),
subtoken_buckets
[
1
])
self
.
assertEqual
(
set
([
"ab"
,
"ac"
]),
subtoken_buckets
[
2
])
self
.
assertEqual
(
len
(
subtoken_buckets
[
3
]),
0
)
self
.
assertEqual
(
set
([
"abbc"
]),
subtoken_buckets
[
4
])
def
test_gen_new_subtoken_list
(
self
):
subtoken_counts
=
collections
.
defaultdict
(
int
,
{
"translate"
:
10
,
"t"
:
40
,
"tr"
:
16
,
"tra"
:
12
})
min_count
=
5
alphabet
=
set
(
"translate"
)
reserved_tokens
=
[
"reserved"
,
"tokens"
]
subtoken_list
,
max_token_length
=
tokenizer
.
_gen_new_subtoken_list
(
subtoken_counts
,
min_count
,
alphabet
,
reserved_tokens
)
# Check that "tra" isn"t in the list (its count should be decremented to 2,
# so it should not be added to the canddiate list).
self
.
assertNotIn
(
"tra"
,
subtoken_list
)
self
.
assertIn
(
"tr"
,
subtoken_list
)
self
.
assertIn
(
"t"
,
subtoken_list
)
self
.
assertEqual
(
len
(
"translate"
),
max_token_length
)
def
test_generate_subtokens
(
self
):
token_counts
=
{
"ab"
:
1
,
"bc"
:
3
,
"abc"
:
5
}
alphabet
=
set
(
"abc_"
)
min_count
=
100
num_iterations
=
1
reserved_tokens
=
[
"reserved"
,
"tokens"
]
vocab_list
=
tokenizer
.
_generate_subtokens
(
token_counts
,
alphabet
,
min_count
,
num_iterations
,
reserved_tokens
)
# Check that reserved tokens are at the front of the list
self
.
assertEqual
(
vocab_list
[:
2
],
reserved_tokens
)
# Check that each character in alphabet is in the vocab list
for
c
in
alphabet
:
self
.
assertIn
(
c
,
vocab_list
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment