Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
32e4ca51
Commit
32e4ca51
authored
Nov 28, 2023
by
qianyj
Browse files
Update code to v2.11.0
parents
9485aa1d
71060f67
Changes
772
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
4470 deletions
+0
-4470
official/nlp/transformer/data_pipeline.py
official/nlp/transformer/data_pipeline.py
+0
-330
official/nlp/transformer/embedding_layer.py
official/nlp/transformer/embedding_layer.py
+0
-102
official/nlp/transformer/ffn_layer.py
official/nlp/transformer/ffn_layer.py
+0
-71
official/nlp/transformer/metrics.py
official/nlp/transformer/metrics.py
+0
-180
official/nlp/transformer/misc.py
official/nlp/transformer/misc.py
+0
-288
official/nlp/transformer/model_params.py
official/nlp/transformer/model_params.py
+0
-96
official/nlp/transformer/model_utils.py
official/nlp/transformer/model_utils.py
+0
-121
official/nlp/transformer/model_utils_test.py
official/nlp/transformer/model_utils_test.py
+0
-55
official/nlp/transformer/optimizer.py
official/nlp/transformer/optimizer.py
+0
-64
official/nlp/transformer/transformer.py
official/nlp/transformer/transformer.py
+0
-549
official/nlp/transformer/transformer_forward_test.py
official/nlp/transformer/transformer_forward_test.py
+0
-157
official/nlp/transformer/transformer_layers_test.py
official/nlp/transformer/transformer_layers_test.py
+0
-125
official/nlp/transformer/transformer_main.py
official/nlp/transformer/transformer_main.py
+0
-482
official/nlp/transformer/transformer_main_test.py
official/nlp/transformer/transformer_main_test.py
+0
-193
official/nlp/transformer/transformer_test.py
official/nlp/transformer/transformer_test.py
+0
-98
official/nlp/transformer/translate.py
official/nlp/transformer/translate.py
+0
-190
official/nlp/transformer/utils/__init__.py
official/nlp/transformer/utils/__init__.py
+0
-14
official/nlp/transformer/utils/metrics.py
official/nlp/transformer/utils/metrics.py
+0
-491
official/nlp/transformer/utils/tokenizer.py
official/nlp/transformer/utils/tokenizer.py
+0
-660
official/nlp/transformer/utils/tokenizer_test.py
official/nlp/transformer/utils/tokenizer_test.py
+0
-204
No files found.
Too many changes to show.
To preserve performance only
772 of 772+
files are displayed.
Plain diff
Email patch
official/nlp/transformer/data_pipeline.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Input pipeline for the transformer model to read, filter, and batch examples.
Two things to note in the pipeline:
1. Batching scheme
The examples encoded in the TFRecord files contain data in the format:
{"inputs": [variable length array of integers],
"targets": [variable length array of integers]}
Where integers in the arrays refer to tokens in the English and German vocab
file (named `vocab.ende.32768`).
Prior to batching, elements in the dataset are grouped by length (max between
"inputs" and "targets" length). Each group is then batched such that:
group_batch_size * length <= batch_size.
Another way to view batch_size is the maximum number of tokens in each batch.
Once batched, each element in the dataset will have the shape:
{"inputs": [group_batch_size, padded_input_length],
"targets": [group_batch_size, padded_target_length]}
Lengths are padded to the longest "inputs" or "targets" sequence in the batch
(padded_input_length and padded_target_length can be different).
This batching scheme decreases the fraction of padding tokens per training
batch, thus improving the training speed significantly.
2. Shuffling
While training, the dataset is shuffled in two places in the code. The first
is the list of training files. Second, while reading records using
`parallel_interleave`, the `sloppy` argument is used to generate randomness
in the order of the examples.
"""
import
os
from
absl
import
logging
import
tensorflow
as
tf
from
official.utils.misc
import
model_helpers
# Buffer size for reading records from a TFRecord file. Each training file is
# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
_READ_RECORD_BUFFER
=
8
*
1000
*
1000
# Example grouping constants. Defines length boundaries for each group.
# These values are the defaults used in Tensor2Tensor.
_MIN_BOUNDARY
=
8
_BOUNDARY_SCALE
=
1.1
def
_load_records
(
filename
):
"""Read file and return a dataset of tf.Examples."""
return
tf
.
data
.
TFRecordDataset
(
filename
,
buffer_size
=
_READ_RECORD_BUFFER
)
def
_parse_example
(
serialized_example
):
"""Return inputs and targets Tensors from a serialized tf.Example."""
data_fields
=
{
"inputs"
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
),
"targets"
:
tf
.
io
.
VarLenFeature
(
tf
.
int64
)
}
parsed
=
tf
.
io
.
parse_single_example
(
serialized_example
,
data_fields
)
inputs
=
tf
.
sparse
.
to_dense
(
parsed
[
"inputs"
])
targets
=
tf
.
sparse
.
to_dense
(
parsed
[
"targets"
])
return
inputs
,
targets
def
_filter_max_length
(
example
,
max_length
=
256
):
"""Indicates whether the example's length is lower than the maximum length."""
return
tf
.
logical_and
(
tf
.
size
(
example
[
0
])
<=
max_length
,
tf
.
size
(
example
[
1
])
<=
max_length
)
def
_get_example_length
(
example
):
"""Returns the maximum length between the example inputs and targets."""
length
=
tf
.
maximum
(
tf
.
shape
(
example
[
0
])[
0
],
tf
.
shape
(
example
[
1
])[
0
])
return
length
def
_create_min_max_boundaries
(
max_length
,
min_boundary
=
_MIN_BOUNDARY
,
boundary_scale
=
_BOUNDARY_SCALE
):
"""Create min and max boundary lists up to max_length.
For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
returned values will be:
buckets_min = [0, 4, 8, 16, 24]
buckets_max = [4, 8, 16, 24, 25]
Args:
max_length: The maximum length of example in dataset.
min_boundary: Minimum length in boundary.
boundary_scale: Amount to scale consecutive boundaries in the list.
Returns:
min and max boundary lists
"""
# Create bucket boundaries list by scaling the previous boundary or adding 1
# (to ensure increasing boundary sizes).
bucket_boundaries
=
[]
x
=
min_boundary
while
x
<
max_length
:
bucket_boundaries
.
append
(
x
)
x
=
max
(
x
+
1
,
int
(
x
*
boundary_scale
))
# Create min and max boundary lists from the initial list.
buckets_min
=
[
0
]
+
bucket_boundaries
buckets_max
=
bucket_boundaries
+
[
max_length
+
1
]
return
buckets_min
,
buckets_max
def
_batch_examples
(
dataset
,
batch_size
,
max_length
):
"""Group examples by similar lengths, and return batched dataset.
Each batch of similar-length examples are padded to the same length, and may
have different number of elements in each batch, such that:
group_batch_size * padded_length <= batch_size.
This decreases the number of padding tokens per batch, which improves the
training speed.
Args:
dataset: Dataset of unbatched examples.
batch_size: Max number of tokens per batch of examples.
max_length: Max number of tokens in an example input or target sequence.
Returns:
Dataset of batched examples with similar lengths.
"""
# Get min and max boundary lists for each example. These are used to calculate
# the `bucket_id`, which is the index at which:
# buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
# Note that using both min and max lists improves the performance.
buckets_min
,
buckets_max
=
_create_min_max_boundaries
(
max_length
)
# Create list of batch sizes for each bucket_id, so that
# bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
bucket_batch_sizes
=
[
int
(
batch_size
)
//
x
for
x
in
buckets_max
]
# bucket_id will be a tensor, so convert this list to a tensor as well.
bucket_batch_sizes
=
tf
.
constant
(
bucket_batch_sizes
,
dtype
=
tf
.
int64
)
def
example_to_bucket_id
(
example_input
,
example_target
):
"""Return int64 bucket id for this example, calculated based on length."""
seq_length
=
_get_example_length
((
example_input
,
example_target
))
# TODO(xunkai): investigate if removing code branching improves performance.
conditions_c
=
tf
.
logical_and
(
tf
.
less_equal
(
buckets_min
,
seq_length
),
tf
.
less
(
seq_length
,
buckets_max
))
bucket_id
=
tf
.
reduce_min
(
tf
.
where
(
conditions_c
))
return
bucket_id
def
window_size_fn
(
bucket_id
):
"""Return number of examples to be grouped when given a bucket id."""
return
bucket_batch_sizes
[
bucket_id
]
def
batching_fn
(
bucket_id
,
grouped_dataset
):
"""Batch and add padding to a dataset of elements with similar lengths."""
bucket_batch_size
=
window_size_fn
(
bucket_id
)
# Batch the dataset and add padding so that all input sequences in the
# examples have the same length, and all target sequences have the same
# lengths as well. Resulting lengths of inputs and targets can differ.
return
grouped_dataset
.
padded_batch
(
bucket_batch_size
,
([
None
],
[
None
]))
return
dataset
.
apply
(
tf
.
data
.
experimental
.
group_by_window
(
key_func
=
example_to_bucket_id
,
reduce_func
=
batching_fn
,
window_size
=
None
,
window_size_func
=
window_size_fn
))
def
_read_and_batch_from_files
(
file_pattern
,
batch_size
,
max_length
,
max_io_parallelism
,
shuffle
,
repeat
,
static_batch
=
False
,
num_replicas
=
1
,
ctx
=
None
):
"""Create dataset where each item is a dict of "inputs" and "targets".
Args:
file_pattern: String used to match the input TFRecord files.
batch_size: Maximum number of tokens per global batch of examples.
max_length: Maximum number of tokens per example
max_io_parallelism: Max number of cpu cores for parallel input processing.
shuffle: If true, randomizes order of elements.
repeat: Number of times to repeat the dataset. If None, the dataset is
repeated forever.
static_batch: Whether the batches in the dataset should have static shapes.
If True, the input is batched so that every batch has the shape
[batch_size // max_length, max_length]. If False, the input is grouped by
length, and batched so that batches may have different
shapes [N, M], where: N * M <= batch_size M <= max_length In general, this
setting should be False. Dynamic shapes allow the inputs to be grouped
so that the number of padding tokens is minimized, and helps model
training. In cases where the input shape must be static (e.g. running on
TPU), this setting should be set to True.
num_replicas: Number of GPUs or other workers. We will generate global
batches, and each global batch is equally divisible by number of replicas.
Currently it is only effective when static_batch==True. TODO: make it
effective when static_batch=False.
ctx: Input context.
Returns:
tf.data.Dataset object containing examples loaded from the files.
"""
dataset
=
tf
.
data
.
Dataset
.
list_files
(
file_pattern
,
shuffle
=
shuffle
)
if
ctx
and
ctx
.
num_input_pipelines
>
1
:
logging
.
info
(
"Shard %d of the dataset."
,
ctx
.
input_pipeline_id
)
dataset
=
dataset
.
shard
(
ctx
.
num_input_pipelines
,
ctx
.
input_pipeline_id
)
# Read files and interleave results. When training, the order of the examples
# will be non-deterministic.
options
=
tf
.
data
.
Options
()
options
.
experimental_deterministic
=
False
dataset
=
dataset
.
interleave
(
_load_records
,
cycle_length
=
max_io_parallelism
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
).
with_options
(
options
)
# Parse each tf.Example into a dictionary
# TODO: Look into prefetch_input_elements for performance optimization. # pylint: disable=g-bad-todo
dataset
=
dataset
.
map
(
_parse_example
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
# Remove examples where the input or target length exceeds the maximum length,
dataset
=
dataset
.
filter
(
lambda
x
,
y
:
_filter_max_length
((
x
,
y
),
max_length
))
if
static_batch
:
dataset
=
dataset
.
padded_batch
(
# First calculate batch size (token number) per worker, then divide it
# into sentences, and finally expand to a global batch. It could prove
# the global batch divisble for distribution strategy.
int
(
batch_size
//
num_replicas
//
max_length
*
num_replicas
),
([
max_length
],
[
max_length
]),
drop_remainder
=
True
)
else
:
# Group and batch such that each batch has examples of similar length.
# TODO(xunkai): _batch_examples might need to do something special for
# num_replicas.
dataset
=
_batch_examples
(
dataset
,
batch_size
,
max_length
)
dataset
=
dataset
.
repeat
(
repeat
)
# Prefetch the next element to improve speed of input pipeline.
dataset
=
dataset
.
prefetch
(
buffer_size
=
tf
.
data
.
experimental
.
AUTOTUNE
)
return
dataset
def
_generate_synthetic_data
(
params
):
"""Create synthetic data based on the parameter batch size."""
batch_size
=
int
(
params
[
"batch_size"
]
//
params
[
"max_length"
])
length
=
params
[
"max_length"
]
dataset
=
model_helpers
.
generate_synthetic_data
(
input_shape
=
tf
.
TensorShape
([
length
]),
input_value
=
1
,
input_dtype
=
tf
.
int64
,
label_shape
=
tf
.
TensorShape
([
length
]),
label_value
=
1
,
label_dtype
=
tf
.
int64
,
)
if
params
[
"static_batch"
]:
dataset
=
dataset
.
batch
(
batch_size
,
drop_remainder
=
True
)
else
:
dataset
=
dataset
.
padded_batch
(
batch_size
,
([
None
],
[
None
]))
return
dataset
def
train_input_fn
(
params
,
ctx
=
None
):
"""Load and return dataset of batched examples for use during training."""
file_pattern
=
os
.
path
.
join
(
params
[
"data_dir"
]
or
""
,
"*train*"
)
if
params
[
"use_synthetic_data"
]:
return
_generate_synthetic_data
(
params
)
return
_read_and_batch_from_files
(
file_pattern
,
params
[
"batch_size"
],
params
[
"max_length"
],
params
[
"max_io_parallelism"
],
shuffle
=
True
,
repeat
=
params
[
"repeat_dataset"
],
static_batch
=
params
[
"static_batch"
],
num_replicas
=
params
[
"num_gpus"
],
ctx
=
ctx
)
def
eval_input_fn
(
params
,
ctx
=
None
):
"""Load and return dataset of batched examples for use during evaluation."""
file_pattern
=
os
.
path
.
join
(
params
[
"data_dir"
]
or
""
,
"*dev*"
)
if
params
[
"use_synthetic_data"
]:
return
_generate_synthetic_data
(
params
)
return
_read_and_batch_from_files
(
file_pattern
,
params
[
"batch_size"
],
params
[
"max_length"
],
params
[
"max_io_parallelism"
],
shuffle
=
False
,
repeat
=
1
,
static_batch
=
params
[
"static_batch"
],
num_replicas
=
params
[
"num_gpus"
],
ctx
=
ctx
)
def
map_data_for_transformer_fn
(
x
,
y
):
"""Maps data for training, and handles weried behaviors for different vers."""
# Will transform input x and targets y into tuple(x, y) as new model inputs.
# For TF v2, the 2nd parameter is omitted to make Keras training work.
return
((
x
,
y
),)
official/nlp/transformer/embedding_layer.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of embedding layer with shared weights."""
import
tensorflow
as
tf
class
EmbeddingSharedWeights
(
tf
.
keras
.
layers
.
Layer
):
"""Calculates input embeddings and pre-softmax linear with shared weights."""
def
__init__
(
self
,
vocab_size
,
hidden_size
):
"""Specify characteristic parameters of embedding layer.
Args:
vocab_size: Number of tokens in the embedding. (Typically ~32,000)
hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
"""
super
(
EmbeddingSharedWeights
,
self
).
__init__
()
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
def
build
(
self
,
input_shape
):
"""Build embedding layer."""
with
tf
.
name_scope
(
"embedding_and_softmax"
):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self
.
shared_weights
=
self
.
add_weight
(
"weights"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
dtype
=
tf
.
float32
,
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
hidden_size
**-
0.5
))
super
(
EmbeddingSharedWeights
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"vocab_size"
:
self
.
vocab_size
,
"hidden_size"
:
self
.
hidden_size
,
}
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
"""Get token embeddings of inputs.
Args:
inputs: An int64 tensor with shape [batch_size, length]
mode: string, a valid value is one of "embedding" and "linear".
Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
ValueError: if mode is not valid.
"""
if
mode
==
"embedding"
:
return
self
.
_embedding
(
inputs
)
elif
mode
==
"linear"
:
return
self
.
_linear
(
inputs
)
else
:
raise
ValueError
(
"mode {} is not valid."
.
format
(
mode
))
def
_embedding
(
self
,
inputs
):
"""Applies embedding based on inputs tensor."""
with
tf
.
name_scope
(
"embedding"
):
# Create binary mask of size [batch_size, length]
embeddings
=
tf
.
gather
(
self
.
shared_weights
,
inputs
)
# mask = tf.cast(tf.not_equal(inputs, 0), embeddings.dtype)
# embeddings *= tf.expand_dims(mask, -1)
# Scale embedding by the sqrt of the hidden size
embeddings
*=
self
.
hidden_size
**
0.5
return
embeddings
def
_linear
(
self
,
inputs
):
"""Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
with
tf
.
name_scope
(
"presoftmax_linear"
):
batch_size
=
tf
.
shape
(
inputs
)[
0
]
length
=
tf
.
shape
(
inputs
)[
1
]
x
=
tf
.
reshape
(
inputs
,
[
-
1
,
self
.
hidden_size
])
logits
=
tf
.
matmul
(
x
,
self
.
shared_weights
,
transpose_b
=
True
)
return
tf
.
reshape
(
logits
,
[
batch_size
,
length
,
self
.
vocab_size
])
official/nlp/transformer/ffn_layer.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of fully connected network."""
import
tensorflow
as
tf
class
FeedForwardNetwork
(
tf
.
keras
.
layers
.
Layer
):
"""Fully connected feedforward network."""
def
__init__
(
self
,
hidden_size
,
filter_size
,
relu_dropout
):
"""Initialize FeedForwardNetwork.
Args:
hidden_size: int, output dim of hidden layer.
filter_size: int, filter size for the inner (first) dense layer.
relu_dropout: float, dropout rate for training.
"""
super
(
FeedForwardNetwork
,
self
).
__init__
()
self
.
hidden_size
=
hidden_size
self
.
filter_size
=
filter_size
self
.
relu_dropout
=
relu_dropout
def
build
(
self
,
input_shape
):
self
.
filter_dense_layer
=
tf
.
keras
.
layers
.
Dense
(
self
.
filter_size
,
use_bias
=
True
,
activation
=
tf
.
nn
.
relu
,
name
=
"filter_layer"
)
self
.
output_dense_layer
=
tf
.
keras
.
layers
.
Dense
(
self
.
hidden_size
,
use_bias
=
True
,
name
=
"output_layer"
)
super
(
FeedForwardNetwork
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"hidden_size"
:
self
.
hidden_size
,
"filter_size"
:
self
.
filter_size
,
"relu_dropout"
:
self
.
relu_dropout
,
}
def
call
(
self
,
x
,
training
):
"""Return outputs of the feedforward network.
Args:
x: tensor with shape [batch_size, length, hidden_size]
training: boolean, whether in training mode or not.
Returns:
Output of the feedforward network.
tensor with shape [batch_size, length, hidden_size]
"""
# Retrieve dynamically known shapes
output
=
self
.
filter_dense_layer
(
x
)
if
training
:
output
=
tf
.
nn
.
dropout
(
output
,
rate
=
self
.
relu_dropout
)
output
=
self
.
output_dense_layer
(
output
)
return
output
official/nlp/transformer/metrics.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions for calculating loss, accuracy, and other model metrics.
Metrics:
- Padded loss, accuracy, and negative log perplexity. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
- BLEU approximation. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
- ROUGE score. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
"""
import
functools
import
tensorflow
as
tf
def
_pad_tensors_to_same_length
(
x
,
y
):
"""Pad x and y so that the results have the same length (second dimension)."""
with
tf
.
name_scope
(
"pad_to_same_length"
):
x_length
=
tf
.
shape
(
x
)[
1
]
y_length
=
tf
.
shape
(
y
)[
1
]
max_length
=
tf
.
maximum
(
x_length
,
y_length
)
x
=
tf
.
pad
(
x
,
[[
0
,
0
],
[
0
,
max_length
-
x_length
],
[
0
,
0
]])
y
=
tf
.
pad
(
y
,
[[
0
,
0
],
[
0
,
max_length
-
y_length
]])
return
x
,
y
def
padded_cross_entropy_loss
(
logits
,
labels
,
smoothing
,
vocab_size
):
"""Calculate cross entropy loss while ignoring padding.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch_size, length_labels]
smoothing: Label smoothing constant, used to determine the on and off values
vocab_size: int size of the vocabulary
Returns:
Returns the cross entropy loss and weight tensors: float32 tensors with
shape [batch_size, max(length_logits, length_labels)]
"""
with
tf
.
name_scope
(
"loss"
):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
# Calculate smoothing cross entropy
with
tf
.
name_scope
(
"smoothing_cross_entropy"
):
confidence
=
1.0
-
smoothing
low_confidence
=
(
1.0
-
confidence
)
/
tf
.
cast
(
vocab_size
-
1
,
tf
.
float32
)
soft_targets
=
tf
.
one_hot
(
tf
.
cast
(
labels
,
tf
.
int32
),
depth
=
vocab_size
,
on_value
=
confidence
,
off_value
=
low_confidence
)
xentropy
=
tf
.
nn
.
softmax_cross_entropy_with_logits
(
logits
=
logits
,
labels
=
soft_targets
)
# Calculate the best (lowest) possible value of cross entropy, and
# subtract from the cross entropy loss.
normalizing_constant
=
-
(
confidence
*
tf
.
math
.
log
(
confidence
)
+
tf
.
cast
(
vocab_size
-
1
,
tf
.
float32
)
*
low_confidence
*
tf
.
math
.
log
(
low_confidence
+
1e-20
))
xentropy
-=
normalizing_constant
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
return
xentropy
*
weights
,
weights
def
padded_accuracy
(
logits
,
labels
):
"""Percentage of times that predictions matches labels on non-0s."""
with
tf
.
name_scope
(
"padded_accuracy"
):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
outputs
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
return
tf
.
cast
(
tf
.
equal
(
outputs
,
padded_labels
),
tf
.
float32
),
weights
def
padded_accuracy_topk
(
logits
,
labels
,
k
):
"""Percentage of times that top-k predictions matches labels on non-0s."""
with
tf
.
name_scope
(
"padded_accuracy_topk"
):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
effective_k
=
tf
.
minimum
(
k
,
tf
.
shape
(
logits
)[
-
1
])
_
,
outputs
=
tf
.
nn
.
top_k
(
logits
,
k
=
effective_k
)
outputs
=
tf
.
cast
(
outputs
,
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
padded_labels
=
tf
.
expand_dims
(
padded_labels
,
axis
=-
1
)
padded_labels
+=
tf
.
zeros_like
(
outputs
)
# Pad to same shape.
same
=
tf
.
cast
(
tf
.
equal
(
outputs
,
padded_labels
),
tf
.
float32
)
same_topk
=
tf
.
reduce_sum
(
same
,
axis
=-
1
)
return
same_topk
,
weights
def
padded_accuracy_top5
(
logits
,
labels
):
return
padded_accuracy_topk
(
logits
,
labels
,
5
)
def
padded_sequence_accuracy
(
logits
,
labels
):
"""Percentage of times that predictions matches labels everywhere (non-0)."""
with
tf
.
name_scope
(
"padded_sequence_accuracy"
):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
outputs
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
not_correct
=
tf
.
cast
(
tf
.
not_equal
(
outputs
,
padded_labels
),
tf
.
float32
)
*
weights
axis
=
list
(
range
(
1
,
len
(
outputs
.
get_shape
())))
correct_seq
=
1.0
-
tf
.
minimum
(
1.0
,
tf
.
reduce_sum
(
not_correct
,
axis
=
axis
))
return
correct_seq
,
tf
.
constant
(
1.0
)
def
padded_neg_log_perplexity
(
logits
,
labels
,
vocab_size
):
"""Average log-perplexity excluding padding 0s. No smoothing."""
num
,
den
=
padded_cross_entropy_loss
(
logits
,
labels
,
0
,
vocab_size
)
return
-
num
,
den
class
MetricLayer
(
tf
.
keras
.
layers
.
Layer
):
"""Custom a layer of metrics for Transformer model."""
def
__init__
(
self
,
vocab_size
):
super
(
MetricLayer
,
self
).
__init__
()
self
.
vocab_size
=
vocab_size
self
.
metric_mean_fns
=
[]
def
build
(
self
,
input_shape
):
""""Builds metric layer."""
neg_log_perplexity
=
functools
.
partial
(
padded_neg_log_perplexity
,
vocab_size
=
self
.
vocab_size
)
self
.
metric_mean_fns
=
[
(
tf
.
keras
.
metrics
.
Mean
(
"accuracy"
),
padded_accuracy
),
(
tf
.
keras
.
metrics
.
Mean
(
"accuracy_top5"
),
padded_accuracy_top5
),
(
tf
.
keras
.
metrics
.
Mean
(
"accuracy_per_sequence"
),
padded_sequence_accuracy
),
(
tf
.
keras
.
metrics
.
Mean
(
"neg_log_perplexity"
),
neg_log_perplexity
),
]
super
(
MetricLayer
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"vocab_size"
:
self
.
vocab_size
}
def
call
(
self
,
inputs
):
logits
,
targets
=
inputs
[
0
],
inputs
[
1
]
for
mean
,
fn
in
self
.
metric_mean_fns
:
m
=
mean
(
*
fn
(
logits
,
targets
))
self
.
add_metric
(
m
)
return
logits
def
transformer_loss
(
logits
,
labels
,
smoothing
,
vocab_size
):
"""Calculates total loss containing cross entropy with padding ignored.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch_size, length_labels]
smoothing: Label smoothing constant, used to determine the on and off values
vocab_size: int size of the vocabulary
Returns:
A scalar float tensor for loss.
"""
xentropy
,
weights
=
padded_cross_entropy_loss
(
logits
,
labels
,
smoothing
,
vocab_size
)
return
tf
.
reduce_sum
(
xentropy
)
/
tf
.
reduce_sum
(
weights
)
official/nlp/transformer/misc.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Misc for Transformer."""
# pylint: disable=g-bad-import-order
from
absl
import
flags
import
tensorflow
as
tf
from
official.nlp.transformer
import
model_params
from
official.utils.flags
import
core
as
flags_core
from
official.utils.misc
import
keras_utils
FLAGS
=
flags
.
FLAGS
PARAMS_MAP
=
{
'tiny'
:
model_params
.
TINY_PARAMS
,
'base'
:
model_params
.
BASE_PARAMS
,
'big'
:
model_params
.
BIG_PARAMS
,
}
def
get_model_params
(
param_set
,
num_gpus
):
"""Gets predefined model params."""
if
num_gpus
>
1
:
if
param_set
==
'big'
:
return
model_params
.
BIG_MULTI_GPU_PARAMS
.
copy
()
elif
param_set
==
'base'
:
return
model_params
.
BASE_MULTI_GPU_PARAMS
.
copy
()
else
:
raise
ValueError
(
'Not valid params: param_set={} num_gpus={}'
.
format
(
param_set
,
num_gpus
))
return
PARAMS_MAP
[
param_set
].
copy
()
def
define_transformer_flags
():
"""Add flags and flag validators for running transformer_main."""
# Add common flags (data_dir, model_dir, etc.).
flags_core
.
define_base
(
num_gpu
=
True
,
distribution_strategy
=
True
)
flags_core
.
define_performance
(
num_parallel_calls
=
True
,
inter_op
=
False
,
intra_op
=
False
,
synthetic_data
=
True
,
max_train_steps
=
False
,
dtype
=
True
,
loss_scale
=
True
,
all_reduce_alg
=
True
,
num_packs
=
True
,
tf_gpu_thread_mode
=
True
,
datasets_num_private_threads
=
True
,
enable_xla
=
True
,
fp16_implementation
=
True
)
flags_core
.
define_benchmark
()
flags_core
.
define_device
(
tpu
=
True
)
flags
.
DEFINE_integer
(
name
=
'train_steps'
,
short_name
=
'ts'
,
default
=
300000
,
help
=
flags_core
.
help_wrap
(
'The number of steps used to train.'
))
flags
.
DEFINE_integer
(
name
=
'steps_between_evals'
,
short_name
=
'sbe'
,
default
=
5000
,
help
=
flags_core
.
help_wrap
(
'The Number of training steps to run between evaluations. This is '
'used if --train_steps is defined.'
))
flags
.
DEFINE_boolean
(
name
=
'enable_time_history'
,
default
=
True
,
help
=
'Whether to enable TimeHistory callback.'
)
flags
.
DEFINE_boolean
(
name
=
'enable_tensorboard'
,
default
=
False
,
help
=
'Whether to enable Tensorboard callback.'
)
flags
.
DEFINE_boolean
(
name
=
'enable_metrics_in_training'
,
default
=
False
,
help
=
'Whether to enable metrics during training.'
)
flags
.
DEFINE_boolean
(
name
=
'enable_mlir_bridge'
,
default
=
False
,
help
=
'Whether to enable the TF to XLA bridge.'
)
# Set flags from the flags_core module as 'key flags' so they're listed when
# the '-h' flag is used. Without this line, the flags defined above are
# only shown in the full `--helpful` help text.
flags
.
adopt_module_key_flags
(
flags_core
)
# Add transformer-specific flags
flags
.
DEFINE_enum
(
name
=
'param_set'
,
short_name
=
'mp'
,
default
=
'big'
,
enum_values
=
PARAMS_MAP
.
keys
(),
help
=
flags_core
.
help_wrap
(
'Parameter set to use when creating and training the model. The '
'parameters define the input shape (batch size and max length), '
'model configuration (size of embedding, # of hidden layers, etc.), '
'and various other settings. The big parameter set increases the '
'default batch size, embedding/hidden size, and filter size. For a '
'complete list of parameters, please see model/model_params.py.'
))
flags
.
DEFINE_bool
(
name
=
'static_batch'
,
short_name
=
'sb'
,
default
=
False
,
help
=
flags_core
.
help_wrap
(
'Whether the batches in the dataset should have static shapes. In '
'general, this setting should be False. Dynamic shapes allow the '
'inputs to be grouped so that the number of padding tokens is '
'minimized, and helps model training. In cases where the input shape '
'must be static (e.g. running on TPU), this setting will be ignored '
'and static batching will always be used.'
))
flags
.
DEFINE_integer
(
name
=
'max_length'
,
short_name
=
'ml'
,
default
=
256
,
help
=
flags_core
.
help_wrap
(
'Max sentence length for Transformer. Default is 256. Note: Usually '
'it is more effective to use a smaller max length if static_batch is '
'enabled, e.g. 64.'
))
# Flags for training with steps (may be used for debugging)
flags
.
DEFINE_integer
(
name
=
'validation_steps'
,
short_name
=
'vs'
,
default
=
64
,
help
=
flags_core
.
help_wrap
(
'The number of steps used in validation.'
))
# BLEU score computation
flags
.
DEFINE_string
(
name
=
'bleu_source'
,
short_name
=
'bls'
,
default
=
None
,
help
=
flags_core
.
help_wrap
(
'Path to source file containing text translate when calculating the '
'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
))
flags
.
DEFINE_string
(
name
=
'bleu_ref'
,
short_name
=
'blr'
,
default
=
None
,
help
=
flags_core
.
help_wrap
(
'Path to source file containing text translate when calculating the '
'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
))
flags
.
DEFINE_string
(
name
=
'vocab_file'
,
short_name
=
'vf'
,
default
=
None
,
help
=
flags_core
.
help_wrap
(
'Path to subtoken vocabulary file. If data_download.py was used to '
'download and encode the training data, look in the data_dir to find '
'the vocab file.'
))
flags
.
DEFINE_string
(
name
=
'mode'
,
default
=
'train'
,
help
=
flags_core
.
help_wrap
(
'mode: train, eval, or predict'
))
flags
.
DEFINE_bool
(
name
=
'use_ctl'
,
default
=
False
,
help
=
flags_core
.
help_wrap
(
'Whether the model runs with custom training loop.'
))
flags
.
DEFINE_integer
(
name
=
'decode_batch_size'
,
default
=
32
,
help
=
flags_core
.
help_wrap
(
'Global batch size used for Transformer autoregressive decoding on '
'TPU.'
))
flags
.
DEFINE_integer
(
name
=
'decode_max_length'
,
default
=
97
,
help
=
flags_core
.
help_wrap
(
'Max sequence length of the decode/eval data. This is used by '
'Transformer autoregressive decoding on TPU to have minimum '
'paddings.'
))
flags
.
DEFINE_bool
(
name
=
'padded_decode'
,
default
=
False
,
help
=
flags_core
.
help_wrap
(
'Whether the autoregressive decoding runs with input data padded to '
'the decode_max_length. For TPU/XLA-GPU runs, this flag has to be '
'set due the static shape requirement. Although CPU/GPU could also '
'use padded_decode, it has not been tested. In addition, this method '
'will introduce unnecessary overheads which grow quadratically with '
'the max sequence length.'
))
flags
.
DEFINE_bool
(
name
=
'enable_checkpointing'
,
default
=
True
,
help
=
flags_core
.
help_wrap
(
'Whether to do checkpointing during training. When running under '
'benchmark harness, we will avoid checkpointing.'
))
flags
.
DEFINE_bool
(
name
=
'save_weights_only'
,
default
=
True
,
help
=
flags_core
.
help_wrap
(
'Only used when above `enable_checkpointing` is True. '
'If True, then only the model
\'
s weights will be saved '
'(`model.save_weights(filepath)`), else the full model is saved '
'(`model.save(filepath)`)'
))
flags_core
.
set_defaults
(
data_dir
=
'/tmp/translate_ende'
,
model_dir
=
'/tmp/transformer_model'
,
batch_size
=
None
)
# pylint: disable=unused-variable
@
flags
.
multi_flags_validator
(
[
'bleu_source'
,
'bleu_ref'
],
message
=
'Both or neither --bleu_source and --bleu_ref must be defined.'
)
def
_check_bleu_files
(
flags_dict
):
return
(
flags_dict
[
'bleu_source'
]
is
None
)
==
(
flags_dict
[
'bleu_ref'
]
is
None
)
@
flags
.
multi_flags_validator
(
[
'bleu_source'
,
'bleu_ref'
,
'vocab_file'
],
message
=
'--vocab_file must be defined if --bleu_source and --bleu_ref '
'are defined.'
)
def
_check_bleu_vocab_file
(
flags_dict
):
if
flags_dict
[
'bleu_source'
]
and
flags_dict
[
'bleu_ref'
]:
return
flags_dict
[
'vocab_file'
]
is
not
None
return
True
# pylint: enable=unused-variable
def
get_callbacks
():
"""Returns common callbacks."""
callbacks
=
[]
if
FLAGS
.
enable_time_history
:
time_callback
=
keras_utils
.
TimeHistory
(
FLAGS
.
batch_size
,
FLAGS
.
log_steps
,
logdir
=
FLAGS
.
model_dir
if
FLAGS
.
enable_tensorboard
else
None
)
callbacks
.
append
(
time_callback
)
if
FLAGS
.
enable_tensorboard
:
tensorboard_callback
=
tf
.
keras
.
callbacks
.
TensorBoard
(
log_dir
=
FLAGS
.
model_dir
)
callbacks
.
append
(
tensorboard_callback
)
return
callbacks
def
update_stats
(
history
,
stats
,
callbacks
):
"""Normalizes and updates dictionary of stats.
Args:
history: Results of the training step.
stats: Dict with pre-existing training stats.
callbacks: a list of callbacks which might include a time history callback
used during keras.fit.
"""
if
history
and
history
.
history
:
train_hist
=
history
.
history
# Gets final loss from training.
stats
[
'loss'
]
=
float
(
train_hist
[
'loss'
][
-
1
])
if
not
callbacks
:
return
# Look for the time history callback which was used during keras.fit
for
callback
in
callbacks
:
if
isinstance
(
callback
,
keras_utils
.
TimeHistory
):
timestamp_log
=
callback
.
timestamp_log
stats
[
'step_timestamp_log'
]
=
timestamp_log
stats
[
'train_finish_time'
]
=
callback
.
train_finish_time
if
len
(
timestamp_log
)
>
1
:
stats
[
'avg_exp_per_second'
]
=
(
callback
.
batch_size
*
callback
.
log_steps
*
(
len
(
callback
.
timestamp_log
)
-
1
)
/
(
timestamp_log
[
-
1
].
timestamp
-
timestamp_log
[
0
].
timestamp
))
official/nlp/transformer/model_params.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Defines Transformer model parameters."""
import
collections
BASE_PARAMS
=
collections
.
defaultdict
(
lambda
:
None
,
# Set default value to None.
# Input params
default_batch_size
=
2048
,
# Maximum number of tokens per batch of examples.
default_batch_size_tpu
=
32768
,
max_length
=
256
,
# Maximum number of tokens per example.
# Model params
initializer_gain
=
1.0
,
# Used in trainable variable initialization.
vocab_size
=
33708
,
# Number of tokens defined in the vocabulary file.
hidden_size
=
512
,
# Model dimension in the hidden layers.
num_hidden_layers
=
6
,
# Number of layers in the encoder and decoder stacks.
num_heads
=
8
,
# Number of heads to use in multi-headed attention.
filter_size
=
2048
,
# Inner layer dimension in the feedforward network.
# Dropout values (only used when training)
layer_postprocess_dropout
=
0.1
,
attention_dropout
=
0.1
,
relu_dropout
=
0.1
,
# Training params
label_smoothing
=
0.1
,
learning_rate
=
2.0
,
learning_rate_decay_rate
=
1.0
,
learning_rate_warmup_steps
=
16000
,
# Optimizer params
optimizer_adam_beta1
=
0.9
,
optimizer_adam_beta2
=
0.997
,
optimizer_adam_epsilon
=
1e-09
,
# Default prediction params
extra_decode_length
=
50
,
beam_size
=
4
,
alpha
=
0.6
,
# used to calculate length normalization in beam search
# TPU specific parameters
use_tpu
=
False
,
static_batch
=
False
,
allow_ffn_pad
=
True
,
)
BIG_PARAMS
=
BASE_PARAMS
.
copy
()
BIG_PARAMS
.
update
(
default_batch_size
=
4096
,
# default batch size is smaller than for BASE_PARAMS due to memory limits.
default_batch_size_tpu
=
16384
,
hidden_size
=
1024
,
filter_size
=
4096
,
num_heads
=
16
,
)
# Parameters for running the model in multi gpu. These should not change the
# params that modify the model shape (such as the hidden_size or num_heads).
BASE_MULTI_GPU_PARAMS
=
BASE_PARAMS
.
copy
()
BASE_MULTI_GPU_PARAMS
.
update
(
learning_rate_warmup_steps
=
8000
)
BIG_MULTI_GPU_PARAMS
=
BIG_PARAMS
.
copy
()
BIG_MULTI_GPU_PARAMS
.
update
(
layer_postprocess_dropout
=
0.3
,
learning_rate_warmup_steps
=
8000
)
# Parameters for testing the model
TINY_PARAMS
=
BASE_PARAMS
.
copy
()
TINY_PARAMS
.
update
(
default_batch_size
=
1024
,
default_batch_size_tpu
=
1024
,
hidden_size
=
32
,
num_heads
=
4
,
filter_size
=
256
,
)
official/nlp/transformer/model_utils.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer model helper methods."""
import
math
import
numpy
as
np
import
tensorflow
as
tf
# Very low numbers to represent -infinity. We do not actually use -Inf, since we
# want to be able to multiply these values by zero to get zero. (-Inf * 0 = NaN)
_NEG_INF_FP32
=
-
1e9
_NEG_INF_FP16
=
np
.
finfo
(
np
.
float16
).
min
def
get_position_encoding
(
length
,
hidden_size
,
min_timescale
=
1.0
,
max_timescale
=
1.0e4
):
"""Return positional encoding.
Calculates the position encoding as a mix of sine and cosine functions with
geometrically increasing wavelengths.
Defined and formulized in Attention is All You Need, section 3.5.
Args:
length: Sequence length.
hidden_size: Size of the
min_timescale: Minimum scale that will be applied at each position
max_timescale: Maximum scale that will be applied at each position
Returns:
Tensor with shape [length, hidden_size]
"""
# We compute the positional encoding in float32 even if the model uses
# float16, as many of the ops used, like log and exp, are numerically unstable
# in float16.
position
=
tf
.
cast
(
tf
.
range
(
length
),
tf
.
float32
)
num_timescales
=
hidden_size
//
2
log_timescale_increment
=
(
math
.
log
(
float
(
max_timescale
)
/
float
(
min_timescale
))
/
(
tf
.
cast
(
num_timescales
,
tf
.
float32
)
-
1
))
inv_timescales
=
min_timescale
*
tf
.
exp
(
tf
.
cast
(
tf
.
range
(
num_timescales
),
tf
.
float32
)
*
-
log_timescale_increment
)
scaled_time
=
tf
.
expand_dims
(
position
,
1
)
*
tf
.
expand_dims
(
inv_timescales
,
0
)
signal
=
tf
.
concat
([
tf
.
sin
(
scaled_time
),
tf
.
cos
(
scaled_time
)],
axis
=
1
)
return
signal
def
get_decoder_self_attention_bias
(
length
,
dtype
=
tf
.
float32
):
"""Calculate bias for decoder that maintains model's autoregressive property.
Creates a tensor that masks out locations that correspond to illegal
connections, so prediction at position i cannot draw information from future
positions.
Args:
length: int length of sequences in batch.
dtype: The dtype of the return value.
Returns:
float tensor of shape [1, 1, length, length]
"""
neg_inf
=
_NEG_INF_FP16
if
dtype
==
tf
.
float16
else
_NEG_INF_FP32
with
tf
.
name_scope
(
"decoder_self_attention_bias"
):
valid_locs
=
tf
.
linalg
.
band_part
(
tf
.
ones
([
length
,
length
],
dtype
=
dtype
),
-
1
,
0
)
valid_locs
=
tf
.
reshape
(
valid_locs
,
[
1
,
1
,
length
,
length
])
decoder_bias
=
neg_inf
*
(
1.0
-
valid_locs
)
return
decoder_bias
def
get_padding
(
x
,
padding_value
=
0
,
dtype
=
tf
.
float32
):
"""Return float tensor representing the padding values in x.
Args:
x: int tensor with any shape
padding_value: int which represents padded values in input
dtype: The dtype of the return value.
Returns:
float tensor with same shape as x containing values 0 or 1.
0 -> non-padding, 1 -> padding
"""
with
tf
.
name_scope
(
"padding"
):
return
tf
.
cast
(
tf
.
equal
(
x
,
padding_value
),
dtype
)
def
get_padding_bias
(
x
,
padding_value
=
0
,
dtype
=
tf
.
float32
):
"""Calculate bias tensor from padding values in tensor.
Bias tensor that is added to the pre-softmax multi-headed attention logits,
which has shape [batch_size, num_heads, length, length]. The tensor is zero at
non-padding locations, and -1e9 (negative infinity) at padding locations.
Args:
x: int tensor with shape [batch_size, length]
padding_value: int which represents padded values in input
dtype: The dtype of the return value
Returns:
Attention bias tensor of shape [batch_size, 1, 1, length].
"""
with
tf
.
name_scope
(
"attention_bias"
):
padding
=
get_padding
(
x
,
padding_value
,
dtype
)
attention_bias
=
padding
*
_NEG_INF_FP32
attention_bias
=
tf
.
expand_dims
(
tf
.
expand_dims
(
attention_bias
,
axis
=
1
),
axis
=
1
)
return
attention_bias
official/nlp/transformer/model_utils_test.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test Transformer model helper methods."""
import
tensorflow
as
tf
from
official.nlp.transformer
import
model_utils
NEG_INF
=
-
1e9
class
ModelUtilsTest
(
tf
.
test
.
TestCase
):
def
test_get_padding
(
self
):
x
=
tf
.
constant
([[
1
,
0
,
0
,
0
,
2
],
[
3
,
4
,
0
,
0
,
0
],
[
0
,
5
,
6
,
0
,
7
]])
padding
=
model_utils
.
get_padding
(
x
,
padding_value
=
0
)
self
.
assertAllEqual
([[
0
,
1
,
1
,
1
,
0
],
[
0
,
0
,
1
,
1
,
1
],
[
1
,
0
,
0
,
1
,
0
]],
padding
)
def
test_get_padding_bias
(
self
):
x
=
tf
.
constant
([[
1
,
0
,
0
,
0
,
2
],
[
3
,
4
,
0
,
0
,
0
],
[
0
,
5
,
6
,
0
,
7
]])
bias
=
model_utils
.
get_padding_bias
(
x
)
bias_shape
=
tf
.
shape
(
bias
)
flattened_bias
=
tf
.
reshape
(
bias
,
[
3
,
5
])
self
.
assertAllEqual
(
[[
0
,
NEG_INF
,
NEG_INF
,
NEG_INF
,
0
],
[
0
,
0
,
NEG_INF
,
NEG_INF
,
NEG_INF
],
[
NEG_INF
,
0
,
0
,
NEG_INF
,
0
]],
flattened_bias
)
self
.
assertAllEqual
([
3
,
1
,
1
,
5
],
bias_shape
)
def
test_get_decoder_self_attention_bias
(
self
):
length
=
5
bias
=
model_utils
.
get_decoder_self_attention_bias
(
length
)
self
.
assertAllEqual
(
[[[[
0
,
NEG_INF
,
NEG_INF
,
NEG_INF
,
NEG_INF
],
[
0
,
0
,
NEG_INF
,
NEG_INF
,
NEG_INF
],
[
0
,
0
,
0
,
NEG_INF
,
NEG_INF
],
[
0
,
0
,
0
,
0
,
NEG_INF
],
[
0
,
0
,
0
,
0
,
0
]]]],
bias
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/transformer/optimizer.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimizer from addons and learning rate scheduler."""
import
tensorflow
as
tf
class
LearningRateSchedule
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Learning rate schedule."""
def
__init__
(
self
,
initial_learning_rate
,
hidden_size
,
warmup_steps
):
"""Initialize configuration of the learning rate schedule.
Args:
initial_learning_rate: A float, the initial learning rate.
hidden_size: An integer, the model dimension in the hidden layers.
warmup_steps: An integer, the number of steps required for linear warmup.
"""
super
(
LearningRateSchedule
,
self
).
__init__
()
self
.
initial_learning_rate
=
initial_learning_rate
self
.
hidden_size
=
hidden_size
self
.
warmup_steps
=
warmup_steps
self
.
warmup_steps_tensor
=
tf
.
cast
(
warmup_steps
,
tf
.
float32
)
def
__call__
(
self
,
global_step
):
"""Calculate learning rate with linear warmup and rsqrt decay.
Args:
global_step: An integer, the current global step used for learning rate
calculation.
Returns:
A float, the learning rate needs to be used for current global step.
"""
with
tf
.
name_scope
(
'learning_rate_schedule'
):
global_step
=
tf
.
cast
(
global_step
,
tf
.
float32
)
learning_rate
=
self
.
initial_learning_rate
learning_rate
*=
(
self
.
hidden_size
**-
0.5
)
# Apply linear warmup
learning_rate
*=
tf
.
minimum
(
1.0
,
global_step
/
self
.
warmup_steps_tensor
)
# Apply rsqrt decay
learning_rate
/=
tf
.
sqrt
(
tf
.
maximum
(
global_step
,
self
.
warmup_steps_tensor
))
return
learning_rate
def
get_config
(
self
):
"""Get the configuration of the learning rate schedule."""
return
{
'initial_learning_rate'
:
self
.
initial_learning_rate
,
'hidden_size'
:
self
.
hidden_size
,
'warmup_steps'
:
self
.
warmup_steps
,
}
official/nlp/transformer/transformer.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Defines the Transformer model in TF 2.0.
Model paper: https://arxiv.org/pdf/1706.03762.pdf
Transformer model code source: https://github.com/tensorflow/tensor2tensor
"""
import
tensorflow
as
tf
from
official.nlp.modeling.layers
import
position_embedding
from
official.nlp.modeling.ops
import
beam_search
from
official.nlp.transformer
import
attention_layer
from
official.nlp.transformer
import
embedding_layer
from
official.nlp.transformer
import
ffn_layer
from
official.nlp.transformer
import
metrics
from
official.nlp.transformer
import
model_utils
from
official.nlp.transformer.utils.tokenizer
import
EOS_ID
# Disable the not-callable lint error, since it claims many objects are not
# callable when they actually are.
# pylint: disable=not-callable
def
create_model
(
params
,
is_train
):
"""Creates transformer model."""
with
tf
.
name_scope
(
"model"
):
if
is_train
:
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
targets
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"targets"
)
internal_model
=
Transformer
(
params
,
name
=
"transformer_v2"
)
logits
=
internal_model
([
inputs
,
targets
],
training
=
is_train
)
vocab_size
=
params
[
"vocab_size"
]
label_smoothing
=
params
[
"label_smoothing"
]
if
params
[
"enable_metrics_in_training"
]:
logits
=
metrics
.
MetricLayer
(
vocab_size
)([
logits
,
targets
])
logits
=
tf
.
keras
.
layers
.
Lambda
(
lambda
x
:
x
,
name
=
"logits"
,
dtype
=
tf
.
float32
)(
logits
)
model
=
tf
.
keras
.
Model
([
inputs
,
targets
],
logits
)
loss
=
metrics
.
transformer_loss
(
logits
,
targets
,
label_smoothing
,
vocab_size
)
model
.
add_loss
(
loss
)
return
model
else
:
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
internal_model
=
Transformer
(
params
,
name
=
"transformer_v2"
)
ret
=
internal_model
([
inputs
],
training
=
is_train
)
outputs
,
scores
=
ret
[
"outputs"
],
ret
[
"scores"
]
return
tf
.
keras
.
Model
(
inputs
,
[
outputs
,
scores
])
class
Transformer
(
tf
.
keras
.
Model
):
"""Transformer model with Keras.
Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
The Transformer model consists of an encoder and decoder. The input is an int
sequence (or a batch of sequences). The encoder produces a continuous
representation, and the decoder uses the encoder output to generate
probabilities for the output sequence.
"""
def
__init__
(
self
,
params
,
name
=
None
):
"""Initialize layers to build Transformer model.
Args:
params: hyperparameter object defining layer sizes, dropout values, etc.
name: name of the model.
"""
super
(
Transformer
,
self
).
__init__
(
name
=
name
)
self
.
params
=
params
self
.
embedding_softmax_layer
=
embedding_layer
.
EmbeddingSharedWeights
(
params
[
"vocab_size"
],
params
[
"hidden_size"
])
self
.
encoder_stack
=
EncoderStack
(
params
)
self
.
decoder_stack
=
DecoderStack
(
params
)
self
.
position_embedding
=
position_embedding
.
RelativePositionEmbedding
(
hidden_size
=
self
.
params
[
"hidden_size"
])
def
get_config
(
self
):
return
{
"params"
:
self
.
params
,
}
def
call
(
self
,
inputs
,
training
):
"""Calculate target logits or inferred target sequences.
Args:
inputs: input tensor list of size 1 or 2.
First item, inputs: int tensor with shape [batch_size, input_length].
Second item (optional), targets: None or int tensor with shape
[batch_size, target_length].
training: boolean, whether in training mode or not.
Returns:
If targets is defined, then return logits for each word in the target
sequence. float tensor with shape [batch_size, target_length, vocab_size]
If target is none, then generate output sequence one token at a time.
returns a dictionary {
outputs: int tensor with shape [batch_size, decoded_length]
scores: float tensor with shape [batch_size]}
Even when float16 is used, the output tensor(s) are always float32.
Raises:
NotImplementedError: If try to use padded decode method on CPU/GPUs.
"""
inputs
=
inputs
if
isinstance
(
inputs
,
list
)
else
[
inputs
]
if
len
(
inputs
)
==
2
:
inputs
,
targets
=
inputs
[
0
],
inputs
[
1
]
else
:
# Decoding path.
inputs
,
targets
=
inputs
[
0
],
None
if
self
.
params
[
"padded_decode"
]:
if
not
self
.
params
[
"num_replicas"
]:
raise
NotImplementedError
(
"Padded decoding on CPU/GPUs is not supported."
)
decode_batch_size
=
int
(
self
.
params
[
"decode_batch_size"
]
/
self
.
params
[
"num_replicas"
])
inputs
.
set_shape
([
decode_batch_size
,
self
.
params
[
"decode_max_length"
]])
# Variance scaling is used here because it seems to work in many problems.
# Other reasonable initializers may also work just as well.
with
tf
.
name_scope
(
"Transformer"
):
# Calculate attention bias for encoder self-attention and decoder
# multi-headed attention layers.
attention_bias
=
model_utils
.
get_padding_bias
(
inputs
)
# Run the inputs through the encoder layer to map the symbol
# representations to continuous representations.
encoder_outputs
=
self
.
encode
(
inputs
,
attention_bias
,
training
)
# Generate output sequence if targets is None, or return logits if target
# sequence is known.
if
targets
is
None
:
return
self
.
predict
(
encoder_outputs
,
attention_bias
,
training
)
else
:
logits
=
self
.
decode
(
targets
,
encoder_outputs
,
attention_bias
,
training
)
return
logits
def
encode
(
self
,
inputs
,
attention_bias
,
training
):
"""Generate continuous representation for inputs.
Args:
inputs: int tensor with shape [batch_size, input_length].
attention_bias: float tensor with shape [batch_size, 1, 1, input_length].
training: boolean, whether in training mode or not.
Returns:
float tensor with shape [batch_size, input_length, hidden_size]
"""
with
tf
.
name_scope
(
"encode"
):
# Prepare inputs to the layer stack by adding positional encodings and
# applying dropout.
embedded_inputs
=
self
.
embedding_softmax_layer
(
inputs
)
embedded_inputs
=
tf
.
cast
(
embedded_inputs
,
self
.
params
[
"dtype"
])
inputs_padding
=
model_utils
.
get_padding
(
inputs
)
attention_bias
=
tf
.
cast
(
attention_bias
,
self
.
params
[
"dtype"
])
with
tf
.
name_scope
(
"add_pos_encoding"
):
pos_encoding
=
self
.
position_embedding
(
inputs
=
embedded_inputs
)
pos_encoding
=
tf
.
cast
(
pos_encoding
,
self
.
params
[
"dtype"
])
encoder_inputs
=
embedded_inputs
+
pos_encoding
if
training
:
encoder_inputs
=
tf
.
nn
.
dropout
(
encoder_inputs
,
rate
=
self
.
params
[
"layer_postprocess_dropout"
])
return
self
.
encoder_stack
(
encoder_inputs
,
attention_bias
,
inputs_padding
,
training
=
training
)
def
decode
(
self
,
targets
,
encoder_outputs
,
attention_bias
,
training
):
"""Generate logits for each value in the target sequence.
Args:
targets: target values for the output sequence. int tensor with shape
[batch_size, target_length]
encoder_outputs: continuous representation of input sequence. float tensor
with shape [batch_size, input_length, hidden_size]
attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
training: boolean, whether in training mode or not.
Returns:
float32 tensor with shape [batch_size, target_length, vocab_size]
"""
with
tf
.
name_scope
(
"decode"
):
# Prepare inputs to decoder layers by shifting targets, adding positional
# encoding and applying dropout.
with
tf
.
name_scope
(
"shift_targets"
):
# Shift targets to the right, and remove the last element
targets
=
tf
.
pad
(
targets
,
[[
0
,
0
],
[
1
,
0
]])[:,
:
-
1
]
decoder_inputs
=
self
.
embedding_softmax_layer
(
targets
)
decoder_inputs
=
tf
.
cast
(
decoder_inputs
,
self
.
params
[
"dtype"
])
attention_bias
=
tf
.
cast
(
attention_bias
,
self
.
params
[
"dtype"
])
with
tf
.
name_scope
(
"add_pos_encoding"
):
length
=
tf
.
shape
(
decoder_inputs
)[
1
]
pos_encoding
=
self
.
position_embedding
(
decoder_inputs
)
pos_encoding
=
tf
.
cast
(
pos_encoding
,
self
.
params
[
"dtype"
])
decoder_inputs
+=
pos_encoding
if
training
:
decoder_inputs
=
tf
.
nn
.
dropout
(
decoder_inputs
,
rate
=
self
.
params
[
"layer_postprocess_dropout"
])
# Run values
decoder_self_attention_bias
=
model_utils
.
get_decoder_self_attention_bias
(
length
,
dtype
=
self
.
params
[
"dtype"
])
outputs
=
self
.
decoder_stack
(
decoder_inputs
,
encoder_outputs
,
decoder_self_attention_bias
,
attention_bias
,
training
=
training
)
logits
=
self
.
embedding_softmax_layer
(
outputs
,
mode
=
"linear"
)
logits
=
tf
.
cast
(
logits
,
tf
.
float32
)
return
logits
def
_get_symbols_to_logits_fn
(
self
,
max_decode_length
,
training
):
"""Returns a decoding function that calculates logits of the next tokens."""
timing_signal
=
self
.
position_embedding
(
inputs
=
None
,
length
=
max_decode_length
+
1
)
timing_signal
=
tf
.
cast
(
timing_signal
,
self
.
params
[
"dtype"
])
decoder_self_attention_bias
=
model_utils
.
get_decoder_self_attention_bias
(
max_decode_length
,
dtype
=
self
.
params
[
"dtype"
])
def
symbols_to_logits_fn
(
ids
,
i
,
cache
):
"""Generate logits for next potential IDs.
Args:
ids: Current decoded sequences. int tensor with shape [batch_size *
beam_size, i + 1].
i: Loop index.
cache: dictionary of values storing the encoder output, encoder-decoder
attention bias, and previous decoder attention values.
Returns:
Tuple of
(logits with shape [batch_size * beam_size, vocab_size],
updated cache values)
"""
# Set decoder input to the last generated IDs
decoder_input
=
ids
[:,
-
1
:]
# Preprocess decoder input by getting embeddings and adding timing signal.
decoder_input
=
self
.
embedding_softmax_layer
(
decoder_input
)
decoder_input
+=
timing_signal
[
i
]
if
self
.
params
[
"padded_decode"
]:
bias_shape
=
decoder_self_attention_bias
.
shape
.
as_list
()
self_attention_bias
=
tf
.
slice
(
decoder_self_attention_bias
,
[
0
,
0
,
i
,
0
],
[
bias_shape
[
0
],
bias_shape
[
1
],
1
,
bias_shape
[
3
]])
else
:
self_attention_bias
=
decoder_self_attention_bias
[:,
:,
i
:
i
+
1
,
:
i
+
1
]
decoder_outputs
=
self
.
decoder_stack
(
decoder_input
,
cache
.
get
(
"encoder_outputs"
),
self_attention_bias
,
cache
.
get
(
"encoder_decoder_attention_bias"
),
training
=
training
,
cache
=
cache
,
decode_loop_step
=
i
if
self
.
params
[
"padded_decode"
]
else
None
)
logits
=
self
.
embedding_softmax_layer
(
decoder_outputs
,
mode
=
"linear"
)
logits
=
tf
.
squeeze
(
logits
,
axis
=
[
1
])
return
logits
,
cache
return
symbols_to_logits_fn
def
predict
(
self
,
encoder_outputs
,
encoder_decoder_attention_bias
,
training
):
"""Return predicted sequence."""
encoder_outputs
=
tf
.
cast
(
encoder_outputs
,
self
.
params
[
"dtype"
])
if
self
.
params
[
"padded_decode"
]:
batch_size
=
encoder_outputs
.
shape
.
as_list
()[
0
]
input_length
=
encoder_outputs
.
shape
.
as_list
()[
1
]
else
:
batch_size
=
tf
.
shape
(
encoder_outputs
)[
0
]
input_length
=
tf
.
shape
(
encoder_outputs
)[
1
]
max_decode_length
=
input_length
+
self
.
params
[
"extra_decode_length"
]
encoder_decoder_attention_bias
=
tf
.
cast
(
encoder_decoder_attention_bias
,
self
.
params
[
"dtype"
])
symbols_to_logits_fn
=
self
.
_get_symbols_to_logits_fn
(
max_decode_length
,
training
)
# Create initial set of IDs that will be passed into symbols_to_logits_fn.
initial_ids
=
tf
.
zeros
([
batch_size
],
dtype
=
tf
.
int32
)
# Create cache storing decoder attention values for each layer.
# pylint: disable=g-complex-comprehension
init_decode_length
=
(
max_decode_length
if
self
.
params
[
"padded_decode"
]
else
0
)
num_heads
=
self
.
params
[
"num_heads"
]
dim_per_head
=
self
.
params
[
"hidden_size"
]
//
num_heads
cache
=
{
"layer_%d"
%
layer
:
{
"k"
:
tf
.
zeros
(
[
batch_size
,
init_decode_length
,
num_heads
,
dim_per_head
],
dtype
=
self
.
params
[
"dtype"
]),
"v"
:
tf
.
zeros
(
[
batch_size
,
init_decode_length
,
num_heads
,
dim_per_head
],
dtype
=
self
.
params
[
"dtype"
])
}
for
layer
in
range
(
self
.
params
[
"num_hidden_layers"
])
}
# pylint: enable=g-complex-comprehension
# Add encoder output and attention bias to the cache.
cache
[
"encoder_outputs"
]
=
encoder_outputs
cache
[
"encoder_decoder_attention_bias"
]
=
encoder_decoder_attention_bias
# Use beam search to find the top beam_size sequences and scores.
decoded_ids
,
scores
=
beam_search
.
sequence_beam_search
(
symbols_to_logits_fn
=
symbols_to_logits_fn
,
initial_ids
=
initial_ids
,
initial_cache
=
cache
,
vocab_size
=
self
.
params
[
"vocab_size"
],
beam_size
=
self
.
params
[
"beam_size"
],
alpha
=
self
.
params
[
"alpha"
],
max_decode_length
=
max_decode_length
,
eos_id
=
EOS_ID
,
padded_decode
=
self
.
params
[
"padded_decode"
],
dtype
=
self
.
params
[
"dtype"
])
# Get the top sequence for each batch element
top_decoded_ids
=
decoded_ids
[:,
0
,
1
:]
top_scores
=
scores
[:,
0
]
return
{
"outputs"
:
top_decoded_ids
,
"scores"
:
top_scores
}
class
PrePostProcessingWrapper
(
tf
.
keras
.
layers
.
Layer
):
"""Wrapper class that applies layer pre-processing and post-processing."""
def
__init__
(
self
,
layer
,
params
):
super
(
PrePostProcessingWrapper
,
self
).
__init__
()
self
.
layer
=
layer
self
.
params
=
params
self
.
postprocess_dropout
=
params
[
"layer_postprocess_dropout"
]
def
build
(
self
,
input_shape
):
# Create normalization layer
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-6
,
dtype
=
"float32"
)
super
(
PrePostProcessingWrapper
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"params"
:
self
.
params
,
}
def
call
(
self
,
x
,
*
args
,
**
kwargs
):
"""Calls wrapped layer with same parameters."""
# Preprocessing: apply layer normalization
training
=
kwargs
[
"training"
]
y
=
self
.
layer_norm
(
x
)
# Get layer output
y
=
self
.
layer
(
y
,
*
args
,
**
kwargs
)
# Postprocessing: apply dropout and residual connection
if
training
:
y
=
tf
.
nn
.
dropout
(
y
,
rate
=
self
.
postprocess_dropout
)
return
x
+
y
class
EncoderStack
(
tf
.
keras
.
layers
.
Layer
):
"""Transformer encoder stack.
The encoder stack is made up of N identical layers. Each layer is composed
of the sublayers:
1. Self-attention layer
2. Feedforward network (which is 2 fully-connected layers)
"""
def
__init__
(
self
,
params
):
super
(
EncoderStack
,
self
).
__init__
()
self
.
params
=
params
self
.
layers
=
[]
def
build
(
self
,
input_shape
):
"""Builds the encoder stack."""
params
=
self
.
params
for
_
in
range
(
params
[
"num_hidden_layers"
]):
# Create sublayers for each layer.
self_attention_layer
=
attention_layer
.
SelfAttention
(
params
[
"hidden_size"
],
params
[
"num_heads"
],
params
[
"attention_dropout"
])
feed_forward_network
=
ffn_layer
.
FeedForwardNetwork
(
params
[
"hidden_size"
],
params
[
"filter_size"
],
params
[
"relu_dropout"
])
self
.
layers
.
append
([
PrePostProcessingWrapper
(
self_attention_layer
,
params
),
PrePostProcessingWrapper
(
feed_forward_network
,
params
)
])
# Create final layer normalization layer.
self
.
output_normalization
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-6
,
dtype
=
"float32"
)
super
(
EncoderStack
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"params"
:
self
.
params
,
}
def
call
(
self
,
encoder_inputs
,
attention_bias
,
inputs_padding
,
training
):
"""Return the output of the encoder layer stacks.
Args:
encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
attention_bias: bias for the encoder self-attention layer. [batch_size, 1,
1, input_length]
inputs_padding: tensor with shape [batch_size, input_length], inputs with
zero paddings.
training: boolean, whether in training mode or not.
Returns:
Output of encoder layer stack.
float32 tensor with shape [batch_size, input_length, hidden_size]
"""
for
n
,
layer
in
enumerate
(
self
.
layers
):
# Run inputs through the sublayers.
self_attention_layer
=
layer
[
0
]
feed_forward_network
=
layer
[
1
]
with
tf
.
name_scope
(
"layer_%d"
%
n
):
with
tf
.
name_scope
(
"self_attention"
):
encoder_inputs
=
self_attention_layer
(
encoder_inputs
,
attention_bias
,
training
=
training
)
with
tf
.
name_scope
(
"ffn"
):
encoder_inputs
=
feed_forward_network
(
encoder_inputs
,
training
=
training
)
return
self
.
output_normalization
(
encoder_inputs
)
class
DecoderStack
(
tf
.
keras
.
layers
.
Layer
):
"""Transformer decoder stack.
Like the encoder stack, the decoder stack is made up of N identical layers.
Each layer is composed of the sublayers:
1. Self-attention layer
2. Multi-headed attention layer combining encoder outputs with results from
the previous self-attention layer.
3. Feedforward network (2 fully-connected layers)
"""
def
__init__
(
self
,
params
):
super
(
DecoderStack
,
self
).
__init__
()
self
.
params
=
params
self
.
layers
=
[]
def
build
(
self
,
input_shape
):
"""Builds the decoder stack."""
params
=
self
.
params
for
_
in
range
(
params
[
"num_hidden_layers"
]):
self_attention_layer
=
attention_layer
.
SelfAttention
(
params
[
"hidden_size"
],
params
[
"num_heads"
],
params
[
"attention_dropout"
])
enc_dec_attention_layer
=
attention_layer
.
Attention
(
params
[
"hidden_size"
],
params
[
"num_heads"
],
params
[
"attention_dropout"
])
feed_forward_network
=
ffn_layer
.
FeedForwardNetwork
(
params
[
"hidden_size"
],
params
[
"filter_size"
],
params
[
"relu_dropout"
])
self
.
layers
.
append
([
PrePostProcessingWrapper
(
self_attention_layer
,
params
),
PrePostProcessingWrapper
(
enc_dec_attention_layer
,
params
),
PrePostProcessingWrapper
(
feed_forward_network
,
params
)
])
self
.
output_normalization
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-6
,
dtype
=
"float32"
)
super
(
DecoderStack
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"params"
:
self
.
params
,
}
def
call
(
self
,
decoder_inputs
,
encoder_outputs
,
decoder_self_attention_bias
,
attention_bias
,
training
,
cache
=
None
,
decode_loop_step
=
None
):
"""Return the output of the decoder layer stacks.
Args:
decoder_inputs: A tensor with shape [batch_size, target_length,
hidden_size].
encoder_outputs: A tensor with shape [batch_size, input_length,
hidden_size]
decoder_self_attention_bias: A tensor with shape [1, 1, target_len,
target_length], the bias for decoder self-attention layer.
attention_bias: A tensor with shape [batch_size, 1, 1, input_length], the
bias for encoder-decoder attention layer.
training: A bool, whether in training mode or not.
cache: (Used for fast decoding) A nested dictionary storing previous
decoder self-attention values. The items are:
{layer_n: {"k": A tensor with shape [batch_size, i, key_channels],
"v": A tensor with shape [batch_size, i, value_channels]},
...}
decode_loop_step: An integer, the step number of the decoding loop. Used
only for autoregressive inference on TPU.
Returns:
Output of decoder layer stack.
float32 tensor with shape [batch_size, target_length, hidden_size]
"""
for
n
,
layer
in
enumerate
(
self
.
layers
):
self_attention_layer
=
layer
[
0
]
enc_dec_attention_layer
=
layer
[
1
]
feed_forward_network
=
layer
[
2
]
# Run inputs through the sublayers.
layer_name
=
"layer_%d"
%
n
layer_cache
=
cache
[
layer_name
]
if
cache
is
not
None
else
None
with
tf
.
name_scope
(
layer_name
):
with
tf
.
name_scope
(
"self_attention"
):
decoder_inputs
=
self_attention_layer
(
decoder_inputs
,
decoder_self_attention_bias
,
training
=
training
,
cache
=
layer_cache
,
decode_loop_step
=
decode_loop_step
)
with
tf
.
name_scope
(
"encdec_attention"
):
decoder_inputs
=
enc_dec_attention_layer
(
decoder_inputs
,
encoder_outputs
,
attention_bias
,
training
=
training
)
with
tf
.
name_scope
(
"ffn"
):
decoder_inputs
=
feed_forward_network
(
decoder_inputs
,
training
=
training
)
return
self
.
output_normalization
(
decoder_inputs
)
official/nlp/transformer/transformer_forward_test.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Forward pass test for Transformer model refactoring."""
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.modeling
import
models
from
official.nlp.transformer
import
metrics
from
official.nlp.transformer
import
model_params
from
official.nlp.transformer
import
transformer
def
_count_params
(
layer
,
trainable_only
=
True
):
"""Returns the count of all model parameters, or just trainable ones."""
if
not
trainable_only
:
return
layer
.
count_params
()
else
:
return
int
(
np
.
sum
([
tf
.
keras
.
backend
.
count_params
(
p
)
for
p
in
layer
.
trainable_weights
]))
def
_create_model
(
params
,
is_train
):
"""Creates transformer model."""
encdec_kwargs
=
dict
(
num_layers
=
params
[
"num_hidden_layers"
],
num_attention_heads
=
params
[
"num_heads"
],
intermediate_size
=
params
[
"filter_size"
],
activation
=
"relu"
,
dropout_rate
=
params
[
"relu_dropout"
],
attention_dropout_rate
=
params
[
"attention_dropout"
],
use_bias
=
False
,
norm_first
=
True
,
norm_epsilon
=
1e-6
,
intermediate_dropout
=
params
[
"relu_dropout"
])
encoder_layer
=
models
.
TransformerEncoder
(
**
encdec_kwargs
)
decoder_layer
=
models
.
TransformerDecoder
(
**
encdec_kwargs
)
model_kwargs
=
dict
(
vocab_size
=
params
[
"vocab_size"
],
embedding_width
=
params
[
"hidden_size"
],
dropout_rate
=
params
[
"layer_postprocess_dropout"
],
padded_decode
=
params
[
"padded_decode"
],
decode_max_length
=
params
[
"decode_max_length"
],
dtype
=
params
[
"dtype"
],
extra_decode_length
=
params
[
"extra_decode_length"
],
beam_size
=
params
[
"beam_size"
],
alpha
=
params
[
"alpha"
],
encoder_layer
=
encoder_layer
,
decoder_layer
=
decoder_layer
,
name
=
"transformer_v2"
)
if
is_train
:
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
targets
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"targets"
)
internal_model
=
models
.
Seq2SeqTransformer
(
**
model_kwargs
)
logits
=
internal_model
(
dict
(
inputs
=
inputs
,
targets
=
targets
),
training
=
is_train
)
vocab_size
=
params
[
"vocab_size"
]
label_smoothing
=
params
[
"label_smoothing"
]
if
params
[
"enable_metrics_in_training"
]:
logits
=
metrics
.
MetricLayer
(
vocab_size
)([
logits
,
targets
])
logits
=
tf
.
keras
.
layers
.
Lambda
(
lambda
x
:
x
,
name
=
"logits"
,
dtype
=
tf
.
float32
)(
logits
)
model
=
tf
.
keras
.
Model
([
inputs
,
targets
],
logits
)
loss
=
metrics
.
transformer_loss
(
logits
,
targets
,
label_smoothing
,
vocab_size
)
model
.
add_loss
(
loss
)
return
model
batch_size
=
params
[
"decode_batch_size"
]
if
params
[
"padded_decode"
]
else
None
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
batch_size
=
batch_size
,
dtype
=
"int64"
,
name
=
"inputs"
)
internal_model
=
models
.
Seq2SeqTransformer
(
**
model_kwargs
)
ret
=
internal_model
(
dict
(
inputs
=
inputs
),
training
=
is_train
)
outputs
,
scores
=
ret
[
"outputs"
],
ret
[
"scores"
]
return
tf
.
keras
.
Model
(
inputs
,
[
outputs
,
scores
])
class
TransformerForwardTest
(
tf
.
test
.
TestCase
):
def
setUp
(
self
):
super
(
TransformerForwardTest
,
self
).
setUp
()
self
.
params
=
params
=
model_params
.
TINY_PARAMS
params
[
"batch_size"
]
=
params
[
"default_batch_size"
]
=
16
params
[
"hidden_size"
]
=
12
params
[
"num_hidden_layers"
]
=
3
params
[
"filter_size"
]
=
14
params
[
"num_heads"
]
=
2
params
[
"vocab_size"
]
=
41
params
[
"extra_decode_length"
]
=
0
params
[
"beam_size"
]
=
3
params
[
"dtype"
]
=
tf
.
float32
params
[
"layer_postprocess_dropout"
]
=
0.0
params
[
"attention_dropout"
]
=
0.0
params
[
"relu_dropout"
]
=
0.0
def
test_forward_pass_train
(
self
):
# Set input_len different from target_len
inputs
=
np
.
asarray
([[
5
,
2
,
1
],
[
7
,
5
,
0
],
[
1
,
4
,
0
],
[
7
,
5
,
11
]])
targets
=
np
.
asarray
([[
4
,
3
,
4
,
0
],
[
13
,
19
,
17
,
8
],
[
20
,
14
,
1
,
2
],
[
5
,
7
,
3
,
0
]])
# src_model is the original model before refactored.
src_model
=
transformer
.
create_model
(
self
.
params
,
True
)
src_num_weights
=
_count_params
(
src_model
)
src_weights
=
src_model
.
get_weights
()
src_model_output
=
src_model
([
inputs
,
targets
],
training
=
True
)
# dest_model is the refactored model.
dest_model
=
_create_model
(
self
.
params
,
True
)
dest_num_weights
=
_count_params
(
dest_model
)
self
.
assertEqual
(
src_num_weights
,
dest_num_weights
)
dest_model
.
set_weights
(
src_weights
)
dest_model_output
=
dest_model
([
inputs
,
targets
],
training
=
True
)
self
.
assertAllEqual
(
src_model_output
,
dest_model_output
)
def
test_forward_pass_not_train
(
self
):
inputs
=
np
.
asarray
([[
5
,
2
,
1
],
[
7
,
5
,
0
],
[
1
,
4
,
0
],
[
7
,
5
,
11
]])
# src_model is the original model before refactored.
src_model
=
transformer
.
create_model
(
self
.
params
,
False
)
src_num_weights
=
_count_params
(
src_model
)
src_weights
=
src_model
.
get_weights
()
src_model_output
=
src_model
([
inputs
],
training
=
False
)
# dest_model is the refactored model.
dest_model
=
_create_model
(
self
.
params
,
False
)
dest_num_weights
=
_count_params
(
dest_model
)
self
.
assertEqual
(
src_num_weights
,
dest_num_weights
)
dest_model
.
set_weights
(
src_weights
)
dest_model_output
=
dest_model
([
inputs
],
training
=
False
)
self
.
assertAllEqual
(
src_model_output
[
0
],
dest_model_output
[
0
])
self
.
assertAllEqual
(
src_model_output
[
1
],
dest_model_output
[
1
])
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/transformer/transformer_layers_test.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for layers in Transformer."""
import
tensorflow
as
tf
from
official.nlp.transformer
import
attention_layer
from
official.nlp.transformer
import
embedding_layer
from
official.nlp.transformer
import
ffn_layer
from
official.nlp.transformer
import
metrics
class
TransformerLayersTest
(
tf
.
test
.
TestCase
):
def
test_attention_layer
(
self
):
hidden_size
=
64
num_heads
=
4
dropout
=
0.5
dim_per_head
=
hidden_size
//
num_heads
layer
=
attention_layer
.
SelfAttention
(
hidden_size
,
num_heads
,
dropout
)
self
.
assertDictEqual
(
layer
.
get_config
(),
{
"hidden_size"
:
hidden_size
,
"num_heads"
:
num_heads
,
"attention_dropout"
:
dropout
,
})
length
=
2
x
=
tf
.
ones
([
1
,
length
,
hidden_size
])
bias
=
tf
.
ones
([
1
])
cache
=
{
"k"
:
tf
.
zeros
([
1
,
0
,
num_heads
,
dim_per_head
]),
"v"
:
tf
.
zeros
([
1
,
0
,
num_heads
,
dim_per_head
]),
}
y
=
layer
(
x
,
bias
,
training
=
True
,
cache
=
cache
)
self
.
assertEqual
(
y
.
shape
,
(
1
,
length
,
64
,
))
self
.
assertEqual
(
cache
[
"k"
].
shape
,
(
1
,
length
,
num_heads
,
dim_per_head
,
))
self
.
assertEqual
(
cache
[
"v"
].
shape
,
(
1
,
length
,
num_heads
,
dim_per_head
,
))
def
test_embedding_shared_weights
(
self
):
vocab_size
=
50
hidden_size
=
64
length
=
2
layer
=
embedding_layer
.
EmbeddingSharedWeights
(
vocab_size
,
hidden_size
)
self
.
assertDictEqual
(
layer
.
get_config
(),
{
"vocab_size"
:
50
,
"hidden_size"
:
64
,
})
idx
=
tf
.
ones
([
1
,
length
],
dtype
=
"int32"
)
y
=
layer
(
idx
)
self
.
assertEqual
(
y
.
shape
,
(
1
,
length
,
hidden_size
,
))
x
=
tf
.
ones
([
1
,
length
,
hidden_size
])
output
=
layer
(
x
,
"linear"
)
self
.
assertEqual
(
output
.
shape
,
(
1
,
length
,
vocab_size
,
))
def
test_feed_forward_network
(
self
):
hidden_size
=
64
filter_size
=
32
relu_dropout
=
0.5
layer
=
ffn_layer
.
FeedForwardNetwork
(
hidden_size
,
filter_size
,
relu_dropout
)
self
.
assertDictEqual
(
layer
.
get_config
(),
{
"hidden_size"
:
hidden_size
,
"filter_size"
:
filter_size
,
"relu_dropout"
:
relu_dropout
,
})
length
=
2
x
=
tf
.
ones
([
1
,
length
,
hidden_size
])
y
=
layer
(
x
,
training
=
True
)
self
.
assertEqual
(
y
.
shape
,
(
1
,
length
,
hidden_size
,
))
def
test_metric_layer
(
self
):
vocab_size
=
50
logits
=
tf
.
keras
.
layers
.
Input
((
None
,
vocab_size
),
dtype
=
"float32"
,
name
=
"logits"
)
targets
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"targets"
)
output_logits
=
metrics
.
MetricLayer
(
vocab_size
)([
logits
,
targets
])
self
.
assertEqual
(
output_logits
.
shape
.
as_list
(),
[
None
,
None
,
vocab_size
,
])
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/transformer/transformer_main.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Train and evaluate the Transformer model.
See README for description of setting the training schedule and evaluating the
BLEU score.
"""
import
os
import
tempfile
# Import libraries
from
absl
import
app
from
absl
import
flags
from
absl
import
logging
import
tensorflow
as
tf
from
official.common
import
distribute_utils
from
official.modeling
import
performance
from
official.nlp.transformer
import
compute_bleu
from
official.nlp.transformer
import
data_pipeline
from
official.nlp.transformer
import
metrics
from
official.nlp.transformer
import
misc
from
official.nlp.transformer
import
optimizer
from
official.nlp.transformer
import
transformer
from
official.nlp.transformer
import
translate
from
official.nlp.transformer.utils
import
tokenizer
from
official.utils.flags
import
core
as
flags_core
from
official.utils.misc
import
keras_utils
# pylint:disable=logging-format-interpolation
INF
=
int
(
1e9
)
BLEU_DIR
=
"bleu"
_SINGLE_SAMPLE
=
1
def
translate_and_compute_bleu
(
model
,
params
,
subtokenizer
,
bleu_source
,
bleu_ref
,
distribution_strategy
=
None
):
"""Translate file and report the cased and uncased bleu scores.
Args:
model: A Keras model, used to generate the translations.
params: A dictionary, containing the translation related parameters.
subtokenizer: A subtokenizer object, used for encoding and decoding source
and translated lines.
bleu_source: A file containing source sentences for translation.
bleu_ref: A file containing the reference for the translated sentences.
distribution_strategy: A platform distribution strategy, used for TPU based
translation.
Returns:
uncased_score: A float, the case insensitive BLEU score.
cased_score: A float, the case sensitive BLEU score.
"""
# Create temporary file to store translation.
tmp
=
tempfile
.
NamedTemporaryFile
(
delete
=
False
)
tmp_filename
=
tmp
.
name
translate
.
translate_file
(
model
,
params
,
subtokenizer
,
bleu_source
,
output_file
=
tmp_filename
,
print_all_translations
=
False
,
distribution_strategy
=
distribution_strategy
)
# Compute uncased and cased bleu scores.
uncased_score
=
compute_bleu
.
bleu_wrapper
(
bleu_ref
,
tmp_filename
,
False
)
cased_score
=
compute_bleu
.
bleu_wrapper
(
bleu_ref
,
tmp_filename
,
True
)
os
.
remove
(
tmp_filename
)
return
uncased_score
,
cased_score
def
evaluate_and_log_bleu
(
model
,
params
,
bleu_source
,
bleu_ref
,
vocab_file
,
distribution_strategy
=
None
):
"""Calculate and record the BLEU score.
Args:
model: A Keras model, used to generate the translations.
params: A dictionary, containing the translation related parameters.
bleu_source: A file containing source sentences for translation.
bleu_ref: A file containing the reference for the translated sentences.
vocab_file: A file containing the vocabulary for translation.
distribution_strategy: A platform distribution strategy, used for TPU based
translation.
Returns:
uncased_score: A float, the case insensitive BLEU score.
cased_score: A float, the case sensitive BLEU score.
"""
subtokenizer
=
tokenizer
.
Subtokenizer
(
vocab_file
)
uncased_score
,
cased_score
=
translate_and_compute_bleu
(
model
,
params
,
subtokenizer
,
bleu_source
,
bleu_ref
,
distribution_strategy
)
logging
.
info
(
"Bleu score (uncased): %s"
,
uncased_score
)
logging
.
info
(
"Bleu score (cased): %s"
,
cased_score
)
return
uncased_score
,
cased_score
class
TransformerTask
(
object
):
"""Main entry of Transformer model."""
def
__init__
(
self
,
flags_obj
):
"""Init function of TransformerMain.
Args:
flags_obj: Object containing parsed flag values, i.e., FLAGS.
Raises:
ValueError: if not using static batch for input data on TPU.
"""
self
.
flags_obj
=
flags_obj
self
.
predict_model
=
None
# Add flag-defined parameters to params object
num_gpus
=
flags_core
.
get_num_gpus
(
flags_obj
)
self
.
params
=
params
=
misc
.
get_model_params
(
flags_obj
.
param_set
,
num_gpus
)
params
[
"num_gpus"
]
=
num_gpus
params
[
"use_ctl"
]
=
flags_obj
.
use_ctl
params
[
"data_dir"
]
=
flags_obj
.
data_dir
params
[
"model_dir"
]
=
flags_obj
.
model_dir
params
[
"static_batch"
]
=
flags_obj
.
static_batch
params
[
"max_length"
]
=
flags_obj
.
max_length
params
[
"decode_batch_size"
]
=
flags_obj
.
decode_batch_size
params
[
"decode_max_length"
]
=
flags_obj
.
decode_max_length
params
[
"padded_decode"
]
=
flags_obj
.
padded_decode
params
[
"max_io_parallelism"
]
=
(
flags_obj
.
num_parallel_calls
or
tf
.
data
.
experimental
.
AUTOTUNE
)
params
[
"use_synthetic_data"
]
=
flags_obj
.
use_synthetic_data
params
[
"batch_size"
]
=
flags_obj
.
batch_size
or
params
[
"default_batch_size"
]
params
[
"repeat_dataset"
]
=
None
params
[
"dtype"
]
=
flags_core
.
get_tf_dtype
(
flags_obj
)
params
[
"enable_tensorboard"
]
=
flags_obj
.
enable_tensorboard
params
[
"enable_metrics_in_training"
]
=
flags_obj
.
enable_metrics_in_training
params
[
"steps_between_evals"
]
=
flags_obj
.
steps_between_evals
params
[
"enable_checkpointing"
]
=
flags_obj
.
enable_checkpointing
params
[
"save_weights_only"
]
=
flags_obj
.
save_weights_only
self
.
distribution_strategy
=
distribute_utils
.
get_distribution_strategy
(
distribution_strategy
=
flags_obj
.
distribution_strategy
,
num_gpus
=
num_gpus
,
all_reduce_alg
=
flags_obj
.
all_reduce_alg
,
num_packs
=
flags_obj
.
num_packs
,
tpu_address
=
flags_obj
.
tpu
or
""
)
if
self
.
use_tpu
:
params
[
"num_replicas"
]
=
self
.
distribution_strategy
.
num_replicas_in_sync
else
:
logging
.
info
(
"Running transformer with num_gpus = %d"
,
num_gpus
)
if
self
.
distribution_strategy
:
logging
.
info
(
"For training, using distribution strategy: %s"
,
self
.
distribution_strategy
)
else
:
logging
.
info
(
"Not using any distribution strategy."
)
performance
.
set_mixed_precision_policy
(
params
[
"dtype"
])
@
property
def
use_tpu
(
self
):
if
self
.
distribution_strategy
:
return
isinstance
(
self
.
distribution_strategy
,
tf
.
distribute
.
TPUStrategy
)
return
False
def
train
(
self
):
"""Trains the model."""
params
=
self
.
params
flags_obj
=
self
.
flags_obj
# Sets config options.
keras_utils
.
set_session_config
(
enable_xla
=
flags_obj
.
enable_xla
)
_ensure_dir
(
flags_obj
.
model_dir
)
with
distribute_utils
.
get_strategy_scope
(
self
.
distribution_strategy
):
model
=
transformer
.
create_model
(
params
,
is_train
=
True
)
opt
=
self
.
_create_optimizer
()
current_step
=
0
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
model
,
optimizer
=
opt
)
latest_checkpoint
=
tf
.
train
.
latest_checkpoint
(
flags_obj
.
model_dir
)
if
latest_checkpoint
:
checkpoint
.
restore
(
latest_checkpoint
)
logging
.
info
(
"Loaded checkpoint %s"
,
latest_checkpoint
)
current_step
=
opt
.
iterations
.
numpy
()
if
params
[
"use_ctl"
]:
train_loss_metric
=
tf
.
keras
.
metrics
.
Mean
(
"training_loss"
,
dtype
=
tf
.
float32
)
if
params
[
"enable_tensorboard"
]:
summary_writer
=
tf
.
summary
.
create_file_writer
(
os
.
path
.
join
(
flags_obj
.
model_dir
,
"summary"
))
else
:
summary_writer
=
tf
.
summary
.
create_noop_writer
()
train_metrics
=
[
train_loss_metric
]
if
params
[
"enable_metrics_in_training"
]:
train_metrics
=
train_metrics
+
model
.
metrics
else
:
model
.
compile
(
opt
)
model
.
summary
()
if
self
.
use_tpu
:
# Different from experimental_distribute_dataset,
# distribute_datasets_from_function requires
# per-replica/local batch size.
params
[
"batch_size"
]
/=
self
.
distribution_strategy
.
num_replicas_in_sync
train_ds
=
(
self
.
distribution_strategy
.
distribute_datasets_from_function
(
lambda
ctx
:
data_pipeline
.
train_input_fn
(
params
,
ctx
)))
else
:
train_ds
=
data_pipeline
.
train_input_fn
(
params
)
map_data_fn
=
data_pipeline
.
map_data_for_transformer_fn
train_ds
=
train_ds
.
map
(
map_data_fn
,
num_parallel_calls
=
tf
.
data
.
experimental
.
AUTOTUNE
)
if
params
[
"use_ctl"
]:
train_ds_iterator
=
iter
(
train_ds
)
callbacks
=
self
.
_create_callbacks
(
flags_obj
.
model_dir
,
params
)
# Only TimeHistory callback is supported for CTL
if
params
[
"use_ctl"
]:
callbacks
=
[
cb
for
cb
in
callbacks
if
isinstance
(
cb
,
keras_utils
.
TimeHistory
)]
@
tf
.
function
def
train_steps
(
iterator
,
steps
):
"""Training steps function for TPU runs.
Args:
iterator: The input iterator of the training dataset.
steps: An integer, the number of training steps.
Returns:
A float, the loss value.
"""
def
_step_fn
(
inputs
):
"""Per-replica step function."""
inputs
,
targets
=
inputs
with
tf
.
GradientTape
()
as
tape
:
logits
=
model
([
inputs
,
targets
],
training
=
True
)
loss
=
metrics
.
transformer_loss
(
logits
,
targets
,
params
[
"label_smoothing"
],
params
[
"vocab_size"
])
# Scales the loss, which results in using the average loss across all
# of the replicas for backprop.
scaled_loss
=
loss
/
self
.
distribution_strategy
.
num_replicas_in_sync
# De-dupes variables due to keras tracking issues.
tvars
=
list
({
id
(
v
):
v
for
v
in
model
.
trainable_variables
}.
values
())
grads
=
tape
.
gradient
(
scaled_loss
,
tvars
)
opt
.
apply_gradients
(
zip
(
grads
,
tvars
))
# For reporting, the metric takes the mean of losses.
train_loss_metric
.
update_state
(
loss
)
for
_
in
tf
.
range
(
steps
):
train_loss_metric
.
reset_states
()
self
.
distribution_strategy
.
run
(
_step_fn
,
args
=
(
next
(
iterator
),))
cased_score
,
uncased_score
=
None
,
None
cased_score_history
,
uncased_score_history
=
[],
[]
while
current_step
<
flags_obj
.
train_steps
:
remaining_steps
=
flags_obj
.
train_steps
-
current_step
train_steps_per_eval
=
(
remaining_steps
if
remaining_steps
<
flags_obj
.
steps_between_evals
else
flags_obj
.
steps_between_evals
)
current_iteration
=
current_step
//
flags_obj
.
steps_between_evals
logging
.
info
(
"Start train iteration at global step:{}"
.
format
(
current_step
))
history
=
None
if
params
[
"use_ctl"
]:
if
not
self
.
use_tpu
:
raise
NotImplementedError
(
"Custom training loop on GPUs is not implemented."
)
# Runs training steps.
with
summary_writer
.
as_default
():
for
cb
in
callbacks
:
cb
.
on_epoch_begin
(
current_iteration
)
cb
.
on_batch_begin
(
0
)
train_steps
(
train_ds_iterator
,
tf
.
convert_to_tensor
(
train_steps_per_eval
,
dtype
=
tf
.
int32
))
current_step
+=
train_steps_per_eval
train_loss
=
train_loss_metric
.
result
().
numpy
().
astype
(
float
)
logging
.
info
(
"Train Step: %d/%d / loss = %s"
,
current_step
,
flags_obj
.
train_steps
,
train_loss
)
for
cb
in
callbacks
:
cb
.
on_batch_end
(
train_steps_per_eval
-
1
)
cb
.
on_epoch_end
(
current_iteration
)
if
params
[
"enable_tensorboard"
]:
for
metric_obj
in
train_metrics
:
tf
.
summary
.
scalar
(
metric_obj
.
name
,
metric_obj
.
result
(),
current_step
)
summary_writer
.
flush
()
for
cb
in
callbacks
:
cb
.
on_train_end
()
if
flags_obj
.
enable_checkpointing
:
# avoid check-pointing when running for benchmarking.
checkpoint_name
=
checkpoint
.
save
(
os
.
path
.
join
(
flags_obj
.
model_dir
,
"ctl_step_{}.ckpt"
.
format
(
current_step
)))
logging
.
info
(
"Saved checkpoint to %s"
,
checkpoint_name
)
else
:
if
self
.
use_tpu
:
raise
NotImplementedError
(
"Keras model.fit on TPUs is not implemented."
)
history
=
model
.
fit
(
train_ds
,
initial_epoch
=
current_iteration
,
epochs
=
current_iteration
+
1
,
steps_per_epoch
=
train_steps_per_eval
,
callbacks
=
callbacks
,
# If TimeHistory is enabled, progress bar would be messy. Increase
# the verbose level to get rid of it.
verbose
=
(
2
if
flags_obj
.
enable_time_history
else
1
))
current_step
+=
train_steps_per_eval
logging
.
info
(
"Train history: {}"
.
format
(
history
.
history
))
logging
.
info
(
"End train iteration at global step:{}"
.
format
(
current_step
))
if
(
flags_obj
.
bleu_source
and
flags_obj
.
bleu_ref
):
uncased_score
,
cased_score
=
self
.
eval
()
cased_score_history
.
append
([
current_iteration
+
1
,
cased_score
])
uncased_score_history
.
append
([
current_iteration
+
1
,
uncased_score
])
stats
=
({
"loss"
:
train_loss
}
if
history
is
None
else
{})
misc
.
update_stats
(
history
,
stats
,
callbacks
)
if
uncased_score
and
cased_score
:
stats
[
"bleu_uncased"
]
=
uncased_score
stats
[
"bleu_cased"
]
=
cased_score
stats
[
"bleu_uncased_history"
]
=
uncased_score_history
stats
[
"bleu_cased_history"
]
=
cased_score_history
return
stats
def
eval
(
self
):
"""Evaluates the model."""
distribution_strategy
=
self
.
distribution_strategy
if
self
.
use_tpu
else
None
# We only want to create the model under DS scope for TPU case.
# When 'distribution_strategy' is None, a no-op DummyContextManager will
# be used.
with
distribute_utils
.
get_strategy_scope
(
distribution_strategy
):
if
not
self
.
predict_model
:
self
.
predict_model
=
transformer
.
create_model
(
self
.
params
,
False
)
self
.
_load_weights_if_possible
(
self
.
predict_model
,
tf
.
train
.
latest_checkpoint
(
self
.
flags_obj
.
model_dir
))
self
.
predict_model
.
summary
()
return
evaluate_and_log_bleu
(
self
.
predict_model
,
self
.
params
,
self
.
flags_obj
.
bleu_source
,
self
.
flags_obj
.
bleu_ref
,
self
.
flags_obj
.
vocab_file
,
distribution_strategy
)
def
predict
(
self
):
"""Predicts result from the model."""
params
=
self
.
params
flags_obj
=
self
.
flags_obj
with
tf
.
name_scope
(
"model"
):
model
=
transformer
.
create_model
(
params
,
is_train
=
False
)
self
.
_load_weights_if_possible
(
model
,
tf
.
train
.
latest_checkpoint
(
self
.
flags_obj
.
model_dir
))
model
.
summary
()
subtokenizer
=
tokenizer
.
Subtokenizer
(
flags_obj
.
vocab_file
)
ds
=
data_pipeline
.
eval_input_fn
(
params
)
ds
=
ds
.
map
(
lambda
x
,
y
:
x
).
take
(
_SINGLE_SAMPLE
)
ret
=
model
.
predict
(
ds
)
val_outputs
,
_
=
ret
length
=
len
(
val_outputs
)
for
i
in
range
(
length
):
translate
.
translate_from_input
(
val_outputs
[
i
],
subtokenizer
)
def
_create_callbacks
(
self
,
cur_log_dir
,
params
):
"""Creates a list of callbacks."""
callbacks
=
misc
.
get_callbacks
()
if
params
[
"enable_checkpointing"
]:
ckpt_full_path
=
os
.
path
.
join
(
cur_log_dir
,
"cp-{epoch:04d}.ckpt"
)
callbacks
.
append
(
tf
.
keras
.
callbacks
.
ModelCheckpoint
(
ckpt_full_path
,
save_weights_only
=
params
[
"save_weights_only"
]))
return
callbacks
def
_load_weights_if_possible
(
self
,
model
,
init_weight_path
=
None
):
"""Loads model weights when it is provided."""
if
init_weight_path
:
logging
.
info
(
"Load weights: {}"
.
format
(
init_weight_path
))
if
self
.
use_tpu
:
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
model
,
optimizer
=
self
.
_create_optimizer
())
checkpoint
.
restore
(
init_weight_path
)
else
:
model
.
load_weights
(
init_weight_path
)
else
:
logging
.
info
(
"Weights not loaded from path:{}"
.
format
(
init_weight_path
))
def
_create_optimizer
(
self
):
"""Creates optimizer."""
params
=
self
.
params
lr_schedule
=
optimizer
.
LearningRateSchedule
(
params
[
"learning_rate"
],
params
[
"hidden_size"
],
params
[
"learning_rate_warmup_steps"
])
opt
=
tf
.
keras
.
optimizers
.
Adam
(
lr_schedule
,
params
[
"optimizer_adam_beta1"
],
params
[
"optimizer_adam_beta2"
],
epsilon
=
params
[
"optimizer_adam_epsilon"
])
opt
=
performance
.
configure_optimizer
(
opt
,
use_float16
=
params
[
"dtype"
]
==
tf
.
float16
,
loss_scale
=
flags_core
.
get_loss_scale
(
self
.
flags_obj
,
default_for_fp16
=
"dynamic"
))
return
opt
def
_ensure_dir
(
log_dir
):
"""Makes log dir if not existed."""
if
not
tf
.
io
.
gfile
.
exists
(
log_dir
):
tf
.
io
.
gfile
.
makedirs
(
log_dir
)
def
main
(
_
):
flags_obj
=
flags
.
FLAGS
if
flags_obj
.
enable_mlir_bridge
:
tf
.
config
.
experimental
.
enable_mlir_bridge
()
task
=
TransformerTask
(
flags_obj
)
# Execute flag override logic for better model performance
if
flags_obj
.
tf_gpu_thread_mode
:
keras_utils
.
set_gpu_thread_mode_and_count
(
per_gpu_thread_count
=
flags_obj
.
per_gpu_thread_count
,
gpu_thread_mode
=
flags_obj
.
tf_gpu_thread_mode
,
num_gpus
=
flags_obj
.
num_gpus
,
datasets_num_private_threads
=
flags_obj
.
datasets_num_private_threads
)
if
flags_obj
.
mode
==
"train"
:
task
.
train
()
elif
flags_obj
.
mode
==
"predict"
:
task
.
predict
()
elif
flags_obj
.
mode
==
"eval"
:
task
.
eval
()
else
:
raise
ValueError
(
"Invalid mode {}"
.
format
(
flags_obj
.
mode
))
if
__name__
==
"__main__"
:
logging
.
set_verbosity
(
logging
.
INFO
)
misc
.
define_transformer_flags
()
app
.
run
(
main
)
official/nlp/transformer/transformer_main_test.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test Transformer model."""
import
os
import
re
import
sys
import
unittest
from
absl
import
flags
from
absl.testing
import
flagsaver
import
tensorflow
as
tf
from
tensorflow.python.eager
import
context
# pylint: disable=ungrouped-imports
from
official.nlp.transformer
import
misc
from
official.nlp.transformer
import
transformer_main
FLAGS
=
flags
.
FLAGS
FIXED_TIMESTAMP
=
'my_time_stamp'
WEIGHT_PATTERN
=
re
.
compile
(
r
'weights-epoch-.+\.hdf5'
)
def
_generate_file
(
filepath
,
lines
):
with
open
(
filepath
,
'w'
)
as
f
:
for
l
in
lines
:
f
.
write
(
'{}
\n
'
.
format
(
l
))
class
TransformerTaskTest
(
tf
.
test
.
TestCase
):
local_flags
=
None
def
setUp
(
self
):
# pylint: disable=g-missing-super-call
temp_dir
=
self
.
get_temp_dir
()
if
TransformerTaskTest
.
local_flags
is
None
:
misc
.
define_transformer_flags
()
# Loads flags, array cannot be blank.
flags
.
FLAGS
([
'foo'
])
TransformerTaskTest
.
local_flags
=
flagsaver
.
save_flag_values
()
else
:
flagsaver
.
restore_flag_values
(
TransformerTaskTest
.
local_flags
)
FLAGS
.
model_dir
=
os
.
path
.
join
(
temp_dir
,
FIXED_TIMESTAMP
)
FLAGS
.
param_set
=
'tiny'
FLAGS
.
use_synthetic_data
=
True
FLAGS
.
steps_between_evals
=
1
FLAGS
.
train_steps
=
1
FLAGS
.
validation_steps
=
1
FLAGS
.
batch_size
=
4
FLAGS
.
max_length
=
1
FLAGS
.
num_gpus
=
1
FLAGS
.
distribution_strategy
=
'off'
FLAGS
.
dtype
=
'fp32'
self
.
model_dir
=
FLAGS
.
model_dir
self
.
temp_dir
=
temp_dir
self
.
vocab_file
=
os
.
path
.
join
(
temp_dir
,
'vocab'
)
self
.
vocab_size
=
misc
.
get_model_params
(
FLAGS
.
param_set
,
0
)[
'vocab_size'
]
self
.
bleu_source
=
os
.
path
.
join
(
temp_dir
,
'bleu_source'
)
self
.
bleu_ref
=
os
.
path
.
join
(
temp_dir
,
'bleu_ref'
)
self
.
orig_policy
=
(
tf
.
compat
.
v2
.
keras
.
mixed_precision
.
global_policy
())
def
tearDown
(
self
):
# pylint: disable=g-missing-super-call
tf
.
compat
.
v2
.
keras
.
mixed_precision
.
set_global_policy
(
self
.
orig_policy
)
def
_assert_exists
(
self
,
filepath
):
self
.
assertTrue
(
os
.
path
.
exists
(
filepath
))
def
test_train_no_dist_strat
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
def
test_train_save_full_model
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
FLAGS
.
save_weights_only
=
False
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
def
test_train_static_batch
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
FLAGS
.
distribution_strategy
=
'one_device'
if
tf
.
test
.
is_built_with_cuda
():
FLAGS
.
num_gpus
=
1
else
:
FLAGS
.
num_gpus
=
0
FLAGS
.
static_batch
=
True
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_train_1_gpu_with_dist_strat
(
self
):
FLAGS
.
distribution_strategy
=
'one_device'
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_train_fp16
(
self
):
FLAGS
.
distribution_strategy
=
'one_device'
FLAGS
.
dtype
=
'fp16'
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_train_2_gpu
(
self
):
if
context
.
num_gpus
()
<
2
:
self
.
skipTest
(
'{} GPUs are not available for this test. {} GPUs are available'
.
format
(
2
,
context
.
num_gpus
()))
FLAGS
.
distribution_strategy
=
'mirrored'
FLAGS
.
num_gpus
=
2
FLAGS
.
param_set
=
'base'
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_train_2_gpu_fp16
(
self
):
if
context
.
num_gpus
()
<
2
:
self
.
skipTest
(
'{} GPUs are not available for this test. {} GPUs are available'
.
format
(
2
,
context
.
num_gpus
()))
FLAGS
.
distribution_strategy
=
'mirrored'
FLAGS
.
num_gpus
=
2
FLAGS
.
param_set
=
'base'
FLAGS
.
dtype
=
'fp16'
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
train
()
def
_prepare_files_and_flags
(
self
,
*
extra_flags
):
# Make log dir.
if
not
os
.
path
.
exists
(
self
.
temp_dir
):
os
.
makedirs
(
self
.
temp_dir
)
# Fake vocab, bleu_source and bleu_ref.
tokens
=
[
"'<pad>'"
,
"'<EOS>'"
,
"'_'"
,
"'a'"
,
"'b'"
,
"'c'"
,
"'d'"
,
"'a_'"
,
"'b_'"
,
"'c_'"
,
"'d_'"
]
tokens
+=
[
"'{}'"
.
format
(
i
)
for
i
in
range
(
self
.
vocab_size
-
len
(
tokens
))]
_generate_file
(
self
.
vocab_file
,
tokens
)
_generate_file
(
self
.
bleu_source
,
[
'a b'
,
'c d'
])
_generate_file
(
self
.
bleu_ref
,
[
'a b'
,
'd c'
])
# Update flags.
update_flags
=
[
'ignored_program_name'
,
'--vocab_file={}'
.
format
(
self
.
vocab_file
),
'--bleu_source={}'
.
format
(
self
.
bleu_source
),
'--bleu_ref={}'
.
format
(
self
.
bleu_ref
),
]
if
extra_flags
:
update_flags
.
extend
(
extra_flags
)
FLAGS
(
update_flags
)
def
test_predict
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
self
.
_prepare_files_and_flags
()
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
predict
()
@
unittest
.
skipUnless
(
tf
.
test
.
is_built_with_cuda
(),
'requires GPU'
)
def
test_predict_fp16
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
self
.
_prepare_files_and_flags
(
'--dtype=fp16'
)
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
predict
()
def
test_eval
(
self
):
if
context
.
num_gpus
()
>=
2
:
self
.
skipTest
(
'No need to test 2+ GPUs without a distribution strategy.'
)
if
'test_xla'
in
sys
.
argv
[
0
]:
self
.
skipTest
(
'TODO(xla): Make this test faster under XLA.'
)
self
.
_prepare_files_and_flags
()
t
=
transformer_main
.
TransformerTask
(
FLAGS
)
t
.
eval
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/transformer/transformer_test.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test Transformer model."""
import
tensorflow
as
tf
from
official.nlp.transformer
import
model_params
from
official.nlp.transformer
import
transformer
class
TransformerV2Test
(
tf
.
test
.
TestCase
):
def
setUp
(
self
):
super
().
setUp
()
self
.
params
=
params
=
model_params
.
TINY_PARAMS
params
[
"batch_size"
]
=
params
[
"default_batch_size"
]
=
16
params
[
"use_synthetic_data"
]
=
True
params
[
"hidden_size"
]
=
12
params
[
"num_hidden_layers"
]
=
2
params
[
"filter_size"
]
=
14
params
[
"num_heads"
]
=
2
params
[
"vocab_size"
]
=
41
params
[
"extra_decode_length"
]
=
2
params
[
"beam_size"
]
=
3
params
[
"dtype"
]
=
tf
.
float32
def
test_create_model_train
(
self
):
model
=
transformer
.
create_model
(
self
.
params
,
True
)
inputs
,
outputs
=
model
.
inputs
,
model
.
outputs
self
.
assertEqual
(
len
(
inputs
),
2
)
self
.
assertEqual
(
len
(
outputs
),
1
)
self
.
assertEqual
(
inputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
0
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
inputs
[
1
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
1
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
outputs
[
0
].
shape
.
as_list
(),
[
None
,
None
,
41
])
self
.
assertEqual
(
outputs
[
0
].
dtype
,
tf
.
float32
)
def
test_create_model_not_train
(
self
):
model
=
transformer
.
create_model
(
self
.
params
,
False
)
inputs
,
outputs
=
model
.
inputs
,
model
.
outputs
self
.
assertEqual
(
len
(
inputs
),
1
)
self
.
assertEqual
(
len
(
outputs
),
2
)
self
.
assertEqual
(
inputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
0
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
outputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
outputs
[
0
].
dtype
,
tf
.
int32
)
self
.
assertEqual
(
outputs
[
1
].
shape
.
as_list
(),
[
None
])
self
.
assertEqual
(
outputs
[
1
].
dtype
,
tf
.
float32
)
def
test_export
(
self
):
model
=
transformer
.
Transformer
(
self
.
params
,
name
=
"transformer_v2"
)
export_dir
=
self
.
get_temp_dir
()
batch_size
=
5
max_length
=
6
class
SaveModule
(
tf
.
Module
):
def
__init__
(
self
,
model
):
super
(
SaveModule
,
self
).
__init__
()
self
.
model
=
model
@
tf
.
function
def
serve
(
self
,
x
):
return
self
.
model
.
call
([
x
],
training
=
False
)
save_module
=
SaveModule
(
model
)
tensor_shape
=
(
None
,
None
)
sample_input
=
tf
.
zeros
((
batch_size
,
max_length
),
dtype
=
tf
.
int64
)
_
=
save_module
.
serve
(
sample_input
)
signatures
=
dict
(
serving_default
=
save_module
.
serve
.
get_concrete_function
(
tf
.
TensorSpec
(
shape
=
tensor_shape
,
dtype
=
tf
.
int64
,
name
=
"x"
)))
tf
.
saved_model
.
save
(
save_module
,
export_dir
,
signatures
=
signatures
)
imported
=
tf
.
saved_model
.
load
(
export_dir
)
serving_fn
=
imported
.
signatures
[
"serving_default"
]
all_outputs
=
serving_fn
(
sample_input
)
output
=
all_outputs
[
"outputs"
]
output_shapes
=
output
.
shape
.
as_list
()
self
.
assertEqual
(
output_shapes
[
0
],
batch_size
)
self
.
assertEqual
(
output_shapes
[
1
],
max_length
+
model
.
params
[
"extra_decode_length"
])
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/transformer/translate.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Translate text or files using trained transformer model."""
# Import libraries
from
absl
import
logging
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.transformer.utils
import
tokenizer
_EXTRA_DECODE_LENGTH
=
100
_BEAM_SIZE
=
4
_ALPHA
=
0.6
def
_get_sorted_inputs
(
filename
):
"""Read and sort lines from the file sorted by decreasing length.
Args:
filename: String name of file to read inputs from.
Returns:
Sorted list of inputs, and dictionary mapping original index->sorted index
of each element.
"""
with
tf
.
io
.
gfile
.
GFile
(
filename
)
as
f
:
records
=
f
.
read
().
split
(
"
\n
"
)
inputs
=
[
record
.
strip
()
for
record
in
records
]
if
not
inputs
[
-
1
]:
inputs
.
pop
()
input_lens
=
[(
i
,
len
(
line
.
split
()))
for
i
,
line
in
enumerate
(
inputs
)]
sorted_input_lens
=
sorted
(
input_lens
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
sorted_inputs
=
[
None
]
*
len
(
sorted_input_lens
)
sorted_keys
=
[
0
]
*
len
(
sorted_input_lens
)
for
i
,
(
index
,
_
)
in
enumerate
(
sorted_input_lens
):
sorted_inputs
[
i
]
=
inputs
[
index
]
sorted_keys
[
index
]
=
i
return
sorted_inputs
,
sorted_keys
def
_encode_and_add_eos
(
line
,
subtokenizer
):
"""Encode line with subtokenizer, and add EOS id to the end."""
return
subtokenizer
.
encode
(
line
)
+
[
tokenizer
.
EOS_ID
]
def
_trim_and_decode
(
ids
,
subtokenizer
):
"""Trim EOS and PAD tokens from ids, and decode to return a string."""
try
:
index
=
list
(
ids
).
index
(
tokenizer
.
EOS_ID
)
return
subtokenizer
.
decode
(
ids
[:
index
])
except
ValueError
:
# No EOS found in sequence
return
subtokenizer
.
decode
(
ids
)
def
translate_file
(
model
,
params
,
subtokenizer
,
input_file
,
output_file
=
None
,
print_all_translations
=
True
,
distribution_strategy
=
None
):
"""Translate lines in file, and save to output file if specified.
Args:
model: A Keras model, used to generate the translations.
params: A dictionary, containing the translation related parameters.
subtokenizer: A subtokenizer object, used for encoding and decoding source
and translated lines.
input_file: A file containing lines to translate.
output_file: A file that stores the generated translations.
print_all_translations: A bool. If true, all translations are printed to
stdout.
distribution_strategy: A distribution strategy, used to perform inference
directly with tf.function instead of Keras model.predict().
Raises:
ValueError: if output file is invalid.
"""
batch_size
=
params
[
"decode_batch_size"
]
# Read and sort inputs by length. Keep dictionary (original index-->new index
# in sorted list) to write translations in the original order.
sorted_inputs
,
sorted_keys
=
_get_sorted_inputs
(
input_file
)
total_samples
=
len
(
sorted_inputs
)
num_decode_batches
=
(
total_samples
-
1
)
//
batch_size
+
1
def
input_generator
():
"""Yield encoded strings from sorted_inputs."""
for
i
in
range
(
num_decode_batches
):
lines
=
[
sorted_inputs
[
j
+
i
*
batch_size
]
for
j
in
range
(
batch_size
)
if
j
+
i
*
batch_size
<
total_samples
]
lines
=
[
_encode_and_add_eos
(
l
,
subtokenizer
)
for
l
in
lines
]
if
distribution_strategy
:
for
j
in
range
(
batch_size
-
len
(
lines
)):
lines
.
append
([
tokenizer
.
EOS_ID
])
batch
=
tf
.
keras
.
preprocessing
.
sequence
.
pad_sequences
(
lines
,
maxlen
=
params
[
"decode_max_length"
],
dtype
=
"int32"
,
padding
=
"post"
)
logging
.
info
(
"Decoding batch %d out of %d."
,
i
,
num_decode_batches
)
yield
batch
@
tf
.
function
def
predict_step
(
inputs
):
"""Decoding step function for TPU runs."""
def
_step_fn
(
inputs
):
"""Per replica step function."""
tag
=
inputs
[
0
]
val_inputs
=
inputs
[
1
]
val_outputs
,
_
=
model
([
val_inputs
],
training
=
False
)
return
tag
,
val_outputs
return
distribution_strategy
.
run
(
_step_fn
,
args
=
(
inputs
,))
translations
=
[]
if
distribution_strategy
:
num_replicas
=
distribution_strategy
.
num_replicas_in_sync
local_batch_size
=
params
[
"decode_batch_size"
]
//
num_replicas
for
i
,
text
in
enumerate
(
input_generator
()):
if
distribution_strategy
:
text
=
np
.
reshape
(
text
,
[
num_replicas
,
local_batch_size
,
-
1
])
# Add tag to the input of each replica with the reordering logic after
# outputs, to ensure the output order matches the input order.
text
=
tf
.
constant
(
text
)
@
tf
.
function
def
text_as_per_replica
():
replica_context
=
tf
.
distribute
.
get_replica_context
()
replica_id
=
replica_context
.
replica_id_in_sync_group
return
replica_id
,
text
[
replica_id
]
# pylint: disable=cell-var-from-loop
text
=
distribution_strategy
.
run
(
text_as_per_replica
)
outputs
=
distribution_strategy
.
experimental_local_results
(
predict_step
(
text
))
val_outputs
=
[
output
for
_
,
output
in
outputs
]
val_outputs
=
np
.
reshape
(
val_outputs
,
[
params
[
"decode_batch_size"
],
-
1
])
else
:
val_outputs
,
_
=
model
.
predict
(
text
)
length
=
len
(
val_outputs
)
for
j
in
range
(
length
):
if
j
+
i
*
batch_size
<
total_samples
:
translation
=
_trim_and_decode
(
val_outputs
[
j
],
subtokenizer
)
translations
.
append
(
translation
)
if
print_all_translations
:
logging
.
info
(
"Translating:
\n\t
Input: %s
\n\t
Output: %s"
,
sorted_inputs
[
j
+
i
*
batch_size
],
translation
)
# Write translations in the order they appeared in the original file.
if
output_file
is
not
None
:
if
tf
.
io
.
gfile
.
isdir
(
output_file
):
raise
ValueError
(
"File output is a directory, will not save outputs to "
"file."
)
logging
.
info
(
"Writing to file %s"
,
output_file
)
with
tf
.
io
.
gfile
.
GFile
(
output_file
,
"w"
)
as
f
:
for
i
in
sorted_keys
:
f
.
write
(
"%s
\n
"
%
translations
[
i
])
def
translate_from_text
(
model
,
subtokenizer
,
txt
):
encoded_txt
=
_encode_and_add_eos
(
txt
,
subtokenizer
)
result
=
model
.
predict
(
encoded_txt
)
outputs
=
result
[
"outputs"
]
logging
.
info
(
"Original:
\"
%s
\"
"
,
txt
)
translate_from_input
(
outputs
,
subtokenizer
)
def
translate_from_input
(
outputs
,
subtokenizer
):
translation
=
_trim_and_decode
(
outputs
,
subtokenizer
)
logging
.
info
(
"Translation:
\"
%s
\"
"
,
translation
)
official/nlp/transformer/utils/__init__.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
official/nlp/transformer/utils/metrics.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions for calculating loss, accuracy, and other model metrics.
Metrics:
- Padded loss, accuracy, and negative log perplexity. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
- BLEU approximation. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
- ROUGE score. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
math
import
numpy
as
np
import
six
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow.compat.v1
as
tf
def
_pad_tensors_to_same_length
(
x
,
y
):
"""Pad x and y so that the results have the same length (second dimension)."""
with
tf
.
name_scope
(
"pad_to_same_length"
):
x_length
=
tf
.
shape
(
x
)[
1
]
y_length
=
tf
.
shape
(
y
)[
1
]
max_length
=
tf
.
maximum
(
x_length
,
y_length
)
x
=
tf
.
pad
(
x
,
[[
0
,
0
],
[
0
,
max_length
-
x_length
],
[
0
,
0
]])
y
=
tf
.
pad
(
y
,
[[
0
,
0
],
[
0
,
max_length
-
y_length
]])
return
x
,
y
def
padded_cross_entropy_loss
(
logits
,
labels
,
smoothing
,
vocab_size
):
"""Calculate cross entropy loss while ignoring padding.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch_size, length_labels]
smoothing: Label smoothing constant, used to determine the on and off values
vocab_size: int size of the vocabulary
Returns:
Returns the cross entropy loss and weight tensors: float32 tensors with
shape [batch_size, max(length_logits, length_labels)]
"""
with
tf
.
name_scope
(
"loss"
,
values
=
[
logits
,
labels
]):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
# Calculate smoothing cross entropy
with
tf
.
name_scope
(
"smoothing_cross_entropy"
,
values
=
[
logits
,
labels
]):
confidence
=
1.0
-
smoothing
low_confidence
=
(
1.0
-
confidence
)
/
tf
.
cast
(
vocab_size
-
1
,
tf
.
float32
)
soft_targets
=
tf
.
one_hot
(
tf
.
cast
(
labels
,
tf
.
int32
),
depth
=
vocab_size
,
on_value
=
confidence
,
off_value
=
low_confidence
)
xentropy
=
tf
.
nn
.
softmax_cross_entropy_with_logits_v2
(
logits
=
logits
,
labels
=
soft_targets
)
# Calculate the best (lowest) possible value of cross entropy, and
# subtract from the cross entropy loss.
normalizing_constant
=
-
(
confidence
*
tf
.
log
(
confidence
)
+
tf
.
cast
(
vocab_size
-
1
,
tf
.
float32
)
*
low_confidence
*
tf
.
log
(
low_confidence
+
1e-20
))
xentropy
-=
normalizing_constant
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
return
xentropy
*
weights
,
weights
def
_convert_to_eval_metric
(
metric_fn
):
"""Wrap a metric fn that returns scores and weights as an eval metric fn.
The input metric_fn returns values for the current batch. The wrapper
aggregates the return values collected over all of the batches evaluated.
Args:
metric_fn: function that returns scores and weights for the current batch's
logits and predicted labels.
Returns:
function that aggregates the scores and weights from metric_fn.
"""
def
problem_metric_fn
(
*
args
):
"""Returns an aggregation of the metric_fn's returned values."""
(
scores
,
weights
)
=
metric_fn
(
*
args
)
# The tf.metrics.mean function assures correct aggregation.
return
tf
.
metrics
.
mean
(
scores
,
weights
)
return
problem_metric_fn
def
get_eval_metrics
(
logits
,
labels
,
params
):
"""Return dictionary of model evaluation metrics."""
metrics
=
{
"accuracy"
:
_convert_to_eval_metric
(
padded_accuracy
)(
logits
,
labels
),
"accuracy_top5"
:
_convert_to_eval_metric
(
padded_accuracy_top5
)(
logits
,
labels
),
"accuracy_per_sequence"
:
_convert_to_eval_metric
(
padded_sequence_accuracy
)(
logits
,
labels
),
"neg_log_perplexity"
:
_convert_to_eval_metric
(
padded_neg_log_perplexity
)(
logits
,
labels
,
params
[
"vocab_size"
]),
}
if
not
params
[
"use_tpu"
]:
# TPU does not support tf.py_func
metrics
.
update
({
"approx_bleu_score"
:
_convert_to_eval_metric
(
bleu_score
)(
logits
,
labels
),
"rouge_2_fscore"
:
_convert_to_eval_metric
(
rouge_2_fscore
)(
logits
,
labels
),
"rouge_L_fscore"
:
_convert_to_eval_metric
(
rouge_l_fscore
)(
logits
,
labels
),
})
# Prefix each of the metric names with "metrics/". This allows the metric
# graphs to display under the "metrics" category in TensorBoard.
metrics
=
{
"metrics/%s"
%
k
:
v
for
k
,
v
in
six
.
iteritems
(
metrics
)}
return
metrics
def
padded_accuracy
(
logits
,
labels
):
"""Percentage of times that predictions matches labels on non-0s."""
with
tf
.
variable_scope
(
"padded_accuracy"
,
values
=
[
logits
,
labels
]):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
outputs
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
return
tf
.
cast
(
tf
.
equal
(
outputs
,
padded_labels
),
tf
.
float32
),
weights
def
padded_accuracy_topk
(
logits
,
labels
,
k
):
"""Percentage of times that top-k predictions matches labels on non-0s."""
with
tf
.
variable_scope
(
"padded_accuracy_topk"
,
values
=
[
logits
,
labels
]):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
effective_k
=
tf
.
minimum
(
k
,
tf
.
shape
(
logits
)[
-
1
])
_
,
outputs
=
tf
.
nn
.
top_k
(
logits
,
k
=
effective_k
)
outputs
=
tf
.
cast
(
outputs
,
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
padded_labels
=
tf
.
expand_dims
(
padded_labels
,
axis
=-
1
)
padded_labels
+=
tf
.
zeros_like
(
outputs
)
# Pad to same shape.
same
=
tf
.
cast
(
tf
.
equal
(
outputs
,
padded_labels
),
tf
.
float32
)
same_topk
=
tf
.
reduce_sum
(
same
,
axis
=-
1
)
return
same_topk
,
weights
def
padded_accuracy_top5
(
logits
,
labels
):
return
padded_accuracy_topk
(
logits
,
labels
,
5
)
def
padded_sequence_accuracy
(
logits
,
labels
):
"""Percentage of times that predictions matches labels everywhere (non-0)."""
with
tf
.
variable_scope
(
"padded_sequence_accuracy"
,
values
=
[
logits
,
labels
]):
logits
,
labels
=
_pad_tensors_to_same_length
(
logits
,
labels
)
weights
=
tf
.
cast
(
tf
.
not_equal
(
labels
,
0
),
tf
.
float32
)
outputs
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
padded_labels
=
tf
.
cast
(
labels
,
tf
.
int32
)
not_correct
=
(
tf
.
cast
(
tf
.
not_equal
(
outputs
,
padded_labels
),
tf
.
float32
)
*
weights
)
axis
=
list
(
range
(
1
,
len
(
outputs
.
get_shape
())))
correct_seq
=
1.0
-
tf
.
minimum
(
1.0
,
tf
.
reduce_sum
(
not_correct
,
axis
=
axis
))
return
correct_seq
,
tf
.
constant
(
1.0
)
def
padded_neg_log_perplexity
(
logits
,
labels
,
vocab_size
):
"""Average log-perplexity excluding padding 0s. No smoothing."""
num
,
den
=
padded_cross_entropy_loss
(
logits
,
labels
,
0
,
vocab_size
)
return
-
num
,
den
def
bleu_score
(
logits
,
labels
):
"""Approximate BLEU score computation between labels and predictions.
An approximate BLEU scoring method since we do not glue word pieces or
decode the ids and tokenize the output. By default, we use ngram order of 4
and use brevity penalty. Also, this does not have beam search.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch-size, length_labels]
Returns:
bleu: int, approx bleu score
"""
predictions
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
# TODO: Look into removing use of py_func # pylint: disable=g-bad-todo
bleu
=
tf
.
py_func
(
compute_bleu
,
(
labels
,
predictions
),
tf
.
float32
)
return
bleu
,
tf
.
constant
(
1.0
)
def
_get_ngrams_with_counter
(
segment
,
max_order
):
"""Extracts all n-grams up to a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts
=
collections
.
Counter
()
for
order
in
xrange
(
1
,
max_order
+
1
):
for
i
in
xrange
(
0
,
len
(
segment
)
-
order
+
1
):
ngram
=
tuple
(
segment
[
i
:
i
+
order
])
ngram_counts
[
ngram
]
+=
1
return
ngram_counts
def
compute_bleu
(
reference_corpus
,
translation_corpus
,
max_order
=
4
,
use_bp
=
True
):
"""Computes BLEU score of translated segments against one or more references.
Args:
reference_corpus: list of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
use_bp: boolean, whether to apply brevity penalty.
Returns:
BLEU score.
"""
reference_length
=
0
translation_length
=
0
bp
=
1.0
geo_mean
=
0
matches_by_order
=
[
0
]
*
max_order
possible_matches_by_order
=
[
0
]
*
max_order
precisions
=
[]
for
(
references
,
translations
)
in
zip
(
reference_corpus
,
translation_corpus
):
reference_length
+=
len
(
references
)
translation_length
+=
len
(
translations
)
ref_ngram_counts
=
_get_ngrams_with_counter
(
references
,
max_order
)
translation_ngram_counts
=
_get_ngrams_with_counter
(
translations
,
max_order
)
overlap
=
dict
((
ngram
,
min
(
count
,
translation_ngram_counts
[
ngram
]))
for
ngram
,
count
in
ref_ngram_counts
.
items
())
for
ngram
in
overlap
:
matches_by_order
[
len
(
ngram
)
-
1
]
+=
overlap
[
ngram
]
for
ngram
in
translation_ngram_counts
:
possible_matches_by_order
[
len
(
ngram
)
-
1
]
+=
translation_ngram_counts
[
ngram
]
precisions
=
[
0
]
*
max_order
smooth
=
1.0
for
i
in
xrange
(
0
,
max_order
):
if
possible_matches_by_order
[
i
]
>
0
:
precisions
[
i
]
=
float
(
matches_by_order
[
i
])
/
possible_matches_by_order
[
i
]
if
matches_by_order
[
i
]
>
0
:
precisions
[
i
]
=
float
(
matches_by_order
[
i
])
/
possible_matches_by_order
[
i
]
else
:
smooth
*=
2
precisions
[
i
]
=
1.0
/
(
smooth
*
possible_matches_by_order
[
i
])
else
:
precisions
[
i
]
=
0.0
if
max
(
precisions
)
>
0
:
p_log_sum
=
sum
(
math
.
log
(
p
)
for
p
in
precisions
if
p
)
geo_mean
=
math
.
exp
(
p_log_sum
/
max_order
)
if
use_bp
:
ratio
=
translation_length
/
reference_length
bp
=
math
.
exp
(
1
-
1.
/
ratio
)
if
ratio
<
1.0
else
1.0
bleu
=
geo_mean
*
bp
return
np
.
float32
(
bleu
)
def
rouge_2_fscore
(
logits
,
labels
):
"""ROUGE-2 F1 score computation between labels and predictions.
This is an approximate ROUGE scoring method since we do not glue word pieces
or decode the ids and tokenize the output.
Args:
logits: tensor, model predictions
labels: tensor, gold output.
Returns:
rouge2_fscore: approx rouge-2 f1 score.
"""
predictions
=
tf
.
cast
(
tf
.
argmax
(
logits
,
axis
=-
1
),
tf
.
int32
)
# TODO: Look into removing use of py_func # pylint: disable=g-bad-todo
rouge_2_f_score
=
tf
.
py_func
(
rouge_n
,
(
predictions
,
labels
),
tf
.
float32
)
return
rouge_2_f_score
,
tf
.
constant
(
1.0
)
def
_get_ngrams
(
n
,
text
):
"""Calculates n-grams.
Args:
n: which n-grams to calculate
text: An array of tokens
Returns:
A set of n-grams
"""
ngram_set
=
set
()
text_length
=
len
(
text
)
max_index_ngram_start
=
text_length
-
n
for
i
in
range
(
max_index_ngram_start
+
1
):
ngram_set
.
add
(
tuple
(
text
[
i
:
i
+
n
]))
return
ngram_set
def
rouge_n
(
eval_sentences
,
ref_sentences
,
n
=
2
):
"""Computes ROUGE-N f1 score of two text collections of sentences.
Source: https://www.microsoft.com/en-us/research/publication/
rouge-a-package-for-automatic-evaluation-of-summaries/
Args:
eval_sentences: Predicted sentences.
ref_sentences: Sentences from the reference set
n: Size of ngram. Defaults to 2.
Returns:
f1 score for ROUGE-N
"""
f1_scores
=
[]
for
eval_sentence
,
ref_sentence
in
zip
(
eval_sentences
,
ref_sentences
):
eval_ngrams
=
_get_ngrams
(
n
,
eval_sentence
)
ref_ngrams
=
_get_ngrams
(
n
,
ref_sentence
)
ref_count
=
len
(
ref_ngrams
)
eval_count
=
len
(
eval_ngrams
)
# Count the overlapping ngrams between evaluated and reference
overlapping_ngrams
=
eval_ngrams
.
intersection
(
ref_ngrams
)
overlapping_count
=
len
(
overlapping_ngrams
)
# Handle edge case. This isn't mathematically correct, but it's good enough
if
eval_count
==
0
:
precision
=
0.0
else
:
precision
=
float
(
overlapping_count
)
/
eval_count
if
ref_count
==
0
:
recall
=
0.0
else
:
recall
=
float
(
overlapping_count
)
/
ref_count
f1_scores
.
append
(
2.0
*
((
precision
*
recall
)
/
(
precision
+
recall
+
1e-8
)))
# return overlapping_count / reference_count
return
np
.
mean
(
f1_scores
,
dtype
=
np
.
float32
)
def
rouge_l_fscore
(
predictions
,
labels
):
"""ROUGE scores computation between labels and predictions.
This is an approximate ROUGE scoring method since we do not glue word pieces
or decode the ids and tokenize the output.
Args:
predictions: tensor, model predictions
labels: tensor, gold output.
Returns:
rouge_l_fscore: approx rouge-l f1 score.
"""
outputs
=
tf
.
cast
(
tf
.
argmax
(
predictions
,
axis
=-
1
),
tf
.
int32
)
rouge_l_f_score
=
tf
.
py_func
(
rouge_l_sentence_level
,
(
outputs
,
labels
),
tf
.
float32
)
return
rouge_l_f_score
,
tf
.
constant
(
1.0
)
def
rouge_l_sentence_level
(
eval_sentences
,
ref_sentences
):
"""Computes ROUGE-L (sentence level) of two collections of sentences.
Source: https://www.microsoft.com/en-us/research/publication/
rouge-a-package-for-automatic-evaluation-of-summaries/
Calculated according to:
R_lcs = LCS(X,Y)/m
P_lcs = LCS(X,Y)/n
F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
where:
X = reference summary
Y = Candidate summary
m = length of reference summary
n = length of candidate summary
Args:
eval_sentences: The sentences that have been picked by the summarizer
ref_sentences: The sentences from the reference set
Returns:
A float: F_lcs
"""
f1_scores
=
[]
for
eval_sentence
,
ref_sentence
in
zip
(
eval_sentences
,
ref_sentences
):
m
=
float
(
len
(
ref_sentence
))
n
=
float
(
len
(
eval_sentence
))
lcs
=
_len_lcs
(
eval_sentence
,
ref_sentence
)
f1_scores
.
append
(
_f_lcs
(
lcs
,
m
,
n
))
return
np
.
mean
(
f1_scores
,
dtype
=
np
.
float32
)
def
_len_lcs
(
x
,
y
):
"""Returns the length of the Longest Common Subsequence between two seqs.
Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
Args:
x: sequence of words
y: sequence of words
Returns
integer: Length of LCS between x and y
"""
table
=
_lcs
(
x
,
y
)
n
,
m
=
len
(
x
),
len
(
y
)
return
table
[
n
,
m
]
def
_lcs
(
x
,
y
):
"""Computes the length of the LCS between two seqs.
The implementation below uses a DP programming algorithm and runs
in O(nm) time where n = len(x) and m = len(y).
Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
Args:
x: collection of words
y: collection of words
Returns:
Table of dictionary of coord and len lcs
"""
n
,
m
=
len
(
x
),
len
(
y
)
table
=
dict
()
for
i
in
range
(
n
+
1
):
for
j
in
range
(
m
+
1
):
if
i
==
0
or
j
==
0
:
table
[
i
,
j
]
=
0
elif
x
[
i
-
1
]
==
y
[
j
-
1
]:
table
[
i
,
j
]
=
table
[
i
-
1
,
j
-
1
]
+
1
else
:
table
[
i
,
j
]
=
max
(
table
[
i
-
1
,
j
],
table
[
i
,
j
-
1
])
return
table
def
_f_lcs
(
llcs
,
m
,
n
):
"""Computes the LCS-based F-measure score.
Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
rouge-working-note-v1.3.1.pdf
Args:
llcs: Length of LCS
m: number of words in reference summary
n: number of words in candidate summary
Returns:
Float. LCS-based F-measure score
"""
r_lcs
=
llcs
/
m
p_lcs
=
llcs
/
n
beta
=
p_lcs
/
(
r_lcs
+
1e-12
)
num
=
(
1
+
(
beta
**
2
))
*
r_lcs
*
p_lcs
denom
=
r_lcs
+
((
beta
**
2
)
*
p_lcs
)
f_lcs
=
num
/
(
denom
+
1e-12
)
return
f_lcs
official/nlp/transformer/utils/tokenizer.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Defines Subtokenizer class to encode and decode strings."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
re
import
sys
import
unicodedata
from
absl
import
logging
import
numpy
as
np
import
six
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow
as
tf
# pylint: disable=g-complex-comprehension
PAD
=
"<pad>"
PAD_ID
=
0
EOS
=
"<EOS>"
EOS_ID
=
1
RESERVED_TOKENS
=
[
PAD
,
EOS
]
# Set of characters that will be used in the function _escape_token() (see func
# docstring for more details).
# This set is added to the alphabet list to ensure that all escaped tokens can
# be encoded.
_ESCAPE_CHARS
=
set
(
u
"
\\
_u;0123456789"
)
# Regex for the function _unescape_token(), the inverse of _escape_token().
# This is used to find "\u", "\\", and "\###;" substrings in the token.
_UNESCAPE_REGEX
=
re
.
compile
(
r
"\\u|\\\\|\\([0-9]+);"
)
_UNDEFINED_UNICODE
=
u
"
\u3013
"
def
alphanumeric_char_set
():
return
set
(
six
.
unichr
(
i
)
for
i
in
xrange
(
sys
.
maxunicode
)
if
(
unicodedata
.
category
(
six
.
unichr
(
i
)).
startswith
(
"L"
)
or
unicodedata
.
category
(
six
.
unichr
(
i
)).
startswith
(
"N"
)))
# Set contains all letter and number characters.
_ALPHANUMERIC_CHAR_SET
=
alphanumeric_char_set
()
# min_count is the minimum number of times a subtoken must appear in the data
# before before it is added to the vocabulary. The value is found using binary
# search to obtain the target vocabulary size.
_MIN_MIN_COUNT
=
1
# min value to use when binary searching for min_count
_MAX_MIN_COUNT
=
1000
# max value to use when binary searching for min_count
class
Subtokenizer
(
object
):
"""Encodes and decodes strings to/from integer IDs."""
def
__init__
(
self
,
vocab_file
,
reserved_tokens
=
None
,
master_char_set
=
None
):
"""Initializes class, creating a vocab file if data_files is provided."""
logging
.
info
(
"Initializing Subtokenizer from file %s."
,
vocab_file
)
if
master_char_set
is
None
:
master_char_set
=
_ALPHANUMERIC_CHAR_SET
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
self
.
subtoken_list
=
_load_vocab_file
(
vocab_file
,
reserved_tokens
)
self
.
alphabet
=
_generate_alphabet_dict
(
self
.
subtoken_list
)
self
.
subtoken_to_id_dict
=
_list_to_index_dict
(
self
.
subtoken_list
)
self
.
max_subtoken_length
=
0
for
subtoken
in
self
.
subtoken_list
:
self
.
max_subtoken_length
=
max
(
self
.
max_subtoken_length
,
len
(
subtoken
))
# Create cache to speed up subtokenization
self
.
_cache_size
=
2
**
20
self
.
_cache
=
[(
None
,
None
)]
*
self
.
_cache_size
self
.
_master_char_set
=
master_char_set
@
staticmethod
def
init_from_files
(
vocab_file
,
files
,
target_vocab_size
,
threshold
,
min_count
=
None
,
file_byte_limit
=
1e6
,
reserved_tokens
=
None
,
correct_strip
=
True
,
master_char_set
=
None
):
"""Create subtoken vocabulary based on files, and save vocab to file.
Args:
vocab_file: String name of vocab file to store subtoken vocabulary.
files: List of file paths that will be used to generate vocabulary.
target_vocab_size: target vocabulary size to generate.
threshold: int threshold of vocabulary size to accept.
min_count: int minimum count to use for generating the vocabulary. The min
count is the minimum number of times a subtoken should appear in the
files before it is added to the vocabulary. If set to none, this value
is found using binary search.
file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
will be drawn from the files.
reserved_tokens: List of string tokens that are guaranteed to be at the
beginning of the subtoken vocabulary list.
correct_strip: Whether to convert text to unicode before strip.
master_char_set: the char set.
Returns:
Subtokenizer object
"""
if
master_char_set
is
None
:
master_char_set
=
_ALPHANUMERIC_CHAR_SET
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
if
tf
.
io
.
gfile
.
exists
(
vocab_file
):
logging
.
info
(
"Vocab file already exists (%s)"
,
vocab_file
)
else
:
logging
.
info
(
"Begin steps to create subtoken vocabulary..."
)
token_counts
=
_count_tokens
(
files
,
file_byte_limit
,
correct_strip
,
master_char_set
)
alphabet
=
_generate_alphabet_dict
(
token_counts
)
subtoken_list
=
_generate_subtokens_with_target_vocab_size
(
token_counts
,
alphabet
,
target_vocab_size
,
threshold
,
min_count
,
reserved_tokens
)
logging
.
info
(
"Generated vocabulary with %d subtokens."
,
len
(
subtoken_list
))
_save_vocab_file
(
vocab_file
,
subtoken_list
)
return
Subtokenizer
(
vocab_file
,
master_char_set
=
master_char_set
)
def
encode
(
self
,
raw_string
,
add_eos
=
False
):
"""Encodes a string into a list of int subtoken ids."""
ret
=
[]
tokens
=
_split_string_to_tokens
(
native_to_unicode
(
raw_string
),
self
.
_master_char_set
)
for
token
in
tokens
:
ret
.
extend
(
self
.
_token_to_subtoken_ids
(
token
))
if
add_eos
:
assert
EOS
in
self
.
subtoken_list
,
\
"Can't append 'EOS' because it is not in list of known subtokens."
ret
.
append
(
EOS_ID
)
return
ret
def
_token_to_subtoken_ids
(
self
,
token
):
"""Encode a single token into a list of subtoken ids."""
cache_location
=
hash
(
token
)
%
self
.
_cache_size
cache_key
,
cache_value
=
self
.
_cache
[
cache_location
]
if
cache_key
==
token
:
return
cache_value
ret
=
_split_token_to_subtokens
(
_escape_token
(
token
,
self
.
alphabet
),
self
.
subtoken_to_id_dict
,
self
.
max_subtoken_length
)
ret
=
[
self
.
subtoken_to_id_dict
[
subtoken_id
]
for
subtoken_id
in
ret
]
self
.
_cache
[
cache_location
]
=
(
token
,
ret
)
return
ret
def
decode
(
self
,
subtokens
):
"""Converts list of int subtokens ids into a string."""
if
isinstance
(
subtokens
,
np
.
ndarray
):
# Note that list(subtokens) converts subtokens to a python list, but the
# items remain as np.int32. This converts both the array and its items.
subtokens
=
subtokens
.
tolist
()
if
not
subtokens
:
return
""
assert
isinstance
(
subtokens
,
list
)
and
isinstance
(
subtokens
[
0
],
int
),
(
"Subtokens argument passed into decode() must be a list of integers."
)
return
_unicode_to_native
(
_join_tokens_to_string
(
self
.
_subtoken_ids_to_tokens
(
subtokens
),
self
.
_master_char_set
))
def
_subtoken_ids_to_tokens
(
self
,
subtokens
):
"""Convert list of int subtoken ids to a list of string tokens."""
escaped_tokens
=
""
.
join
([
self
.
subtoken_list
[
s
]
for
s
in
subtokens
if
s
<
len
(
self
.
subtoken_list
)
])
escaped_tokens
=
escaped_tokens
.
split
(
"_"
)
# All tokens in the vocabulary list have been escaped (see _escape_token())
# so each token must be unescaped when decoding.
ret
=
[]
for
token
in
escaped_tokens
:
if
token
:
ret
.
append
(
_unescape_token
(
token
))
return
ret
def
_save_vocab_file
(
vocab_file
,
subtoken_list
):
"""Save subtokens to file."""
with
tf
.
io
.
gfile
.
GFile
(
vocab_file
,
mode
=
"w"
)
as
f
:
for
subtoken
in
subtoken_list
:
f
.
write
(
"'%s'
\n
"
%
_unicode_to_native
(
subtoken
))
def
_load_vocab_file
(
vocab_file
,
reserved_tokens
=
None
):
"""Load vocabulary while ensuring reserved tokens are at the top."""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
subtoken_list
=
[]
with
tf
.
io
.
gfile
.
GFile
(
vocab_file
,
mode
=
"r"
)
as
f
:
for
line
in
f
:
subtoken
=
native_to_unicode
(
line
.
strip
())
subtoken
=
subtoken
[
1
:
-
1
]
# Remove surrounding single-quotes
if
subtoken
in
reserved_tokens
:
continue
subtoken_list
.
append
(
native_to_unicode
(
subtoken
))
return
reserved_tokens
+
subtoken_list
def
native_to_unicode
(
s
):
"""Convert string to unicode (required in Python 2)."""
try
:
# Python 2
return
s
if
isinstance
(
s
,
unicode
)
else
s
.
decode
(
"utf-8"
)
except
NameError
:
# Python 3
return
s
def
_unicode_to_native
(
s
):
"""Convert string from unicode to native format (required in Python 2)."""
try
:
# Python 2
return
s
.
encode
(
"utf-8"
)
if
isinstance
(
s
,
unicode
)
else
s
except
NameError
:
# Python 3
return
s
def
_split_string_to_tokens
(
text
,
master_char_set
):
"""Splits text to a list of string tokens."""
if
not
text
:
return
[]
ret
=
[]
token_start
=
0
# Classify each character in the input string
is_master
=
[
c
in
master_char_set
for
c
in
text
]
for
pos
in
xrange
(
1
,
len
(
text
)):
if
is_master
[
pos
]
!=
is_master
[
pos
-
1
]:
token
=
text
[
token_start
:
pos
]
if
token
!=
u
" "
or
token_start
==
0
:
ret
.
append
(
token
)
token_start
=
pos
final_token
=
text
[
token_start
:]
ret
.
append
(
final_token
)
return
ret
def
_join_tokens_to_string
(
tokens
,
master_char_set
):
"""Join a list of string tokens into a single string."""
token_is_master
=
[
t
[
0
]
in
master_char_set
for
t
in
tokens
]
ret
=
[]
for
i
,
token
in
enumerate
(
tokens
):
if
i
>
0
and
token_is_master
[
i
-
1
]
and
token_is_master
[
i
]:
ret
.
append
(
u
" "
)
ret
.
append
(
token
)
return
""
.
join
(
ret
)
def
_escape_token
(
token
,
alphabet
):
r
"""Replace characters that aren't in the alphabet and append "_" to token.
Apply three transformations to the token:
1. Replace underline character "_" with "\u", and backslash "\" with "\\".
2. Replace characters outside of the alphabet with "\###;", where ### is the
character's Unicode code point.
3. Appends "_" to mark the end of a token.
Args:
token: unicode string to be escaped
alphabet: list of all known characters
Returns:
escaped string
"""
token
=
token
.
replace
(
u
"
\\
"
,
u
"
\\\\
"
).
replace
(
u
"_"
,
u
"
\\
u"
)
ret
=
[
c
if
c
in
alphabet
and
c
!=
u
"
\n
"
else
r
"\%d;"
%
ord
(
c
)
for
c
in
token
]
return
u
""
.
join
(
ret
)
+
"_"
def
_unescape_token
(
token
):
r
"""Replaces escaped characters in the token with their unescaped versions.
Applies inverse transformations as _escape_token():
1. Replace "\u" with "_", and "\\" with "\".
2. Replace "\###;" with the unicode character the ### refers to.
Args:
token: escaped string
Returns:
unescaped string
"""
def
match
(
m
):
r
"""Returns replacement string for matched object.
Matched objects contain one of the strings that matches the regex pattern:
r"\\u|\\\\|\\([0-9]+);"
The strings can be '\u', '\\', or '\###;' (### is any digit number).
m.group(0) refers to the entire matched string ('\u', '\\', or '\###;').
m.group(1) refers to the first parenthesized subgroup ('###').
m.group(0) exists for all match objects, while m.group(1) exists only for
the string '\###;'.
This function looks to see if m.group(1) exists. If it doesn't, then the
matched string must be '\u' or '\\' . In this case, the corresponding
replacement ('_' and '\') are returned. Note that in python, a single
backslash is written as '\\', and double backslash as '\\\\'.
If m.goup(1) exists, then use the integer in m.group(1) to return a
unicode character.
Args:
m: match object
Returns:
String to replace matched object with.
"""
# Check if the matched strings are '\u' or '\\'.
if
m
.
group
(
1
)
is
None
:
return
u
"_"
if
m
.
group
(
0
)
==
u
"
\\
u"
else
u
"
\\
"
# If m.group(1) exists, try and return unicode character.
try
:
return
six
.
unichr
(
int
(
m
.
group
(
1
)))
except
(
ValueError
,
OverflowError
)
as
_
:
return
_UNDEFINED_UNICODE
# Use match function to replace escaped substrings in the token.
return
_UNESCAPE_REGEX
.
sub
(
match
,
token
)
def
_count_tokens
(
files
,
file_byte_limit
=
1e6
,
correct_strip
=
True
,
master_char_set
=
None
):
"""Return token counts of words in the files.
Samples file_byte_limit bytes from each file, and counts the words that appear
in the samples. The samples are semi-evenly distributed across the file.
Args:
files: List of filepaths
file_byte_limit: Max number of bytes that will be read from each file.
correct_strip: Whether to convert text to unicode before strip. This affects
vocabulary generation for PY2. Sets correct_strip to False in PY2 to
reproduce previous common public result. Sets correct_strip to True will
let PY2 and PY3 get a consistent vocabulary.
master_char_set: the char set.
Returns:
Dictionary mapping tokens to the number of times they appear in the sampled
lines from the files.
"""
if
master_char_set
is
None
:
master_char_set
=
_ALPHANUMERIC_CHAR_SET
token_counts
=
collections
.
defaultdict
(
int
)
for
filepath
in
files
:
with
tf
.
io
.
gfile
.
GFile
(
filepath
,
mode
=
"r"
)
as
reader
:
file_byte_budget
=
file_byte_limit
counter
=
0
lines_to_skip
=
int
(
reader
.
size
()
/
(
file_byte_budget
*
2
))
for
line
in
reader
:
if
counter
<
lines_to_skip
:
counter
+=
1
else
:
if
file_byte_budget
<
0
:
break
if
correct_strip
:
line
=
native_to_unicode
(
line
)
line
=
line
.
strip
()
file_byte_budget
-=
len
(
line
)
counter
=
0
# Add words to token counts
for
token
in
_split_string_to_tokens
(
native_to_unicode
(
line
),
master_char_set
):
token_counts
[
token
]
+=
1
return
token_counts
def
_list_to_index_dict
(
lst
):
"""Create dictionary mapping list items to their indices in the list."""
return
{
item
:
n
for
n
,
item
in
enumerate
(
lst
)}
def
_split_token_to_subtokens
(
token
,
subtoken_dict
,
max_subtoken_length
):
"""Splits a token into subtokens defined in the subtoken dict."""
ret
=
[]
start
=
0
token_len
=
len
(
token
)
while
start
<
token_len
:
# Find the longest subtoken, so iterate backwards.
for
end
in
xrange
(
min
(
token_len
,
start
+
max_subtoken_length
),
start
,
-
1
):
subtoken
=
token
[
start
:
end
]
if
subtoken
in
subtoken_dict
:
ret
.
append
(
subtoken
)
start
=
end
break
else
:
# Did not break
# If there is no possible encoding of the escaped token then one of the
# characters in the token is not in the alphabet. This should be
# impossible and would be indicative of a bug.
raise
ValueError
(
"Was unable to split token
\"
%s
\"
into subtokens."
%
token
)
return
ret
def
_generate_subtokens_with_target_vocab_size
(
token_counts
,
alphabet
,
target_size
,
threshold
,
min_count
=
None
,
reserved_tokens
=
None
):
"""Generate subtoken vocabulary close to the target size."""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
if
min_count
is
not
None
:
logging
.
info
(
"Using min_count=%d to generate vocab with target size %d"
,
min_count
,
target_size
)
return
_generate_subtokens
(
token_counts
,
alphabet
,
min_count
,
reserved_tokens
=
reserved_tokens
)
def
bisect
(
min_val
,
max_val
):
"""Recursive function to binary search for subtoken vocabulary."""
cur_count
=
(
min_val
+
max_val
)
//
2
logging
.
info
(
"Binary search: trying min_count=%d (%d %d)"
,
cur_count
,
min_val
,
max_val
)
subtoken_list
=
_generate_subtokens
(
token_counts
,
alphabet
,
cur_count
,
reserved_tokens
=
reserved_tokens
)
val
=
len
(
subtoken_list
)
logging
.
info
(
"Binary search: min_count=%d resulted in %d tokens"
,
cur_count
,
val
)
within_threshold
=
abs
(
val
-
target_size
)
<
threshold
if
within_threshold
or
min_val
>=
max_val
or
cur_count
<
2
:
return
subtoken_list
if
val
>
target_size
:
other_subtoken_list
=
bisect
(
cur_count
+
1
,
max_val
)
else
:
other_subtoken_list
=
bisect
(
min_val
,
cur_count
-
1
)
# Return vocabulary dictionary with the closest number of tokens.
other_val
=
len
(
other_subtoken_list
)
if
abs
(
other_val
-
target_size
)
<
abs
(
val
-
target_size
):
return
other_subtoken_list
return
subtoken_list
logging
.
info
(
"Finding best min_count to get target size of %d"
,
target_size
)
return
bisect
(
_MIN_MIN_COUNT
,
_MAX_MIN_COUNT
)
def
_generate_alphabet_dict
(
iterable
,
reserved_tokens
=
None
):
"""Create set of characters that appear in any element in the iterable."""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
alphabet
=
{
c
for
token
in
iterable
for
c
in
token
}
alphabet
|=
{
c
for
token
in
reserved_tokens
for
c
in
token
}
alphabet
|=
_ESCAPE_CHARS
# Add escape characters to alphabet set.
return
alphabet
def
_count_and_gen_subtokens
(
token_counts
,
alphabet
,
subtoken_dict
,
max_subtoken_length
):
"""Count number of times subtokens appear, and generate new subtokens.
Args:
token_counts: dict mapping tokens to the number of times they appear in the
original files.
alphabet: list of allowed characters. Used to escape the tokens, which
guarantees that all tokens can be split into subtokens.
subtoken_dict: dict mapping subtokens to ids.
max_subtoken_length: maximum length of subtoken in subtoken_dict.
Returns:
A defaultdict mapping subtokens to the number of times they appear in the
tokens. The dict may contain new subtokens.
"""
subtoken_counts
=
collections
.
defaultdict
(
int
)
for
token
,
count
in
six
.
iteritems
(
token_counts
):
token
=
_escape_token
(
token
,
alphabet
)
subtokens
=
_split_token_to_subtokens
(
token
,
subtoken_dict
,
max_subtoken_length
)
# Generate new subtokens by taking substrings from token.
start
=
0
for
subtoken
in
subtokens
:
for
end
in
xrange
(
start
+
1
,
len
(
token
)
+
1
):
new_subtoken
=
token
[
start
:
end
]
subtoken_counts
[
new_subtoken
]
+=
count
start
+=
len
(
subtoken
)
return
subtoken_counts
def
_filter_and_bucket_subtokens
(
subtoken_counts
,
min_count
):
"""Return a bucketed list of subtokens that are filtered by count.
Args:
subtoken_counts: defaultdict mapping subtokens to their counts
min_count: int count used to filter subtokens
Returns:
List of subtoken sets, where subtokens in set i have the same length=i.
"""
# Create list of buckets, where subtokens in bucket i have length i.
subtoken_buckets
=
[]
for
subtoken
,
count
in
six
.
iteritems
(
subtoken_counts
):
if
count
<
min_count
:
# Filter out subtokens that don't appear enough
continue
while
len
(
subtoken_buckets
)
<=
len
(
subtoken
):
subtoken_buckets
.
append
(
set
())
subtoken_buckets
[
len
(
subtoken
)].
add
(
subtoken
)
return
subtoken_buckets
def
_gen_new_subtoken_list
(
subtoken_counts
,
min_count
,
alphabet
,
reserved_tokens
=
None
):
"""Generate candidate subtokens ordered by count, and new max subtoken length.
Add subtokens to the candiate list in order of length (longest subtokens
first). When a subtoken is added, the counts of each of its prefixes are
decreased. Prefixes that don't appear much outside the subtoken are not added
to the candidate list.
For example:
subtoken being added to candidate list: 'translate'
subtoken_counts: {'translate':10, 't':40, 'tr':16, 'tra':12, ...}
min_count: 5
When 'translate' is added, subtoken_counts is updated to:
{'translate':0, 't':30, 'tr':6, 'tra': 2, ...}
The subtoken 'tra' will not be added to the candidate list, because it appears
twice (less than min_count) outside of 'translate'.
Args:
subtoken_counts: defaultdict mapping str subtokens to int counts
min_count: int minumum count requirement for subtokens
alphabet: set of characters. Each character is added to the subtoken list to
guarantee that all tokens can be encoded.
reserved_tokens: list of tokens that will be added to the beginning of the
returned subtoken list.
Returns:
List of candidate subtokens in decreasing count order, and maximum subtoken
length
"""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
# Create a list of (count, subtoken) for each candidate subtoken.
subtoken_candidates
=
[]
# Use bucketted list to iterate through subtokens in order of length.
# subtoken_buckets[i] = set(subtokens), where each subtoken has length i.
subtoken_buckets
=
_filter_and_bucket_subtokens
(
subtoken_counts
,
min_count
)
max_subtoken_length
=
len
(
subtoken_buckets
)
-
1
# Go through the list in reverse order to consider longer subtokens first.
for
subtoken_len
in
xrange
(
max_subtoken_length
,
0
,
-
1
):
for
subtoken
in
subtoken_buckets
[
subtoken_len
]:
count
=
subtoken_counts
[
subtoken
]
# Possible if this subtoken is a prefix of another token.
if
count
<
min_count
:
continue
# Ignore alphabet/reserved tokens, which will be added manually later.
if
subtoken
not
in
alphabet
and
subtoken
not
in
reserved_tokens
:
subtoken_candidates
.
append
((
count
,
subtoken
))
# Decrement count of the subtoken's prefixes (if a longer subtoken is
# added, its prefixes lose priority to be added).
for
end
in
xrange
(
1
,
subtoken_len
):
subtoken_counts
[
subtoken
[:
end
]]
-=
count
# Add alphabet subtokens (guarantees that all strings are encodable).
subtoken_candidates
.
extend
((
subtoken_counts
.
get
(
a
,
0
),
a
)
for
a
in
alphabet
)
# Order subtoken candidates by decreasing count.
subtoken_list
=
[
t
for
_
,
t
in
sorted
(
subtoken_candidates
,
reverse
=
True
)]
# Add reserved tokens to beginning of the list.
subtoken_list
=
reserved_tokens
+
subtoken_list
return
subtoken_list
,
max_subtoken_length
def
_generate_subtokens
(
token_counts
,
alphabet
,
min_count
,
num_iterations
=
4
,
reserved_tokens
=
None
):
"""Create a list of subtokens in decreasing order of frequency.
Args:
token_counts: dict mapping str tokens -> int count
alphabet: set of characters
min_count: int minimum number of times a subtoken must appear before it is
added to the vocabulary.
num_iterations: int number of iterations to generate new tokens.
reserved_tokens: list of tokens that will be added to the beginning to the
returned subtoken list.
Returns:
Sorted list of subtokens (most frequent first)
"""
if
reserved_tokens
is
None
:
reserved_tokens
=
RESERVED_TOKENS
# Use alphabet set to create initial list of subtokens
subtoken_list
=
reserved_tokens
+
list
(
alphabet
)
max_subtoken_length
=
1
# On each iteration, segment all words using the subtokens defined in
# subtoken_dict, count how often the resulting subtokens appear, and update
# the dictionary with subtokens w/ high enough counts.
for
i
in
xrange
(
num_iterations
):
logging
.
info
(
"
\t
Generating subtokens: iteration %d"
,
i
)
# Generate new subtoken->id dictionary using the new subtoken list.
subtoken_dict
=
_list_to_index_dict
(
subtoken_list
)
# Create dict mapping subtoken->count, with additional subtokens created
# from substrings taken from the tokens.
subtoken_counts
=
_count_and_gen_subtokens
(
token_counts
,
alphabet
,
subtoken_dict
,
max_subtoken_length
)
# Generate new list of subtokens sorted by subtoken count.
subtoken_list
,
max_subtoken_length
=
_gen_new_subtoken_list
(
subtoken_counts
,
min_count
,
alphabet
,
reserved_tokens
)
logging
.
info
(
"
\t
Vocab size: %d"
,
len
(
subtoken_list
))
return
subtoken_list
official/nlp/transformer/utils/tokenizer_test.py
deleted
100644 → 0
View file @
9485aa1d
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test Subtokenizer and string helper methods."""
import
collections
import
tempfile
import
tensorflow
as
tf
from
official.nlp.transformer.utils
import
tokenizer
class
SubtokenizerTest
(
tf
.
test
.
TestCase
):
def
_init_subtokenizer
(
self
,
vocab_list
):
temp_file
=
tempfile
.
NamedTemporaryFile
(
delete
=
False
)
with
tf
.
io
.
gfile
.
GFile
(
temp_file
.
name
,
"w"
)
as
w
:
for
subtoken
in
vocab_list
:
w
.
write
(
"'%s'"
%
subtoken
)
w
.
write
(
"
\n
"
)
return
tokenizer
.
Subtokenizer
(
temp_file
.
name
,
reserved_tokens
=
[])
def
test_encode
(
self
):
vocab_list
=
[
"123_"
,
"test"
,
"ing_"
]
subtokenizer
=
self
.
_init_subtokenizer
(
vocab_list
)
s
=
"testing 123"
encoded_list
=
subtokenizer
.
encode
(
s
)
self
.
assertEqual
([
1
,
2
,
0
],
encoded_list
)
def
test_decode
(
self
):
vocab_list
=
[
"123_"
,
"test"
,
"ing_"
]
subtokenizer
=
self
.
_init_subtokenizer
(
vocab_list
)
encoded_list
=
[
1
,
2
,
0
]
# testing 123
decoded_str
=
subtokenizer
.
decode
(
encoded_list
)
self
.
assertEqual
(
"testing 123"
,
decoded_str
)
def
test_subtoken_ids_to_tokens
(
self
):
vocab_list
=
[
"123_"
,
"test"
,
"ing_"
]
subtokenizer
=
self
.
_init_subtokenizer
(
vocab_list
)
encoded_list
=
[
1
,
2
,
0
]
# testing 123
token_list
=
subtokenizer
.
_subtoken_ids_to_tokens
(
encoded_list
)
self
.
assertEqual
([
u
"testing"
,
u
"123"
],
token_list
)
class
StringHelperTest
(
tf
.
test
.
TestCase
):
def
test_split_string_to_tokens
(
self
):
text
=
"test? testing 123."
tokens
=
tokenizer
.
_split_string_to_tokens
(
text
,
tokenizer
.
_ALPHANUMERIC_CHAR_SET
)
self
.
assertEqual
([
"test"
,
"? "
,
"testing"
,
"123"
,
"."
],
tokens
)
def
test_join_tokens_to_string
(
self
):
tokens
=
[
"test"
,
"? "
,
"testing"
,
"123"
,
"."
]
s
=
tokenizer
.
_join_tokens_to_string
(
tokens
,
tokenizer
.
_ALPHANUMERIC_CHAR_SET
)
self
.
assertEqual
(
"test? testing 123."
,
s
)
def
test_escape_token
(
self
):
token
=
u
"abc_
\\
4"
alphabet
=
set
(
"abc_
\\
u;"
)
escaped_token
=
tokenizer
.
_escape_token
(
token
,
alphabet
)
self
.
assertEqual
(
"abc
\\
u
\\\\\\
52;_"
,
escaped_token
)
def
test_unescape_token
(
self
):
escaped_token
=
u
"Underline:
\\
u, Backslash:
\\\\
, Unicode:
\\
52;"
unescaped_token
=
tokenizer
.
_unescape_token
(
escaped_token
)
self
.
assertEqual
(
"Underline: _, Backslash:
\\
, Unicode: 4"
,
unescaped_token
)
def
test_list_to_index_dict
(
self
):
lst
=
[
"test"
,
"strings"
]
d
=
tokenizer
.
_list_to_index_dict
(
lst
)
self
.
assertDictEqual
({
"test"
:
0
,
"strings"
:
1
},
d
)
def
test_split_token_to_subtokens
(
self
):
token
=
"abc"
subtoken_dict
=
{
"a"
:
0
,
"b"
:
1
,
"c"
:
2
,
"ab"
:
3
}
max_subtoken_length
=
2
subtokens
=
tokenizer
.
_split_token_to_subtokens
(
token
,
subtoken_dict
,
max_subtoken_length
)
self
.
assertEqual
([
"ab"
,
"c"
],
subtokens
)
def
test_generate_alphabet_dict
(
self
):
s
=
[
"testing"
,
"123"
]
reserved_tokens
=
[
"???"
]
alphabet
=
tokenizer
.
_generate_alphabet_dict
(
s
,
reserved_tokens
)
self
.
assertIn
(
"?"
,
alphabet
)
self
.
assertIn
(
"t"
,
alphabet
)
self
.
assertIn
(
"e"
,
alphabet
)
self
.
assertIn
(
"s"
,
alphabet
)
self
.
assertIn
(
"i"
,
alphabet
)
self
.
assertIn
(
"n"
,
alphabet
)
self
.
assertIn
(
"g"
,
alphabet
)
self
.
assertIn
(
"1"
,
alphabet
)
self
.
assertIn
(
"2"
,
alphabet
)
self
.
assertIn
(
"3"
,
alphabet
)
def
test_count_and_gen_subtokens
(
self
):
token_counts
=
{
"abc"
:
5
}
alphabet
=
set
(
"abc_"
)
subtoken_dict
=
{
"a"
:
0
,
"b"
:
1
,
"c"
:
2
,
"_"
:
3
}
max_subtoken_length
=
2
subtoken_counts
=
tokenizer
.
_count_and_gen_subtokens
(
token_counts
,
alphabet
,
subtoken_dict
,
max_subtoken_length
)
self
.
assertIsInstance
(
subtoken_counts
,
collections
.
defaultdict
)
self
.
assertDictEqual
(
{
"a"
:
5
,
"b"
:
5
,
"c"
:
5
,
"_"
:
5
,
"ab"
:
5
,
"bc"
:
5
,
"c_"
:
5
,
"abc"
:
5
,
"bc_"
:
5
,
"abc_"
:
5
},
subtoken_counts
)
def
test_filter_and_bucket_subtokens
(
self
):
subtoken_counts
=
collections
.
defaultdict
(
int
,
{
"a"
:
2
,
"b"
:
4
,
"c"
:
1
,
"ab"
:
6
,
"ac"
:
3
,
"abbc"
:
5
})
min_count
=
3
subtoken_buckets
=
tokenizer
.
_filter_and_bucket_subtokens
(
subtoken_counts
,
min_count
)
self
.
assertEqual
(
len
(
subtoken_buckets
[
0
]),
0
)
self
.
assertEqual
(
set
(
"b"
),
subtoken_buckets
[
1
])
self
.
assertEqual
(
set
([
"ab"
,
"ac"
]),
subtoken_buckets
[
2
])
self
.
assertEqual
(
len
(
subtoken_buckets
[
3
]),
0
)
self
.
assertEqual
(
set
([
"abbc"
]),
subtoken_buckets
[
4
])
def
test_gen_new_subtoken_list
(
self
):
subtoken_counts
=
collections
.
defaultdict
(
int
,
{
"translate"
:
10
,
"t"
:
40
,
"tr"
:
16
,
"tra"
:
12
})
min_count
=
5
alphabet
=
set
(
"translate"
)
reserved_tokens
=
[
"reserved"
,
"tokens"
]
subtoken_list
,
max_token_length
=
tokenizer
.
_gen_new_subtoken_list
(
subtoken_counts
,
min_count
,
alphabet
,
reserved_tokens
)
# Check that "tra" isn"t in the list (its count should be decremented to 2,
# so it should not be added to the canddiate list).
self
.
assertNotIn
(
"tra"
,
subtoken_list
)
self
.
assertIn
(
"tr"
,
subtoken_list
)
self
.
assertIn
(
"t"
,
subtoken_list
)
self
.
assertEqual
(
len
(
"translate"
),
max_token_length
)
def
test_generate_subtokens
(
self
):
token_counts
=
{
"ab"
:
1
,
"bc"
:
3
,
"abc"
:
5
}
alphabet
=
set
(
"abc_"
)
min_count
=
100
num_iterations
=
1
reserved_tokens
=
[
"reserved"
,
"tokens"
]
vocab_list
=
tokenizer
.
_generate_subtokens
(
token_counts
,
alphabet
,
min_count
,
num_iterations
,
reserved_tokens
)
# Check that reserved tokens are at the front of the list
self
.
assertEqual
(
vocab_list
[:
2
],
reserved_tokens
)
# Check that each character in alphabet is in the vocab list
for
c
in
alphabet
:
self
.
assertIn
(
c
,
vocab_list
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
Prev
1
…
24
25
26
27
28
29
30
31
32
…
39
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment