Commit 3b158095 authored by Ilya Mironov's avatar Ilya Mironov
Browse files

Merge branch 'master' of https://github.com/ilyamironov/models

parents a90db800 be659c2f
This diff is collapsed.
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test beam search helper methods."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.model import beam_search
class BeamSearchHelperTests(tf.test.TestCase):
def test_expand_to_beam_size(self):
x = tf.ones([7, 4, 2, 5])
x = beam_search._expand_to_beam_size(x, 3)
with self.test_session() as sess:
shape = sess.run(tf.shape(x))
self.assertAllEqual([7, 3, 4, 2, 5], shape)
def test_shape_list(self):
y = tf.constant(4.0)
x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5])
shape = beam_search._shape_list(x)
self.assertIsInstance(shape[0], int)
self.assertIsInstance(shape[1], tf.Tensor)
self.assertIsInstance(shape[2], int)
self.assertIsInstance(shape[3], int)
def test_get_shape_keep_last_dim(self):
y = tf.constant(4.0)
x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5])
shape = beam_search._get_shape_keep_last_dim(x)
self.assertAllEqual([None, None, None, 5],
shape.as_list())
def test_flatten_beam_dim(self):
x = tf.ones([7, 4, 2, 5])
x = beam_search._flatten_beam_dim(x)
with self.test_session() as sess:
shape = sess.run(tf.shape(x))
self.assertAllEqual([28, 2, 5], shape)
def test_unflatten_beam_dim(self):
x = tf.ones([28, 2, 5])
x = beam_search._unflatten_beam_dim(x, 7, 4)
with self.test_session() as sess:
shape = sess.run(tf.shape(x))
self.assertAllEqual([7, 4, 2, 5], shape)
def test_gather_beams(self):
x = tf.reshape(tf.range(24), [2, 3, 4])
# x looks like: [[[ 0 1 2 3]
# [ 4 5 6 7]
# [ 8 9 10 11]]
#
# [[12 13 14 15]
# [16 17 18 19]
# [20 21 22 23]]]
y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
with self.test_session() as sess:
y = sess.run(y)
self.assertAllEqual([[[4, 5, 6, 7],
[8, 9, 10, 11]],
[[12, 13, 14, 15],
[20, 21, 22, 23]]],
y)
def test_gather_topk_beams(self):
x = tf.reshape(tf.range(24), [2, 3, 4])
x_scores = [[0, 1, 1], [1, 0, 1]]
y = beam_search._gather_topk_beams(x, x_scores, 2, 2)
with self.test_session() as sess:
y = sess.run(y)
self.assertAllEqual([[[4, 5, 6, 7],
[8, 9, 10, 11]],
[[12, 13, 14, 15],
[20, 21, 22, 23]]],
y)
if __name__ == "__main__":
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Implementation of embedding layer with shared weights."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.model import model_utils
class EmbeddingSharedWeights(tf.layers.Layer):
"""Calculates input embeddings and pre-softmax linear with shared weights."""
def __init__(self, vocab_size, hidden_size):
super(EmbeddingSharedWeights, self).__init__()
self.vocab_size = vocab_size
self.hidden_size = hidden_size
def build(self, _):
with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE):
# Create and initialize weights. The random normal initializer was chosen
# randomly, and works well.
self.shared_weights = tf.get_variable(
"weights", [self.vocab_size, self.hidden_size],
initializer=tf.random_normal_initializer(
0., self.hidden_size ** -0.5))
self.built = True
def call(self, x):
"""Get token embeddings of x.
Args:
x: An int64 tensor with shape [batch_size, length]
Returns:
embeddings: float32 tensor with shape [batch_size, length, embedding_size]
padding: float32 tensor with shape [batch_size, length] indicating the
locations of the padding tokens in x.
"""
with tf.name_scope("embedding"):
embeddings = tf.gather(self.shared_weights, x)
# Scale embedding by the sqrt of the hidden size
embeddings *= self.hidden_size ** 0.5
# Create binary array of size [batch_size, length]
# where 1 = padding, 0 = not padding
padding = model_utils.get_padding(x)
# Set all padding embedding values to 0
embeddings *= tf.expand_dims(1 - padding, -1)
return embeddings
def linear(self, x):
"""Computes logits by running x through a linear layer.
Args:
x: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
with tf.name_scope("presoftmax_linear"):
batch_size = tf.shape(x)[0]
length = tf.shape(x)[1]
x = tf.reshape(x, [-1, self.hidden_size])
logits = tf.matmul(x, self.shared_weights, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size])
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Implementation of fully connected network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class FeedFowardNetwork(tf.layers.Layer):
"""Fully connected feedforward network."""
def __init__(self, hidden_size, filter_size, relu_dropout, train):
super(FeedFowardNetwork, self).__init__()
self.hidden_size = hidden_size
self.filter_size = filter_size
self.relu_dropout = relu_dropout
self.train = train
self.filter_dense_layer = tf.layers.Dense(
filter_size, use_bias=True, activation=tf.nn.relu, name="filter_layer")
self.output_dense_layer = tf.layers.Dense(
hidden_size, use_bias=True, name="output_layer")
def call(self, x, padding=None):
"""Return outputs of the feedforward network.
Args:
x: tensor with shape [batch_size, length, hidden_size]
padding: (optional) If set, the padding values are temporarily removed
from x. The padding values are placed back in the output tensor in the
same locations. shape [batch_size, length]
Returns:
Output of the feedforward network.
tensor with shape [batch_size, length, hidden_size]
"""
# Retrieve dynamically known shapes
batch_size = tf.shape(x)[0]
length = tf.shape(x)[1]
if padding is not None:
with tf.name_scope("remove_padding"):
# Flatten padding to [batch_size*length]
pad_mask = tf.reshape(padding, [-1])
nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9))
# Reshape x to [batch_size*length, hidden_size] to remove padding
x = tf.reshape(x, [-1, self.hidden_size])
x = tf.gather_nd(x, indices=nonpad_ids)
# Reshape x from 2 dimensions to 3 dimensions.
x.set_shape([None, self.hidden_size])
x = tf.expand_dims(x, axis=0)
output = self.filter_dense_layer(x)
if self.train:
output = tf.nn.dropout(output, 1.0 - self.relu_dropout)
output = self.output_dense_layer(output)
if padding is not None:
with tf.name_scope("re_add_padding"):
output = tf.squeeze(output, axis=0)
output = tf.scatter_nd(
indices=nonpad_ids,
updates=output,
shape=[batch_size * length, self.hidden_size]
)
output = tf.reshape(output, [batch_size, length, self.hidden_size])
return output
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines Transformer model parameters."""
class TransformerBaseParams(object):
"""Parameters for the base Transformer model."""
# Input params
batch_size = 2048 # Maximum number of tokens per batch of examples.
max_length = 256 # Maximum number of tokens per example.
# Model params
initializer_gain = 1.0 # Used in trainable variable initialization.
vocab_size = 33708 # Number of tokens defined in the vocabulary file.
hidden_size = 512 # Model dimension in the hidden layers.
num_hidden_layers = 6 # Number of layers in the encoder and decoder stacks.
num_heads = 8 # Number of heads to use in multi-headed attention.
filter_size = 2048 # Inner layer dimensionality in the feedforward network.
# Dropout values (only used when training)
layer_postprocess_dropout = 0.1
attention_dropout = 0.1
relu_dropout = 0.1
# Training params
label_smoothing = 0.1
learning_rate = 2.0
learning_rate_decay_rate = 1.0
learning_rate_warmup_steps = 16000
# Optimizer params
optimizer_adam_beta1 = 0.9
optimizer_adam_beta2 = 0.997
optimizer_adam_epsilon = 1e-09
# Default prediction params
extra_decode_length = 50
beam_size = 4
alpha = 0.6 # used to calculate length normalization in beam search
class TransformerBigParams(TransformerBaseParams):
"""Parameters for the big Transformer model."""
batch_size = 4096
hidden_size = 1024
filter_size = 4096
num_heads = 16
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Transformer model helper methods."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
_NEG_INF = -1e9
def get_position_encoding(
length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
"""Return positional encoding.
Calculates the position encoding as a mix of sine and cosine functions with
geometrically increasing wavelengths.
Defined and formulized in Attention is All You Need, section 3.5.
Args:
length: Sequence length.
hidden_size: Size of the
min_timescale: Minimum scale that will be applied at each position
max_timescale: Maximum scale that will be applied at each position
Returns:
Tensor with shape [length, hidden_size]
"""
position = tf.to_float(tf.range(length))
num_timescales = hidden_size // 2
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
(tf.to_float(num_timescales) - 1))
inv_timescales = min_timescale * tf.exp(
tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
return signal
def get_decoder_self_attention_bias(length):
"""Calculate bias for decoder that maintains model's autoregressive property.
Creates a tensor that masks out locations that correspond to illegal
connections, so prediction at position i cannot draw information from future
positions.
Args:
length: int length of sequences in batch.
Returns:
float tensor of shape [1, 1, length, length]
"""
with tf.name_scope("decoder_self_attention_bias"):
valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
decoder_bias = _NEG_INF * (1.0 - valid_locs)
return decoder_bias
def get_padding(x, padding_value=0):
"""Return float tensor representing the padding values in x.
Args:
x: int tensor with any shape
padding_value: int value that
Returns:
flaot tensor with same shape as x containing values 0 or 1.
0 -> non-padding, 1 -> padding
"""
with tf.name_scope("padding"):
return tf.to_float(tf.equal(x, padding_value))
def get_padding_bias(x):
"""Calculate bias tensor from padding values in tensor.
Bias tensor that is added to the pre-softmax multi-headed attention logits,
which has shape [batch_size, num_heads, length, length]. The tensor is zero at
non-padding locations, and -1e9 (negative infinity) at padding locations.
Args:
x: int tensor with shape [batch_size, length]
Returns:
Attention bias tensor of shape [batch_size, 1, 1, length].
"""
with tf.name_scope("attention_bias"):
padding = get_padding(x)
attention_bias = padding * _NEG_INF
attention_bias = tf.expand_dims(
tf.expand_dims(attention_bias, axis=1), axis=1)
return attention_bias
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test Transformer model helper methods."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.model import model_utils
NEG_INF = -1e9
class ModelUtilsTest(tf.test.TestCase):
def test_get_padding(self):
x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
padding = model_utils.get_padding(x, padding_value=0)
with self.test_session() as sess:
padding = sess.run(padding)
self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]],
padding)
def test_get_padding_bias(self):
x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
bias = model_utils.get_padding_bias(x)
bias_shape = tf.shape(bias)
flattened_bias = tf.reshape(bias, [3, 5])
with self.test_session() as sess:
flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape))
self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
[0, 0, NEG_INF, NEG_INF, NEG_INF],
[NEG_INF, 0, 0, NEG_INF, 0]],
flattened_bias)
self.assertAllEqual([3, 1, 1, 5], bias_shape)
def test_get_decoder_self_attention_bias(self):
length = 5
bias = model_utils.get_decoder_self_attention_bias(length)
with self.test_session() as sess:
bias = sess.run(bias)
self.assertAllEqual([[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
[0, 0, NEG_INF, NEG_INF, NEG_INF],
[0, 0, 0, NEG_INF, NEG_INF],
[0, 0, 0, 0, NEG_INF],
[0, 0, 0, 0, 0]]]],
bias)
if __name__ == "__main__":
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines the Transformer model, and its encoder and decoder stacks.
Model paper: https://arxiv.org/pdf/1706.03762.pdf
Transformer model code source: https://github.com/tensorflow/tensor2tensor
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.model import attention_layer
from official.transformer.model import beam_search
from official.transformer.model import embedding_layer
from official.transformer.model import ffn_layer
from official.transformer.model import model_utils
from official.transformer.utils.tokenizer import EOS_ID
_NEG_INF = -1e9
class Transformer(object):
"""Transformer model for sequence to sequence data.
Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
The Transformer model consists of an encoder and decoder. The input is an int
sequence (or a batch of sequences). The encoder produces a continous
representation, and the decoder uses the encoder output to generate
probabilities for the output sequence.
"""
def __init__(self, params, train):
"""Initialize layers to build Transformer model.
Args:
params: hyperparameter object defining layer sizes, dropout values, etc.
train: boolean indicating whether the model is in training mode. Used to
determine if dropout layers should be added.
"""
self.train = train
self.params = params
self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
params.vocab_size, params.hidden_size)
self.encoder_stack = EncoderStack(params, train)
self.decoder_stack = DecoderStack(params, train)
def __call__(self, inputs, targets=None):
"""Calculate target logits or inferred target sequences.
Args:
inputs: int tensor with shape [batch_size, input_length].
targets: None or int tensor with shape [batch_size, target_length].
Returns:
If targets is defined, then return logits for each word in the target
sequence. float tensor with shape [batch_size, target_length, vocab_size]
If target is none, then generate output sequence one token at a time.
returns a dictionary {
output: [batch_size, decoded length]
score: [batch_size, float]}
"""
# Variance scaling is used here because it seems to work in many problems.
# Other reasonable initializers may also work just as well.
initializer = tf.variance_scaling_initializer(
self.params.initializer_gain, mode="fan_avg", distribution="uniform")
with tf.variable_scope("Transformer", initializer=initializer):
# Calculate attention bias for encoder self-attention and decoder
# multi-headed attention layers.
attention_bias = model_utils.get_padding_bias(inputs)
# Run the inputs through the encoder layer to map the symbol
# representations to continuous representations.
encoder_outputs = self.encode(inputs, attention_bias)
# Generate output sequence if targets is None, or return logits if target
# sequence is known.
if targets is None:
return self.predict(encoder_outputs, attention_bias)
else:
logits = self.decode(targets, encoder_outputs, attention_bias)
return logits
def encode(self, inputs, attention_bias):
"""Generate continuous representation for inputs.
Args:
inputs: int tensor with shape [batch_size, input_length].
attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
Returns:
float tensor with shape [batch_size, input_length, hidden_size]
"""
with tf.name_scope("encode"):
# Prepare inputs to the layer stack by adding positional encodings and
# applying dropout.
embedded_inputs = self.embedding_softmax_layer(inputs)
inputs_padding = model_utils.get_padding(inputs)
with tf.name_scope("add_pos_encoding"):
length = tf.shape(embedded_inputs)[1]
pos_encoding = model_utils.get_position_encoding(
length, self.params.hidden_size)
encoder_inputs = embedded_inputs + pos_encoding
if self.train:
encoder_inputs = tf.nn.dropout(
encoder_inputs, 1 - self.params.layer_postprocess_dropout)
return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def decode(self, targets, encoder_outputs, attention_bias):
"""Generate logits for each value in the target sequence.
Args:
targets: target values for the output sequence.
int tensor with shape [batch_size, target_length]
encoder_outputs: continuous representation of input sequence.
float tensor with shape [batch_size, input_length, hidden_size]
attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
Returns:
float32 tensor with shape [batch_size, target_length, vocab_size]
"""
with tf.name_scope("decode"):
# Prepare inputs to decoder layers by shifting targets, adding positional
# encoding and applying dropout.
decoder_inputs = self.embedding_softmax_layer(targets)
with tf.name_scope("shift_targets"):
# Shift targets to the right, and remove the last element
decoder_inputs = tf.pad(
decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
with tf.name_scope("add_pos_encoding"):
length = tf.shape(decoder_inputs)[1]
decoder_inputs += model_utils.get_position_encoding(
length, self.params.hidden_size)
if self.train:
decoder_inputs = tf.nn.dropout(
decoder_inputs, 1 - self.params.layer_postprocess_dropout)
# Run values
decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
length)
outputs = self.decoder_stack(
decoder_inputs, encoder_outputs, decoder_self_attention_bias,
attention_bias)
logits = self.embedding_softmax_layer.linear(outputs)
return logits
def _get_symbols_to_logits_fn(self, max_decode_length):
"""Returns a decoding function that calculates logits of the next tokens."""
timing_signal = model_utils.get_position_encoding(
max_decode_length + 1, self.params.hidden_size)
decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
max_decode_length)
def symbols_to_logits_fn(ids, i, cache):
"""Generate logits for next potential IDs.
Args:
ids: Current decoded sequences.
int tensor with shape [batch_size * beam_size, i + 1]
i: Loop index
cache: dictionary of values storing the encoder output, encoder-decoder
attention bias, and previous decoder attention values.
Returns:
Tuple of
(logits with shape [batch_size * beam_size, vocab_size],
updated cache values)
"""
# Set decoder input to the last generated IDs
decoder_input = ids[:, -1:]
# Preprocess decoder input by getting embeddings and adding timing signal.
decoder_input = self.embedding_softmax_layer(decoder_input)
decoder_input += timing_signal[i:i + 1]
self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
decoder_outputs = self.decoder_stack(
decoder_input, cache.get("encoder_outputs"), self_attention_bias,
cache.get("encoder_decoder_attention_bias"), cache)
logits = self.embedding_softmax_layer.linear(decoder_outputs)
logits = tf.squeeze(logits, axis=[1])
return logits, cache
return symbols_to_logits_fn
def predict(self, encoder_outputs, encoder_decoder_attention_bias):
"""Return predicted sequence."""
batch_size = tf.shape(encoder_outputs)[0]
input_length = tf.shape(encoder_outputs)[1]
max_decode_length = input_length + self.params.extra_decode_length
symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
# Create initial set of IDs that will be passed into symbols_to_logits_fn.
initial_ids = tf.zeros([batch_size], dtype=tf.int32)
# Create cache storing decoder attention values for each layer.
cache = {
"layer_%d" % layer: {
"k": tf.zeros([batch_size, 0, self.params.hidden_size]),
"v": tf.zeros([batch_size, 0, self.params.hidden_size]),
} for layer in range(self.params.num_hidden_layers)}
# Add encoder output and attention bias to the cache.
cache["encoder_outputs"] = encoder_outputs
cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
# Use beam search to find the top beam_size sequences and scores.
decoded_ids, scores = beam_search.sequence_beam_search(
symbols_to_logits_fn=symbols_to_logits_fn,
initial_ids=initial_ids,
initial_cache=cache,
vocab_size=self.params.vocab_size,
beam_size=self.params.beam_size,
alpha=self.params.alpha,
max_decode_length=max_decode_length,
eos_id=EOS_ID)
# Get the top sequence for each batch element
top_decoded_ids = decoded_ids[:, 0, 1:]
top_scores = scores[:, 0]
return {"outputs": top_decoded_ids, "scores": top_scores}
class LayerNormalization(tf.layers.Layer):
"""Applies layer normalization."""
def __init__(self, hidden_size):
super(LayerNormalization, self).__init__()
self.hidden_size = hidden_size
def build(self, _):
self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
initializer=tf.ones_initializer())
self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
initializer=tf.zeros_initializer())
self.built = True
def call(self, x, epsilon=1e-6):
mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
return norm_x * self.scale + self.bias
class PrePostProcessingWrapper(object):
"""Wrapper class that applies layer pre-processing and post-processing."""
def __init__(self, layer, params, train):
self.layer = layer
self.postprocess_dropout = params.layer_postprocess_dropout
self.train = train
# Create normalization layer
self.layer_norm = LayerNormalization(params.hidden_size)
def __call__(self, x, *args, **kwargs):
# Preprocessing: apply layer normalization
y = self.layer_norm(x)
# Get layer output
y = self.layer(y, *args, **kwargs)
# Postprocessing: apply dropout and residual connection
if self.train:
y = tf.nn.dropout(y, 1 - self.postprocess_dropout)
return x + y
class EncoderStack(tf.layers.Layer):
"""Transformer encoder stack.
The encoder stack is made up of N identical layers. Each layer is composed
of the sublayers:
1. Self-attention layer
2. Feedforward network (which is 2 fully-connected layers)
"""
def __init__(self, params, train):
super(EncoderStack, self).__init__()
self.layers = []
for _ in range(params.num_hidden_layers):
# Create sublayers for each layer.
self_attention_layer = attention_layer.SelfAttention(
params.hidden_size, params.num_heads, params.attention_dropout, train)
feed_forward_network = ffn_layer.FeedFowardNetwork(
params.hidden_size, params.filter_size, params.relu_dropout, train)
self.layers.append([
PrePostProcessingWrapper(self_attention_layer, params, train),
PrePostProcessingWrapper(feed_forward_network, params, train)])
# Create final layer normalization layer.
self.output_normalization = LayerNormalization(params.hidden_size)
def call(self, encoder_inputs, attention_bias, inputs_padding):
"""Return the output of the encoder layer stacks.
Args:
encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
attention_bias: bias for the encoder self-attention layer.
[batch_size, 1, 1, input_length]
inputs_padding: P
Returns:
Output of encoder layer stack.
float32 tensor with shape [batch_size, input_length, hidden_size]
"""
for n, layer in enumerate(self.layers):
# Run inputs through the sublayers.
self_attention_layer = layer[0]
feed_forward_network = layer[1]
with tf.variable_scope("layer_%d" % n):
with tf.variable_scope("self_attention"):
encoder_inputs = self_attention_layer(encoder_inputs, attention_bias)
with tf.variable_scope("ffn"):
encoder_inputs = feed_forward_network(encoder_inputs, inputs_padding)
return self.output_normalization(encoder_inputs)
class DecoderStack(tf.layers.Layer):
"""Transformer decoder stack.
Like the encoder stack, the decoder stack is made up of N identical layers.
Each layer is composed of the sublayers:
1. Self-attention layer
2. Multi-headed attention layer combining encoder outputs with results from
the previous self-attention layer.
3. Feedforward network (2 fully-connected layers)
"""
def __init__(self, params, train):
super(DecoderStack, self).__init__()
self.layers = []
for _ in range(params.num_hidden_layers):
self_attention_layer = attention_layer.SelfAttention(
params.hidden_size, params.num_heads, params.attention_dropout, train)
enc_dec_attention_layer = attention_layer.Attention(
params.hidden_size, params.num_heads, params.attention_dropout, train)
feed_forward_network = ffn_layer.FeedFowardNetwork(
params.hidden_size, params.filter_size, params.relu_dropout, train)
self.layers.append([
PrePostProcessingWrapper(self_attention_layer, params, train),
PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
PrePostProcessingWrapper(feed_forward_network, params, train)])
self.output_normalization = LayerNormalization(params.hidden_size)
def call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias,
attention_bias, cache=None):
"""Return the output of the decoder layer stacks.
Args:
decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
decoder_self_attention_bias: bias for decoder self-attention layer.
[1, 1, target_len, target_length]
attention_bias: bias for encoder-decoder attention layer.
[batch_size, 1, 1, input_length]
cache: (Used for fast decoding) A nested dictionary storing previous
decoder self-attention values. The items are:
{layer_n: {"k": tensor with shape [batch_size, i, key_channels],
"v": tensor with shape [batch_size, i, value_channels]},
...}
Returns:
Output of decoder layer stack.
float32 tensor with shape [batch_size, target_length, hidden_size]
"""
for n, layer in enumerate(self.layers):
self_attention_layer = layer[0]
enc_dec_attention_layer = layer[1]
feed_forward_network = layer[2]
# Run inputs through the sublayers.
layer_name = "layer_%d" % n
layer_cache = cache[layer_name] if cache is not None else None
with tf.variable_scope(layer_name):
with tf.variable_scope("self_attention"):
decoder_inputs = self_attention_layer(
decoder_inputs, decoder_self_attention_bias, cache=layer_cache)
with tf.variable_scope("encdec_attention"):
decoder_inputs = enc_dec_attention_layer(
decoder_inputs, encoder_outputs, attention_bias)
with tf.variable_scope("ffn"):
decoder_inputs = feed_forward_network(decoder_inputs)
return self.output_normalization(decoder_inputs)
This diff is collapsed.
This diff is collapsed.
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Creates an estimator to train the Transformer model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import sys
import tempfile
# pylint: disable=g-bad-import-order
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
# pylint: enable=g-bad-import-order
from official.transformer import compute_bleu
from official.transformer import translate
from official.transformer.data_download import VOCAB_FILE
from official.transformer.model import model_params
from official.transformer.model import transformer
from official.transformer.utils import dataset
from official.transformer.utils import metrics
from official.transformer.utils import tokenizer
DEFAULT_TRAIN_EPOCHS = 10
BLEU_DIR = "bleu"
INF = int(1e9)
def model_fn(features, labels, mode, params):
"""Defines how to train, evaluate and predict from the transformer model."""
with tf.variable_scope("model"):
inputs, targets = features, labels
# Create model and get output logits.
model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)
output = model(inputs, targets)
# When in prediction mode, the labels/targets is None. The model output
# is the prediction
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(
tf.estimator.ModeKeys.PREDICT,
predictions=output)
logits = output
# Calculate model loss.
xentropy, weights = metrics.padded_cross_entropy_loss(
logits, targets, params.label_smoothing, params.vocab_size)
loss = tf.reduce_sum(xentropy * weights) / tf.reduce_sum(weights)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, predictions={"predictions": logits},
eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
else:
train_op = get_train_op(loss, params)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def get_learning_rate(learning_rate, hidden_size, learning_rate_warmup_steps):
"""Calculate learning rate with linear warmup and rsqrt decay."""
with tf.name_scope("learning_rate"):
warmup_steps = tf.to_float(learning_rate_warmup_steps)
step = tf.to_float(tf.train.get_or_create_global_step())
learning_rate *= (hidden_size ** -0.5)
# Apply linear warmup
learning_rate *= tf.minimum(1.0, step / warmup_steps)
# Apply rsqrt decay
learning_rate *= tf.rsqrt(tf.maximum(step, warmup_steps))
# Save learning rate value to TensorBoard summary.
tf.summary.scalar("learning_rate", learning_rate)
return learning_rate
def get_train_op(loss, params):
"""Generate training operation that updates variables based on loss."""
with tf.variable_scope("get_train_op"):
learning_rate = get_learning_rate(
params.learning_rate, params.hidden_size,
params.learning_rate_warmup_steps)
# Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
# than the TF core Adam optimizer.
optimizer = tf.contrib.opt.LazyAdamOptimizer(
learning_rate,
beta1=params.optimizer_adam_beta1,
beta2=params.optimizer_adam_beta2,
epsilon=params.optimizer_adam_epsilon)
# Calculate and apply gradients using LazyAdamOptimizer.
global_step = tf.train.get_global_step()
tvars = tf.trainable_variables()
gradients = optimizer.compute_gradients(
loss, tvars, colocate_gradients_with_ops=True)
train_op = optimizer.apply_gradients(
gradients, global_step=global_step, name="train")
# Save gradient norm to Tensorboard
tf.summary.scalar("global_norm/gradient_norm",
tf.global_norm(list(zip(*gradients))[0]))
return train_op
def translate_and_compute_bleu(estimator, subtokenizer, bleu_source, bleu_ref):
"""Translate file and report the cased and uncased bleu scores."""
# Create temporary file to store translation.
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp_filename = tmp.name
translate.translate_file(
estimator, subtokenizer, bleu_source, output_file=tmp_filename,
print_all_translations=False)
# Compute uncased and cased bleu scores.
uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
cased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, True)
os.remove(tmp_filename)
return uncased_score, cased_score
def get_global_step(estimator):
"""Return estimator's last checkpoint."""
return int(estimator.latest_checkpoint().split("-")[-1])
def evaluate_and_log_bleu(estimator, bleu_writer, bleu_source, bleu_ref):
"""Calculate and record the BLEU score."""
subtokenizer = tokenizer.Subtokenizer(
os.path.join(FLAGS.data_dir, FLAGS.vocab_file))
uncased_score, cased_score = translate_and_compute_bleu(
estimator, subtokenizer, bleu_source, bleu_ref)
print("Bleu score (uncased):", uncased_score)
print("Bleu score (cased):", cased_score)
summary = tf.Summary(value=[
tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score),
tf.Summary.Value(tag="bleu/cased", simple_value=cased_score),
])
bleu_writer.add_summary(summary, get_global_step(estimator))
bleu_writer.flush()
return uncased_score, cased_score
def train_schedule(
estimator, train_eval_iterations, single_iteration_train_steps=None,
single_iteration_train_epochs=None, bleu_source=None, bleu_ref=None,
bleu_threshold=None):
"""Train and evaluate model, and optionally compute model's BLEU score.
**Step vs. Epoch vs. Iteration**
Steps and epochs are canonical terms used in TensorFlow and general machine
learning. They are used to describe running a single process (train/eval):
- Step refers to running the process through a single or batch of examples.
- Epoch refers to running the process through an entire dataset.
E.g. training a dataset with 100 examples. The dataset is
divided into 20 batches with 5 examples per batch. A single training step
trains the model on one batch. After 20 training steps, the model will have
trained on every batch in the dataset, or, in other words, one epoch.
Meanwhile, iteration is used in this implementation to describe running
multiple processes (training and eval).
- A single iteration:
1. trains the model for a specific number of steps or epochs.
2. evaluates the model.
3. (if source and ref files are provided) compute BLEU score.
This function runs through multiple train+eval+bleu iterations.
Args:
estimator: tf.Estimator containing model to train.
train_eval_iterations: Number of times to repeat the train+eval iteration.
single_iteration_train_steps: Number of steps to train in one iteration.
single_iteration_train_epochs: Number of epochs to train in one iteration.
bleu_source: File containing text to be translated for BLEU calculation.
bleu_ref: File containing reference translations for BLEU calculation.
bleu_threshold: minimum BLEU score before training is stopped.
Raises:
ValueError: if both or none of single_iteration_train_steps and
single_iteration_train_epochs were defined.
"""
# Ensure that exactly one of single_iteration_train_steps and
# single_iteration_train_epochs is defined.
if single_iteration_train_steps is None:
if single_iteration_train_epochs is None:
raise ValueError(
"Exactly one of single_iteration_train_steps or "
"single_iteration_train_epochs must be defined. Both were none.")
else:
if single_iteration_train_epochs is not None:
raise ValueError(
"Exactly one of single_iteration_train_steps or "
"single_iteration_train_epochs must be defined. Both were defined.")
evaluate_bleu = bleu_source is not None and bleu_ref is not None
# Print out training schedule
print("Training schedule:")
if single_iteration_train_epochs is not None:
print("\t1. Train for %d epochs." % single_iteration_train_epochs)
else:
print("\t1. Train for %d steps." % single_iteration_train_steps)
print("\t2. Evaluate model.")
if evaluate_bleu:
print("\t3. Compute BLEU score.")
if bleu_threshold is not None:
print("Repeat above steps until the BLEU score reaches", bleu_threshold)
if not evaluate_bleu or bleu_threshold is None:
print("Repeat above steps %d times." % train_eval_iterations)
if evaluate_bleu:
# Set summary writer to log bleu score.
bleu_writer = tf.summary.FileWriter(
os.path.join(estimator.model_dir, BLEU_DIR))
if bleu_threshold is not None:
# Change loop stopping condition if bleu_threshold is defined.
train_eval_iterations = INF
# Loop training/evaluation/bleu cycles
for i in xrange(train_eval_iterations):
print("Starting iteration", i + 1)
# Train the model for single_iteration_train_steps or until the input fn
# runs out of examples (if single_iteration_train_steps is None).
estimator.train(dataset.train_input_fn, steps=single_iteration_train_steps)
eval_results = estimator.evaluate(dataset.eval_input_fn)
print("Evaluation results (iter %d/%d):" % (i + 1, train_eval_iterations),
eval_results)
if evaluate_bleu:
uncased_score, _ = evaluate_and_log_bleu(
estimator, bleu_writer, bleu_source, bleu_ref)
if bleu_threshold is not None and uncased_score > bleu_threshold:
bleu_writer.close()
break
def main(_):
# Set logging level to INFO to display training progress (logged by the
# estimator)
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.params == "base":
params = model_params.TransformerBaseParams
elif FLAGS.params == "big":
params = model_params.TransformerBigParams
else:
raise ValueError("Invalid parameter set defined: %s."
"Expected 'base' or 'big.'" % FLAGS.params)
# Determine training schedule based on flags.
if FLAGS.train_steps is not None and FLAGS.train_epochs is not None:
raise ValueError("Both --train_steps and --train_epochs were set. Only one "
"may be defined.")
if FLAGS.train_steps is not None:
train_eval_iterations = FLAGS.train_steps // FLAGS.steps_between_eval
single_iteration_train_steps = FLAGS.steps_between_eval
single_iteration_train_epochs = None
else:
if FLAGS.train_epochs is None:
FLAGS.train_epochs = DEFAULT_TRAIN_EPOCHS
train_eval_iterations = FLAGS.train_epochs // FLAGS.epochs_between_eval
single_iteration_train_steps = None
single_iteration_train_epochs = FLAGS.epochs_between_eval
# Make sure that the BLEU source and ref files if set
if FLAGS.bleu_source is not None and FLAGS.bleu_ref is not None:
if not tf.gfile.Exists(FLAGS.bleu_source):
raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_source)
if not tf.gfile.Exists(FLAGS.bleu_ref):
raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_ref)
# Add flag-defined parameters to params object
params.data_dir = FLAGS.data_dir
params.num_cpu_cores = FLAGS.num_cpu_cores
params.epochs_between_eval = FLAGS.epochs_between_eval
params.repeat_dataset = single_iteration_train_epochs
estimator = tf.estimator.Estimator(
model_fn=model_fn, model_dir=FLAGS.model_dir, params=params)
train_schedule(
estimator, train_eval_iterations, single_iteration_train_steps,
single_iteration_train_epochs, FLAGS.bleu_source, FLAGS.bleu_ref,
FLAGS.bleu_threshold)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_dir", "-dd", type=str, default="/tmp/translate_ende",
help="[default: %(default)s] Directory containing training and "
"evaluation data, and vocab file used for encoding.",
metavar="<DD>")
parser.add_argument(
"--vocab_file", "-vf", type=str, default=VOCAB_FILE,
help="[default: %(default)s] Name of vocabulary file.",
metavar="<vf>")
parser.add_argument(
"--model_dir", "-md", type=str, default="/tmp/transformer_model",
help="[default: %(default)s] Directory to save Transformer model "
"training checkpoints",
metavar="<MD>")
parser.add_argument(
"--params", "-p", type=str, default="big", choices=["base", "big"],
help="[default: %(default)s] Parameter set to use when creating and "
"training the model.",
metavar="<P>")
parser.add_argument(
"--num_cpu_cores", "-nc", type=int, default=4,
help="[default: %(default)s] Number of CPU cores to use in the input "
"pipeline.",
metavar="<NC>")
# Flags for training with epochs. (default)
parser.add_argument(
"--train_epochs", "-te", type=int, default=None,
help="The number of epochs used to train. If both --train_epochs and "
"--train_steps are not set, the model will train for %d epochs." %
DEFAULT_TRAIN_EPOCHS,
metavar="<TE>")
parser.add_argument(
"--epochs_between_eval", "-ebe", type=int, default=1,
help="[default: %(default)s] The number of training epochs to run "
"between evaluations.",
metavar="<TE>")
# Flags for training with steps (may be used for debugging)
parser.add_argument(
"--train_steps", "-ts", type=int, default=None,
help="Total number of training steps. If both --train_epochs and "
"--train_steps are not set, the model will train for %d epochs." %
DEFAULT_TRAIN_EPOCHS,
metavar="<TS>")
parser.add_argument(
"--steps_between_eval", "-sbe", type=int, default=1000,
help="[default: %(default)s] Number of training steps to run between "
"evaluations.",
metavar="<SBE>")
# BLEU score computation
parser.add_argument(
"--bleu_source", "-bs", type=str, default=None,
help="Path to source file containing text translate when calculating the "
"official BLEU score. Both --bleu_source and --bleu_ref must be "
"set. The BLEU score will be calculated during model evaluation.",
metavar="<BS>")
parser.add_argument(
"--bleu_ref", "-br", type=str, default=None,
help="Path to file containing the reference translation for calculating "
"the official BLEU score. Both --bleu_source and --bleu_ref must be "
"set. The BLEU score will be calculated during model evaluation.",
metavar="<BR>")
parser.add_argument(
"--bleu_threshold", "-bt", type=float, default=None,
help="Stop training when the uncased BLEU score reaches this value. "
"Setting this overrides the total number of steps or epochs set by "
"--train_steps or --train_epochs.",
metavar="<BT>")
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Translate text or files using trained transformer model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import sys
# pylint: disable=g-bad-import-order
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
# pylint: enable=g-bad-import-order
from official.transformer.data_download import VOCAB_FILE
from official.transformer.model import model_params
from official.transformer.utils import tokenizer
_DECODE_BATCH_SIZE = 32
_EXTRA_DECODE_LENGTH = 100
_BEAM_SIZE = 4
_ALPHA = 0.6
def _get_sorted_inputs(filename):
"""Read and sort lines from the file sorted by decreasing length.
Args:
filename: String name of file to read inputs from.
Returns:
Sorted list of inputs, and dictionary mapping original index->sorted index
of each element.
"""
with tf.gfile.Open(filename) as f:
records = f.read().split("\n")
inputs = [record.strip() for record in records]
if not inputs[-1]:
inputs.pop()
input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
sorted_inputs = []
sorted_keys = {}
for i, (index, _) in enumerate(sorted_input_lens):
sorted_inputs.append(inputs[index])
sorted_keys[index] = i
return sorted_inputs, sorted_keys
def _encode_and_add_eos(line, subtokenizer):
"""Encode line with subtokenizer, and add EOS id to the end."""
return subtokenizer.encode(line) + [tokenizer.EOS_ID]
def _trim_and_decode(ids, subtokenizer):
"""Trim EOS and PAD tokens from ids, and decode to return a string."""
try:
index = list(ids).index(tokenizer.EOS_ID)
return subtokenizer.decode(ids[:index])
except ValueError: # No EOS found in sequence
return subtokenizer.decode(ids)
def translate_file(
estimator, subtokenizer, input_file, output_file=None,
print_all_translations=True):
"""Translate lines in file, and save to output file if specified.
Args:
estimator: tf.Estimator used to generate the translations.
subtokenizer: Subtokenizer object for encoding and decoding source and
translated lines.
input_file: file containing lines to translate
output_file: file that stores the generated translations.
print_all_translations: If true, all translations are printed to stdout.
Raises:
ValueError: if output file is invalid.
"""
batch_size = _DECODE_BATCH_SIZE
# Read and sort inputs by length. Keep dictionary (original index-->new index
# in sorted list) to write translations in the original order.
sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
num_decode_batches = (len(sorted_inputs) - 1) // batch_size + 1
def input_generator():
"""Yield encoded strings from sorted_inputs."""
for i, line in enumerate(sorted_inputs):
if i % batch_size == 0:
batch_num = (i // batch_size) + 1
print("Decoding batch %d out of %d." % (batch_num, num_decode_batches))
yield _encode_and_add_eos(line, subtokenizer)
def input_fn():
"""Created batched dataset of encoded inputs."""
ds = tf.data.Dataset.from_generator(
input_generator, tf.int64, tf.TensorShape([None]))
ds = ds.padded_batch(batch_size, [None])
return ds
translations = []
for i, prediction in enumerate(estimator.predict(input_fn)):
translation = _trim_and_decode(prediction["outputs"], subtokenizer)
translations.append(translation)
if print_all_translations:
print("Translating:")
print("\tInput: %s" % sorted_inputs[i])
print("\tOutput: %s\n" % translation)
print("=" * 100)
# Write translations in the order they appeared in the original file.
if output_file is not None:
if tf.gfile.IsDirectory(output_file):
raise ValueError("File output is a directory, will not save outputs to "
"file.")
tf.logging.info("Writing to file %s" % output_file)
with tf.gfile.Open(output_file, "w") as f:
for index in xrange(len(sorted_keys)):
f.write("%s\n" % translations[sorted_keys[index]])
def translate_text(estimator, subtokenizer, txt):
"""Translate a single string."""
encoded_txt = _encode_and_add_eos(txt, subtokenizer)
def input_fn():
ds = tf.data.Dataset.from_tensors(encoded_txt)
ds = ds.batch(_DECODE_BATCH_SIZE)
return ds
predictions = estimator.predict(input_fn)
translation = next(predictions)["outputs"]
translation = _trim_and_decode(translation, subtokenizer)
print("Translation of \"%s\": \"%s\"" % (txt, translation))
def main(unused_argv):
from official.transformer import transformer_main
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.text is None and FLAGS.file is None:
tf.logging.warn("Nothing to translate. Make sure to call this script using "
"flags --text or --file.")
return
subtokenizer = tokenizer.Subtokenizer(
os.path.join(FLAGS.data_dir, FLAGS.vocab_file))
if FLAGS.params == "base":
params = model_params.TransformerBaseParams
elif FLAGS.params == "big":
params = model_params.TransformerBigParams
else:
raise ValueError("Invalid parameter set defined: %s."
"Expected 'base' or 'big.'" % FLAGS.params)
# Set up estimator and params
params.beam_size = _BEAM_SIZE
params.alpha = _ALPHA
params.extra_decode_length = _EXTRA_DECODE_LENGTH
params.batch_size = _DECODE_BATCH_SIZE
estimator = tf.estimator.Estimator(
model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir,
params=params)
if FLAGS.text is not None:
tf.logging.info("Translating text: %s" % FLAGS.text)
translate_text(estimator, subtokenizer, FLAGS.text)
if FLAGS.file is not None:
input_file = os.path.abspath(FLAGS.file)
tf.logging.info("Translating file: %s" % input_file)
if not tf.gfile.Exists(FLAGS.file):
raise ValueError("File does not exist: %s" % input_file)
output_file = None
if FLAGS.file_out is not None:
output_file = os.path.abspath(FLAGS.file_out)
tf.logging.info("File output specified: %s" % output_file)
translate_file(estimator, subtokenizer, input_file, output_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Model arguments
parser.add_argument(
"--data_dir", "-dd", type=str, default="/tmp/data/translate_ende",
help="[default: %(default)s] Directory where vocab file is stored.",
metavar="<DD>")
parser.add_argument(
"--vocab_file", "-vf", type=str, default=VOCAB_FILE,
help="[default: %(default)s] Name of vocabulary file.",
metavar="<vf>")
parser.add_argument(
"--model_dir", "-md", type=str, default="/tmp/transformer_model",
help="[default: %(default)s] Directory containing Transformer model "
"checkpoints.",
metavar="<MD>")
parser.add_argument(
"--params", "-p", type=str, default="big", choices=["base", "big"],
help="[default: %(default)s] Parameter used for trained model.",
metavar="<P>")
# Flags for specifying text/file to be translated.
parser.add_argument(
"--text", "-t", type=str, default=None,
help="[default: %(default)s] Text to translate. Output will be printed "
"to console.",
metavar="<T>")
parser.add_argument(
"--file", "-f", type=str, default=None,
help="[default: %(default)s] File containing text to translate. "
"Translation will be printed to console and, if --file_out is "
"provided, saved to an output file.",
metavar="<F>")
parser.add_argument(
"--file_out", "-fo", type=str, default=None,
help="[default: %(default)s] If --file flag is specified, save "
"translation to this file.",
metavar="<FO>")
FLAGS, unparsed = parser.parse_known_args()
main(sys.argv)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test Subtokenizer and string helper methods."""
import collections
import tempfile
import unittest
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.utils import tokenizer
class SubtokenizerTest(unittest.TestCase):
def _init_subtokenizer(self, vocab_list):
temp_file = tempfile.NamedTemporaryFile(delete=False)
with tf.gfile.Open(temp_file.name, 'w') as w:
for subtoken in vocab_list:
w.write("'%s'" % subtoken)
w.write("\n")
return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
def test_encode(self):
vocab_list = ["123_", "test", "ing_"]
subtokenizer = self._init_subtokenizer(vocab_list)
s = "testing 123"
encoded_list = subtokenizer.encode(s)
self.assertEqual([1, 2, 0], encoded_list)
def test_decode(self):
vocab_list = ["123_", "test", "ing_"]
subtokenizer = self._init_subtokenizer(vocab_list)
encoded_list = [1, 2, 0] # testing 123
decoded_str = subtokenizer.decode(encoded_list)
self.assertEqual("testing 123", decoded_str)
def test_subtoken_ids_to_tokens(self):
vocab_list = ["123_", "test", "ing_"]
subtokenizer = self._init_subtokenizer(vocab_list)
encoded_list = [1, 2, 0] # testing 123
token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list)
self.assertEqual([u"testing", u"123"], token_list)
class StringHelperTest(unittest.TestCase):
def test_split_string_to_tokens(self):
text = "test? testing 123."
tokens = tokenizer._split_string_to_tokens(text)
self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
def test_join_tokens_to_string(self):
tokens = ["test", "? ", "testing", "123", "."]
s = tokenizer._join_tokens_to_string(tokens)
self.assertEqual("test? testing 123.", s)
def test_escape_token(self):
token = u"abc_\\4"
alphabet = set("abc_\\u;")
escaped_token = tokenizer._escape_token(token, alphabet)
self.assertEqual("abc\\u\\\\\\52;_", escaped_token)
def test_unescape_token(self):
escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
unescaped_token = tokenizer._unescape_token(escaped_token)
self.assertEqual(
"Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
def test_list_to_index_dict(self):
lst = ["test", "strings"]
d = tokenizer._list_to_index_dict(lst)
self.assertDictEqual({"test": 0, "strings": 1}, d)
def test_split_token_to_subtokens(self):
token = "abc"
subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
max_subtoken_length = 2
subtokens = tokenizer._split_token_to_subtokens(
token, subtoken_dict, max_subtoken_length)
self.assertEqual(["ab", "c"], subtokens)
def test_generate_alphabet_dict(self):
s = ["testing", "123"]
reserved_tokens = ["???"]
alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens)
self.assertIn("?", alphabet)
self.assertIn("t", alphabet)
self.assertIn("e", alphabet)
self.assertIn("s", alphabet)
self.assertIn("i", alphabet)
self.assertIn("n", alphabet)
self.assertIn("g", alphabet)
self.assertIn("1", alphabet)
self.assertIn("2", alphabet)
self.assertIn("3", alphabet)
def test_count_and_gen_subtokens(self):
token_counts = {"abc": 5}
alphabet = set("abc_")
subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
max_subtoken_length = 2
subtoken_counts = tokenizer._count_and_gen_subtokens(
token_counts, alphabet, subtoken_dict, max_subtoken_length)
self.assertIsInstance(subtoken_counts, collections.defaultdict)
self.assertDictEqual(
{"a": 5, "b": 5, "c": 5, "_": 5, "ab": 5, "bc": 5, "c_": 5,
"abc": 5, "bc_": 5, "abc_": 5}, subtoken_counts)
def test_filter_and_bucket_subtokens(self):
subtoken_counts = collections.defaultdict(
int, {"a": 2, "b": 4, "c": 1, "ab": 6, "ac": 3, "abbc": 5})
min_count = 3
subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
subtoken_counts, min_count)
self.assertEqual(len(subtoken_buckets[0]), 0)
self.assertEqual(set("b"), subtoken_buckets[1])
self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
self.assertEqual(len(subtoken_buckets[3]), 0)
self.assertEqual(set(["abbc"]), subtoken_buckets[4])
def test_gen_new_subtoken_list(self):
subtoken_counts = collections.defaultdict(
int, {"translate": 10, "t": 40, "tr": 16, "tra": 12})
min_count = 5
alphabet = set("translate")
reserved_tokens = ["reserved", "tokens"]
subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list(
subtoken_counts, min_count, alphabet, reserved_tokens)
# Check that "tra" isn"t in the list (its count should be decremented to 2,
# so it should not be added to the canddiate list).
self.assertNotIn("tra", subtoken_list)
self.assertIn("tr", subtoken_list)
self.assertIn("t", subtoken_list)
self.assertEqual(len("translate"), max_token_length)
def test_generate_subtokens(self):
token_counts = {"ab": 1, "bc": 3, "abc": 5}
alphabet = set("abc_")
min_count = 100
num_iterations = 1
reserved_tokens = ["reserved", "tokens"]
vocab_list = tokenizer._generate_subtokens(
token_counts, alphabet, min_count, num_iterations, reserved_tokens)
# Check that reserved tokens are at the front of the list
self.assertEqual(vocab_list[:2], reserved_tokens)
# Check that each character in alphabet is in the vocab list
for c in alphabet:
self.assertIn(c, vocab_list)
if __name__ == "__main__":
unittest.main()
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment