Commit 9dafea91 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'qianyj_tf' into 'main'

update tf code

See merge request dcutoolkit/deeplearing/dlexamples_new!35
parents 92a2ca36 a4146470
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Download and preprocess WMT17 ende training and evaluation datasets."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import random
import tarfile
# pylint: disable=g-bad-import-order
import six
from six.moves import urllib
from absl import app as absl_app
from absl import flags
import tensorflow as tf
# pylint: enable=g-bad-import-order
from official.transformer.utils import tokenizer
from official.utils.flags import core as flags_core
# Data sources for training/evaluating the transformer translation model.
# If any of the training sources are changed, then either:
# 1) use the flag `--search` to find the best min count or
# 2) update the _TRAIN_DATA_MIN_COUNT constant.
# min_count is the minimum number of times a token must appear in the data
# before it is added to the vocabulary. "Best min count" refers to the value
# that generates a vocabulary set that is closest in size to _TARGET_VOCAB_SIZE.
_TRAIN_DATA_SOURCES = [
{
"url": "http://data.statmt.org/wmt17/translation-task/"
"training-parallel-nc-v12.tgz",
"input": "news-commentary-v12.de-en.en",
"target": "news-commentary-v12.de-en.de",
},
{
"url": "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
"input": "commoncrawl.de-en.en",
"target": "commoncrawl.de-en.de",
},
{
"url": "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
"input": "europarl-v7.de-en.en",
"target": "europarl-v7.de-en.de",
},
]
# Use pre-defined minimum count to generate subtoken vocabulary.
_TRAIN_DATA_MIN_COUNT = 6
_EVAL_DATA_SOURCES = [
{
"url": "http://data.statmt.org/wmt17/translation-task/dev.tgz",
"input": "newstest2013.en",
"target": "newstest2013.de",
}
]
# Vocabulary constants
_TARGET_VOCAB_SIZE = 32768 # Number of subtokens in the vocabulary list.
_TARGET_THRESHOLD = 327 # Accept vocabulary if size is within this threshold
VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE
# Strings to inclue in the generated files.
_PREFIX = "wmt32k"
_TRAIN_TAG = "train"
_EVAL_TAG = "dev" # Following WMT and Tensor2Tensor conventions, in which the
# evaluation datasets are tagged as "dev" for development.
# Number of files to split train and evaluation data
_TRAIN_SHARDS = 100
_EVAL_SHARDS = 1
def find_file(path, filename, max_depth=5):
"""Returns full filepath if the file is in path or a subdirectory."""
for root, dirs, files in os.walk(path):
if filename in files:
return os.path.join(root, filename)
# Don't search past max_depth
depth = root[len(path) + 1:].count(os.sep)
if depth > max_depth:
del dirs[:] # Clear dirs
return None
###############################################################################
# Download and extraction functions
###############################################################################
def get_raw_files(raw_dir, data_source):
"""Return raw files from source. Downloads/extracts if needed.
Args:
raw_dir: string directory to store raw files
data_source: dictionary with
{"url": url of compressed dataset containing input and target files
"input": file with data in input language
"target": file with data in target language}
Returns:
dictionary with
{"inputs": list of files containing data in input language
"targets": list of files containing corresponding data in target language
}
"""
raw_files = {
"inputs": [],
"targets": [],
} # keys
for d in data_source:
input_file, target_file = download_and_extract(
raw_dir, d["url"], d["input"], d["target"])
raw_files["inputs"].append(input_file)
raw_files["targets"].append(target_file)
return raw_files
def download_report_hook(count, block_size, total_size):
"""Report hook for download progress.
Args:
count: current block number
block_size: block size
total_size: total size
"""
percent = int(count * block_size * 100 / total_size)
print("\r%d%%" % percent + " completed", end="\r")
def download_from_url(path, url):
"""Download content from a url.
Args:
path: string directory where file will be downloaded
url: string url
Returns:
Full path to downloaded file
"""
filename = url.split("/")[-1]
found_file = find_file(path, filename, max_depth=0)
if found_file is None:
filename = os.path.join(path, filename)
tf.logging.info("Downloading from %s to %s." % (url, filename))
inprogress_filepath = filename + ".incomplete"
inprogress_filepath, _ = urllib.request.urlretrieve(
url, inprogress_filepath, reporthook=download_report_hook)
# Print newline to clear the carriage return from the download progress.
print()
tf.gfile.Rename(inprogress_filepath, filename)
return filename
else:
tf.logging.info("Already downloaded: %s (at %s)." % (url, found_file))
return found_file
def download_and_extract(path, url, input_filename, target_filename):
"""Extract files from downloaded compressed archive file.
Args:
path: string directory where the files will be downloaded
url: url containing the compressed input and target files
input_filename: name of file containing data in source language
target_filename: name of file containing data in target language
Returns:
Full paths to extracted input and target files.
Raises:
OSError: if the the download/extraction fails.
"""
# Check if extracted files already exist in path
input_file = find_file(path, input_filename)
target_file = find_file(path, target_filename)
if input_file and target_file:
tf.logging.info("Already downloaded and extracted %s." % url)
return input_file, target_file
# Download archive file if it doesn't already exist.
compressed_file = download_from_url(path, url)
# Extract compressed files
tf.logging.info("Extracting %s." % compressed_file)
with tarfile.open(compressed_file, "r:gz") as corpus_tar:
corpus_tar.extractall(path)
# Return filepaths of the requested files.
input_file = find_file(path, input_filename)
target_file = find_file(path, target_filename)
if input_file and target_file:
return input_file, target_file
raise OSError("Download/extraction failed for url %s to path %s" %
(url, path))
def txt_line_iterator(path):
"""Iterate through lines of file."""
with tf.gfile.Open(path) as f:
for line in f:
yield line.strip()
def compile_files(raw_dir, raw_files, tag):
"""Compile raw files into a single file for each language.
Args:
raw_dir: Directory containing downloaded raw files.
raw_files: Dict containing filenames of input and target data.
{"inputs": list of files containing data in input language
"targets": list of files containing corresponding data in target language
}
tag: String to append to the compiled filename.
Returns:
Full path of compiled input and target files.
"""
tf.logging.info("Compiling files with tag %s." % tag)
filename = "%s-%s" % (_PREFIX, tag)
input_compiled_file = os.path.join(raw_dir, filename + ".lang1")
target_compiled_file = os.path.join(raw_dir, filename + ".lang2")
with tf.gfile.Open(input_compiled_file, mode="w") as input_writer:
with tf.gfile.Open(target_compiled_file, mode="w") as target_writer:
for i in range(len(raw_files["inputs"])):
input_file = raw_files["inputs"][i]
target_file = raw_files["targets"][i]
tf.logging.info("Reading files %s and %s." % (input_file, target_file))
write_file(input_writer, input_file)
write_file(target_writer, target_file)
return input_compiled_file, target_compiled_file
def write_file(writer, filename):
"""Write all of lines from file using the writer."""
for line in txt_line_iterator(filename):
writer.write(line)
writer.write("\n")
###############################################################################
# Data preprocessing
###############################################################################
def encode_and_save_files(
subtokenizer, data_dir, raw_files, tag, total_shards):
"""Save data from files as encoded Examples in TFrecord format.
Args:
subtokenizer: Subtokenizer object that will be used to encode the strings.
data_dir: The directory in which to write the examples
raw_files: A tuple of (input, target) data files. Each line in the input and
the corresponding line in target file will be saved in a tf.Example.
tag: String that will be added onto the file names.
total_shards: Number of files to divide the data into.
Returns:
List of all files produced.
"""
# Create a file for each shard.
filepaths = [shard_filename(data_dir, tag, n + 1, total_shards)
for n in range(total_shards)]
if all_exist(filepaths):
tf.logging.info("Files with tag %s already exist." % tag)
return filepaths
tf.logging.info("Saving files with tag %s." % tag)
input_file = raw_files[0]
target_file = raw_files[1]
# Write examples to each shard in round robin order.
tmp_filepaths = [fname + ".incomplete" for fname in filepaths]
writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths]
counter, shard = 0, 0
for counter, (input_line, target_line) in enumerate(zip(
txt_line_iterator(input_file), txt_line_iterator(target_file))):
if counter > 0 and counter % 100000 == 0:
tf.logging.info("\tSaving case %d." % counter)
example = dict_to_example(
{"inputs": subtokenizer.encode(input_line, add_eos=True),
"targets": subtokenizer.encode(target_line, add_eos=True)})
writers[shard].write(example.SerializeToString())
shard = (shard + 1) % total_shards
for writer in writers:
writer.close()
for tmp_name, final_name in zip(tmp_filepaths, filepaths):
tf.gfile.Rename(tmp_name, final_name)
tf.logging.info("Saved %d Examples", counter + 1)
return filepaths
def shard_filename(path, tag, shard_num, total_shards):
"""Create filename for data shard."""
return os.path.join(
path, "%s-%s-%.5d-of-%.5d" % (_PREFIX, tag, shard_num, total_shards))
def shuffle_records(fname):
"""Shuffle records in a single file."""
tf.logging.info("Shuffling records in file %s" % fname)
# Rename file prior to shuffling
tmp_fname = fname + ".unshuffled"
tf.gfile.Rename(fname, tmp_fname)
reader = tf.compat.v1.io.tf_record_iterator(tmp_fname)
records = []
for record in reader:
records.append(record)
if len(records) % 100000 == 0:
tf.logging.info("\tRead: %d", len(records))
random.shuffle(records)
# Write shuffled records to original file name
with tf.python_io.TFRecordWriter(fname) as w:
for count, record in enumerate(records):
w.write(record)
if count > 0 and count % 100000 == 0:
tf.logging.info("\tWriting record: %d" % count)
tf.gfile.Remove(tmp_fname)
def dict_to_example(dictionary):
"""Converts a dictionary of string->int to a tf.Example."""
features = {}
for k, v in six.iteritems(dictionary):
features[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=v))
return tf.train.Example(features=tf.train.Features(feature=features))
def all_exist(filepaths):
"""Returns true if all files in the list exist."""
for fname in filepaths:
if not tf.gfile.Exists(fname):
return False
return True
def make_dir(path):
if not tf.gfile.Exists(path):
tf.logging.info("Creating directory %s" % path)
tf.gfile.MakeDirs(path)
def main(unused_argv):
"""Obtain training and evaluation data for the Transformer model."""
make_dir(FLAGS.raw_dir)
make_dir(FLAGS.data_dir)
# Get paths of download/extracted training and evaluation files.
tf.logging.info("Step 1/4: Downloading data from source")
train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES)
eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES)
# Create subtokenizer based on the training files.
tf.logging.info("Step 2/4: Creating subtokenizer and building vocabulary")
train_files_flat = train_files["inputs"] + train_files["targets"]
vocab_file = os.path.join(FLAGS.data_dir, VOCAB_FILE)
subtokenizer = tokenizer.Subtokenizer.init_from_files(
vocab_file, train_files_flat, _TARGET_VOCAB_SIZE, _TARGET_THRESHOLD,
min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT)
tf.logging.info("Step 3/4: Compiling training and evaluation data")
compiled_train_files = compile_files(FLAGS.raw_dir, train_files, _TRAIN_TAG)
compiled_eval_files = compile_files(FLAGS.raw_dir, eval_files, _EVAL_TAG)
# Tokenize and save data as Examples in the TFRecord format.
tf.logging.info("Step 4/4: Preprocessing and saving data")
train_tfrecord_files = encode_and_save_files(
subtokenizer, FLAGS.data_dir, compiled_train_files, _TRAIN_TAG,
_TRAIN_SHARDS)
encode_and_save_files(
subtokenizer, FLAGS.data_dir, compiled_eval_files, _EVAL_TAG,
_EVAL_SHARDS)
for fname in train_tfrecord_files:
shuffle_records(fname)
def define_data_download_flags():
"""Add flags specifying data download arguments."""
flags.DEFINE_string(
name="data_dir", short_name="dd", default="/tmp/translate_ende",
help=flags_core.help_wrap(
"Directory for where the translate_ende_wmt32k dataset is saved."))
flags.DEFINE_string(
name="raw_dir", short_name="rd", default="/tmp/translate_ende_raw",
help=flags_core.help_wrap(
"Path where the raw data will be downloaded and extracted."))
flags.DEFINE_bool(
name="search", default=False,
help=flags_core.help_wrap(
"If set, use binary search to find the vocabulary set with size"
"closest to the target size (%d)." % _TARGET_VOCAB_SIZE))
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
define_data_download_flags()
FLAGS = flags.FLAGS
absl_app.run(main)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Implementation of multiheaded attention and self-attention layers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class Attention(tf.layers.Layer):
"""Multi-headed attention layer."""
def __init__(self, hidden_size, num_heads, attention_dropout, train):
if hidden_size % num_heads != 0:
raise ValueError("Hidden size must be evenly divisible by the number of "
"heads.")
super(Attention, self).__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.attention_dropout = attention_dropout
self.train = train
# Layers for linearly projecting the queries, keys, and values.
self.q_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="q")
self.k_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="k")
self.v_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="v")
self.output_dense_layer = tf.layers.Dense(hidden_size, use_bias=False,
name="output_transform")
def split_heads(self, x):
"""Split x into different heads, and transpose the resulting value.
The tensor is transposed to insure the inner dimensions hold the correct
values during the matrix multiplication.
Args:
x: A tensor with shape [batch_size, length, hidden_size]
Returns:
A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
"""
with tf.name_scope("split_heads"):
batch_size = tf.shape(x)[0]
length = tf.shape(x)[1]
# Calculate depth of last dimension after it has been split.
depth = (self.hidden_size // self.num_heads)
# Split the last dimension
x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
# Transpose the result
return tf.transpose(x, [0, 2, 1, 3])
def combine_heads(self, x):
"""Combine tensor that has been split.
Args:
x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
Returns:
A tensor with shape [batch_size, length, hidden_size]
"""
with tf.name_scope("combine_heads"):
batch_size = tf.shape(x)[0]
length = tf.shape(x)[2]
x = tf.transpose(x, [0, 2, 1, 3]) # --> [batch, length, num_heads, depth]
return tf.reshape(x, [batch_size, length, self.hidden_size])
def call(self, x, y, bias, cache=None):
"""Apply attention mechanism to x and y.
Args:
x: a tensor with shape [batch_size, length_x, hidden_size]
y: a tensor with shape [batch_size, length_y, hidden_size]
bias: attention bias that will be added to the result of the dot product.
cache: (Used during prediction) dictionary with tensors containing results
of previous attentions. The dictionary must have the items:
{"k": tensor with shape [batch_size, i, key_channels],
"v": tensor with shape [batch_size, i, value_channels]}
where i is the current decoded length.
Returns:
Attention layer output with shape [batch_size, length_x, hidden_size]
"""
# Linearly project the query (q), key (k) and value (v) using different
# learned projections. This is in preparation of splitting them into
# multiple heads. Multi-head attention uses multiple queries, keys, and
# values rather than regular attention (which uses a single q, k, v).
q = self.q_dense_layer(x)
k = self.k_dense_layer(y)
v = self.v_dense_layer(y)
if cache is not None:
# Combine cached keys and values with new keys and values.
k = tf.concat([cache["k"], k], axis=1)
v = tf.concat([cache["v"], v], axis=1)
# Update cache
cache["k"] = k
cache["v"] = v
# Split q, k, v into heads.
q = self.split_heads(q)
k = self.split_heads(k)
v = self.split_heads(v)
# Scale q to prevent the dot product between q and k from growing too large.
depth = (self.hidden_size // self.num_heads)
q *= depth ** -0.5
# Calculate dot product attention
logits = tf.matmul(q, k, transpose_b=True)
logits += bias
weights = tf.nn.softmax(logits, name="attention_weights")
if self.train:
weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout)
attention_output = tf.matmul(weights, v)
# Recombine heads --> [batch_size, length, hidden_size]
attention_output = self.combine_heads(attention_output)
# Run the combined outputs through another linear projection layer.
attention_output = self.output_dense_layer(attention_output)
return attention_output
class SelfAttention(Attention):
"""Multiheaded self-attention layer."""
def call(self, x, bias, cache=None):
return super(SelfAttention, self).call(x, x, bias, cache)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Beam search to find the translated sequence with the highest probability.
Source implementation from Tensor2Tensor:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/beam_search.py
"""
import tensorflow as tf
from tensorflow.python.util import nest
# Default value for INF
INF = 1. * 1e7
class _StateKeys(object):
"""Keys to dictionary storing the state of the beam search loop."""
# Variable storing the loop index.
CUR_INDEX = "CUR_INDEX"
# Top sequences that are alive for each batch item. Alive sequences are ones
# that have not generated an EOS token. Sequences that reach EOS are marked as
# finished and moved to the FINISHED_SEQ tensor.
# Has shape [batch_size, beam_size, CUR_INDEX + 1]
ALIVE_SEQ = "ALIVE_SEQ"
# Log probabilities of each alive sequence. Shape [batch_size, beam_size]
ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
# Dictionary of cached values for each alive sequence. The cache stores
# the encoder output, attention bias, and the decoder attention output from
# the previous iteration.
ALIVE_CACHE = "ALIVE_CACHE"
# Top finished sequences for each batch item.
# Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
# shorter than CUR_INDEX + 1 are padded with 0s.
FINISHED_SEQ = "FINISHED_SEQ"
# Scores for each finished sequence. Score = log probability / length norm
# Shape [batch_size, beam_size]
FINISHED_SCORES = "FINISHED_SCORES"
# Flags indicating which sequences in the finished sequences are finished.
# At the beginning, all of the sequences in FINISHED_SEQ are filler values.
# True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
FINISHED_FLAGS = "FINISHED_FLAGS"
class SequenceBeamSearch(object):
"""Implementation of beam search loop."""
def __init__(self, symbols_to_logits_fn, vocab_size, batch_size,
beam_size, alpha, max_decode_length, eos_id):
self.symbols_to_logits_fn = symbols_to_logits_fn
self.vocab_size = vocab_size
self.batch_size = batch_size
self.beam_size = beam_size
self.alpha = alpha
self.max_decode_length = max_decode_length
self.eos_id = eos_id
def search(self, initial_ids, initial_cache):
"""Beam search for sequences with highest scores."""
state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
finished_state = tf.while_loop(
self._continue_search, self._search_step, loop_vars=[state],
shape_invariants=[state_shapes], parallel_iterations=1, back_prop=False)
finished_state = finished_state[0]
alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
# Account for corner case where there are no finished sequences for a
# particular batch item. In that case, return alive sequences for that batch
# item.
finished_seq = tf.where(
tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
finished_scores = tf.where(
tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
return finished_seq, finished_scores
def _create_initial_state(self, initial_ids, initial_cache):
"""Return initial state dictionary and its shape invariants.
Args:
initial_ids: initial ids to pass into the symbols_to_logits_fn.
int tensor with shape [batch_size, 1]
initial_cache: dictionary storing values to be passed into the
symbols_to_logits_fn.
Returns:
state and shape invariant dictionaries with keys from _StateKeys
"""
# Current loop index (starts at 0)
cur_index = tf.constant(0)
# Create alive sequence with shape [batch_size, beam_size, 1]
alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
alive_seq = tf.expand_dims(alive_seq, axis=2)
# Create tensor for storing initial log probabilities.
# Assume initial_ids are prob 1.0
initial_log_probs = tf.constant(
[[0.] + [-float("inf")] * (self.beam_size - 1)])
alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1])
# Expand all values stored in the dictionary to the beam size, so that each
# beam has a separate cache.
alive_cache = nest.map_structure(
lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
# Initialize tensor storing finished sequences with filler values.
finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
# Set scores of the initial finished seqs to negative infinity.
finished_scores = tf.ones([self.batch_size, self.beam_size]) * -INF
# Initialize finished flags with all False values.
finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool)
# Create state dictionary
state = {
_StateKeys.CUR_INDEX: cur_index,
_StateKeys.ALIVE_SEQ: alive_seq,
_StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
_StateKeys.ALIVE_CACHE: alive_cache,
_StateKeys.FINISHED_SEQ: finished_seq,
_StateKeys.FINISHED_SCORES: finished_scores,
_StateKeys.FINISHED_FLAGS: finished_flags
}
# Create state invariants for each value in the state dictionary. Each
# dimension must be a constant or None. A None dimension means either:
# 1) the dimension's value is a tensor that remains the same but may
# depend on the input sequence to the model (e.g. batch size).
# 2) the dimension may have different values on different iterations.
state_shape_invariants = {
_StateKeys.CUR_INDEX: tf.TensorShape([]),
_StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]),
_StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]),
_StateKeys.ALIVE_CACHE: nest.map_structure(
_get_shape_keep_last_dim, alive_cache),
_StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]),
_StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]),
_StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size])
}
return state, state_shape_invariants
def _continue_search(self, state):
"""Return whether to continue the search loop.
The loops should terminate when
1) when decode length has been reached, or
2) when the worst score in the finished sequences is better than the best
score in the alive sequences (i.e. the finished sequences are provably
unchanging)
Args:
state: A dictionary with the current loop state.
Returns:
Bool tensor with value True if loop should continue, False if loop should
terminate.
"""
i = state[_StateKeys.CUR_INDEX]
alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
finished_scores = state[_StateKeys.FINISHED_SCORES]
finished_flags = state[_StateKeys.FINISHED_FLAGS]
not_at_max_decode_length = tf.less(i, self.max_decode_length)
# Calculate largest length penalty (the larger penalty, the better score).
max_length_norm = _length_normalization(self.alpha, self.max_decode_length)
# Get the best possible scores from alive sequences.
best_alive_scores = alive_log_probs[:, 0] / max_length_norm
# Compute worst score in finished sequences for each batch element
finished_scores *= tf.to_float(finished_flags) # set filler scores to zero
lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
# If there are no finished sequences in a batch element, then set the lowest
# finished score to -INF for that element.
finished_batches = tf.reduce_any(finished_flags, 1)
lowest_finished_scores += (1. - tf.to_float(finished_batches)) * -INF
worst_finished_score_better_than_best_alive_score = tf.reduce_all(
tf.greater(lowest_finished_scores, best_alive_scores)
)
return tf.logical_and(
not_at_max_decode_length,
tf.logical_not(worst_finished_score_better_than_best_alive_score)
)
def _search_step(self, state):
"""Beam search loop body.
Grow alive sequences by a single ID. Sequences that have reached the EOS
token are marked as finished. The alive and finished sequences with the
highest log probabilities and scores are returned.
A sequence's finished score is calculating by dividing the log probability
by the length normalization factor. Without length normalization, the
search is more likely to return shorter sequences.
Args:
state: A dictionary with the current loop state.
Returns:
new state dictionary.
"""
# Grow alive sequences by one token.
new_seq, new_log_probs, new_cache = self._grow_alive_seq(state)
# Collect top beam_size alive sequences
alive_state = self._get_new_alive_state(new_seq, new_log_probs, new_cache)
# Combine newly finished sequences with existing finished sequences, and
# collect the top k scoring sequences.
finished_state = self._get_new_finished_state(state, new_seq, new_log_probs)
# Increment loop index and create new state dictionary
new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
new_state.update(alive_state)
new_state.update(finished_state)
return [new_state]
def _grow_alive_seq(self, state):
"""Grow alive sequences by one token, and collect top 2*beam_size sequences.
2*beam_size sequences are collected because some sequences may have reached
the EOS token. 2*beam_size ensures that at least beam_size sequences are
still alive.
Args:
state: A dictionary with the current loop state.
Returns:
Tuple of
(Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
Scores of returned sequences [batch_size, 2 * beam_size],
New alive cache, for each of the 2 * beam_size sequences)
"""
i = state[_StateKeys.CUR_INDEX]
alive_seq = state[_StateKeys.ALIVE_SEQ]
alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
alive_cache = state[_StateKeys.ALIVE_CACHE]
beams_to_keep = 2 * self.beam_size
# Get logits for the next candidate IDs for the alive sequences. Get the new
# cache values at the same time.
flat_ids = _flatten_beam_dim(alive_seq) # [batch_size * beam_size]
flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache)
flat_logits, flat_cache = self.symbols_to_logits_fn(flat_ids, i, flat_cache)
# Unflatten logits to shape [batch_size, beam_size, vocab_size]
logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size)
new_cache = nest.map_structure(
lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size),
flat_cache)
# Convert logits to normalized log probs
candidate_log_probs = _log_prob_from_logits(logits)
# Calculate new log probabilities if each of the alive sequences were
# extended # by the the candidate IDs.
# Shape [batch_size, beam_size, vocab_size]
log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
# Each batch item has beam_size * vocab_size candidate sequences. For each
# batch item, get the k candidates with the highest log probabilities.
flat_log_probs = tf.reshape(log_probs,
[-1, self.beam_size * self.vocab_size])
topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep)
# Extract the alive sequences that generate the highest log probabilities
# after being extended.
topk_beam_indices = topk_indices // self.vocab_size
topk_seq, new_cache = _gather_beams(
[alive_seq, new_cache], topk_beam_indices, self.batch_size,
beams_to_keep)
# Append the most probable IDs to the topk sequences
topk_ids = topk_indices % self.vocab_size
topk_ids = tf.expand_dims(topk_ids, axis=2)
topk_seq = tf.concat([topk_seq, topk_ids], axis=2)
return topk_seq, topk_log_probs, new_cache
def _get_new_alive_state(self, new_seq, new_log_probs, new_cache):
"""Gather the top k sequences that are still alive.
Args:
new_seq: New sequences generated by growing the current alive sequences
int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
new_log_probs: Log probabilities of new sequences
float32 tensor with shape [batch_size, beam_size]
new_cache: Dict of cached values for each sequence.
Returns:
Dictionary with alive keys from _StateKeys:
{Top beam_size sequences that are still alive (don't end with eos_id)
Log probabilities of top alive sequences
Dict cache storing decoder states for top alive sequences}
"""
# To prevent finished sequences from being considered, set log probs to -INF
new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
new_log_probs += tf.to_float(new_finished_flags) * -INF
top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
[new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size,
self.beam_size)
return {
_StateKeys.ALIVE_SEQ: top_alive_seq,
_StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
_StateKeys.ALIVE_CACHE: top_alive_cache
}
def _get_new_finished_state(self, state, new_seq, new_log_probs):
"""Combine new and old finished sequences, and gather the top k sequences.
Args:
state: A dictionary with the current loop state.
new_seq: New sequences generated by growing the current alive sequences
int32 tensor with shape [batch_size, beam_size, i + 1]
new_log_probs: Log probabilities of new sequences
float32 tensor with shape [batch_size, beam_size]
Returns:
Dictionary with finished keys from _StateKeys:
{Top beam_size finished sequences based on score,
Scores of finished sequences,
Finished flags of finished sequences}
"""
i = state[_StateKeys.CUR_INDEX]
finished_seq = state[_StateKeys.FINISHED_SEQ]
finished_scores = state[_StateKeys.FINISHED_SCORES]
finished_flags = state[_StateKeys.FINISHED_FLAGS]
# First append a column of 0-ids to finished_seq to increment the length.
# New shape of finished_seq: [batch_size, beam_size, i + 1]
finished_seq = tf.concat(
[finished_seq,
tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)], axis=2)
# Calculate new seq scores from log probabilities.
length_norm = _length_normalization(self.alpha, i + 1)
new_scores = new_log_probs / length_norm
# Set the scores of the still-alive seq in new_seq to large negative values.
new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
new_scores += (1. - tf.to_float(new_finished_flags)) * -INF
# Combine sequences, scores, and flags.
finished_seq = tf.concat([finished_seq, new_seq], axis=1)
finished_scores = tf.concat([finished_scores, new_scores], axis=1)
finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
# Return the finished sequences with the best scores.
top_finished_seq, top_finished_scores, top_finished_flags = (
_gather_topk_beams([finished_seq, finished_scores, finished_flags],
finished_scores, self.batch_size, self.beam_size))
return {
_StateKeys.FINISHED_SEQ: top_finished_seq,
_StateKeys.FINISHED_SCORES: top_finished_scores,
_StateKeys.FINISHED_FLAGS: top_finished_flags
}
def sequence_beam_search(
symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size,
alpha, max_decode_length, eos_id):
"""Search for sequence of subtoken ids with the largest probability.
Args:
symbols_to_logits_fn: A function that takes in ids, index, and cache as
arguments. The passed in arguments will have shape:
ids -> [batch_size * beam_size, index]
index -> [] (scalar)
cache -> nested dictionary of tensors [batch_size * beam_size, ...]
The function must return logits and new cache.
logits -> [batch * beam_size, vocab_size]
new cache -> same shape/structure as inputted cache
initial_ids: Starting ids for each batch item.
int32 tensor with shape [batch_size]
initial_cache: dict containing starting decoder variables information
vocab_size: int size of tokens
beam_size: int number of beams
alpha: float defining the strength of length normalization
max_decode_length: maximum length to decoded sequence
eos_id: int id of eos token, used to determine when a sequence has finished
Returns:
Top decoded sequences [batch_size, beam_size, max_decode_length]
sequence scores [batch_size, beam_size]
"""
batch_size = tf.shape(initial_ids)[0]
sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size,
beam_size, alpha, max_decode_length, eos_id)
return sbs.search(initial_ids, initial_cache)
def _log_prob_from_logits(logits):
return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
def _length_normalization(alpha, length):
"""Return length normalization factor."""
return tf.pow(((5. + tf.to_float(length)) / 6.), alpha)
def _expand_to_beam_size(tensor, beam_size):
"""Tiles a given tensor by beam_size.
Args:
tensor: tensor to tile [batch_size, ...]
beam_size: How much to tile the tensor by.
Returns:
Tiled tensor [batch_size, beam_size, ...]
"""
tensor = tf.expand_dims(tensor, axis=1)
tile_dims = [1] * tensor.shape.ndims
tile_dims[1] = beam_size
return tf.tile(tensor, tile_dims)
def _shape_list(tensor):
"""Return a list of the tensor's shape, and ensure no None values in list."""
# Get statically known shape (may contain None's for unknown dimensions)
shape = tensor.get_shape().as_list()
# Ensure that the shape values are not None
dynamic_shape = tf.shape(tensor)
for i in range(len(shape)): # pylint: disable=consider-using-enumerate
if shape[i] is None:
shape[i] = dynamic_shape[i]
return shape
def _get_shape_keep_last_dim(tensor):
shape_list = _shape_list(tensor)
# Only the last
for i in range(len(shape_list) - 1):
shape_list[i] = None
if isinstance(shape_list[-1], tf.Tensor):
shape_list[-1] = None
return tf.TensorShape(shape_list)
def _flatten_beam_dim(tensor):
"""Reshapes first two dimensions in to single dimension.
Args:
tensor: Tensor to reshape of shape [A, B, ...]
Returns:
Reshaped tensor of shape [A*B, ...]
"""
shape = _shape_list(tensor)
shape[0] *= shape[1]
shape.pop(1) # Remove beam dim
return tf.reshape(tensor, shape)
def _unflatten_beam_dim(tensor, batch_size, beam_size):
"""Reshapes first dimension back to [batch_size, beam_size].
Args:
tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
batch_size: Tensor, original batch size.
beam_size: int, original beam size.
Returns:
Reshaped tensor of shape [batch_size, beam_size, ...]
"""
shape = _shape_list(tensor)
new_shape = [batch_size, beam_size] + shape[1:]
return tf.reshape(tensor, new_shape)
def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
"""Gather beams from nested structure of tensors.
Each tensor in nested represents a batch of beams, where beam refers to a
single search state (beam search involves searching through multiple states
in parallel).
This function is used to gather the top beams, specified by
beam_indices, from the nested tensors.
Args:
nested: Nested structure (tensor, list, tuple or dict) containing tensors
with shape [batch_size, beam_size, ...].
beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
value in beam_indices must be between [0, beam_size), and are not
necessarily unique.
batch_size: int size of batch
new_beam_size: int number of beams to be pulled from the nested tensors.
Returns:
Nested structure containing tensors with shape
[batch_size, new_beam_size, ...]
"""
# Computes the i'th coodinate that contains the batch index for gather_nd.
# Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
# Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
# with shape [batch_size, beam_size, 2], where the last dimension contains
# the (i, j) gathering coordinates.
coordinates = tf.stack([batch_pos, beam_indices], axis=2)
return nest.map_structure(
lambda state: tf.gather_nd(state, coordinates), nested)
def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
"""Gather top beams from nested structure."""
_, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
return _gather_beams(nested, topk_indexes, batch_size, beam_size)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test beam search helper methods."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.model import beam_search
class BeamSearchHelperTests(tf.test.TestCase):
def test_expand_to_beam_size(self):
x = tf.ones([7, 4, 2, 5])
x = beam_search._expand_to_beam_size(x, 3)
with self.test_session() as sess:
shape = sess.run(tf.shape(x))
self.assertAllEqual([7, 3, 4, 2, 5], shape)
def test_shape_list(self):
y = tf.placeholder(dtype=tf.int32, shape=[])
x = tf.ones([7, y, 2, 5])
shape = beam_search._shape_list(x)
self.assertIsInstance(shape[0], int)
self.assertIsInstance(shape[1], tf.Tensor)
self.assertIsInstance(shape[2], int)
self.assertIsInstance(shape[3], int)
def test_get_shape_keep_last_dim(self):
y = tf.constant(4.0)
x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5])
shape = beam_search._get_shape_keep_last_dim(x)
self.assertAllEqual([None, None, None, 5],
shape.as_list())
def test_flatten_beam_dim(self):
x = tf.ones([7, 4, 2, 5])
x = beam_search._flatten_beam_dim(x)
with self.test_session() as sess:
shape = sess.run(tf.shape(x))
self.assertAllEqual([28, 2, 5], shape)
def test_unflatten_beam_dim(self):
x = tf.ones([28, 2, 5])
x = beam_search._unflatten_beam_dim(x, 7, 4)
with self.test_session() as sess:
shape = sess.run(tf.shape(x))
self.assertAllEqual([7, 4, 2, 5], shape)
def test_gather_beams(self):
x = tf.reshape(tf.range(24), [2, 3, 4])
# x looks like: [[[ 0 1 2 3]
# [ 4 5 6 7]
# [ 8 9 10 11]]
#
# [[12 13 14 15]
# [16 17 18 19]
# [20 21 22 23]]]
y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
with self.test_session() as sess:
y = sess.run(y)
self.assertAllEqual([[[4, 5, 6, 7],
[8, 9, 10, 11]],
[[12, 13, 14, 15],
[20, 21, 22, 23]]],
y)
def test_gather_topk_beams(self):
x = tf.reshape(tf.range(24), [2, 3, 4])
x_scores = [[0, 1, 1], [1, 0, 1]]
y = beam_search._gather_topk_beams(x, x_scores, 2, 2)
with self.test_session() as sess:
y = sess.run(y)
self.assertAllEqual([[[4, 5, 6, 7],
[8, 9, 10, 11]],
[[12, 13, 14, 15],
[20, 21, 22, 23]]],
y)
if __name__ == "__main__":
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Implementation of embedding layer with shared weights."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.model import model_utils
from official.utils.accelerator import tpu as tpu_utils
class EmbeddingSharedWeights(tf.layers.Layer):
"""Calculates input embeddings and pre-softmax linear with shared weights."""
def __init__(self, vocab_size, hidden_size, method="gather"):
"""Specify characteristic parameters of embedding layer.
Args:
vocab_size: Number of tokens in the embedding. (Typically ~32,000)
hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
method: Strategy for performing embedding lookup. "gather" uses tf.gather
which performs well on CPUs and GPUs, but very poorly on TPUs. "matmul"
one-hot encodes the indicies and formulates the embedding as a sparse
matrix multiplication. The matmul formulation is wasteful as it does
extra work, however matrix multiplication is very fast on TPUs which
makes "matmul" considerably faster than "gather" on TPUs.
"""
super(EmbeddingSharedWeights, self).__init__()
self.vocab_size = vocab_size
self.hidden_size = hidden_size
if method not in ("gather", "matmul"):
raise ValueError("method {} must be 'gather' or 'matmul'".format(method))
self.method = method
def build(self, _):
with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE):
# Create and initialize weights. The random normal initializer was chosen
# randomly, and works well.
self.shared_weights = tf.get_variable(
"weights", [self.vocab_size, self.hidden_size],
initializer=tf.random_normal_initializer(
0., self.hidden_size ** -0.5))
self.built = True
def call(self, x):
"""Get token embeddings of x.
Args:
x: An int64 tensor with shape [batch_size, length]
Returns:
embeddings: float32 tensor with shape [batch_size, length, embedding_size]
padding: float32 tensor with shape [batch_size, length] indicating the
locations of the padding tokens in x.
"""
with tf.name_scope("embedding"):
# Create binary mask of size [batch_size, length]
mask = tf.to_float(tf.not_equal(x, 0))
if self.method == "gather":
embeddings = tf.gather(self.shared_weights, x)
embeddings *= tf.expand_dims(mask, -1)
else: # matmul
embeddings = tpu_utils.embedding_matmul(
embedding_table=self.shared_weights,
values=tf.cast(x, dtype=tf.int32),
mask=mask
)
# embedding_matmul already zeros out masked positions, so
# `embeddings *= tf.expand_dims(mask, -1)` is unnecessary.
# Scale embedding by the sqrt of the hidden size
embeddings *= self.hidden_size ** 0.5
return embeddings
def linear(self, x):
"""Computes logits by running x through a linear layer.
Args:
x: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
with tf.name_scope("presoftmax_linear"):
batch_size = tf.shape(x)[0]
length = tf.shape(x)[1]
x = tf.reshape(x, [-1, self.hidden_size])
logits = tf.matmul(x, self.shared_weights, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size])
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Implementation of fully connected network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class FeedFowardNetwork(tf.layers.Layer):
"""Fully connected feedforward network."""
def __init__(self, hidden_size, filter_size, relu_dropout, train, allow_pad):
super(FeedFowardNetwork, self).__init__()
self.hidden_size = hidden_size
self.filter_size = filter_size
self.relu_dropout = relu_dropout
self.train = train
self.allow_pad = allow_pad
self.filter_dense_layer = tf.layers.Dense(
filter_size, use_bias=True, activation=tf.nn.relu, name="filter_layer")
self.output_dense_layer = tf.layers.Dense(
hidden_size, use_bias=True, name="output_layer")
def call(self, x, padding=None):
"""Return outputs of the feedforward network.
Args:
x: tensor with shape [batch_size, length, hidden_size]
padding: (optional) If set, the padding values are temporarily removed
from x (provided self.allow_pad is set). The padding values are placed
back in the output tensor in the same locations.
shape [batch_size, length]
Returns:
Output of the feedforward network.
tensor with shape [batch_size, length, hidden_size]
"""
padding = None if not self.allow_pad else padding
# Retrieve dynamically known shapes
batch_size = tf.shape(x)[0]
length = tf.shape(x)[1]
if padding is not None:
with tf.name_scope("remove_padding"):
# Flatten padding to [batch_size*length]
pad_mask = tf.reshape(padding, [-1])
nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9))
# Reshape x to [batch_size*length, hidden_size] to remove padding
x = tf.reshape(x, [-1, self.hidden_size])
x = tf.gather_nd(x, indices=nonpad_ids)
# Reshape x from 2 dimensions to 3 dimensions.
x.set_shape([None, self.hidden_size])
x = tf.expand_dims(x, axis=0)
output = self.filter_dense_layer(x)
if self.train:
output = tf.nn.dropout(output, 1.0 - self.relu_dropout)
output = self.output_dense_layer(output)
if padding is not None:
with tf.name_scope("re_add_padding"):
output = tf.squeeze(output, axis=0)
output = tf.scatter_nd(
indices=nonpad_ids,
updates=output,
shape=[batch_size * length, self.hidden_size]
)
output = tf.reshape(output, [batch_size, length, self.hidden_size])
return output
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines Transformer model parameters."""
from collections import defaultdict
BASE_PARAMS = defaultdict(
lambda: None, # Set default value to None.
# Input params
default_batch_size=2048, # Maximum number of tokens per batch of examples.
default_batch_size_tpu=32768,
max_length=256, # Maximum number of tokens per example.
# Model params
initializer_gain=1.0, # Used in trainable variable initialization.
vocab_size=33708, # Number of tokens defined in the vocabulary file.
hidden_size=512, # Model dimension in the hidden layers.
num_hidden_layers=6, # Number of layers in the encoder and decoder stacks.
num_heads=8, # Number of heads to use in multi-headed attention.
filter_size=2048, # Inner layer dimension in the feedforward network.
# Dropout values (only used when training)
layer_postprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
# Training params
label_smoothing=0.1,
learning_rate=2.0,
learning_rate_decay_rate=1.0,
learning_rate_warmup_steps=16000,
# Optimizer params
optimizer_adam_beta1=0.9,
optimizer_adam_beta2=0.997,
optimizer_adam_epsilon=1e-09,
# Default prediction params
extra_decode_length=50,
beam_size=4,
alpha=0.6, # used to calculate length normalization in beam search
# TPU specific parameters
use_tpu=False,
static_batch=False,
allow_ffn_pad=True,
)
BIG_PARAMS = BASE_PARAMS.copy()
BIG_PARAMS.update(
default_batch_size=4096,
# default batch size is smaller than for BASE_PARAMS due to memory limits.
default_batch_size_tpu=16384,
hidden_size=1024,
filter_size=4096,
num_heads=16,
)
# Parameters for running the model in multi gpu. These should not change the
# params that modify the model shape (such as the hidden_size or num_heads).
BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
BASE_MULTI_GPU_PARAMS.update(
learning_rate_warmup_steps=8000
)
BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
BIG_MULTI_GPU_PARAMS.update(
layer_postprocess_dropout=0.3,
learning_rate_warmup_steps=8000
)
# Parameters for testing the model
TINY_PARAMS = BASE_PARAMS.copy()
TINY_PARAMS.update(
default_batch_size=1024,
default_batch_size_tpu=1024,
hidden_size=32,
num_heads=4,
filter_size=256,
)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Transformer model helper methods."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
_NEG_INF = -1e9
def get_position_encoding(
length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
"""Return positional encoding.
Calculates the position encoding as a mix of sine and cosine functions with
geometrically increasing wavelengths.
Defined and formulized in Attention is All You Need, section 3.5.
Args:
length: Sequence length.
hidden_size: Size of the
min_timescale: Minimum scale that will be applied at each position
max_timescale: Maximum scale that will be applied at each position
Returns:
Tensor with shape [length, hidden_size]
"""
position = tf.to_float(tf.range(length))
num_timescales = hidden_size // 2
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
(tf.to_float(num_timescales) - 1))
inv_timescales = min_timescale * tf.exp(
tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
return signal
def get_decoder_self_attention_bias(length):
"""Calculate bias for decoder that maintains model's autoregressive property.
Creates a tensor that masks out locations that correspond to illegal
connections, so prediction at position i cannot draw information from future
positions.
Args:
length: int length of sequences in batch.
Returns:
float tensor of shape [1, 1, length, length]
"""
with tf.name_scope("decoder_self_attention_bias"):
valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
decoder_bias = _NEG_INF * (1.0 - valid_locs)
return decoder_bias
def get_padding(x, padding_value=0):
"""Return float tensor representing the padding values in x.
Args:
x: int tensor with any shape
padding_value: int value that
Returns:
flaot tensor with same shape as x containing values 0 or 1.
0 -> non-padding, 1 -> padding
"""
with tf.name_scope("padding"):
return tf.to_float(tf.equal(x, padding_value))
def get_padding_bias(x):
"""Calculate bias tensor from padding values in tensor.
Bias tensor that is added to the pre-softmax multi-headed attention logits,
which has shape [batch_size, num_heads, length, length]. The tensor is zero at
non-padding locations, and -1e9 (negative infinity) at padding locations.
Args:
x: int tensor with shape [batch_size, length]
Returns:
Attention bias tensor of shape [batch_size, 1, 1, length].
"""
with tf.name_scope("attention_bias"):
padding = get_padding(x)
attention_bias = padding * _NEG_INF
attention_bias = tf.expand_dims(
tf.expand_dims(attention_bias, axis=1), axis=1)
return attention_bias
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test Transformer model helper methods."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.model import model_utils
NEG_INF = -1e9
class ModelUtilsTest(tf.test.TestCase):
def test_get_padding(self):
x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
padding = model_utils.get_padding(x, padding_value=0)
with self.test_session() as sess:
padding = sess.run(padding)
self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]],
padding)
def test_get_padding_bias(self):
x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
bias = model_utils.get_padding_bias(x)
bias_shape = tf.shape(bias)
flattened_bias = tf.reshape(bias, [3, 5])
with self.test_session() as sess:
flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape))
self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
[0, 0, NEG_INF, NEG_INF, NEG_INF],
[NEG_INF, 0, 0, NEG_INF, 0]],
flattened_bias)
self.assertAllEqual([3, 1, 1, 5], bias_shape)
def test_get_decoder_self_attention_bias(self):
length = 5
bias = model_utils.get_decoder_self_attention_bias(length)
with self.test_session() as sess:
bias = sess.run(bias)
self.assertAllEqual([[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
[0, 0, NEG_INF, NEG_INF, NEG_INF],
[0, 0, 0, NEG_INF, NEG_INF],
[0, 0, 0, 0, NEG_INF],
[0, 0, 0, 0, 0]]]],
bias)
if __name__ == "__main__":
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines the Transformer model, and its encoder and decoder stacks.
Model paper: https://arxiv.org/pdf/1706.03762.pdf
Transformer model code source: https://github.com/tensorflow/tensor2tensor
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.transformer.model import attention_layer
from official.transformer.model import beam_search
from official.transformer.model import embedding_layer
from official.transformer.model import ffn_layer
from official.transformer.model import model_utils
from official.transformer.utils.tokenizer import EOS_ID
_NEG_INF = -1e9
class Transformer(object):
"""Transformer model for sequence to sequence data.
Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
The Transformer model consists of an encoder and decoder. The input is an int
sequence (or a batch of sequences). The encoder produces a continous
representation, and the decoder uses the encoder output to generate
probabilities for the output sequence.
"""
def __init__(self, params, train):
"""Initialize layers to build Transformer model.
Args:
params: hyperparameter object defining layer sizes, dropout values, etc.
train: boolean indicating whether the model is in training mode. Used to
determine if dropout layers should be added.
"""
self.train = train
self.params = params
self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
params["vocab_size"], params["hidden_size"],
method="matmul" if params["tpu"] else "gather")
self.encoder_stack = EncoderStack(params, train)
self.decoder_stack = DecoderStack(params, train)
def __call__(self, inputs, targets=None):
"""Calculate target logits or inferred target sequences.
Args:
inputs: int tensor with shape [batch_size, input_length].
targets: None or int tensor with shape [batch_size, target_length].
Returns:
If targets is defined, then return logits for each word in the target
sequence. float tensor with shape [batch_size, target_length, vocab_size]
If target is none, then generate output sequence one token at a time.
returns a dictionary {
output: [batch_size, decoded length]
score: [batch_size, float]}
"""
# Variance scaling is used here because it seems to work in many problems.
# Other reasonable initializers may also work just as well.
initializer = tf.variance_scaling_initializer(
self.params["initializer_gain"], mode="fan_avg", distribution="uniform")
with tf.variable_scope("Transformer", initializer=initializer):
# Calculate attention bias for encoder self-attention and decoder
# multi-headed attention layers.
attention_bias = model_utils.get_padding_bias(inputs)
# Run the inputs through the encoder layer to map the symbol
# representations to continuous representations.
encoder_outputs = self.encode(inputs, attention_bias)
# Generate output sequence if targets is None, or return logits if target
# sequence is known.
if targets is None:
return self.predict(encoder_outputs, attention_bias)
else:
logits = self.decode(targets, encoder_outputs, attention_bias)
return logits
def encode(self, inputs, attention_bias):
"""Generate continuous representation for inputs.
Args:
inputs: int tensor with shape [batch_size, input_length].
attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
Returns:
float tensor with shape [batch_size, input_length, hidden_size]
"""
with tf.name_scope("encode"):
# Prepare inputs to the layer stack by adding positional encodings and
# applying dropout.
embedded_inputs = self.embedding_softmax_layer(inputs)
inputs_padding = model_utils.get_padding(inputs)
with tf.name_scope("add_pos_encoding"):
length = tf.shape(embedded_inputs)[1]
pos_encoding = model_utils.get_position_encoding(
length, self.params["hidden_size"])
encoder_inputs = embedded_inputs + pos_encoding
if self.train:
encoder_inputs = tf.nn.dropout(
encoder_inputs, 1 - self.params["layer_postprocess_dropout"])
return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def decode(self, targets, encoder_outputs, attention_bias):
"""Generate logits for each value in the target sequence.
Args:
targets: target values for the output sequence.
int tensor with shape [batch_size, target_length]
encoder_outputs: continuous representation of input sequence.
float tensor with shape [batch_size, input_length, hidden_size]
attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
Returns:
float32 tensor with shape [batch_size, target_length, vocab_size]
"""
with tf.name_scope("decode"):
# Prepare inputs to decoder layers by shifting targets, adding positional
# encoding and applying dropout.
decoder_inputs = self.embedding_softmax_layer(targets)
with tf.name_scope("shift_targets"):
# Shift targets to the right, and remove the last element
decoder_inputs = tf.pad(
decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
with tf.name_scope("add_pos_encoding"):
length = tf.shape(decoder_inputs)[1]
decoder_inputs += model_utils.get_position_encoding(
length, self.params["hidden_size"])
if self.train:
decoder_inputs = tf.nn.dropout(
decoder_inputs, 1 - self.params["layer_postprocess_dropout"])
# Run values
decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
length)
outputs = self.decoder_stack(
decoder_inputs, encoder_outputs, decoder_self_attention_bias,
attention_bias)
logits = self.embedding_softmax_layer.linear(outputs)
return logits
def _get_symbols_to_logits_fn(self, max_decode_length):
"""Returns a decoding function that calculates logits of the next tokens."""
timing_signal = model_utils.get_position_encoding(
max_decode_length + 1, self.params["hidden_size"])
decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
max_decode_length)
def symbols_to_logits_fn(ids, i, cache):
"""Generate logits for next potential IDs.
Args:
ids: Current decoded sequences.
int tensor with shape [batch_size * beam_size, i + 1]
i: Loop index
cache: dictionary of values storing the encoder output, encoder-decoder
attention bias, and previous decoder attention values.
Returns:
Tuple of
(logits with shape [batch_size * beam_size, vocab_size],
updated cache values)
"""
# Set decoder input to the last generated IDs
decoder_input = ids[:, -1:]
# Preprocess decoder input by getting embeddings and adding timing signal.
decoder_input = self.embedding_softmax_layer(decoder_input)
decoder_input += timing_signal[i:i + 1]
self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
decoder_outputs = self.decoder_stack(
decoder_input, cache.get("encoder_outputs"), self_attention_bias,
cache.get("encoder_decoder_attention_bias"), cache)
logits = self.embedding_softmax_layer.linear(decoder_outputs)
logits = tf.squeeze(logits, axis=[1])
return logits, cache
return symbols_to_logits_fn
def predict(self, encoder_outputs, encoder_decoder_attention_bias):
"""Return predicted sequence."""
batch_size = tf.shape(encoder_outputs)[0]
input_length = tf.shape(encoder_outputs)[1]
max_decode_length = input_length + self.params["extra_decode_length"]
symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
# Create initial set of IDs that will be passed into symbols_to_logits_fn.
initial_ids = tf.zeros([batch_size], dtype=tf.int32)
# Create cache storing decoder attention values for each layer.
cache = {
"layer_%d" % layer: {
"k": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
"v": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
} for layer in range(self.params["num_hidden_layers"])}
# Add encoder output and attention bias to the cache.
cache["encoder_outputs"] = encoder_outputs
cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
# Use beam search to find the top beam_size sequences and scores.
decoded_ids, scores = beam_search.sequence_beam_search(
symbols_to_logits_fn=symbols_to_logits_fn,
initial_ids=initial_ids,
initial_cache=cache,
vocab_size=self.params["vocab_size"],
beam_size=self.params["beam_size"],
alpha=self.params["alpha"],
max_decode_length=max_decode_length,
eos_id=EOS_ID)
# Get the top sequence for each batch element
top_decoded_ids = decoded_ids[:, 0, 1:]
top_scores = scores[:, 0]
return {"outputs": top_decoded_ids, "scores": top_scores}
class LayerNormalization(tf.layers.Layer):
"""Applies layer normalization."""
def __init__(self, hidden_size):
super(LayerNormalization, self).__init__()
self.hidden_size = hidden_size
def build(self, _):
self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
initializer=tf.ones_initializer())
self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
initializer=tf.zeros_initializer())
self.built = True
def call(self, x, epsilon=1e-6):
mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
return norm_x * self.scale + self.bias
class PrePostProcessingWrapper(object):
"""Wrapper class that applies layer pre-processing and post-processing."""
def __init__(self, layer, params, train):
self.layer = layer
self.postprocess_dropout = params["layer_postprocess_dropout"]
self.train = train
# Create normalization layer
self.layer_norm = LayerNormalization(params["hidden_size"])
def __call__(self, x, *args, **kwargs):
# Preprocessing: apply layer normalization
y = self.layer_norm(x)
# Get layer output
y = self.layer(y, *args, **kwargs)
# Postprocessing: apply dropout and residual connection
if self.train:
y = tf.nn.dropout(y, 1 - self.postprocess_dropout)
return x + y
class EncoderStack(tf.layers.Layer):
"""Transformer encoder stack.
The encoder stack is made up of N identical layers. Each layer is composed
of the sublayers:
1. Self-attention layer
2. Feedforward network (which is 2 fully-connected layers)
"""
def __init__(self, params, train):
super(EncoderStack, self).__init__()
self.layers = []
for _ in range(params["num_hidden_layers"]):
# Create sublayers for each layer.
self_attention_layer = attention_layer.SelfAttention(
params["hidden_size"], params["num_heads"],
params["attention_dropout"], train)
feed_forward_network = ffn_layer.FeedFowardNetwork(
params["hidden_size"], params["filter_size"],
params["relu_dropout"], train, params["allow_ffn_pad"])
self.layers.append([
PrePostProcessingWrapper(self_attention_layer, params, train),
PrePostProcessingWrapper(feed_forward_network, params, train)])
# Create final layer normalization layer.
self.output_normalization = LayerNormalization(params["hidden_size"])
def call(self, encoder_inputs, attention_bias, inputs_padding):
"""Return the output of the encoder layer stacks.
Args:
encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
attention_bias: bias for the encoder self-attention layer.
[batch_size, 1, 1, input_length]
inputs_padding: P
Returns:
Output of encoder layer stack.
float32 tensor with shape [batch_size, input_length, hidden_size]
"""
for n, layer in enumerate(self.layers):
# Run inputs through the sublayers.
self_attention_layer = layer[0]
feed_forward_network = layer[1]
with tf.variable_scope("layer_%d" % n):
with tf.variable_scope("self_attention"):
encoder_inputs = self_attention_layer(encoder_inputs, attention_bias)
with tf.variable_scope("ffn"):
encoder_inputs = feed_forward_network(encoder_inputs, inputs_padding)
return self.output_normalization(encoder_inputs)
class DecoderStack(tf.layers.Layer):
"""Transformer decoder stack.
Like the encoder stack, the decoder stack is made up of N identical layers.
Each layer is composed of the sublayers:
1. Self-attention layer
2. Multi-headed attention layer combining encoder outputs with results from
the previous self-attention layer.
3. Feedforward network (2 fully-connected layers)
"""
def __init__(self, params, train):
super(DecoderStack, self).__init__()
self.layers = []
for _ in range(params["num_hidden_layers"]):
self_attention_layer = attention_layer.SelfAttention(
params["hidden_size"], params["num_heads"],
params["attention_dropout"], train)
enc_dec_attention_layer = attention_layer.Attention(
params["hidden_size"], params["num_heads"],
params["attention_dropout"], train)
feed_forward_network = ffn_layer.FeedFowardNetwork(
params["hidden_size"], params["filter_size"],
params["relu_dropout"], train, params["allow_ffn_pad"])
self.layers.append([
PrePostProcessingWrapper(self_attention_layer, params, train),
PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
PrePostProcessingWrapper(feed_forward_network, params, train)])
self.output_normalization = LayerNormalization(params["hidden_size"])
def call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias,
attention_bias, cache=None):
"""Return the output of the decoder layer stacks.
Args:
decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
decoder_self_attention_bias: bias for decoder self-attention layer.
[1, 1, target_len, target_length]
attention_bias: bias for encoder-decoder attention layer.
[batch_size, 1, 1, input_length]
cache: (Used for fast decoding) A nested dictionary storing previous
decoder self-attention values. The items are:
{layer_n: {"k": tensor with shape [batch_size, i, key_channels],
"v": tensor with shape [batch_size, i, value_channels]},
...}
Returns:
Output of decoder layer stack.
float32 tensor with shape [batch_size, target_length, hidden_size]
"""
for n, layer in enumerate(self.layers):
self_attention_layer = layer[0]
enc_dec_attention_layer = layer[1]
feed_forward_network = layer[2]
# Run inputs through the sublayers.
layer_name = "layer_%d" % n
layer_cache = cache[layer_name] if cache is not None else None
with tf.variable_scope(layer_name):
with tf.variable_scope("self_attention"):
decoder_inputs = self_attention_layer(
decoder_inputs, decoder_self_attention_bias, cache=layer_cache)
with tf.variable_scope("encdec_attention"):
decoder_inputs = enc_dec_attention_layer(
decoder_inputs, encoder_outputs, attention_bias)
with tf.variable_scope("ffn"):
decoder_inputs = feed_forward_network(decoder_inputs)
return self.output_normalization(decoder_inputs)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Train and evaluate the Transformer model.
See README for description of setting the training schedule and evaluating the
BLEU score.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tempfile
# pylint: disable=g-bad-import-order
from six.moves import xrange # pylint: disable=redefined-builtin
from absl import app as absl_app
from absl import flags
import tensorflow as tf
# pylint: enable=g-bad-import-order
from official.transformer import compute_bleu
from official.transformer import translate
from official.transformer.model import model_params
from official.transformer.model import transformer
from official.transformer.utils import dataset
from official.transformer.utils import metrics
from official.transformer.utils import schedule
from official.transformer.utils import tokenizer
from official.utils.accelerator import tpu as tpu_util
from official.utils.export import export
from official.utils.flags import core as flags_core
from official.utils.logs import hooks_helper
from official.utils.logs import logger
from official.utils.misc import distribution_utils
from official.utils.misc import model_helpers
PARAMS_MAP = {
"tiny": model_params.TINY_PARAMS,
"base": model_params.BASE_PARAMS,
"big": model_params.BIG_PARAMS,
}
DEFAULT_TRAIN_EPOCHS = 10
INF = int(1e9)
BLEU_DIR = "bleu"
# Dictionary containing tensors that are logged by the logging hooks. Each item
# maps a string to the tensor name.
TENSORS_TO_LOG = {
"learning_rate": "model/get_train_op/learning_rate/learning_rate",
"cross_entropy_loss": "model/cross_entropy"}
def model_fn(features, labels, mode, params):
"""Defines how to train, evaluate and predict from the transformer model."""
with tf.variable_scope("model"):
inputs, targets = features, labels
# Create model and get output logits.
model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)
logits = model(inputs, targets)
# When in prediction mode, the labels/targets is None. The model output
# is the prediction
if mode == tf.estimator.ModeKeys.PREDICT:
if params["use_tpu"]:
raise NotImplementedError("Prediction is not yet supported on TPUs.")
return tf.estimator.EstimatorSpec(
tf.estimator.ModeKeys.PREDICT,
predictions=logits,
export_outputs={
"translate": tf.estimator.export.PredictOutput(logits)
})
# Explicitly set the shape of the logits for XLA (TPU). This is needed
# because the logits are passed back to the host VM CPU for metric
# evaluation, and the shape of [?, ?, vocab_size] is too vague. However
# it is known from Transformer that the first two dimensions of logits
# are the dimensions of targets. Note that the ambiguous shape of logits is
# not a problem when computing xentropy, because padded_cross_entropy_loss
# resolves the shape on the TPU.
logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])
# Calculate model loss.
# xentropy contains the cross entropy loss of every nonpadding token in the
# targets.
xentropy, weights = metrics.padded_cross_entropy_loss(
logits, targets, params["label_smoothing"], params["vocab_size"])
loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
# Save loss as named tensor that will be logged with the logging hook.
tf.identity(loss, "cross_entropy")
if mode == tf.estimator.ModeKeys.EVAL:
if params["use_tpu"]:
# host call functions should only have tensors as arguments.
# This lambda pre-populates params so that metric_fn is
# TPUEstimator compliant.
metric_fn = lambda logits, labels: (
metrics.get_eval_metrics(logits, labels, params=params))
eval_metrics = (metric_fn, [logits, labels])
return tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, loss=loss, predictions={"predictions": logits},
eval_metrics=eval_metrics)
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, predictions={"predictions": logits},
eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
else:
train_op, metric_dict = get_train_op_and_metrics(loss, params)
# Epochs can be quite long. This gives some intermediate information
# in TensorBoard.
metric_dict["minibatch_loss"] = loss
if params["use_tpu"]:
return tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, loss=loss, train_op=train_op,
host_call=tpu_util.construct_scalar_host_call(
metric_dict=metric_dict, model_dir=params["model_dir"],
prefix="training/")
)
record_scalars(metric_dict)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def record_scalars(metric_dict):
for key, value in metric_dict.items():
tf.contrib.summary.scalar(name=key, tensor=value)
def get_learning_rate(learning_rate, hidden_size, learning_rate_warmup_steps):
"""Calculate learning rate with linear warmup and rsqrt decay."""
with tf.name_scope("learning_rate"):
warmup_steps = tf.to_float(learning_rate_warmup_steps)
step = tf.to_float(tf.train.get_or_create_global_step())
learning_rate *= (hidden_size ** -0.5)
# Apply linear warmup
learning_rate *= tf.minimum(1.0, step / warmup_steps)
# Apply rsqrt decay
learning_rate *= tf.rsqrt(tf.maximum(step, warmup_steps))
# Create a named tensor that will be logged using the logging hook.
# The full name includes variable and names scope. In this case, the name
# is model/get_train_op/learning_rate/learning_rate
tf.identity(learning_rate, "learning_rate")
return learning_rate
def get_train_op_and_metrics(loss, params):
"""Generate training op and metrics to save in TensorBoard."""
with tf.variable_scope("get_train_op"):
learning_rate = get_learning_rate(
learning_rate=params["learning_rate"],
hidden_size=params["hidden_size"],
learning_rate_warmup_steps=params["learning_rate_warmup_steps"])
# Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
# than the TF core Adam optimizer.
optimizer = tf.contrib.opt.LazyAdamOptimizer(
learning_rate,
beta1=params["optimizer_adam_beta1"],
beta2=params["optimizer_adam_beta2"],
epsilon=params["optimizer_adam_epsilon"])
if params["use_tpu"] and params["tpu"] != tpu_util.LOCAL:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
# Calculate and apply gradients using LazyAdamOptimizer.
global_step = tf.train.get_global_step()
tvars = tf.trainable_variables()
gradients = optimizer.compute_gradients(
loss, tvars, colocate_gradients_with_ops=True)
minimize_op = optimizer.apply_gradients(
gradients, global_step=global_step, name="train")
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
train_op = tf.group(minimize_op, update_ops)
train_metrics = {"learning_rate": learning_rate}
if not params["use_tpu"]:
# gradient norm is not included as a summary when running on TPU, as
# it can cause instability between the TPU and the host controller.
gradient_norm = tf.global_norm(list(zip(*gradients))[0])
train_metrics["global_norm/gradient_norm"] = gradient_norm
return train_op, train_metrics
def translate_and_compute_bleu(estimator, subtokenizer, bleu_source, bleu_ref):
"""Translate file and report the cased and uncased bleu scores."""
# Create temporary file to store translation.
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp_filename = tmp.name
translate.translate_file(
estimator, subtokenizer, bleu_source, output_file=tmp_filename,
print_all_translations=False)
# Compute uncased and cased bleu scores.
uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
cased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, True)
os.remove(tmp_filename)
return uncased_score, cased_score
def get_global_step(estimator):
"""Return estimator's last checkpoint."""
return int(estimator.latest_checkpoint().split("-")[-1])
def evaluate_and_log_bleu(estimator, bleu_source, bleu_ref, vocab_file):
"""Calculate and record the BLEU score."""
subtokenizer = tokenizer.Subtokenizer(vocab_file)
uncased_score, cased_score = translate_and_compute_bleu(
estimator, subtokenizer, bleu_source, bleu_ref)
tf.logging.info("Bleu score (uncased): %d", uncased_score)
tf.logging.info("Bleu score (cased): %d", cased_score)
return uncased_score, cased_score
def _validate_file(filepath):
"""Make sure that file exists."""
if not tf.gfile.Exists(filepath):
raise tf.errors.NotFoundError(None, None, "File %s not found." % filepath)
def run_loop(
estimator, schedule_manager, train_hooks=None, benchmark_logger=None,
bleu_source=None, bleu_ref=None, bleu_threshold=None, vocab_file=None):
"""Train and evaluate model, and optionally compute model's BLEU score.
**Step vs. Epoch vs. Iteration**
Steps and epochs are canonical terms used in TensorFlow and general machine
learning. They are used to describe running a single process (train/eval):
- Step refers to running the process through a single or batch of examples.
- Epoch refers to running the process through an entire dataset.
E.g. training a dataset with 100 examples. The dataset is
divided into 20 batches with 5 examples per batch. A single training step
trains the model on one batch. After 20 training steps, the model will have
trained on every batch in the dataset, or, in other words, one epoch.
Meanwhile, iteration is used in this implementation to describe running
multiple processes (training and eval).
- A single iteration:
1. trains the model for a specific number of steps or epochs.
2. evaluates the model.
3. (if source and ref files are provided) compute BLEU score.
This function runs through multiple train+eval+bleu iterations.
Args:
estimator: tf.Estimator containing model to train.
schedule_manager: A schedule.Manager object to guide the run loop.
train_hooks: List of hooks to pass to the estimator during training.
benchmark_logger: a BenchmarkLogger object that logs evaluation data
bleu_source: File containing text to be translated for BLEU calculation.
bleu_ref: File containing reference translations for BLEU calculation.
bleu_threshold: minimum BLEU score before training is stopped.
vocab_file: Path to vocab file that will be used to subtokenize bleu_source.
Raises:
ValueError: if both or none of single_iteration_train_steps and
single_iteration_train_epochs were defined.
NotFoundError: if the vocab file or bleu files don't exist.
"""
if bleu_source:
_validate_file(bleu_source)
if bleu_ref:
_validate_file(bleu_ref)
if vocab_file:
_validate_file(vocab_file)
evaluate_bleu = bleu_source is not None and bleu_ref is not None
if evaluate_bleu and schedule_manager.use_tpu:
raise ValueError("BLEU score can not be computed when training with a TPU, "
"as it requires estimator.predict which is not yet "
"supported.")
# Print details of training schedule.
tf.logging.info("Training schedule:")
tf.logging.info(
"\t1. Train for {}".format(schedule_manager.train_increment_str))
tf.logging.info("\t2. Evaluate model.")
if evaluate_bleu:
tf.logging.info("\t3. Compute BLEU score.")
if bleu_threshold is not None:
tf.logging.info("Repeat above steps until the BLEU score reaches %f" %
bleu_threshold)
if not evaluate_bleu or bleu_threshold is None:
tf.logging.info("Repeat above steps %d times." %
schedule_manager.train_eval_iterations)
if evaluate_bleu:
# Create summary writer to log bleu score (values can be displayed in
# Tensorboard).
bleu_writer = tf.summary.FileWriter(
os.path.join(estimator.model_dir, BLEU_DIR))
if bleu_threshold is not None:
# Change loop stopping condition if bleu_threshold is defined.
schedule_manager.train_eval_iterations = INF
# Loop training/evaluation/bleu cycles
for i in xrange(schedule_manager.train_eval_iterations):
tf.logging.info("Starting iteration %d" % (i + 1))
# Train the model for single_iteration_train_steps or until the input fn
# runs out of examples (if single_iteration_train_steps is None).
estimator.train(
dataset.train_input_fn,
steps=schedule_manager.single_iteration_train_steps,
hooks=train_hooks)
eval_results = estimator.evaluate(
input_fn=dataset.eval_input_fn,
steps=schedule_manager.single_iteration_eval_steps)
tf.logging.info("Evaluation results (iter %d/%d):" %
(i + 1, schedule_manager.train_eval_iterations))
tf.logging.info(eval_results)
benchmark_logger.log_evaluation_result(eval_results)
# The results from estimator.evaluate() are measured on an approximate
# translation, which utilize the target golden values provided. The actual
# bleu score must be computed using the estimator.predict() path, which
# outputs translations that are not based on golden values. The translations
# are compared to reference file to get the actual bleu score.
if evaluate_bleu:
uncased_score, cased_score = evaluate_and_log_bleu(
estimator, bleu_source, bleu_ref, vocab_file)
# Write actual bleu scores using summary writer and benchmark logger
global_step = get_global_step(estimator)
summary = tf.Summary(value=[
tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score),
tf.Summary.Value(tag="bleu/cased", simple_value=cased_score),
])
bleu_writer.add_summary(summary, global_step)
bleu_writer.flush()
benchmark_logger.log_metric(
"bleu_uncased", uncased_score, global_step=global_step)
benchmark_logger.log_metric(
"bleu_cased", cased_score, global_step=global_step)
# Stop training if bleu stopping threshold is met.
if model_helpers.past_stop_threshold(bleu_threshold, uncased_score):
bleu_writer.close()
break
def define_transformer_flags():
"""Add flags and flag validators for running transformer_main."""
# Add common flags (data_dir, model_dir, train_epochs, etc.).
flags_core.define_base()
flags_core.define_performance(
num_parallel_calls=True,
inter_op=False,
intra_op=False,
synthetic_data=True,
max_train_steps=False,
dtype=False,
all_reduce_alg=True
)
flags_core.define_benchmark()
flags_core.define_device(tpu=True)
# Set flags from the flags_core module as "key flags" so they're listed when
# the '-h' flag is used. Without this line, the flags defined above are
# only shown in the full `--helpful` help text.
flags.adopt_module_key_flags(flags_core)
# Add transformer-specific flags
flags.DEFINE_enum(
name="param_set", short_name="mp", default="big",
enum_values=PARAMS_MAP.keys(),
help=flags_core.help_wrap(
"Parameter set to use when creating and training the model. The "
"parameters define the input shape (batch size and max length), "
"model configuration (size of embedding, # of hidden layers, etc.), "
"and various other settings. The big parameter set increases the "
"default batch size, embedding/hidden size, and filter size. For a "
"complete list of parameters, please see model/model_params.py."))
flags.DEFINE_bool(
name="static_batch", default=False,
help=flags_core.help_wrap(
"Whether the batches in the dataset should have static shapes. In "
"general, this setting should be False. Dynamic shapes allow the "
"inputs to be grouped so that the number of padding tokens is "
"minimized, and helps model training. In cases where the input shape "
"must be static (e.g. running on TPU), this setting will be ignored "
"and static batching will always be used."))
# Flags for training with steps (may be used for debugging)
flags.DEFINE_integer(
name="train_steps", short_name="ts", default=None,
help=flags_core.help_wrap("The number of steps used to train."))
flags.DEFINE_integer(
name="steps_between_evals", short_name="sbe", default=1000,
help=flags_core.help_wrap(
"The Number of training steps to run between evaluations. This is "
"used if --train_steps is defined."))
# BLEU score computation
flags.DEFINE_string(
name="bleu_source", short_name="bls", default=None,
help=flags_core.help_wrap(
"Path to source file containing text translate when calculating the "
"official BLEU score. Both --bleu_source and --bleu_ref must be set. "
"Use the flag --stop_threshold to stop the script based on the "
"uncased BLEU score."))
flags.DEFINE_string(
name="bleu_ref", short_name="blr", default=None,
help=flags_core.help_wrap(
"Path to source file containing text translate when calculating the "
"official BLEU score. Both --bleu_source and --bleu_ref must be set. "
"Use the flag --stop_threshold to stop the script based on the "
"uncased BLEU score."))
flags.DEFINE_string(
name="vocab_file", short_name="vf", default=None,
help=flags_core.help_wrap(
"Path to subtoken vocabulary file. If data_download.py was used to "
"download and encode the training data, look in the data_dir to find "
"the vocab file."))
flags_core.set_defaults(data_dir="/tmp/translate_ende",
model_dir="/tmp/transformer_model",
batch_size=None,
train_epochs=None)
@flags.multi_flags_validator(
["train_epochs", "train_steps"],
message="Both --train_steps and --train_epochs were set. Only one may be "
"defined.")
def _check_train_limits(flag_dict):
return flag_dict["train_epochs"] is None or flag_dict["train_steps"] is None
@flags.multi_flags_validator(
["bleu_source", "bleu_ref"],
message="Both or neither --bleu_source and --bleu_ref must be defined.")
def _check_bleu_files(flags_dict):
return (flags_dict["bleu_source"] is None) == (
flags_dict["bleu_ref"] is None)
@flags.multi_flags_validator(
["bleu_source", "bleu_ref", "vocab_file"],
message="--vocab_file must be defined if --bleu_source and --bleu_ref "
"are defined.")
def _check_bleu_vocab_file(flags_dict):
if flags_dict["bleu_source"] and flags_dict["bleu_ref"]:
return flags_dict["vocab_file"] is not None
return True
@flags.multi_flags_validator(
["export_dir", "vocab_file"],
message="--vocab_file must be defined if --export_dir is set.")
def _check_export_vocab_file(flags_dict):
if flags_dict["export_dir"]:
return flags_dict["vocab_file"] is not None
return True
flags_core.require_cloud_storage(["data_dir", "model_dir", "export_dir"])
def construct_estimator(flags_obj, params, schedule_manager):
"""Construct an estimator from either Estimator or TPUEstimator.
Args:
flags_obj: The FLAGS object parsed from command line.
params: A dict of run specific parameters.
schedule_manager: A schedule.Manager object containing the run schedule.
Returns:
An estimator object to be used for training and eval.
"""
if not params["use_tpu"]:
distribution_strategy = distribution_utils.get_distribution_strategy(
flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)
return tf.estimator.Estimator(
model_fn=model_fn, model_dir=flags_obj.model_dir, params=params,
config=tf.estimator.RunConfig(train_distribute=distribution_strategy))
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
tpu=flags_obj.tpu,
zone=flags_obj.tpu_zone,
project=flags_obj.tpu_gcp_project
)
tpu_config = tf.contrib.tpu.TPUConfig(
iterations_per_loop=schedule_manager.single_iteration_train_steps,
num_shards=flags_obj.num_tpu_shards)
run_config = tf.contrib.tpu.RunConfig(
cluster=tpu_cluster_resolver,
model_dir=flags_obj.model_dir,
session_config=tf.ConfigProto(
allow_soft_placement=True, log_device_placement=True),
tpu_config=tpu_config)
return tf.contrib.tpu.TPUEstimator(
model_fn=model_fn,
use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL,
train_batch_size=schedule_manager.batch_size,
eval_batch_size=schedule_manager.batch_size,
params={
# TPUEstimator needs to populate batch_size itself due to sharding.
key: value for key, value in params.items() if key != "batch_size"},
config=run_config)
def run_transformer(flags_obj):
"""Create tf.Estimator to train and evaluate transformer model.
Args:
flags_obj: Object containing parsed flag values.
"""
num_gpus = flags_core.get_num_gpus(flags_obj)
# Add flag-defined parameters to params object
params = PARAMS_MAP[flags_obj.param_set]
if num_gpus > 1:
if flags_obj.param_set == "big":
params = model_params.BIG_MULTI_GPU_PARAMS
elif flags_obj.param_set == "base":
params = model_params.BASE_MULTI_GPU_PARAMS
params["data_dir"] = flags_obj.data_dir
params["model_dir"] = flags_obj.model_dir
params["num_parallel_calls"] = flags_obj.num_parallel_calls
params["tpu"] = flags_obj.tpu
params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified.
params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
params["allow_ffn_pad"] = not params["use_tpu"]
params["use_synthetic_data"] = flags_obj.use_synthetic_data
# Set batch size parameter, which depends on the availability of
# TPU and GPU, and distribution settings.
params["batch_size"] = (flags_obj.batch_size or (
params["default_batch_size_tpu"] if params["use_tpu"]
else params["default_batch_size"]))
if not params["use_tpu"]:
params["batch_size"] = distribution_utils.per_device_batch_size(
params["batch_size"], num_gpus)
schedule_manager = schedule.Manager(
train_steps=flags_obj.train_steps,
steps_between_evals=flags_obj.steps_between_evals,
train_epochs=flags_obj.train_epochs,
epochs_between_evals=flags_obj.epochs_between_evals,
default_train_epochs=DEFAULT_TRAIN_EPOCHS,
batch_size=params["batch_size"],
max_length=params["max_length"],
use_tpu=params["use_tpu"],
num_tpu_shards=flags_obj.num_tpu_shards
)
params["repeat_dataset"] = schedule_manager.repeat_dataset
model_helpers.apply_clean(flags.FLAGS)
# Create hooks that log information about the training and metric values
train_hooks = hooks_helper.get_train_hooks(
flags_obj.hooks,
model_dir=flags_obj.model_dir,
tensors_to_log=TENSORS_TO_LOG, # used for logging hooks
batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook
use_tpu=params["use_tpu"] # Not all hooks can run with TPUs
)
benchmark_logger = logger.get_benchmark_logger()
benchmark_logger.log_run_info(
model_name="transformer",
dataset_name="wmt_translate_ende",
run_params=params,
test_id=flags_obj.benchmark_test_id)
# Train and evaluate transformer model
estimator = construct_estimator(flags_obj, params, schedule_manager)
run_loop(
estimator=estimator,
# Training arguments
schedule_manager=schedule_manager,
train_hooks=train_hooks,
benchmark_logger=benchmark_logger,
# BLEU calculation arguments
bleu_source=flags_obj.bleu_source,
bleu_ref=flags_obj.bleu_ref,
bleu_threshold=flags_obj.stop_threshold,
vocab_file=flags_obj.vocab_file)
if flags_obj.export_dir and not params["use_tpu"]:
serving_input_fn = export.build_tensor_serving_input_receiver_fn(
shape=[None], dtype=tf.int64, batch_size=None)
# Export saved model, and save the vocab file as an extra asset. The vocab
# file is saved to allow consistent input encoding and output decoding.
# (See the "Export trained model" section in the README for an example of
# how to use the vocab file.)
# Since the model itself does not use the vocab file, this file is saved as
# an extra asset rather than a core asset.
estimator.export_savedmodel(
flags_obj.export_dir, serving_input_fn,
assets_extra={"vocab.txt": flags_obj.vocab_file},
strip_default_attrs=True)
def main(_):
with logger.benchmark_context(flags.FLAGS):
run_transformer(flags.FLAGS)
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
define_transformer_flags()
absl_app.run(main)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Translate text or files using trained transformer model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
# pylint: disable=g-bad-import-order
from absl import app as absl_app
from absl import flags
import tensorflow as tf
# pylint: enable=g-bad-import-order
from official.transformer.utils import tokenizer
from official.utils.flags import core as flags_core
_DECODE_BATCH_SIZE = 32
_EXTRA_DECODE_LENGTH = 100
_BEAM_SIZE = 4
_ALPHA = 0.6
def _get_sorted_inputs(filename):
"""Read and sort lines from the file sorted by decreasing length.
Args:
filename: String name of file to read inputs from.
Returns:
Sorted list of inputs, and dictionary mapping original index->sorted index
of each element.
"""
with tf.gfile.Open(filename) as f:
records = f.read().split("\n")
inputs = [record.strip() for record in records]
if not inputs[-1]:
inputs.pop()
input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
sorted_inputs = [None] * len(sorted_input_lens)
sorted_keys = [0] * len(sorted_input_lens)
for i, (index, _) in enumerate(sorted_input_lens):
sorted_inputs[i] = inputs[index]
sorted_keys[index] = i
return sorted_inputs, sorted_keys
def _encode_and_add_eos(line, subtokenizer):
"""Encode line with subtokenizer, and add EOS id to the end."""
return subtokenizer.encode(line) + [tokenizer.EOS_ID]
def _trim_and_decode(ids, subtokenizer):
"""Trim EOS and PAD tokens from ids, and decode to return a string."""
try:
index = list(ids).index(tokenizer.EOS_ID)
return subtokenizer.decode(ids[:index])
except ValueError: # No EOS found in sequence
return subtokenizer.decode(ids)
def translate_file(
estimator, subtokenizer, input_file, output_file=None,
print_all_translations=True):
"""Translate lines in file, and save to output file if specified.
Args:
estimator: tf.Estimator used to generate the translations.
subtokenizer: Subtokenizer object for encoding and decoding source and
translated lines.
input_file: file containing lines to translate
output_file: file that stores the generated translations.
print_all_translations: If true, all translations are printed to stdout.
Raises:
ValueError: if output file is invalid.
"""
batch_size = _DECODE_BATCH_SIZE
# Read and sort inputs by length. Keep dictionary (original index-->new index
# in sorted list) to write translations in the original order.
sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
num_decode_batches = (len(sorted_inputs) - 1) // batch_size + 1
def input_generator():
"""Yield encoded strings from sorted_inputs."""
for i, line in enumerate(sorted_inputs):
if i % batch_size == 0:
batch_num = (i // batch_size) + 1
tf.logging.info("Decoding batch %d out of %d." %
(batch_num, num_decode_batches))
yield _encode_and_add_eos(line, subtokenizer)
def input_fn():
"""Created batched dataset of encoded inputs."""
ds = tf.data.Dataset.from_generator(
input_generator, tf.int64, tf.TensorShape([None]))
ds = ds.padded_batch(batch_size, [None])
return ds
translations = []
for i, prediction in enumerate(estimator.predict(input_fn)):
translation = _trim_and_decode(prediction["outputs"], subtokenizer)
translations.append(translation)
if print_all_translations:
tf.logging.info("Translating:\n\tInput: %s\n\tOutput: %s" %
(sorted_inputs[i], translation))
# Write translations in the order they appeared in the original file.
if output_file is not None:
if tf.gfile.IsDirectory(output_file):
raise ValueError("File output is a directory, will not save outputs to "
"file.")
tf.logging.info("Writing to file %s" % output_file)
with tf.gfile.Open(output_file, "w") as f:
for i in sorted_keys:
f.write("%s\n" % translations[i])
def translate_text(estimator, subtokenizer, txt):
"""Translate a single string."""
encoded_txt = _encode_and_add_eos(txt, subtokenizer)
def input_fn():
ds = tf.data.Dataset.from_tensors(encoded_txt)
ds = ds.batch(_DECODE_BATCH_SIZE)
return ds
predictions = estimator.predict(input_fn)
translation = next(predictions)["outputs"]
translation = _trim_and_decode(translation, subtokenizer)
tf.logging.info("Translation of \"%s\": \"%s\"" % (txt, translation))
def main(unused_argv):
from official.transformer import transformer_main
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.text is None and FLAGS.file is None:
tf.logging.warn("Nothing to translate. Make sure to call this script using "
"flags --text or --file.")
return
subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)
# Set up estimator and params
params = transformer_main.PARAMS_MAP[FLAGS.param_set]
params["beam_size"] = _BEAM_SIZE
params["alpha"] = _ALPHA
params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
params["batch_size"] = _DECODE_BATCH_SIZE
estimator = tf.estimator.Estimator(
model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir,
params=params)
if FLAGS.text is not None:
tf.logging.info("Translating text: %s" % FLAGS.text)
translate_text(estimator, subtokenizer, FLAGS.text)
if FLAGS.file is not None:
input_file = os.path.abspath(FLAGS.file)
tf.logging.info("Translating file: %s" % input_file)
if not tf.gfile.Exists(FLAGS.file):
raise ValueError("File does not exist: %s" % input_file)
output_file = None
if FLAGS.file_out is not None:
output_file = os.path.abspath(FLAGS.file_out)
tf.logging.info("File output specified: %s" % output_file)
translate_file(estimator, subtokenizer, input_file, output_file)
def define_translate_flags():
"""Define flags used for translation script."""
# Model flags
flags.DEFINE_string(
name="model_dir", short_name="md", default="/tmp/transformer_model",
help=flags_core.help_wrap(
"Directory containing Transformer model checkpoints."))
flags.DEFINE_enum(
name="param_set", short_name="mp", default="big",
enum_values=["base", "big"],
help=flags_core.help_wrap(
"Parameter set to use when creating and training the model. The "
"parameters define the input shape (batch size and max length), "
"model configuration (size of embedding, # of hidden layers, etc.), "
"and various other settings. The big parameter set increases the "
"default batch size, embedding/hidden size, and filter size. For a "
"complete list of parameters, please see model/model_params.py."))
flags.DEFINE_string(
name="vocab_file", short_name="vf", default=None,
help=flags_core.help_wrap(
"Path to subtoken vocabulary file. If data_download.py was used to "
"download and encode the training data, look in the data_dir to find "
"the vocab file."))
flags.mark_flag_as_required("vocab_file")
flags.DEFINE_string(
name="text", default=None,
help=flags_core.help_wrap(
"Text to translate. Output will be printed to console."))
flags.DEFINE_string(
name="file", default=None,
help=flags_core.help_wrap(
"File containing text to translate. Translation will be printed to "
"console and, if --file_out is provided, saved to an output file."))
flags.DEFINE_string(
name="file_out", default=None,
help=flags_core.help_wrap(
"If --file flag is specified, save translation to this file."))
if __name__ == "__main__":
define_translate_flags()
FLAGS = flags.FLAGS
absl_app.run(main)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Input pipeline for the transformer model to read, filter, and batch examples.
Two things to note in the pipeline:
1. Batching scheme
The examples encoded in the TFRecord files contain data in the format:
{"inputs": [variable length array of integers],
"targets": [variable length array of integers]}
Where integers in the arrays refer to tokens in the English and German vocab
file (named `vocab.ende.32768`).
Prior to batching, elements in the dataset are grouped by length (max between
"inputs" and "targets" length). Each group is then batched such that:
group_batch_size * length <= batch_size.
Another way to view batch_size is the maximum number of tokens in each batch.
Once batched, each element in the dataset will have the shape:
{"inputs": [group_batch_size, padded_input_length],
"targets": [group_batch_size, padded_target_length]}
Lengths are padded to the longest "inputs" or "targets" sequence in the batch
(padded_input_length and padded_target_length can be different).
This batching scheme decreases the fraction of padding tokens per training
batch, thus improving the training speed significantly.
2. Shuffling
While training, the dataset is shuffled in two places in the code. The first
is the list of training files. Second, while reading records using
`parallel_interleave`, the `sloppy` argument is used to generate randomness
in the order of the examples.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import tensorflow as tf
from official.utils.misc import model_helpers
# Buffer size for reading records from a TFRecord file. Each training file is
# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
_READ_RECORD_BUFFER = 8 * 1000 * 1000
# Example grouping constants. Defines length boundaries for each group.
# These values are the defaults used in Tensor2Tensor.
_MIN_BOUNDARY = 8
_BOUNDARY_SCALE = 1.1
def _load_records(filename):
"""Read file and return a dataset of tf.Examples."""
return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
def _parse_example(serialized_example):
"""Return inputs and targets Tensors from a serialized tf.Example."""
data_fields = {
"inputs": tf.VarLenFeature(tf.int64),
"targets": tf.VarLenFeature(tf.int64)
}
parsed = tf.parse_single_example(serialized_example, data_fields)
inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
targets = tf.sparse_tensor_to_dense(parsed["targets"])
return inputs, targets
def _filter_max_length(example, max_length=256):
"""Indicates whether the example's length is lower than the maximum length."""
return tf.logical_and(tf.size(example[0]) <= max_length,
tf.size(example[1]) <= max_length)
def _get_example_length(example):
"""Returns the maximum length between the example inputs and targets."""
length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
return length
def _create_min_max_boundaries(
max_length, min_boundary=_MIN_BOUNDARY, boundary_scale=_BOUNDARY_SCALE):
"""Create min and max boundary lists up to max_length.
For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
returned values will be:
buckets_min = [0, 4, 8, 16, 24]
buckets_max = [4, 8, 16, 24, 25]
Args:
max_length: The maximum length of example in dataset.
min_boundary: Minimum length in boundary.
boundary_scale: Amount to scale consecutive boundaries in the list.
Returns:
min and max boundary lists
"""
# Create bucket boundaries list by scaling the previous boundary or adding 1
# (to ensure increasing boundary sizes).
bucket_boundaries = []
x = min_boundary
while x < max_length:
bucket_boundaries.append(x)
x = max(x + 1, int(x * boundary_scale))
# Create min and max boundary lists from the initial list.
buckets_min = [0] + bucket_boundaries
buckets_max = bucket_boundaries + [max_length + 1]
return buckets_min, buckets_max
def _batch_examples(dataset, batch_size, max_length):
"""Group examples by similar lengths, and return batched dataset.
Each batch of similar-length examples are padded to the same length, and may
have different number of elements in each batch, such that:
group_batch_size * padded_length <= batch_size.
This decreases the number of padding tokens per batch, which improves the
training speed.
Args:
dataset: Dataset of unbatched examples.
batch_size: Max number of tokens per batch of examples.
max_length: Max number of tokens in an example input or target sequence.
Returns:
Dataset of batched examples with similar lengths.
"""
# Get min and max boundary lists for each example. These are used to calculate
# the `bucket_id`, which is the index at which:
# buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
# Note that using both min and max lists improves the performance.
buckets_min, buckets_max = _create_min_max_boundaries(max_length)
# Create list of batch sizes for each bucket_id, so that
# bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
bucket_batch_sizes = [batch_size // x for x in buckets_max]
# bucket_id will be a tensor, so convert this list to a tensor as well.
bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
def example_to_bucket_id(example_input, example_target):
"""Return int64 bucket id for this example, calculated based on length."""
seq_length = _get_example_length((example_input, example_target))
# TODO: investigate whether removing code branching improves performance.
conditions_c = tf.logical_and(
tf.less_equal(buckets_min, seq_length),
tf.less(seq_length, buckets_max))
bucket_id = tf.reduce_min(tf.where(conditions_c))
return bucket_id
def window_size_fn(bucket_id):
"""Return number of examples to be grouped when given a bucket id."""
return bucket_batch_sizes[bucket_id]
def batching_fn(bucket_id, grouped_dataset):
"""Batch and add padding to a dataset of elements with similar lengths."""
bucket_batch_size = window_size_fn(bucket_id)
# Batch the dataset and add padding so that all input sequences in the
# examples have the same length, and all target sequences have the same
# lengths as well. Resulting lengths of inputs and targets can differ.
return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
return dataset.apply(tf.contrib.data.group_by_window(
key_func=example_to_bucket_id,
reduce_func=batching_fn,
window_size=None,
window_size_func=window_size_fn))
def _read_and_batch_from_files(
file_pattern, batch_size, max_length, num_parallel_calls, shuffle, repeat,
static_batch=False):
"""Create dataset where each item is a dict of "inputs" and "targets".
Args:
file_pattern: String used to match the input TFRecord files.
batch_size: Maximum number of tokens per batch of examples
max_length: Maximum number of tokens per example
num_parallel_calls: Number of cpu cores for parallel input processing.
shuffle: If true, randomizes order of elements.
repeat: Number of times to repeat the dataset. If None, the dataset is
repeated forever.
static_batch: Whether the batches in the dataset should have static shapes.
If True, the input is batched so that every batch has the
shape [batch_size // max_length, max_length]. If False, the input is
grouped by length, and batched so that batches may have different
shapes [N, M], where:
N * M <= batch_size
M <= max_length
In general, this setting should be False. Dynamic shapes allow the inputs
to be grouped so that the number of padding tokens is minimized, and helps
model training. In cases where the input shape must be static
(e.g. running on TPU), this setting should be set to True.
Returns:
tf.data.Dataset object containing examples loaded from the files.
"""
dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle)
# Read files and interleave results. When training, the order of the examples
# will be non-deterministic.
dataset = dataset.apply(
tf.contrib.data.parallel_interleave(
_load_records, sloppy=shuffle, cycle_length=num_parallel_calls))
# Parse each tf.Example into a dictionary
# TODO: Look into prefetch_input_elements for performance optimization.
dataset = dataset.map(_parse_example,
num_parallel_calls=num_parallel_calls)
# Remove examples where the input or target length exceeds the maximum length,
dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
if static_batch:
dataset = dataset.apply(tf.contrib.data.padded_batch_and_drop_remainder(
batch_size // max_length, ([max_length], [max_length])))
else:
# Group and batch such that each batch has examples of similar length.
dataset = _batch_examples(dataset, batch_size, max_length)
dataset = dataset.repeat(repeat)
# Prefetch the next element to improve speed of input pipeline.
dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)
return dataset
def _generate_synthetic_data(params):
"""Create synthetic data based on the parameter batch size."""
batch = length = int(math.sqrt(params["batch_size"]))
return model_helpers.generate_synthetic_data(
input_shape=tf.TensorShape([batch, length]),
input_value=1,
input_dtype=tf.int32,
label_shape=tf.TensorShape([batch, length]),
label_value=1,
label_dtype=tf.int32,
)
def train_input_fn(params):
"""Load and return dataset of batched examples for use during training."""
file_pattern = os.path.join(params["data_dir"] or "", "*train*")
if params["use_synthetic_data"]:
return _generate_synthetic_data(params)
return _read_and_batch_from_files(
file_pattern, params["batch_size"], params["max_length"],
params["num_parallel_calls"], shuffle=True,
repeat=params["repeat_dataset"], static_batch=params["static_batch"])
def eval_input_fn(params):
"""Load and return dataset of batched examples for use during evaluation."""
file_pattern = os.path.join(params["data_dir"] or "", "*dev*")
if params["use_synthetic_data"]:
return _generate_synthetic_data(params)
return _read_and_batch_from_files(
file_pattern, params["batch_size"], params["max_length"],
params["num_parallel_calls"], shuffle=False, repeat=1,
static_batch=params["static_batch"])
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions for calculating loss, accuracy, and other model metrics.
Metrics:
- Padded loss, accuracy, and negative log perplexity. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
- BLEU approximation. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
- ROUGE score. Source:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import numpy as np
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
def _pad_tensors_to_same_length(x, y):
"""Pad x and y so that the results have the same length (second dimension)."""
with tf.name_scope("pad_to_same_length"):
x_length = tf.shape(x)[1]
y_length = tf.shape(y)[1]
max_length = tf.maximum(x_length, y_length)
x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
return x, y
def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
"""Calculate cross entropy loss while ignoring padding.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch_size, length_labels]
smoothing: Label smoothing constant, used to determine the on and off values
vocab_size: int size of the vocabulary
Returns:
Returns the cross entropy loss and weight tensors: float32 tensors with
shape [batch_size, max(length_logits, length_labels)]
"""
with tf.name_scope("loss", values=[logits, labels]):
logits, labels = _pad_tensors_to_same_length(logits, labels)
# Calculate smoothing cross entropy
with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
confidence = 1.0 - smoothing
low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
soft_targets = tf.one_hot(
tf.cast(labels, tf.int32),
depth=vocab_size,
on_value=confidence,
off_value=low_confidence)
xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
logits=logits, labels=soft_targets)
# Calculate the best (lowest) possible value of cross entropy, and
# subtract from the cross entropy loss.
normalizing_constant = -(
confidence * tf.log(confidence) + tf.to_float(vocab_size - 1) *
low_confidence * tf.log(low_confidence + 1e-20))
xentropy -= normalizing_constant
weights = tf.to_float(tf.not_equal(labels, 0))
return xentropy * weights, weights
def _convert_to_eval_metric(metric_fn):
"""Wrap a metric fn that returns scores and weights as an eval metric fn.
The input metric_fn returns values for the current batch. The wrapper
aggregates the return values collected over all of the batches evaluated.
Args:
metric_fn: function that returns scores and weights for the current batch's
logits and predicted labels.
Returns:
function that aggregates the scores and weights from metric_fn.
"""
def problem_metric_fn(*args):
"""Returns an aggregation of the metric_fn's returned values."""
(scores, weights) = metric_fn(*args)
# The tf.metrics.mean function assures correct aggregation.
return tf.metrics.mean(scores, weights)
return problem_metric_fn
def get_eval_metrics(logits, labels, params):
"""Return dictionary of model evaluation metrics."""
metrics = {
"accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
"accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(
logits, labels),
"accuracy_per_sequence": _convert_to_eval_metric(
padded_sequence_accuracy)(logits, labels),
"neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(
logits, labels, params["vocab_size"]),
}
if not params["use_tpu"]:
# TPU does not support tf.py_func
metrics.update({
"approx_bleu_score": _convert_to_eval_metric(
bleu_score)(logits, labels),
"rouge_2_fscore": _convert_to_eval_metric(
rouge_2_fscore)(logits, labels),
"rouge_L_fscore": _convert_to_eval_metric(
rouge_l_fscore)(logits, labels),
})
# Prefix each of the metric names with "metrics/". This allows the metric
# graphs to display under the "metrics" category in TensorBoard.
metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
return metrics
def padded_accuracy(logits, labels):
"""Percentage of times that predictions matches labels on non-0s."""
with tf.variable_scope("padded_accuracy", values=[logits, labels]):
logits, labels = _pad_tensors_to_same_length(logits, labels)
weights = tf.to_float(tf.not_equal(labels, 0))
outputs = tf.to_int32(tf.argmax(logits, axis=-1))
padded_labels = tf.to_int32(labels)
return tf.to_float(tf.equal(outputs, padded_labels)), weights
def padded_accuracy_topk(logits, labels, k):
"""Percentage of times that top-k predictions matches labels on non-0s."""
with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
logits, labels = _pad_tensors_to_same_length(logits, labels)
weights = tf.to_float(tf.not_equal(labels, 0))
effective_k = tf.minimum(k, tf.shape(logits)[-1])
_, outputs = tf.nn.top_k(logits, k=effective_k)
outputs = tf.to_int32(outputs)
padded_labels = tf.to_int32(labels)
padded_labels = tf.expand_dims(padded_labels, axis=-1)
padded_labels += tf.zeros_like(outputs) # Pad to same shape.
same = tf.to_float(tf.equal(outputs, padded_labels))
same_topk = tf.reduce_sum(same, axis=-1)
return same_topk, weights
def padded_accuracy_top5(logits, labels):
return padded_accuracy_topk(logits, labels, 5)
def padded_sequence_accuracy(logits, labels):
"""Percentage of times that predictions matches labels everywhere (non-0)."""
with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
logits, labels = _pad_tensors_to_same_length(logits, labels)
weights = tf.to_float(tf.not_equal(labels, 0))
outputs = tf.to_int32(tf.argmax(logits, axis=-1))
padded_labels = tf.to_int32(labels)
not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
axis = list(range(1, len(outputs.get_shape())))
correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
return correct_seq, tf.constant(1.0)
def padded_neg_log_perplexity(logits, labels, vocab_size):
"""Average log-perplexity excluding padding 0s. No smoothing."""
num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
return -num, den
def bleu_score(logits, labels):
"""Approximate BLEU score computation between labels and predictions.
An approximate BLEU scoring method since we do not glue word pieces or
decode the ids and tokenize the output. By default, we use ngram order of 4
and use brevity penalty. Also, this does not have beam search.
Args:
logits: Tensor of size [batch_size, length_logits, vocab_size]
labels: Tensor of size [batch-size, length_labels]
Returns:
bleu: int, approx bleu score
"""
predictions = tf.to_int32(tf.argmax(logits, axis=-1))
# TODO: Look into removing use of py_func
bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
return bleu, tf.constant(1.0)
def _get_ngrams_with_counter(segment, max_order):
"""Extracts all n-grams up to a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in xrange(1, max_order + 1):
for i in xrange(0, len(segment) - order + 1):
ngram = tuple(segment[i:i + order])
ngram_counts[ngram] += 1
return ngram_counts
def compute_bleu(reference_corpus, translation_corpus, max_order=4,
use_bp=True):
"""Computes BLEU score of translated segments against one or more references.
Args:
reference_corpus: list of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
use_bp: boolean, whether to apply brevity penalty.
Returns:
BLEU score.
"""
reference_length = 0
translation_length = 0
bp = 1.0
geo_mean = 0
matches_by_order = [0] * max_order
possible_matches_by_order = [0] * max_order
precisions = []
for (references, translations) in zip(reference_corpus, translation_corpus):
reference_length += len(references)
translation_length += len(translations)
ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
overlap = dict((ngram,
min(count, translation_ngram_counts[ngram]))
for ngram, count in ref_ngram_counts.items())
for ngram in overlap:
matches_by_order[len(ngram) - 1] += overlap[ngram]
for ngram in translation_ngram_counts:
possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
ngram]
precisions = [0] * max_order
smooth = 1.0
for i in xrange(0, max_order):
if possible_matches_by_order[i] > 0:
precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
if matches_by_order[i] > 0:
precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
i]
else:
smooth *= 2
precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
else:
precisions[i] = 0.0
if max(precisions) > 0:
p_log_sum = sum(math.log(p) for p in precisions if p)
geo_mean = math.exp(p_log_sum / max_order)
if use_bp:
ratio = translation_length / reference_length
bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
bleu = geo_mean * bp
return np.float32(bleu)
def rouge_2_fscore(logits, labels):
"""ROUGE-2 F1 score computation between labels and predictions.
This is an approximate ROUGE scoring method since we do not glue word pieces
or decode the ids and tokenize the output.
Args:
logits: tensor, model predictions
labels: tensor, gold output.
Returns:
rouge2_fscore: approx rouge-2 f1 score.
"""
predictions = tf.to_int32(tf.argmax(logits, axis=-1))
# TODO: Look into removing use of py_func
rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
return rouge_2_f_score, tf.constant(1.0)
def _get_ngrams(n, text):
"""Calculates n-grams.
Args:
n: which n-grams to calculate
text: An array of tokens
Returns:
A set of n-grams
"""
ngram_set = set()
text_length = len(text)
max_index_ngram_start = text_length - n
for i in range(max_index_ngram_start + 1):
ngram_set.add(tuple(text[i:i + n]))
return ngram_set
def rouge_n(eval_sentences, ref_sentences, n=2):
"""Computes ROUGE-N f1 score of two text collections of sentences.
Source: https://www.microsoft.com/en-us/research/publication/
rouge-a-package-for-automatic-evaluation-of-summaries/
Args:
eval_sentences: Predicted sentences.
ref_sentences: Sentences from the reference set
n: Size of ngram. Defaults to 2.
Returns:
f1 score for ROUGE-N
"""
f1_scores = []
for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
eval_ngrams = _get_ngrams(n, eval_sentence)
ref_ngrams = _get_ngrams(n, ref_sentence)
ref_count = len(ref_ngrams)
eval_count = len(eval_ngrams)
# Count the overlapping ngrams between evaluated and reference
overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
overlapping_count = len(overlapping_ngrams)
# Handle edge case. This isn't mathematically correct, but it's good enough
if eval_count == 0:
precision = 0.0
else:
precision = float(overlapping_count) / eval_count
if ref_count == 0:
recall = 0.0
else:
recall = float(overlapping_count) / ref_count
f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
# return overlapping_count / reference_count
return np.mean(f1_scores, dtype=np.float32)
def rouge_l_fscore(predictions, labels):
"""ROUGE scores computation between labels and predictions.
This is an approximate ROUGE scoring method since we do not glue word pieces
or decode the ids and tokenize the output.
Args:
predictions: tensor, model predictions
labels: tensor, gold output.
Returns:
rouge_l_fscore: approx rouge-l f1 score.
"""
outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
tf.float32)
return rouge_l_f_score, tf.constant(1.0)
def rouge_l_sentence_level(eval_sentences, ref_sentences):
"""Computes ROUGE-L (sentence level) of two collections of sentences.
Source: https://www.microsoft.com/en-us/research/publication/
rouge-a-package-for-automatic-evaluation-of-summaries/
Calculated according to:
R_lcs = LCS(X,Y)/m
P_lcs = LCS(X,Y)/n
F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
where:
X = reference summary
Y = Candidate summary
m = length of reference summary
n = length of candidate summary
Args:
eval_sentences: The sentences that have been picked by the summarizer
ref_sentences: The sentences from the reference set
Returns:
A float: F_lcs
"""
f1_scores = []
for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
m = float(len(ref_sentence))
n = float(len(eval_sentence))
lcs = _len_lcs(eval_sentence, ref_sentence)
f1_scores.append(_f_lcs(lcs, m, n))
return np.mean(f1_scores, dtype=np.float32)
def _len_lcs(x, y):
"""Returns the length of the Longest Common Subsequence between two seqs.
Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
Args:
x: sequence of words
y: sequence of words
Returns
integer: Length of LCS between x and y
"""
table = _lcs(x, y)
n, m = len(x), len(y)
return table[n, m]
def _lcs(x, y):
"""Computes the length of the LCS between two seqs.
The implementation below uses a DP programming algorithm and runs
in O(nm) time where n = len(x) and m = len(y).
Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
Args:
x: collection of words
y: collection of words
Returns:
Table of dictionary of coord and len lcs
"""
n, m = len(x), len(y)
table = dict()
for i in range(n + 1):
for j in range(m + 1):
if i == 0 or j == 0:
table[i, j] = 0
elif x[i - 1] == y[j - 1]:
table[i, j] = table[i - 1, j - 1] + 1
else:
table[i, j] = max(table[i - 1, j], table[i, j - 1])
return table
def _f_lcs(llcs, m, n):
"""Computes the LCS-based F-measure score.
Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
rouge-working-note-v1.3.1.pdf
Args:
llcs: Length of LCS
m: number of words in reference summary
n: number of words in candidate summary
Returns:
Float. LCS-based F-measure score
"""
r_lcs = llcs / m
p_lcs = llcs / n
beta = p_lcs / (r_lcs + 1e-12)
num = (1 + (beta ** 2)) * r_lcs * p_lcs
denom = r_lcs + ((beta ** 2) * p_lcs)
f_lcs = num / (denom + 1e-12)
return f_lcs
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Abstract training on a step or epoch basis."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
_TRAIN, _EVAL = tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL
NUM_EXAMPLES = {
tf.estimator.ModeKeys.TRAIN: 4572160,
# # Examples that are too long are filtered out, thus the total is less
# # than the total number of lines.
# 2399123 + # news-commentary-v12.de-en
# 1920209 + # commoncrawl.de-en
# 270769, # europarl-v7.de-en
tf.estimator.ModeKeys.EVAL: 3000, # newstest2013
}
class Manager(object):
"""Container for convenience functions to abstract step or epoch basis.
Transformer allows users to specify an epoch basis (generally recommended for
full training) or a number of steps basis (convenient since epochs are rather
large). TPUs furthermore require a step basis; however epochs are the norm in
the machine learning community and it is desirable to allow users to specify
epochs even when running with TPUS which requires behind the scenes
conversions.
This container simply groups what are largely mundane checks and conversions
rather than interspersing them throughout the run loop code.
"""
def __init__(self, train_steps, steps_between_evals, train_epochs,
epochs_between_evals, default_train_epochs, batch_size,
max_length, use_tpu=False, num_tpu_shards=8):
if train_steps and train_epochs:
raise ValueError("Both train_steps or train_epochs were be defined.")
# Determine training schedule based on flags.
if train_steps:
self.train_eval_iterations = train_steps // steps_between_evals
self._single_iteration_train_steps = steps_between_evals
self._single_iteration_train_epochs = None
else:
train_epochs = train_epochs or default_train_epochs
self.train_eval_iterations = train_epochs // epochs_between_evals
self._single_iteration_train_steps = None
self._single_iteration_train_epochs = epochs_between_evals
self.max_length = max_length
self.batch_size = batch_size
self.use_tpu = use_tpu
self.num_tpu_shards = num_tpu_shards
if self.use_tpu:
assert (self.batch_size // self.max_length) % self.num_tpu_shards == 0
@property
def single_iteration_train_steps(self):
if self._single_iteration_train_steps or not self.use_tpu:
return self._single_iteration_train_steps
return self.epochs_to_steps(
num_epochs=self._single_iteration_train_epochs, mode=_TRAIN)
@property
def single_iteration_eval_steps(self):
if not self.use_tpu:
return None
return self.epochs_to_steps(num_epochs=1, mode=_EVAL)
@property
def train_increment_str(self):
if self._single_iteration_train_steps:
return "{} steps.".format(self._single_iteration_train_steps)
if not self.use_tpu:
return "{} epochs.".format(self._single_iteration_train_epochs)
return "~{} epochs. ({} steps)".format(
self._single_iteration_train_epochs,
self.single_iteration_train_steps)
@property
def repeat_dataset(self):
if (self._single_iteration_train_epochs is None and
self._single_iteration_train_steps > NUM_EXAMPLES[_TRAIN]):
return math.ceil(self._single_iteration_train_steps /
NUM_EXAMPLES[_TRAIN])
return self._single_iteration_train_epochs
def epochs_to_steps(self, num_epochs, mode):
"""Converts a number of epochs to a number of training steps.
TPU only: This function assumes that static_batch is True.
TPU can not tolerate an OutOfRange error from a dataset. As a result the
number of examples to be processed must be known ahead of time. TPUs also
do not allow partial batches, so this function rounds down.
Args:
num_epochs: An integer of the number of epochs to convert to steps.
mode: The estimator ModeKey of the computation
Returns:
An integer of the number of equivalent steps rounded down.
"""
assert self.use_tpu, "epochs_to_steps should only be reached when using TPU"
total_num_tokens = NUM_EXAMPLES[mode] * self.max_length * num_epochs
return total_num_tokens // self.batch_size
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test Transformer's schedule manager."""
import tensorflow as tf
from official.transformer.utils import schedule
class ScheduleBaseTester(tf.test.TestCase):
def test_mutual_exclusivity(self):
with self.assertRaises(ValueError):
schedule.Manager(
train_steps=100, steps_between_evals=100, train_epochs=2,
epochs_between_evals=1, default_train_epochs=None, batch_size=2048,
max_length=256)
def test_step_basis(self):
manager = schedule.Manager(
train_steps=1000, steps_between_evals=100, train_epochs=None,
epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
max_length=256)
self.assertEqual(manager.single_iteration_train_steps, 100)
# Evaluation uses the full set
self.assertIsNone(manager.single_iteration_eval_steps)
self.assertIsNone(manager.repeat_dataset)
def test_epoch_basis(self):
manager = schedule.Manager(
train_steps=None, steps_between_evals=None, train_epochs=10,
epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
max_length=256)
# For non-TPU, estimator relies on dataset exhausion
self.assertIsNone(manager.single_iteration_train_steps)
self.assertIsNone(manager.single_iteration_eval_steps)
self.assertEqual(manager.repeat_dataset, 2)
def test_step_basis_tpu(self):
manager = schedule.Manager(
train_steps=1000, steps_between_evals=100, train_epochs=None,
epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
max_length=256, use_tpu=True)
self.assertEqual(manager.single_iteration_train_steps, 100)
# num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
self.assertEqual(manager.single_iteration_eval_steps, 375)
self.assertIsNone(manager.repeat_dataset)
def test_epoch_basis_tpu(self):
manager = schedule.Manager(
train_steps=None, steps_between_evals=None, train_epochs=10,
epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
max_length=256, use_tpu=True)
self.assertEqual(
manager.single_iteration_train_steps,
schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 // (2048 / 256)
)
# num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
self.assertEqual(manager.single_iteration_eval_steps, 375)
self.assertEqual(manager.repeat_dataset, 2)
if __name__ == "__main__":
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment