Merge branch 'qianyj_tf' into 'main'

update tf code See merge request dcutoolkit/deeplearing/dlexamples_new!35

Merge branch 'qianyj_tf' into 'main'
update tf code See merge request dcutoolkit/deeplearing/dlexamples_new!35
9dafea91 · sunxx1 · 92a2ca36 · a4146470 · 92a2ca36 · 92a2ca36
Commit 9dafea91 authored Aug 02, 2022 by sunxx1
20 changed files
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/data_download.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/data_download.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Download and preprocess WMT17 ende training and evaluation datasets."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import random
-import tarfile
-
-# pylint: disable=g-bad-import-order
-import six
-from six.moves import urllib
-from absl import app as absl_app
-from absl import flags
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.transformer.utils import tokenizer
-from official.utils.flags import core as flags_core
-
-# Data sources for training/evaluating the transformer translation model.
-# If any of the training sources are changed, then either:
-#   1) use the flag `--search` to find the best min count or
-#   2) update the _TRAIN_DATA_MIN_COUNT constant.
-# min_count is the minimum number of times a token must appear in the data
-# before it is added to the vocabulary. "Best min count" refers to the value
-# that generates a vocabulary set that is closest in size to _TARGET_VOCAB_SIZE.
-_TRAIN_DATA_SOURCES = [
-    {
-        "url": "http://data.statmt.org/wmt17/translation-task/"
-               "training-parallel-nc-v12.tgz",
-        "input": "news-commentary-v12.de-en.en",
-        "target": "news-commentary-v12.de-en.de",
-    },
-    {
-        "url": "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
-        "input": "commoncrawl.de-en.en",
-        "target": "commoncrawl.de-en.de",
-    },
-    {
-        "url": "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
-        "input": "europarl-v7.de-en.en",
-        "target": "europarl-v7.de-en.de",
-    },
-]
-# Use pre-defined minimum count to generate subtoken vocabulary.
-_TRAIN_DATA_MIN_COUNT = 6
-
-_EVAL_DATA_SOURCES = [
-    {
-        "url": "http://data.statmt.org/wmt17/translation-task/dev.tgz",
-        "input": "newstest2013.en",
-        "target": "newstest2013.de",
-    }
-]
-
-# Vocabulary constants
-_TARGET_VOCAB_SIZE = 32768  # Number of subtokens in the vocabulary list.
-_TARGET_THRESHOLD = 327  # Accept vocabulary if size is within this threshold
-VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE
-
-# Strings to inclue in the generated files.
-_PREFIX = "wmt32k"
-_TRAIN_TAG = "train"
-_EVAL_TAG = "dev"  # Following WMT and Tensor2Tensor conventions, in which the
-                   # evaluation datasets are tagged as "dev" for development.
-
-# Number of files to split train and evaluation data
-_TRAIN_SHARDS = 100
-_EVAL_SHARDS = 1
-
-
-def find_file(path, filename, max_depth=5):
-  """Returns full filepath if the file is in path or a subdirectory."""
-  for root, dirs, files in os.walk(path):
-    if filename in files:
-      return os.path.join(root, filename)
-
-    # Don't search past max_depth
-    depth = root[len(path) + 1:].count(os.sep)
-    if depth > max_depth:
-      del dirs[:]  # Clear dirs
-  return None
-
-
-###############################################################################
-# Download and extraction functions
-###############################################################################
-def get_raw_files(raw_dir, data_source):
-  """Return raw files from source. Downloads/extracts if needed.
-
-  Args:
-    raw_dir: string directory to store raw files
-    data_source: dictionary with
-      {"url": url of compressed dataset containing input and target files
-       "input": file with data in input language
-       "target": file with data in target language}
-
-  Returns:
-    dictionary with
-      {"inputs": list of files containing data in input language
-       "targets": list of files containing corresponding data in target language
-      }
-  """
-  raw_files = {
-      "inputs": [],
-      "targets": [],
-  }  # keys
-  for d in data_source:
-    input_file, target_file = download_and_extract(
-        raw_dir, d["url"], d["input"], d["target"])
-    raw_files["inputs"].append(input_file)
-    raw_files["targets"].append(target_file)
-  return raw_files
-
-
-def download_report_hook(count, block_size, total_size):
-  """Report hook for download progress.
-
-  Args:
-    count: current block number
-    block_size: block size
-    total_size: total size
-  """
-  percent = int(count * block_size * 100 / total_size)
-  print("\r%d%%" % percent + " completed", end="\r")
-
-
-def download_from_url(path, url):
-  """Download content from a url.
-
-  Args:
-    path: string directory where file will be downloaded
-    url: string url
-
-  Returns:
-    Full path to downloaded file
-  """
-  filename = url.split("/")[-1]
-  found_file = find_file(path, filename, max_depth=0)
-  if found_file is None:
-    filename = os.path.join(path, filename)
-    tf.logging.info("Downloading from %s to %s." % (url, filename))
-    inprogress_filepath = filename + ".incomplete"
-    inprogress_filepath, _ = urllib.request.urlretrieve(
-        url, inprogress_filepath, reporthook=download_report_hook)
-    # Print newline to clear the carriage return from the download progress.
-    print()
-    tf.gfile.Rename(inprogress_filepath, filename)
-    return filename
-  else:
-    tf.logging.info("Already downloaded: %s (at %s)." % (url, found_file))
-    return found_file
-
-
-def download_and_extract(path, url, input_filename, target_filename):
-  """Extract files from downloaded compressed archive file.
-
-  Args:
-    path: string directory where the files will be downloaded
-    url: url containing the compressed input and target files
-    input_filename: name of file containing data in source language
-    target_filename: name of file containing data in target language
-
-  Returns:
-    Full paths to extracted input and target files.
-
-  Raises:
-    OSError: if the the download/extraction fails.
-  """
-  # Check if extracted files already exist in path
-  input_file = find_file(path, input_filename)
-  target_file = find_file(path, target_filename)
-  if input_file and target_file:
-    tf.logging.info("Already downloaded and extracted %s." % url)
-    return input_file, target_file
-
-  # Download archive file if it doesn't already exist.
-  compressed_file = download_from_url(path, url)
-
-  # Extract compressed files
-  tf.logging.info("Extracting %s." % compressed_file)
-  with tarfile.open(compressed_file, "r:gz") as corpus_tar:
-    corpus_tar.extractall(path)
-
-  # Return filepaths of the requested files.
-  input_file = find_file(path, input_filename)
-  target_file = find_file(path, target_filename)
-
-  if input_file and target_file:
-    return input_file, target_file
-
-  raise OSError("Download/extraction failed for url %s to path %s" %
-                (url, path))
-
-
-def txt_line_iterator(path):
-  """Iterate through lines of file."""
-  with tf.gfile.Open(path) as f:
-    for line in f:
-      yield line.strip()
-
-
-def compile_files(raw_dir, raw_files, tag):
-  """Compile raw files into a single file for each language.
-
-  Args:
-    raw_dir: Directory containing downloaded raw files.
-    raw_files: Dict containing filenames of input and target data.
-      {"inputs": list of files containing data in input language
-       "targets": list of files containing corresponding data in target language
-      }
-    tag: String to append to the compiled filename.
-
-  Returns:
-    Full path of compiled input and target files.
-  """
-  tf.logging.info("Compiling files with tag %s." % tag)
-  filename = "%s-%s" % (_PREFIX, tag)
-  input_compiled_file = os.path.join(raw_dir, filename + ".lang1")
-  target_compiled_file = os.path.join(raw_dir, filename + ".lang2")
-
-  with tf.gfile.Open(input_compiled_file, mode="w") as input_writer:
-    with tf.gfile.Open(target_compiled_file, mode="w") as target_writer:
-      for i in range(len(raw_files["inputs"])):
-        input_file = raw_files["inputs"][i]
-        target_file = raw_files["targets"][i]
-
-        tf.logging.info("Reading files %s and %s." % (input_file, target_file))
-        write_file(input_writer, input_file)
-        write_file(target_writer, target_file)
-  return input_compiled_file, target_compiled_file
-
-
-def write_file(writer, filename):
-  """Write all of lines from file using the writer."""
-  for line in txt_line_iterator(filename):
-    writer.write(line)
-    writer.write("\n")
-
-
-###############################################################################
-# Data preprocessing
-###############################################################################
-def encode_and_save_files(
-    subtokenizer, data_dir, raw_files, tag, total_shards):
-  """Save data from files as encoded Examples in TFrecord format.
-
-  Args:
-    subtokenizer: Subtokenizer object that will be used to encode the strings.
-    data_dir: The directory in which to write the examples
-    raw_files: A tuple of (input, target) data files. Each line in the input and
-      the corresponding line in target file will be saved in a tf.Example.
-    tag: String that will be added onto the file names.
-    total_shards: Number of files to divide the data into.
-
-  Returns:
-    List of all files produced.
-  """
-  # Create a file for each shard.
-  filepaths = [shard_filename(data_dir, tag, n + 1, total_shards)
-               for n in range(total_shards)]
-
-  if all_exist(filepaths):
-    tf.logging.info("Files with tag %s already exist." % tag)
-    return filepaths
-
-  tf.logging.info("Saving files with tag %s." % tag)
-  input_file = raw_files[0]
-  target_file = raw_files[1]
-
-  # Write examples to each shard in round robin order.
-  tmp_filepaths = [fname + ".incomplete" for fname in filepaths]
-  writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths]
-  counter, shard = 0, 0
-  for counter, (input_line, target_line) in enumerate(zip(
-      txt_line_iterator(input_file), txt_line_iterator(target_file))):
-    if counter > 0 and counter % 100000 == 0:
-      tf.logging.info("\tSaving case %d." % counter)
-    example = dict_to_example(
-        {"inputs": subtokenizer.encode(input_line, add_eos=True),
-         "targets": subtokenizer.encode(target_line, add_eos=True)})
-    writers[shard].write(example.SerializeToString())
-    shard = (shard + 1) % total_shards
-  for writer in writers:
-    writer.close()
-
-  for tmp_name, final_name in zip(tmp_filepaths, filepaths):
-    tf.gfile.Rename(tmp_name, final_name)
-
-  tf.logging.info("Saved %d Examples", counter + 1)
-  return filepaths
-
-
-def shard_filename(path, tag, shard_num, total_shards):
-  """Create filename for data shard."""
-  return os.path.join(
-      path, "%s-%s-%.5d-of-%.5d" % (_PREFIX, tag, shard_num, total_shards))
-
-
-def shuffle_records(fname):
-  """Shuffle records in a single file."""
-  tf.logging.info("Shuffling records in file %s" % fname)
-
-  # Rename file prior to shuffling
-  tmp_fname = fname + ".unshuffled"
-  tf.gfile.Rename(fname, tmp_fname)
-
-  reader = tf.compat.v1.io.tf_record_iterator(tmp_fname)
-  records = []
-  for record in reader:
-    records.append(record)
-    if len(records) % 100000 == 0:
-      tf.logging.info("\tRead: %d", len(records))
-
-  random.shuffle(records)
-
-  # Write shuffled records to original file name
-  with tf.python_io.TFRecordWriter(fname) as w:
-    for count, record in enumerate(records):
-      w.write(record)
-      if count > 0 and count % 100000 == 0:
-        tf.logging.info("\tWriting record: %d" % count)
-
-  tf.gfile.Remove(tmp_fname)
-
-
-def dict_to_example(dictionary):
-  """Converts a dictionary of string->int to a tf.Example."""
-  features = {}
-  for k, v in six.iteritems(dictionary):
-    features[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=v))
-  return tf.train.Example(features=tf.train.Features(feature=features))
-
-
-def all_exist(filepaths):
-  """Returns true if all files in the list exist."""
-  for fname in filepaths:
-    if not tf.gfile.Exists(fname):
-      return False
-  return True
-
-
-def make_dir(path):
-  if not tf.gfile.Exists(path):
-    tf.logging.info("Creating directory %s" % path)
-    tf.gfile.MakeDirs(path)
-
-
-def main(unused_argv):
-  """Obtain training and evaluation data for the Transformer model."""
-  make_dir(FLAGS.raw_dir)
-  make_dir(FLAGS.data_dir)
-
-  # Get paths of download/extracted training and evaluation files.
-  tf.logging.info("Step 1/4: Downloading data from source")
-  train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES)
-  eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES)
-
-  # Create subtokenizer based on the training files.
-  tf.logging.info("Step 2/4: Creating subtokenizer and building vocabulary")
-  train_files_flat = train_files["inputs"] + train_files["targets"]
-  vocab_file = os.path.join(FLAGS.data_dir, VOCAB_FILE)
-  subtokenizer = tokenizer.Subtokenizer.init_from_files(
-      vocab_file, train_files_flat, _TARGET_VOCAB_SIZE, _TARGET_THRESHOLD,
-      min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT)
-
-  tf.logging.info("Step 3/4: Compiling training and evaluation data")
-  compiled_train_files = compile_files(FLAGS.raw_dir, train_files, _TRAIN_TAG)
-  compiled_eval_files = compile_files(FLAGS.raw_dir, eval_files, _EVAL_TAG)
-
-  # Tokenize and save data as Examples in the TFRecord format.
-  tf.logging.info("Step 4/4: Preprocessing and saving data")
-  train_tfrecord_files = encode_and_save_files(
-      subtokenizer, FLAGS.data_dir, compiled_train_files, _TRAIN_TAG,
-      _TRAIN_SHARDS)
-  encode_and_save_files(
-      subtokenizer, FLAGS.data_dir, compiled_eval_files, _EVAL_TAG,
-      _EVAL_SHARDS)
-
-  for fname in train_tfrecord_files:
-    shuffle_records(fname)
-
-
-def define_data_download_flags():
-  """Add flags specifying data download arguments."""
-  flags.DEFINE_string(
-      name="data_dir", short_name="dd", default="/tmp/translate_ende",
-      help=flags_core.help_wrap(
-          "Directory for where the translate_ende_wmt32k dataset is saved."))
-  flags.DEFINE_string(
-      name="raw_dir", short_name="rd", default="/tmp/translate_ende_raw",
-      help=flags_core.help_wrap(
-          "Path where the raw data will be downloaded and extracted."))
-  flags.DEFINE_bool(
-      name="search", default=False,
-      help=flags_core.help_wrap(
-          "If set, use binary search to find the vocabulary set with size"
-          "closest to the target size (%d)." % _TARGET_VOCAB_SIZE))
-
-
-if __name__ == "__main__":
-  tf.logging.set_verbosity(tf.logging.INFO)
-  define_data_download_flags()
-  FLAGS = flags.FLAGS
-  absl_app.run(main)
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/__init__.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/__init__.py
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/attention_layer.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/attention_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of multiheaded attention and self-attention layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-class Attention(tf.layers.Layer):
-  """Multi-headed attention layer."""
-
-  def __init__(self, hidden_size, num_heads, attention_dropout, train):
-    if hidden_size % num_heads != 0:
-      raise ValueError("Hidden size must be evenly divisible by the number of "
-                       "heads.")
-
-    super(Attention, self).__init__()
-    self.hidden_size = hidden_size
-    self.num_heads = num_heads
-    self.attention_dropout = attention_dropout
-    self.train = train
-
-    # Layers for linearly projecting the queries, keys, and values.
-    self.q_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="q")
-    self.k_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="k")
-    self.v_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="v")
-
-    self.output_dense_layer = tf.layers.Dense(hidden_size, use_bias=False,
-                                              name="output_transform")
-
-  def split_heads(self, x):
-    """Split x into different heads, and transpose the resulting value.
-
-    The tensor is transposed to insure the inner dimensions hold the correct
-    values during the matrix multiplication.
-
-    Args:
-      x: A tensor with shape [batch_size, length, hidden_size]
-
-    Returns:
-      A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
-    """
-    with tf.name_scope("split_heads"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[1]
-
-      # Calculate depth of last dimension after it has been split.
-      depth = (self.hidden_size // self.num_heads)
-
-      # Split the last dimension
-      x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
-
-      # Transpose the result
-      return tf.transpose(x, [0, 2, 1, 3])
-
-  def combine_heads(self, x):
-    """Combine tensor that has been split.
-
-    Args:
-      x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
-
-    Returns:
-      A tensor with shape [batch_size, length, hidden_size]
-    """
-    with tf.name_scope("combine_heads"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[2]
-      x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
-      return tf.reshape(x, [batch_size, length, self.hidden_size])
-
-  def call(self, x, y, bias, cache=None):
-    """Apply attention mechanism to x and y.
-
-    Args:
-      x: a tensor with shape [batch_size, length_x, hidden_size]
-      y: a tensor with shape [batch_size, length_y, hidden_size]
-      bias: attention bias that will be added to the result of the dot product.
-      cache: (Used during prediction) dictionary with tensors containing results
-        of previous attentions. The dictionary must have the items:
-            {"k": tensor with shape [batch_size, i, key_channels],
-             "v": tensor with shape [batch_size, i, value_channels]}
-        where i is the current decoded length.
-
-    Returns:
-      Attention layer output with shape [batch_size, length_x, hidden_size]
-    """
-    # Linearly project the query (q), key (k) and value (v) using different
-    # learned projections. This is in preparation of splitting them into
-    # multiple heads. Multi-head attention uses multiple queries, keys, and
-    # values rather than regular attention (which uses a single q, k, v).
-    q = self.q_dense_layer(x)
-    k = self.k_dense_layer(y)
-    v = self.v_dense_layer(y)
-
-    if cache is not None:
-      # Combine cached keys and values with new keys and values.
-      k = tf.concat([cache["k"], k], axis=1)
-      v = tf.concat([cache["v"], v], axis=1)
-
-      # Update cache
-      cache["k"] = k
-      cache["v"] = v
-
-    # Split q, k, v into heads.
-    q = self.split_heads(q)
-    k = self.split_heads(k)
-    v = self.split_heads(v)
-
-    # Scale q to prevent the dot product between q and k from growing too large.
-    depth = (self.hidden_size // self.num_heads)
-    q *= depth ** -0.5
-
-    # Calculate dot product attention
-    logits = tf.matmul(q, k, transpose_b=True)
-    logits += bias
-    weights = tf.nn.softmax(logits, name="attention_weights")
-    if self.train:
-      weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout)
-    attention_output = tf.matmul(weights, v)
-
-    # Recombine heads --> [batch_size, length, hidden_size]
-    attention_output = self.combine_heads(attention_output)
-
-    # Run the combined outputs through another linear projection layer.
-    attention_output = self.output_dense_layer(attention_output)
-    return attention_output
-
-
-class SelfAttention(Attention):
-  """Multiheaded self-attention layer."""
-
-  def call(self, x, bias, cache=None):
-    return super(SelfAttention, self).call(x, x, bias, cache)
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/beam_search.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/beam_search.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Beam search to find the translated sequence with the highest probability.
-
-Source implementation from Tensor2Tensor:
-https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/beam_search.py
-"""
-
-import tensorflow as tf
-from tensorflow.python.util import nest
-
-# Default value for INF
-INF = 1. * 1e7
-
-
-class _StateKeys(object):
-  """Keys to dictionary storing the state of the beam search loop."""
-
-  # Variable storing the loop index.
-  CUR_INDEX = "CUR_INDEX"
-
-  # Top sequences that are alive for each batch item. Alive sequences are ones
-  # that have not generated an EOS token. Sequences that reach EOS are marked as
-  # finished and moved to the FINISHED_SEQ tensor.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]
-  ALIVE_SEQ = "ALIVE_SEQ"
-  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
-  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
-  # Dictionary of cached values for each alive sequence. The cache stores
-  # the encoder output, attention bias, and the decoder attention output from
-  # the previous iteration.
-  ALIVE_CACHE = "ALIVE_CACHE"
-
-  # Top finished sequences for each batch item.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
-  # shorter than CUR_INDEX + 1 are padded with 0s.
-  FINISHED_SEQ = "FINISHED_SEQ"
-  # Scores for each finished sequence. Score = log probability / length norm
-  # Shape [batch_size, beam_size]
-  FINISHED_SCORES = "FINISHED_SCORES"
-  # Flags indicating which sequences in the finished sequences are finished.
-  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
-  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
-  FINISHED_FLAGS = "FINISHED_FLAGS"
-
-
-class SequenceBeamSearch(object):
-  """Implementation of beam search loop."""
-
-  def __init__(self, symbols_to_logits_fn, vocab_size, batch_size,
-               beam_size, alpha, max_decode_length, eos_id):
-    self.symbols_to_logits_fn = symbols_to_logits_fn
-    self.vocab_size = vocab_size
-    self.batch_size = batch_size
-    self.beam_size = beam_size
-    self.alpha = alpha
-    self.max_decode_length = max_decode_length
-    self.eos_id = eos_id
-
-  def search(self, initial_ids, initial_cache):
-    """Beam search for sequences with highest scores."""
-    state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
-
-    finished_state = tf.while_loop(
-        self._continue_search, self._search_step, loop_vars=[state],
-        shape_invariants=[state_shapes], parallel_iterations=1, back_prop=False)
-    finished_state = finished_state[0]
-
-    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
-    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
-    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
-    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
-    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
-
-    # Account for corner case where there are no finished sequences for a
-    # particular batch item. In that case, return alive sequences for that batch
-    # item.
-    finished_seq = tf.where(
-        tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
-    finished_scores = tf.where(
-        tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
-    return finished_seq, finished_scores
-
-  def _create_initial_state(self, initial_ids, initial_cache):
-    """Return initial state dictionary and its shape invariants.
-
-    Args:
-      initial_ids: initial ids to pass into the symbols_to_logits_fn.
-        int tensor with shape [batch_size, 1]
-      initial_cache: dictionary storing values to be passed into the
-        symbols_to_logits_fn.
-
-    Returns:
-        state and shape invariant dictionaries with keys from _StateKeys
-    """
-    # Current loop index (starts at 0)
-    cur_index = tf.constant(0)
-
-    # Create alive sequence with shape [batch_size, beam_size, 1]
-    alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
-    alive_seq = tf.expand_dims(alive_seq, axis=2)
-
-    # Create tensor for storing initial log probabilities.
-    # Assume initial_ids are prob 1.0
-    initial_log_probs = tf.constant(
-        [[0.] + [-float("inf")] * (self.beam_size - 1)])
-    alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1])
-
-    # Expand all values stored in the dictionary to the beam size, so that each
-    # beam has a separate cache.
-    alive_cache = nest.map_structure(
-        lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
-
-    # Initialize tensor storing finished sequences with filler values.
-    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
-
-    # Set scores of the initial finished seqs to negative infinity.
-    finished_scores = tf.ones([self.batch_size, self.beam_size]) * -INF
-
-    # Initialize finished flags with all False values.
-    finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool)
-
-    # Create state dictionary
-    state = {
-        _StateKeys.CUR_INDEX: cur_index,
-        _StateKeys.ALIVE_SEQ: alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
-        _StateKeys.ALIVE_CACHE: alive_cache,
-        _StateKeys.FINISHED_SEQ: finished_seq,
-        _StateKeys.FINISHED_SCORES: finished_scores,
-        _StateKeys.FINISHED_FLAGS: finished_flags
-    }
-
-    # Create state invariants for each value in the state dictionary. Each
-    # dimension must be a constant or None. A None dimension means either:
-    #   1) the dimension's value is a tensor that remains the same but may
-    #      depend on the input sequence to the model (e.g. batch size).
-    #   2) the dimension may have different values on different iterations.
-    state_shape_invariants = {
-        _StateKeys.CUR_INDEX: tf.TensorShape([]),
-        _StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]),
-        _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]),
-        _StateKeys.ALIVE_CACHE: nest.map_structure(
-            _get_shape_keep_last_dim, alive_cache),
-        _StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]),
-        _StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]),
-        _StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size])
-    }
-
-    return state, state_shape_invariants
-
-  def _continue_search(self, state):
-    """Return whether to continue the search loop.
-
-    The loops should terminate when
-      1) when decode length has been reached, or
-      2) when the worst score in the finished sequences is better than the best
-         score in the alive sequences (i.e. the finished sequences are provably
-         unchanging)
-
-    Args:
-      state: A dictionary with the current loop state.
-
-    Returns:
-      Bool tensor with value True if loop should continue, False if loop should
-      terminate.
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
-    finished_scores = state[_StateKeys.FINISHED_SCORES]
-    finished_flags = state[_StateKeys.FINISHED_FLAGS]
-
-    not_at_max_decode_length = tf.less(i, self.max_decode_length)
-
-    # Calculate largest length penalty (the larger penalty, the better score).
-    max_length_norm = _length_normalization(self.alpha, self.max_decode_length)
-    # Get the best possible scores from alive sequences.
-    best_alive_scores = alive_log_probs[:, 0] / max_length_norm
-
-    # Compute worst score in finished sequences for each batch element
-    finished_scores *= tf.to_float(finished_flags)  # set filler scores to zero
-    lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
-
-    # If there are no finished sequences in a batch element, then set the lowest
-    # finished score to -INF for that element.
-    finished_batches = tf.reduce_any(finished_flags, 1)
-    lowest_finished_scores += (1. - tf.to_float(finished_batches)) * -INF
-
-    worst_finished_score_better_than_best_alive_score = tf.reduce_all(
-        tf.greater(lowest_finished_scores, best_alive_scores)
-    )
-
-    return tf.logical_and(
-        not_at_max_decode_length,
-        tf.logical_not(worst_finished_score_better_than_best_alive_score)
-    )
-
-  def _search_step(self, state):
-    """Beam search loop body.
-
-    Grow alive sequences by a single ID. Sequences that have reached the EOS
-    token are marked as finished. The alive and finished sequences with the
-    highest log probabilities and scores are returned.
-
-    A sequence's finished score is calculating by dividing the log probability
-    by the length normalization factor. Without length normalization, the
-    search is more likely to return shorter sequences.
-
-    Args:
-      state: A dictionary with the current loop state.
-
-    Returns:
-      new state dictionary.
-    """
-    # Grow alive sequences by one token.
-    new_seq, new_log_probs, new_cache = self._grow_alive_seq(state)
-    # Collect top beam_size alive sequences
-    alive_state = self._get_new_alive_state(new_seq, new_log_probs, new_cache)
-
-    # Combine newly finished sequences with existing finished sequences, and
-    # collect the top k scoring sequences.
-    finished_state = self._get_new_finished_state(state, new_seq, new_log_probs)
-
-    # Increment loop index and create new state dictionary
-    new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
-    new_state.update(alive_state)
-    new_state.update(finished_state)
-    return [new_state]
-
-  def _grow_alive_seq(self, state):
-    """Grow alive sequences by one token, and collect top 2*beam_size sequences.
-
-    2*beam_size sequences are collected because some sequences may have reached
-    the EOS token. 2*beam_size ensures that at least beam_size sequences are
-    still alive.
-
-    Args:
-      state: A dictionary with the current loop state.
-    Returns:
-      Tuple of
-      (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
-       Scores of returned sequences [batch_size, 2 * beam_size],
-       New alive cache, for each of the 2 * beam_size sequences)
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    alive_seq = state[_StateKeys.ALIVE_SEQ]
-    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
-    alive_cache = state[_StateKeys.ALIVE_CACHE]
-
-    beams_to_keep = 2 * self.beam_size
-
-    # Get logits for the next candidate IDs for the alive sequences. Get the new
-    # cache values at the same time.
-    flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
-    flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache)
-
-    flat_logits, flat_cache = self.symbols_to_logits_fn(flat_ids, i, flat_cache)
-
-    # Unflatten logits to shape [batch_size, beam_size, vocab_size]
-    logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size)
-    new_cache = nest.map_structure(
-        lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size),
-        flat_cache)
-
-    # Convert logits to normalized log probs
-    candidate_log_probs = _log_prob_from_logits(logits)
-
-    # Calculate new log probabilities if each of the alive sequences were
-    # extended # by the the candidate IDs.
-    # Shape [batch_size, beam_size, vocab_size]
-    log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
-
-    # Each batch item has beam_size * vocab_size candidate sequences. For each
-    # batch item, get the k candidates with the highest log probabilities.
-    flat_log_probs = tf.reshape(log_probs,
-                                [-1, self.beam_size * self.vocab_size])
-    topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep)
-
-    # Extract the alive sequences that generate the highest log probabilities
-    # after being extended.
-    topk_beam_indices = topk_indices // self.vocab_size
-    topk_seq, new_cache = _gather_beams(
-        [alive_seq, new_cache], topk_beam_indices, self.batch_size,
-        beams_to_keep)
-
-    # Append the most probable IDs to the topk sequences
-    topk_ids = topk_indices % self.vocab_size
-    topk_ids = tf.expand_dims(topk_ids, axis=2)
-    topk_seq = tf.concat([topk_seq, topk_ids], axis=2)
-    return topk_seq, topk_log_probs, new_cache
-
-  def _get_new_alive_state(self, new_seq, new_log_probs, new_cache):
-    """Gather the top k sequences that are still alive.
-
-    Args:
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
-      new_log_probs: Log probabilities of new sequences
-        float32 tensor with shape [batch_size, beam_size]
-      new_cache: Dict of cached values for each sequence.
-
-    Returns:
-      Dictionary with alive keys from _StateKeys:
-        {Top beam_size sequences that are still alive (don't end with eos_id)
-         Log probabilities of top alive sequences
-         Dict cache storing decoder states for top alive sequences}
-    """
-    # To prevent finished sequences from being considered, set log probs to -INF
-    new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
-    new_log_probs += tf.to_float(new_finished_flags) * -INF
-
-    top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
-        [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size,
-        self.beam_size)
-
-    return {
-        _StateKeys.ALIVE_SEQ: top_alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
-        _StateKeys.ALIVE_CACHE: top_alive_cache
-    }
-
-  def _get_new_finished_state(self, state, new_seq, new_log_probs):
-    """Combine new and old finished sequences, and gather the top k sequences.
-
-    Args:
-      state: A dictionary with the current loop state.
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, beam_size, i + 1]
-      new_log_probs: Log probabilities of new sequences
-        float32 tensor with shape [batch_size, beam_size]
-
-    Returns:
-      Dictionary with finished keys from _StateKeys:
-        {Top beam_size finished sequences based on score,
-         Scores of finished sequences,
-         Finished flags of finished sequences}
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    finished_seq = state[_StateKeys.FINISHED_SEQ]
-    finished_scores = state[_StateKeys.FINISHED_SCORES]
-    finished_flags = state[_StateKeys.FINISHED_FLAGS]
-
-    # First append a column of 0-ids to finished_seq to increment the length.
-    # New shape of finished_seq: [batch_size, beam_size, i + 1]
-    finished_seq = tf.concat(
-        [finished_seq,
-         tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)], axis=2)
-
-    # Calculate new seq scores from log probabilities.
-    length_norm = _length_normalization(self.alpha, i + 1)
-    new_scores = new_log_probs / length_norm
-
-    # Set the scores of the still-alive seq in new_seq to large negative values.
-    new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
-    new_scores += (1. - tf.to_float(new_finished_flags)) * -INF
-
-    # Combine sequences, scores, and flags.
-    finished_seq = tf.concat([finished_seq, new_seq], axis=1)
-    finished_scores = tf.concat([finished_scores, new_scores], axis=1)
-    finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
-
-    # Return the finished sequences with the best scores.
-    top_finished_seq, top_finished_scores, top_finished_flags = (
-        _gather_topk_beams([finished_seq, finished_scores, finished_flags],
-                           finished_scores, self.batch_size, self.beam_size))
-
-    return {
-        _StateKeys.FINISHED_SEQ: top_finished_seq,
-        _StateKeys.FINISHED_SCORES: top_finished_scores,
-        _StateKeys.FINISHED_FLAGS: top_finished_flags
-    }
-
-
-def sequence_beam_search(
-    symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size,
-    alpha, max_decode_length, eos_id):
-  """Search for sequence of subtoken ids with the largest probability.
-
-  Args:
-    symbols_to_logits_fn: A function that takes in ids, index, and cache as
-      arguments. The passed in arguments will have shape:
-        ids -> [batch_size * beam_size, index]
-        index -> [] (scalar)
-        cache -> nested dictionary of tensors [batch_size * beam_size, ...]
-      The function must return logits and new cache.
-        logits -> [batch * beam_size, vocab_size]
-        new cache -> same shape/structure as inputted cache
-    initial_ids: Starting ids for each batch item.
-      int32 tensor with shape [batch_size]
-    initial_cache: dict containing starting decoder variables information
-    vocab_size: int size of tokens
-    beam_size: int number of beams
-    alpha: float defining the strength of length normalization
-    max_decode_length: maximum length to decoded sequence
-    eos_id: int id of eos token, used to determine when a sequence has finished
-
-  Returns:
-    Top decoded sequences [batch_size, beam_size, max_decode_length]
-    sequence scores [batch_size, beam_size]
-  """
-  batch_size = tf.shape(initial_ids)[0]
-  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size,
-                           beam_size, alpha, max_decode_length, eos_id)
-  return sbs.search(initial_ids, initial_cache)
-
-
-def _log_prob_from_logits(logits):
-  return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
-
-
-def _length_normalization(alpha, length):
-  """Return length normalization factor."""
-  return tf.pow(((5. + tf.to_float(length)) / 6.), alpha)
-
-
-def _expand_to_beam_size(tensor, beam_size):
-  """Tiles a given tensor by beam_size.
-
-  Args:
-    tensor: tensor to tile [batch_size, ...]
-    beam_size: How much to tile the tensor by.
-
-  Returns:
-    Tiled tensor [batch_size, beam_size, ...]
-  """
-  tensor = tf.expand_dims(tensor, axis=1)
-  tile_dims = [1] * tensor.shape.ndims
-  tile_dims[1] = beam_size
-
-  return tf.tile(tensor, tile_dims)
-
-
-def _shape_list(tensor):
-  """Return a list of the tensor's shape, and ensure no None values in list."""
-  # Get statically known shape (may contain None's for unknown dimensions)
-  shape = tensor.get_shape().as_list()
-
-  # Ensure that the shape values are not None
-  dynamic_shape = tf.shape(tensor)
-  for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
-    if shape[i] is None:
-      shape[i] = dynamic_shape[i]
-  return shape
-
-
-def _get_shape_keep_last_dim(tensor):
-  shape_list = _shape_list(tensor)
-
-  # Only the last
-  for i in range(len(shape_list) - 1):
-    shape_list[i] = None
-
-  if isinstance(shape_list[-1], tf.Tensor):
-    shape_list[-1] = None
-  return tf.TensorShape(shape_list)
-
-
-def _flatten_beam_dim(tensor):
-  """Reshapes first two dimensions in to single dimension.
-
-  Args:
-    tensor: Tensor to reshape of shape [A, B, ...]
-
-  Returns:
-    Reshaped tensor of shape [A*B, ...]
-  """
-  shape = _shape_list(tensor)
-  shape[0] *= shape[1]
-  shape.pop(1)  # Remove beam dim
-  return tf.reshape(tensor, shape)
-
-
-def _unflatten_beam_dim(tensor, batch_size, beam_size):
-  """Reshapes first dimension back to [batch_size, beam_size].
-
-  Args:
-    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
-    batch_size: Tensor, original batch size.
-    beam_size: int, original beam size.
-
-  Returns:
-    Reshaped tensor of shape [batch_size, beam_size, ...]
-  """
-  shape = _shape_list(tensor)
-  new_shape = [batch_size, beam_size] + shape[1:]
-  return tf.reshape(tensor, new_shape)
-
-
-def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
-  """Gather beams from nested structure of tensors.
-
-  Each tensor in nested represents a batch of beams, where beam refers to a
-  single search state (beam search involves searching through multiple states
-  in parallel).
-
-  This function is used to gather the top beams, specified by
-  beam_indices, from the nested tensors.
-
-  Args:
-    nested: Nested structure (tensor, list, tuple or dict) containing tensors
-      with shape [batch_size, beam_size, ...].
-    beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
-     value in beam_indices must be between [0, beam_size), and are not
-     necessarily unique.
-    batch_size: int size of batch
-    new_beam_size: int number of beams to be pulled from the nested tensors.
-
-  Returns:
-    Nested structure containing tensors with shape
-      [batch_size, new_beam_size, ...]
-  """
-  # Computes the i'th coodinate that contains the batch index for gather_nd.
-  # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
-  batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
-  batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
-
-  # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
-  # with shape [batch_size, beam_size, 2], where the last dimension contains
-  # the (i, j) gathering coordinates.
-  coordinates = tf.stack([batch_pos, beam_indices], axis=2)
-
-  return nest.map_structure(
-      lambda state: tf.gather_nd(state, coordinates), nested)
-
-
-def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
-  """Gather top beams from nested structure."""
-  _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
-  return _gather_beams(nested, topk_indexes, batch_size, beam_size)
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/beam_search_test.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/beam_search_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test beam search helper methods."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.transformer.model import beam_search
-
-
-class BeamSearchHelperTests(tf.test.TestCase):
-
-  def test_expand_to_beam_size(self):
-    x = tf.ones([7, 4, 2, 5])
-    x = beam_search._expand_to_beam_size(x, 3)
-    with self.test_session() as sess:
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([7, 3, 4, 2, 5], shape)
-
-  def test_shape_list(self):
-    y = tf.placeholder(dtype=tf.int32, shape=[])
-    x = tf.ones([7, y, 2, 5])
-    shape = beam_search._shape_list(x)
-    self.assertIsInstance(shape[0], int)
-    self.assertIsInstance(shape[1], tf.Tensor)
-    self.assertIsInstance(shape[2], int)
-    self.assertIsInstance(shape[3], int)
-
-  def test_get_shape_keep_last_dim(self):
-    y = tf.constant(4.0)
-    x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5])
-    shape = beam_search._get_shape_keep_last_dim(x)
-    self.assertAllEqual([None, None, None, 5],
-                        shape.as_list())
-
-  def test_flatten_beam_dim(self):
-    x = tf.ones([7, 4, 2, 5])
-    x = beam_search._flatten_beam_dim(x)
-    with self.test_session() as sess:
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([28, 2, 5], shape)
-
-  def test_unflatten_beam_dim(self):
-    x = tf.ones([28, 2, 5])
-    x = beam_search._unflatten_beam_dim(x, 7, 4)
-    with self.test_session() as sess:
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([7, 4, 2, 5], shape)
-
-  def test_gather_beams(self):
-    x = tf.reshape(tf.range(24), [2, 3, 4])
-    # x looks like:  [[[ 0  1  2  3]
-    #                  [ 4  5  6  7]
-    #                  [ 8  9 10 11]]
-    #
-    #                 [[12 13 14 15]
-    #                  [16 17 18 19]
-    #                  [20 21 22 23]]]
-
-    y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
-    with self.test_session() as sess:
-      y = sess.run(y)
-
-    self.assertAllEqual([[[4, 5, 6, 7],
-                          [8, 9, 10, 11]],
-                         [[12, 13, 14, 15],
-                          [20, 21, 22, 23]]],
-                        y)
-
-  def test_gather_topk_beams(self):
-    x = tf.reshape(tf.range(24), [2, 3, 4])
-    x_scores = [[0, 1, 1], [1, 0, 1]]
-
-    y = beam_search._gather_topk_beams(x, x_scores, 2, 2)
-    with self.test_session() as sess:
-      y = sess.run(y)
-
-    self.assertAllEqual([[[4, 5, 6, 7],
-                          [8, 9, 10, 11]],
-                         [[12, 13, 14, 15],
-                          [20, 21, 22, 23]]],
-                        y)
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/embedding_layer.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/embedding_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of embedding layer with shared weights."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.transformer.model import model_utils
-from official.utils.accelerator import tpu as tpu_utils
-
-
-class EmbeddingSharedWeights(tf.layers.Layer):
-  """Calculates input embeddings and pre-softmax linear with shared weights."""
-
-  def __init__(self, vocab_size, hidden_size, method="gather"):
-    """Specify characteristic parameters of embedding layer.
-
-    Args:
-      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
-      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
-      method: Strategy for performing embedding lookup. "gather" uses tf.gather
-        which performs well on CPUs and GPUs, but very poorly on TPUs. "matmul"
-        one-hot encodes the indicies and formulates the embedding as a sparse
-        matrix multiplication. The matmul formulation is wasteful as it does
-        extra work, however matrix multiplication is very fast on TPUs which
-        makes "matmul" considerably faster than "gather" on TPUs.
-    """
-    super(EmbeddingSharedWeights, self).__init__()
-    self.vocab_size = vocab_size
-    self.hidden_size = hidden_size
-    if method not in ("gather", "matmul"):
-      raise ValueError("method {} must be 'gather' or 'matmul'".format(method))
-    self.method = method
-
-  def build(self, _):
-    with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE):
-      # Create and initialize weights. The random normal initializer was chosen
-      # randomly, and works well.
-      self.shared_weights = tf.get_variable(
-          "weights", [self.vocab_size, self.hidden_size],
-          initializer=tf.random_normal_initializer(
-              0., self.hidden_size ** -0.5))
-
-    self.built = True
-
-  def call(self, x):
-    """Get token embeddings of x.
-
-    Args:
-      x: An int64 tensor with shape [batch_size, length]
-    Returns:
-      embeddings: float32 tensor with shape [batch_size, length, embedding_size]
-      padding: float32 tensor with shape [batch_size, length] indicating the
-        locations of the padding tokens in x.
-    """
-    with tf.name_scope("embedding"):
-      # Create binary mask of size [batch_size, length]
-      mask = tf.to_float(tf.not_equal(x, 0))
-
-      if self.method == "gather":
-        embeddings = tf.gather(self.shared_weights, x)
-        embeddings *= tf.expand_dims(mask, -1)
-      else:  # matmul
-        embeddings = tpu_utils.embedding_matmul(
-            embedding_table=self.shared_weights,
-            values=tf.cast(x, dtype=tf.int32),
-            mask=mask
-        )
-        # embedding_matmul already zeros out masked positions, so
-        # `embeddings *= tf.expand_dims(mask, -1)` is unnecessary.
-
-
-      # Scale embedding by the sqrt of the hidden size
-      embeddings *= self.hidden_size ** 0.5
-
-      return embeddings
-
-
-  def linear(self, x):
-    """Computes logits by running x through a linear layer.
-
-    Args:
-      x: A float32 tensor with shape [batch_size, length, hidden_size]
-    Returns:
-      float32 tensor with shape [batch_size, length, vocab_size].
-    """
-    with tf.name_scope("presoftmax_linear"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[1]
-
-      x = tf.reshape(x, [-1, self.hidden_size])
-      logits = tf.matmul(x, self.shared_weights, transpose_b=True)
-
-      return tf.reshape(logits, [batch_size, length, self.vocab_size])
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/ffn_layer.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/ffn_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of fully connected network."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-class FeedFowardNetwork(tf.layers.Layer):
-  """Fully connected feedforward network."""
-
-  def __init__(self, hidden_size, filter_size, relu_dropout, train, allow_pad):
-    super(FeedFowardNetwork, self).__init__()
-    self.hidden_size = hidden_size
-    self.filter_size = filter_size
-    self.relu_dropout = relu_dropout
-    self.train = train
-    self.allow_pad = allow_pad
-
-    self.filter_dense_layer = tf.layers.Dense(
-        filter_size, use_bias=True, activation=tf.nn.relu, name="filter_layer")
-    self.output_dense_layer = tf.layers.Dense(
-        hidden_size, use_bias=True, name="output_layer")
-
-  def call(self, x, padding=None):
-    """Return outputs of the feedforward network.
-
-    Args:
-      x: tensor with shape [batch_size, length, hidden_size]
-      padding: (optional) If set, the padding values are temporarily removed
-        from x (provided self.allow_pad is set). The padding values are placed
-        back in the output tensor in the same locations.
-        shape [batch_size, length]
-
-    Returns:
-      Output of the feedforward network.
-      tensor with shape [batch_size, length, hidden_size]
-    """
-    padding = None if not self.allow_pad else padding
-
-    # Retrieve dynamically known shapes
-    batch_size = tf.shape(x)[0]
-    length = tf.shape(x)[1]
-
-    if padding is not None:
-      with tf.name_scope("remove_padding"):
-        # Flatten padding to [batch_size*length]
-        pad_mask = tf.reshape(padding, [-1])
-
-        nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9))
-
-        # Reshape x to [batch_size*length, hidden_size] to remove padding
-        x = tf.reshape(x, [-1, self.hidden_size])
-        x = tf.gather_nd(x, indices=nonpad_ids)
-
-        # Reshape x from 2 dimensions to 3 dimensions.
-        x.set_shape([None, self.hidden_size])
-        x = tf.expand_dims(x, axis=0)
-
-    output = self.filter_dense_layer(x)
-    if self.train:
-      output = tf.nn.dropout(output, 1.0 - self.relu_dropout)
-    output = self.output_dense_layer(output)
-
-    if padding is not None:
-      with tf.name_scope("re_add_padding"):
-        output = tf.squeeze(output, axis=0)
-        output = tf.scatter_nd(
-            indices=nonpad_ids,
-            updates=output,
-            shape=[batch_size * length, self.hidden_size]
-        )
-        output = tf.reshape(output, [batch_size, length, self.hidden_size])
-    return output
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/model_params.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/model_params.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Defines Transformer model parameters."""
-
-from collections import defaultdict
-
-
-BASE_PARAMS = defaultdict(
-    lambda: None,  # Set default value to None.
-
-    # Input params
-    default_batch_size=2048,  # Maximum number of tokens per batch of examples.
-    default_batch_size_tpu=32768,
-    max_length=256,  # Maximum number of tokens per example.
-
-    # Model params
-    initializer_gain=1.0,  # Used in trainable variable initialization.
-    vocab_size=33708,  # Number of tokens defined in the vocabulary file.
-    hidden_size=512,  # Model dimension in the hidden layers.
-    num_hidden_layers=6,  # Number of layers in the encoder and decoder stacks.
-    num_heads=8,  # Number of heads to use in multi-headed attention.
-    filter_size=2048,  # Inner layer dimension in the feedforward network.
-
-    # Dropout values (only used when training)
-    layer_postprocess_dropout=0.1,
-    attention_dropout=0.1,
-    relu_dropout=0.1,
-
-    # Training params
-    label_smoothing=0.1,
-    learning_rate=2.0,
-    learning_rate_decay_rate=1.0,
-    learning_rate_warmup_steps=16000,
-
-    # Optimizer params
-    optimizer_adam_beta1=0.9,
-    optimizer_adam_beta2=0.997,
-    optimizer_adam_epsilon=1e-09,
-
-    # Default prediction params
-    extra_decode_length=50,
-    beam_size=4,
-    alpha=0.6,  # used to calculate length normalization in beam search
-
-    # TPU specific parameters
-    use_tpu=False,
-    static_batch=False,
-    allow_ffn_pad=True,
-)
-
-BIG_PARAMS = BASE_PARAMS.copy()
-BIG_PARAMS.update(
-    default_batch_size=4096,
-
-    # default batch size is smaller than for BASE_PARAMS due to memory limits.
-    default_batch_size_tpu=16384,
-
-    hidden_size=1024,
-    filter_size=4096,
-    num_heads=16,
-)
-
-# Parameters for running the model in multi gpu. These should not change the
-# params that modify the model shape (such as the hidden_size or num_heads).
-BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
-BASE_MULTI_GPU_PARAMS.update(
-    learning_rate_warmup_steps=8000
-)
-
-BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
-BIG_MULTI_GPU_PARAMS.update(
-    layer_postprocess_dropout=0.3,
-    learning_rate_warmup_steps=8000
-)
-
-# Parameters for testing the model
-TINY_PARAMS = BASE_PARAMS.copy()
-TINY_PARAMS.update(
-    default_batch_size=1024,
-    default_batch_size_tpu=1024,
-    hidden_size=32,
-    num_heads=4,
-    filter_size=256,
-)
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/model_utils.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/model_utils.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Transformer model helper methods."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import tensorflow as tf
-
-_NEG_INF = -1e9
-
-
-def get_position_encoding(
-    length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
-  """Return positional encoding.
-
-  Calculates the position encoding as a mix of sine and cosine functions with
-  geometrically increasing wavelengths.
-  Defined and formulized in Attention is All You Need, section 3.5.
-
-  Args:
-    length: Sequence length.
-    hidden_size: Size of the
-    min_timescale: Minimum scale that will be applied at each position
-    max_timescale: Maximum scale that will be applied at each position
-
-  Returns:
-    Tensor with shape [length, hidden_size]
-  """
-  position = tf.to_float(tf.range(length))
-  num_timescales = hidden_size // 2
-  log_timescale_increment = (
-      math.log(float(max_timescale) / float(min_timescale)) /
-      (tf.to_float(num_timescales) - 1))
-  inv_timescales = min_timescale * tf.exp(
-      tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
-  scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
-  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
-  return signal
-
-
-def get_decoder_self_attention_bias(length):
-  """Calculate bias for decoder that maintains model's autoregressive property.
-
-  Creates a tensor that masks out locations that correspond to illegal
-  connections, so prediction at position i cannot draw information from future
-  positions.
-
-  Args:
-    length: int length of sequences in batch.
-
-  Returns:
-    float tensor of shape [1, 1, length, length]
-  """
-  with tf.name_scope("decoder_self_attention_bias"):
-    valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
-    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
-    decoder_bias = _NEG_INF * (1.0 - valid_locs)
-  return decoder_bias
-
-
-def get_padding(x, padding_value=0):
-  """Return float tensor representing the padding values in x.
-
-  Args:
-    x: int tensor with any shape
-    padding_value: int value that
-
-  Returns:
-    flaot tensor with same shape as x containing values 0 or 1.
-      0 -> non-padding, 1 -> padding
-  """
-  with tf.name_scope("padding"):
-    return tf.to_float(tf.equal(x, padding_value))
-
-
-def get_padding_bias(x):
-  """Calculate bias tensor from padding values in tensor.
-
-  Bias tensor that is added to the pre-softmax multi-headed attention logits,
-  which has shape [batch_size, num_heads, length, length]. The tensor is zero at
-  non-padding locations, and -1e9 (negative infinity) at padding locations.
-
-  Args:
-    x: int tensor with shape [batch_size, length]
-
-  Returns:
-    Attention bias tensor of shape [batch_size, 1, 1, length].
-  """
-  with tf.name_scope("attention_bias"):
-    padding = get_padding(x)
-    attention_bias = padding * _NEG_INF
-    attention_bias = tf.expand_dims(
-        tf.expand_dims(attention_bias, axis=1), axis=1)
-  return attention_bias
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/model_utils_test.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/model_utils_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test Transformer model helper methods."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.transformer.model import model_utils
-
-NEG_INF = -1e9
-
-
-class ModelUtilsTest(tf.test.TestCase):
-
-  def test_get_padding(self):
-    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
-    padding = model_utils.get_padding(x, padding_value=0)
-    with self.test_session() as sess:
-      padding = sess.run(padding)
-
-    self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]],
-                        padding)
-
-  def test_get_padding_bias(self):
-    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
-    bias = model_utils.get_padding_bias(x)
-    bias_shape = tf.shape(bias)
-    flattened_bias = tf.reshape(bias, [3, 5])
-    with self.test_session() as sess:
-      flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape))
-
-    self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
-                         [0, 0, NEG_INF, NEG_INF, NEG_INF],
-                         [NEG_INF, 0, 0, NEG_INF, 0]],
-                        flattened_bias)
-    self.assertAllEqual([3, 1, 1, 5], bias_shape)
-
-  def test_get_decoder_self_attention_bias(self):
-    length = 5
-    bias = model_utils.get_decoder_self_attention_bias(length)
-    with self.test_session() as sess:
-      bias = sess.run(bias)
-
-    self.assertAllEqual([[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
-                           [0, 0, NEG_INF, NEG_INF, NEG_INF],
-                           [0, 0, 0, NEG_INF, NEG_INF],
-                           [0, 0, 0, 0, NEG_INF],
-                           [0, 0, 0, 0, 0]]]],
-                        bias)
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/transformer.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/model/transformer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Defines the Transformer model, and its encoder and decoder stacks.
-
-Model paper: https://arxiv.org/pdf/1706.03762.pdf
-Transformer model code source: https://github.com/tensorflow/tensor2tensor
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.transformer.model import attention_layer
-from official.transformer.model import beam_search
-from official.transformer.model import embedding_layer
-from official.transformer.model import ffn_layer
-from official.transformer.model import model_utils
-from official.transformer.utils.tokenizer import EOS_ID
-
-_NEG_INF = -1e9
-
-
-class Transformer(object):
-  """Transformer model for sequence to sequence data.
-
-  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
-
-  The Transformer model consists of an encoder and decoder. The input is an int
-  sequence (or a batch of sequences). The encoder produces a continous
-  representation, and the decoder uses the encoder output to generate
-  probabilities for the output sequence.
-  """
-
-  def __init__(self, params, train):
-    """Initialize layers to build Transformer model.
-
-    Args:
-      params: hyperparameter object defining layer sizes, dropout values, etc.
-      train: boolean indicating whether the model is in training mode. Used to
-        determine if dropout layers should be added.
-    """
-    self.train = train
-    self.params = params
-
-    self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
-        params["vocab_size"], params["hidden_size"],
-        method="matmul" if params["tpu"] else "gather")
-    self.encoder_stack = EncoderStack(params, train)
-    self.decoder_stack = DecoderStack(params, train)
-
-  def __call__(self, inputs, targets=None):
-    """Calculate target logits or inferred target sequences.
-
-    Args:
-      inputs: int tensor with shape [batch_size, input_length].
-      targets: None or int tensor with shape [batch_size, target_length].
-
-    Returns:
-      If targets is defined, then return logits for each word in the target
-      sequence. float tensor with shape [batch_size, target_length, vocab_size]
-      If target is none, then generate output sequence one token at a time.
-        returns a dictionary {
-          output: [batch_size, decoded length]
-          score: [batch_size, float]}
-    """
-    # Variance scaling is used here because it seems to work in many problems.
-    # Other reasonable initializers may also work just as well.
-    initializer = tf.variance_scaling_initializer(
-        self.params["initializer_gain"], mode="fan_avg", distribution="uniform")
-    with tf.variable_scope("Transformer", initializer=initializer):
-      # Calculate attention bias for encoder self-attention and decoder
-      # multi-headed attention layers.
-      attention_bias = model_utils.get_padding_bias(inputs)
-
-      # Run the inputs through the encoder layer to map the symbol
-      # representations to continuous representations.
-      encoder_outputs = self.encode(inputs, attention_bias)
-
-      # Generate output sequence if targets is None, or return logits if target
-      # sequence is known.
-      if targets is None:
-        return self.predict(encoder_outputs, attention_bias)
-      else:
-        logits = self.decode(targets, encoder_outputs, attention_bias)
-        return logits
-
-  def encode(self, inputs, attention_bias):
-    """Generate continuous representation for inputs.
-
-    Args:
-      inputs: int tensor with shape [batch_size, input_length].
-      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
-
-    Returns:
-      float tensor with shape [batch_size, input_length, hidden_size]
-    """
-    with tf.name_scope("encode"):
-      # Prepare inputs to the layer stack by adding positional encodings and
-      # applying dropout.
-      embedded_inputs = self.embedding_softmax_layer(inputs)
-      inputs_padding = model_utils.get_padding(inputs)
-
-      with tf.name_scope("add_pos_encoding"):
-        length = tf.shape(embedded_inputs)[1]
-        pos_encoding = model_utils.get_position_encoding(
-            length, self.params["hidden_size"])
-        encoder_inputs = embedded_inputs + pos_encoding
-
-      if self.train:
-        encoder_inputs = tf.nn.dropout(
-            encoder_inputs, 1 - self.params["layer_postprocess_dropout"])
-
-      return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
-
-  def decode(self, targets, encoder_outputs, attention_bias):
-    """Generate logits for each value in the target sequence.
-
-    Args:
-      targets: target values for the output sequence.
-        int tensor with shape [batch_size, target_length]
-      encoder_outputs: continuous representation of input sequence.
-        float tensor with shape [batch_size, input_length, hidden_size]
-      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
-
-    Returns:
-      float32 tensor with shape [batch_size, target_length, vocab_size]
-    """
-    with tf.name_scope("decode"):
-      # Prepare inputs to decoder layers by shifting targets, adding positional
-      # encoding and applying dropout.
-      decoder_inputs = self.embedding_softmax_layer(targets)
-      with tf.name_scope("shift_targets"):
-        # Shift targets to the right, and remove the last element
-        decoder_inputs = tf.pad(
-            decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
-      with tf.name_scope("add_pos_encoding"):
-        length = tf.shape(decoder_inputs)[1]
-        decoder_inputs += model_utils.get_position_encoding(
-            length, self.params["hidden_size"])
-      if self.train:
-        decoder_inputs = tf.nn.dropout(
-            decoder_inputs, 1 - self.params["layer_postprocess_dropout"])
-
-      # Run values
-      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
-          length)
-      outputs = self.decoder_stack(
-          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
-          attention_bias)
-      logits = self.embedding_softmax_layer.linear(outputs)
-      return logits
-
-  def _get_symbols_to_logits_fn(self, max_decode_length):
-    """Returns a decoding function that calculates logits of the next tokens."""
-
-    timing_signal = model_utils.get_position_encoding(
-        max_decode_length + 1, self.params["hidden_size"])
-    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
-        max_decode_length)
-
-    def symbols_to_logits_fn(ids, i, cache):
-      """Generate logits for next potential IDs.
-
-      Args:
-        ids: Current decoded sequences.
-          int tensor with shape [batch_size * beam_size, i + 1]
-        i: Loop index
-        cache: dictionary of values storing the encoder output, encoder-decoder
-          attention bias, and previous decoder attention values.
-
-      Returns:
-        Tuple of
-          (logits with shape [batch_size * beam_size, vocab_size],
-           updated cache values)
-      """
-      # Set decoder input to the last generated IDs
-      decoder_input = ids[:, -1:]
-
-      # Preprocess decoder input by getting embeddings and adding timing signal.
-      decoder_input = self.embedding_softmax_layer(decoder_input)
-      decoder_input += timing_signal[i:i + 1]
-
-      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
-      decoder_outputs = self.decoder_stack(
-          decoder_input, cache.get("encoder_outputs"), self_attention_bias,
-          cache.get("encoder_decoder_attention_bias"), cache)
-      logits = self.embedding_softmax_layer.linear(decoder_outputs)
-      logits = tf.squeeze(logits, axis=[1])
-      return logits, cache
-    return symbols_to_logits_fn
-
-  def predict(self, encoder_outputs, encoder_decoder_attention_bias):
-    """Return predicted sequence."""
-    batch_size = tf.shape(encoder_outputs)[0]
-    input_length = tf.shape(encoder_outputs)[1]
-    max_decode_length = input_length + self.params["extra_decode_length"]
-
-    symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
-
-    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
-    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
-
-    # Create cache storing decoder attention values for each layer.
-    cache = {
-        "layer_%d" % layer: {
-            "k": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
-            "v": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
-        } for layer in range(self.params["num_hidden_layers"])}
-
-    # Add encoder output and attention bias to the cache.
-    cache["encoder_outputs"] = encoder_outputs
-    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
-
-    # Use beam search to find the top beam_size sequences and scores.
-    decoded_ids, scores = beam_search.sequence_beam_search(
-        symbols_to_logits_fn=symbols_to_logits_fn,
-        initial_ids=initial_ids,
-        initial_cache=cache,
-        vocab_size=self.params["vocab_size"],
-        beam_size=self.params["beam_size"],
-        alpha=self.params["alpha"],
-        max_decode_length=max_decode_length,
-        eos_id=EOS_ID)
-
-    # Get the top sequence for each batch element
-    top_decoded_ids = decoded_ids[:, 0, 1:]
-    top_scores = scores[:, 0]
-
-    return {"outputs": top_decoded_ids, "scores": top_scores}
-
-
-class LayerNormalization(tf.layers.Layer):
-  """Applies layer normalization."""
-
-  def __init__(self, hidden_size):
-    super(LayerNormalization, self).__init__()
-    self.hidden_size = hidden_size
-
-  def build(self, _):
-    self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
-                                 initializer=tf.ones_initializer())
-    self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
-                                initializer=tf.zeros_initializer())
-    self.built = True
-
-  def call(self, x, epsilon=1e-6):
-    mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
-    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
-    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
-    return norm_x * self.scale + self.bias
-
-
-class PrePostProcessingWrapper(object):
-  """Wrapper class that applies layer pre-processing and post-processing."""
-
-  def __init__(self, layer, params, train):
-    self.layer = layer
-    self.postprocess_dropout = params["layer_postprocess_dropout"]
-    self.train = train
-
-    # Create normalization layer
-    self.layer_norm = LayerNormalization(params["hidden_size"])
-
-  def __call__(self, x, *args, **kwargs):
-    # Preprocessing: apply layer normalization
-    y = self.layer_norm(x)
-
-    # Get layer output
-    y = self.layer(y, *args, **kwargs)
-
-    # Postprocessing: apply dropout and residual connection
-    if self.train:
-      y = tf.nn.dropout(y, 1 - self.postprocess_dropout)
-    return x + y
-
-
-class EncoderStack(tf.layers.Layer):
-  """Transformer encoder stack.
-
-  The encoder stack is made up of N identical layers. Each layer is composed
-  of the sublayers:
-    1. Self-attention layer
-    2. Feedforward network (which is 2 fully-connected layers)
-  """
-
-  def __init__(self, params, train):
-    super(EncoderStack, self).__init__()
-    self.layers = []
-    for _ in range(params["num_hidden_layers"]):
-      # Create sublayers for each layer.
-      self_attention_layer = attention_layer.SelfAttention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      feed_forward_network = ffn_layer.FeedFowardNetwork(
-          params["hidden_size"], params["filter_size"],
-          params["relu_dropout"], train, params["allow_ffn_pad"])
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params, train),
-          PrePostProcessingWrapper(feed_forward_network, params, train)])
-
-    # Create final layer normalization layer.
-    self.output_normalization = LayerNormalization(params["hidden_size"])
-
-  def call(self, encoder_inputs, attention_bias, inputs_padding):
-    """Return the output of the encoder layer stacks.
-
-    Args:
-      encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
-      attention_bias: bias for the encoder self-attention layer.
-        [batch_size, 1, 1, input_length]
-      inputs_padding: P
-
-    Returns:
-      Output of encoder layer stack.
-      float32 tensor with shape [batch_size, input_length, hidden_size]
-    """
-    for n, layer in enumerate(self.layers):
-      # Run inputs through the sublayers.
-      self_attention_layer = layer[0]
-      feed_forward_network = layer[1]
-
-      with tf.variable_scope("layer_%d" % n):
-        with tf.variable_scope("self_attention"):
-          encoder_inputs = self_attention_layer(encoder_inputs, attention_bias)
-        with tf.variable_scope("ffn"):
-          encoder_inputs = feed_forward_network(encoder_inputs, inputs_padding)
-
-    return self.output_normalization(encoder_inputs)
-
-
-class DecoderStack(tf.layers.Layer):
-  """Transformer decoder stack.
-
-  Like the encoder stack, the decoder stack is made up of N identical layers.
-  Each layer is composed of the sublayers:
-    1. Self-attention layer
-    2. Multi-headed attention layer combining encoder outputs with results from
-       the previous self-attention layer.
-    3. Feedforward network (2 fully-connected layers)
-  """
-
-  def __init__(self, params, train):
-    super(DecoderStack, self).__init__()
-    self.layers = []
-    for _ in range(params["num_hidden_layers"]):
-      self_attention_layer = attention_layer.SelfAttention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      enc_dec_attention_layer = attention_layer.Attention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      feed_forward_network = ffn_layer.FeedFowardNetwork(
-          params["hidden_size"], params["filter_size"],
-          params["relu_dropout"], train, params["allow_ffn_pad"])
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params, train),
-          PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
-          PrePostProcessingWrapper(feed_forward_network, params, train)])
-
-    self.output_normalization = LayerNormalization(params["hidden_size"])
-
-  def call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias,
-           attention_bias, cache=None):
-    """Return the output of the decoder layer stacks.
-
-    Args:
-      decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
-      encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
-      decoder_self_attention_bias: bias for decoder self-attention layer.
-        [1, 1, target_len, target_length]
-      attention_bias: bias for encoder-decoder attention layer.
-        [batch_size, 1, 1, input_length]
-      cache: (Used for fast decoding) A nested dictionary storing previous
-        decoder self-attention values. The items are:
-          {layer_n: {"k": tensor with shape [batch_size, i, key_channels],
-                     "v": tensor with shape [batch_size, i, value_channels]},
-           ...}
-
-    Returns:
-      Output of decoder layer stack.
-      float32 tensor with shape [batch_size, target_length, hidden_size]
-    """
-    for n, layer in enumerate(self.layers):
-      self_attention_layer = layer[0]
-      enc_dec_attention_layer = layer[1]
-      feed_forward_network = layer[2]
-
-      # Run inputs through the sublayers.
-      layer_name = "layer_%d" % n
-      layer_cache = cache[layer_name] if cache is not None else None
-      with tf.variable_scope(layer_name):
-        with tf.variable_scope("self_attention"):
-          decoder_inputs = self_attention_layer(
-              decoder_inputs, decoder_self_attention_bias, cache=layer_cache)
-        with tf.variable_scope("encdec_attention"):
-          decoder_inputs = enc_dec_attention_layer(
-              decoder_inputs, encoder_outputs, attention_bias)
-        with tf.variable_scope("ffn"):
-          decoder_inputs = feed_forward_network(decoder_inputs)
-
-    return self.output_normalization(decoder_inputs)
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/test_data/newstest2014.de
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/test_data/newstest2014.de
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/test_data/newstest2014.en
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/test_data/newstest2014.en
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/transformer_main.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/transformer_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Train and evaluate the Transformer model.
-
-See README for description of setting the training schedule and evaluating the
-BLEU score.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-
-# pylint: disable=g-bad-import-order
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from absl import app as absl_app
-from absl import flags
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.transformer import compute_bleu
-from official.transformer import translate
-from official.transformer.model import model_params
-from official.transformer.model import transformer
-from official.transformer.utils import dataset
-from official.transformer.utils import metrics
-from official.transformer.utils import schedule
-from official.transformer.utils import tokenizer
-from official.utils.accelerator import tpu as tpu_util
-from official.utils.export import export
-from official.utils.flags import core as flags_core
-from official.utils.logs import hooks_helper
-from official.utils.logs import logger
-from official.utils.misc import distribution_utils
-from official.utils.misc import model_helpers
-
-PARAMS_MAP = {
-    "tiny": model_params.TINY_PARAMS,
-    "base": model_params.BASE_PARAMS,
-    "big": model_params.BIG_PARAMS,
-}
-
-
-DEFAULT_TRAIN_EPOCHS = 10
-INF = int(1e9)
-BLEU_DIR = "bleu"
-
-# Dictionary containing tensors that are logged by the logging hooks. Each item
-# maps a string to the tensor name.
-TENSORS_TO_LOG = {
-    "learning_rate": "model/get_train_op/learning_rate/learning_rate",
-    "cross_entropy_loss": "model/cross_entropy"}
-
-
-def model_fn(features, labels, mode, params):
-  """Defines how to train, evaluate and predict from the transformer model."""
-  with tf.variable_scope("model"):
-    inputs, targets = features, labels
-
-    # Create model and get output logits.
-    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)
-
-    logits = model(inputs, targets)
-
-    # When in prediction mode, the labels/targets is None. The model output
-    # is the prediction
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      if params["use_tpu"]:
-        raise NotImplementedError("Prediction is not yet supported on TPUs.")
-      return tf.estimator.EstimatorSpec(
-          tf.estimator.ModeKeys.PREDICT,
-          predictions=logits,
-          export_outputs={
-              "translate": tf.estimator.export.PredictOutput(logits)
-          })
-
-    # Explicitly set the shape of the logits for XLA (TPU). This is needed
-    # because the logits are passed back to the host VM CPU for metric
-    # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
-    # it is known from Transformer that the first two dimensions of logits
-    # are the dimensions of targets. Note that the ambiguous shape of logits is
-    # not a problem when computing xentropy, because padded_cross_entropy_loss
-    # resolves the shape on the TPU.
-    logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])
-
-    # Calculate model loss.
-    # xentropy contains the cross entropy loss of every nonpadding token in the
-    # targets.
-    xentropy, weights = metrics.padded_cross_entropy_loss(
-        logits, targets, params["label_smoothing"], params["vocab_size"])
-    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
-
-    # Save loss as named tensor that will be logged with the logging hook.
-    tf.identity(loss, "cross_entropy")
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      if params["use_tpu"]:
-        # host call functions should only have tensors as arguments.
-        # This lambda pre-populates params so that metric_fn is
-        # TPUEstimator compliant.
-        metric_fn = lambda logits, labels: (
-            metrics.get_eval_metrics(logits, labels, params=params))
-        eval_metrics = (metric_fn, [logits, labels])
-        return tf.contrib.tpu.TPUEstimatorSpec(
-            mode=mode, loss=loss, predictions={"predictions": logits},
-            eval_metrics=eval_metrics)
-      return tf.estimator.EstimatorSpec(
-          mode=mode, loss=loss, predictions={"predictions": logits},
-          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
-    else:
-      train_op, metric_dict = get_train_op_and_metrics(loss, params)
-
-      # Epochs can be quite long. This gives some intermediate information
-      # in TensorBoard.
-      metric_dict["minibatch_loss"] = loss
-      if params["use_tpu"]:
-        return tf.contrib.tpu.TPUEstimatorSpec(
-            mode=mode, loss=loss, train_op=train_op,
-            host_call=tpu_util.construct_scalar_host_call(
-                metric_dict=metric_dict, model_dir=params["model_dir"],
-                prefix="training/")
-        )
-      record_scalars(metric_dict)
-      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
-
-
-def record_scalars(metric_dict):
-  for key, value in metric_dict.items():
-    tf.contrib.summary.scalar(name=key, tensor=value)
-
-
-def get_learning_rate(learning_rate, hidden_size, learning_rate_warmup_steps):
-  """Calculate learning rate with linear warmup and rsqrt decay."""
-  with tf.name_scope("learning_rate"):
-    warmup_steps = tf.to_float(learning_rate_warmup_steps)
-    step = tf.to_float(tf.train.get_or_create_global_step())
-
-    learning_rate *= (hidden_size ** -0.5)
-    # Apply linear warmup
-    learning_rate *= tf.minimum(1.0, step / warmup_steps)
-    # Apply rsqrt decay
-    learning_rate *= tf.rsqrt(tf.maximum(step, warmup_steps))
-
-    # Create a named tensor that will be logged using the logging hook.
-    # The full name includes variable and names scope. In this case, the name
-    # is model/get_train_op/learning_rate/learning_rate
-    tf.identity(learning_rate, "learning_rate")
-
-    return learning_rate
-
-
-def get_train_op_and_metrics(loss, params):
-  """Generate training op and metrics to save in TensorBoard."""
-  with tf.variable_scope("get_train_op"):
-    learning_rate = get_learning_rate(
-        learning_rate=params["learning_rate"],
-        hidden_size=params["hidden_size"],
-        learning_rate_warmup_steps=params["learning_rate_warmup_steps"])
-
-    # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
-    # than the TF core Adam optimizer.
-    optimizer = tf.contrib.opt.LazyAdamOptimizer(
-        learning_rate,
-        beta1=params["optimizer_adam_beta1"],
-        beta2=params["optimizer_adam_beta2"],
-        epsilon=params["optimizer_adam_epsilon"])
-
-    if params["use_tpu"] and params["tpu"] != tpu_util.LOCAL:
-      optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
-
-    # Calculate and apply gradients using LazyAdamOptimizer.
-    global_step = tf.train.get_global_step()
-    tvars = tf.trainable_variables()
-    gradients = optimizer.compute_gradients(
-        loss, tvars, colocate_gradients_with_ops=True)
-    minimize_op = optimizer.apply_gradients(
-        gradients, global_step=global_step, name="train")
-    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    train_op = tf.group(minimize_op, update_ops)
-
-    train_metrics = {"learning_rate": learning_rate}
-
-    if not params["use_tpu"]:
-      # gradient norm is not included as a summary when running on TPU, as
-      # it can cause instability between the TPU and the host controller.
-      gradient_norm = tf.global_norm(list(zip(*gradients))[0])
-      train_metrics["global_norm/gradient_norm"] = gradient_norm
-
-    return train_op, train_metrics
-
-
-def translate_and_compute_bleu(estimator, subtokenizer, bleu_source, bleu_ref):
-  """Translate file and report the cased and uncased bleu scores."""
-  # Create temporary file to store translation.
-  tmp = tempfile.NamedTemporaryFile(delete=False)
-  tmp_filename = tmp.name
-
-  translate.translate_file(
-      estimator, subtokenizer, bleu_source, output_file=tmp_filename,
-      print_all_translations=False)
-
-  # Compute uncased and cased bleu scores.
-  uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
-  cased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, True)
-  os.remove(tmp_filename)
-  return uncased_score, cased_score
-
-
-def get_global_step(estimator):
-  """Return estimator's last checkpoint."""
-  return int(estimator.latest_checkpoint().split("-")[-1])
-
-
-def evaluate_and_log_bleu(estimator, bleu_source, bleu_ref, vocab_file):
-  """Calculate and record the BLEU score."""
-  subtokenizer = tokenizer.Subtokenizer(vocab_file)
-
-  uncased_score, cased_score = translate_and_compute_bleu(
-      estimator, subtokenizer, bleu_source, bleu_ref)
-
-  tf.logging.info("Bleu score (uncased): %d", uncased_score)
-  tf.logging.info("Bleu score (cased): %d", cased_score)
-  return uncased_score, cased_score
-
-
-def _validate_file(filepath):
-  """Make sure that file exists."""
-  if not tf.gfile.Exists(filepath):
-    raise tf.errors.NotFoundError(None, None, "File %s not found." % filepath)
-
-
-def run_loop(
-    estimator, schedule_manager, train_hooks=None, benchmark_logger=None,
-    bleu_source=None, bleu_ref=None, bleu_threshold=None, vocab_file=None):
-  """Train and evaluate model, and optionally compute model's BLEU score.
-
-  **Step vs. Epoch vs. Iteration**
-
-  Steps and epochs are canonical terms used in TensorFlow and general machine
-  learning. They are used to describe running a single process (train/eval):
-    - Step refers to running the process through a single or batch of examples.
-    - Epoch refers to running the process through an entire dataset.
-
-  E.g. training a dataset with 100 examples. The dataset is
-  divided into 20 batches with 5 examples per batch. A single training step
-  trains the model on one batch. After 20 training steps, the model will have
-  trained on every batch in the dataset, or, in other words, one epoch.
-
-  Meanwhile, iteration is used in this implementation to describe running
-  multiple processes (training and eval).
-    - A single iteration:
-      1. trains the model for a specific number of steps or epochs.
-      2. evaluates the model.
-      3. (if source and ref files are provided) compute BLEU score.
-
-  This function runs through multiple train+eval+bleu iterations.
-
-  Args:
-    estimator: tf.Estimator containing model to train.
-    schedule_manager: A schedule.Manager object to guide the run loop.
-    train_hooks: List of hooks to pass to the estimator during training.
-    benchmark_logger: a BenchmarkLogger object that logs evaluation data
-    bleu_source: File containing text to be translated for BLEU calculation.
-    bleu_ref: File containing reference translations for BLEU calculation.
-    bleu_threshold: minimum BLEU score before training is stopped.
-    vocab_file: Path to vocab file that will be used to subtokenize bleu_source.
-
-  Raises:
-    ValueError: if both or none of single_iteration_train_steps and
-      single_iteration_train_epochs were defined.
-    NotFoundError: if the vocab file or bleu files don't exist.
-  """
-  if bleu_source:
-    _validate_file(bleu_source)
-  if bleu_ref:
-    _validate_file(bleu_ref)
-  if vocab_file:
-    _validate_file(vocab_file)
-
-  evaluate_bleu = bleu_source is not None and bleu_ref is not None
-  if evaluate_bleu and schedule_manager.use_tpu:
-    raise ValueError("BLEU score can not be computed when training with a TPU, "
-                     "as it requires estimator.predict which is not yet "
-                     "supported.")
-
-  # Print details of training schedule.
-  tf.logging.info("Training schedule:")
-  tf.logging.info(
-      "\t1. Train for {}".format(schedule_manager.train_increment_str))
-  tf.logging.info("\t2. Evaluate model.")
-  if evaluate_bleu:
-    tf.logging.info("\t3. Compute BLEU score.")
-    if bleu_threshold is not None:
-      tf.logging.info("Repeat above steps until the BLEU score reaches %f" %
-                      bleu_threshold)
-  if not evaluate_bleu or bleu_threshold is None:
-    tf.logging.info("Repeat above steps %d times." %
-                    schedule_manager.train_eval_iterations)
-
-  if evaluate_bleu:
-    # Create summary writer to log bleu score (values can be displayed in
-    # Tensorboard).
-    bleu_writer = tf.summary.FileWriter(
-        os.path.join(estimator.model_dir, BLEU_DIR))
-    if bleu_threshold is not None:
-      # Change loop stopping condition if bleu_threshold is defined.
-      schedule_manager.train_eval_iterations = INF
-
-  # Loop training/evaluation/bleu cycles
-  for i in xrange(schedule_manager.train_eval_iterations):
-    tf.logging.info("Starting iteration %d" % (i + 1))
-
-    # Train the model for single_iteration_train_steps or until the input fn
-    # runs out of examples (if single_iteration_train_steps is None).
-    estimator.train(
-        dataset.train_input_fn,
-        steps=schedule_manager.single_iteration_train_steps,
-        hooks=train_hooks)
-
-    eval_results = estimator.evaluate(
-        input_fn=dataset.eval_input_fn,
-        steps=schedule_manager.single_iteration_eval_steps)
-
-    tf.logging.info("Evaluation results (iter %d/%d):" %
-                    (i + 1, schedule_manager.train_eval_iterations))
-    tf.logging.info(eval_results)
-    benchmark_logger.log_evaluation_result(eval_results)
-
-    # The results from estimator.evaluate() are measured on an approximate
-    # translation, which utilize the target golden values provided. The actual
-    # bleu score must be computed using the estimator.predict() path, which
-    # outputs translations that are not based on golden values. The translations
-    # are compared to reference file to get the actual bleu score.
-    if evaluate_bleu:
-      uncased_score, cased_score = evaluate_and_log_bleu(
-          estimator, bleu_source, bleu_ref, vocab_file)
-
-      # Write actual bleu scores using summary writer and benchmark logger
-      global_step = get_global_step(estimator)
-      summary = tf.Summary(value=[
-          tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score),
-          tf.Summary.Value(tag="bleu/cased", simple_value=cased_score),
-      ])
-      bleu_writer.add_summary(summary, global_step)
-      bleu_writer.flush()
-      benchmark_logger.log_metric(
-          "bleu_uncased", uncased_score, global_step=global_step)
-      benchmark_logger.log_metric(
-          "bleu_cased", cased_score, global_step=global_step)
-
-      # Stop training if bleu stopping threshold is met.
-      if model_helpers.past_stop_threshold(bleu_threshold, uncased_score):
-        bleu_writer.close()
-        break
-
-
-def define_transformer_flags():
-  """Add flags and flag validators for running transformer_main."""
-  # Add common flags (data_dir, model_dir, train_epochs, etc.).
-  flags_core.define_base()
-  flags_core.define_performance(
-      num_parallel_calls=True,
-      inter_op=False,
-      intra_op=False,
-      synthetic_data=True,
-      max_train_steps=False,
-      dtype=False,
-      all_reduce_alg=True
-  )
-  flags_core.define_benchmark()
-  flags_core.define_device(tpu=True)
-
-  # Set flags from the flags_core module as "key flags" so they're listed when
-  # the '-h' flag is used. Without this line, the flags defined above are
-  # only shown in the full `--helpful` help text.
-  flags.adopt_module_key_flags(flags_core)
-
-  # Add transformer-specific flags
-  flags.DEFINE_enum(
-      name="param_set", short_name="mp", default="big",
-      enum_values=PARAMS_MAP.keys(),
-      help=flags_core.help_wrap(
-          "Parameter set to use when creating and training the model. The "
-          "parameters define the input shape (batch size and max length), "
-          "model configuration (size of embedding, # of hidden layers, etc.), "
-          "and various other settings. The big parameter set increases the "
-          "default batch size, embedding/hidden size, and filter size. For a "
-          "complete list of parameters, please see model/model_params.py."))
-
-  flags.DEFINE_bool(
-      name="static_batch", default=False,
-      help=flags_core.help_wrap(
-          "Whether the batches in the dataset should have static shapes. In "
-          "general, this setting should be False. Dynamic shapes allow the "
-          "inputs to be grouped so that the number of padding tokens is "
-          "minimized, and helps model training. In cases where the input shape "
-          "must be static (e.g. running on TPU), this setting will be ignored "
-          "and static batching will always be used."))
-
-  # Flags for training with steps (may be used for debugging)
-  flags.DEFINE_integer(
-      name="train_steps", short_name="ts", default=None,
-      help=flags_core.help_wrap("The number of steps used to train."))
-  flags.DEFINE_integer(
-      name="steps_between_evals", short_name="sbe", default=1000,
-      help=flags_core.help_wrap(
-          "The Number of training steps to run between evaluations. This is "
-          "used if --train_steps is defined."))
-
-  # BLEU score computation
-  flags.DEFINE_string(
-      name="bleu_source", short_name="bls", default=None,
-      help=flags_core.help_wrap(
-          "Path to source file containing text translate when calculating the "
-          "official BLEU score. Both --bleu_source and --bleu_ref must be set. "
-          "Use the flag --stop_threshold to stop the script based on the "
-          "uncased BLEU score."))
-  flags.DEFINE_string(
-      name="bleu_ref", short_name="blr", default=None,
-      help=flags_core.help_wrap(
-          "Path to source file containing text translate when calculating the "
-          "official BLEU score. Both --bleu_source and --bleu_ref must be set. "
-          "Use the flag --stop_threshold to stop the script based on the "
-          "uncased BLEU score."))
-  flags.DEFINE_string(
-      name="vocab_file", short_name="vf", default=None,
-      help=flags_core.help_wrap(
-          "Path to subtoken vocabulary file. If data_download.py was used to "
-          "download and encode the training data, look in the data_dir to find "
-          "the vocab file."))
-
-  flags_core.set_defaults(data_dir="/tmp/translate_ende",
-                          model_dir="/tmp/transformer_model",
-                          batch_size=None,
-                          train_epochs=None)
-
-  @flags.multi_flags_validator(
-      ["train_epochs", "train_steps"],
-      message="Both --train_steps and --train_epochs were set. Only one may be "
-              "defined.")
-  def _check_train_limits(flag_dict):
-    return flag_dict["train_epochs"] is None or flag_dict["train_steps"] is None
-
-  @flags.multi_flags_validator(
-      ["bleu_source", "bleu_ref"],
-      message="Both or neither --bleu_source and --bleu_ref must be defined.")
-  def _check_bleu_files(flags_dict):
-    return (flags_dict["bleu_source"] is None) == (
-        flags_dict["bleu_ref"] is None)
-
-  @flags.multi_flags_validator(
-      ["bleu_source", "bleu_ref", "vocab_file"],
-      message="--vocab_file must be defined if --bleu_source and --bleu_ref "
-              "are defined.")
-  def _check_bleu_vocab_file(flags_dict):
-    if flags_dict["bleu_source"] and flags_dict["bleu_ref"]:
-      return flags_dict["vocab_file"] is not None
-    return True
-
-  @flags.multi_flags_validator(
-      ["export_dir", "vocab_file"],
-      message="--vocab_file must be defined if --export_dir is set.")
-  def _check_export_vocab_file(flags_dict):
-    if flags_dict["export_dir"]:
-      return flags_dict["vocab_file"] is not None
-    return True
-
-  flags_core.require_cloud_storage(["data_dir", "model_dir", "export_dir"])
-
-
-def construct_estimator(flags_obj, params, schedule_manager):
-  """Construct an estimator from either Estimator or TPUEstimator.
-
-  Args:
-    flags_obj: The FLAGS object parsed from command line.
-    params: A dict of run specific parameters.
-    schedule_manager: A schedule.Manager object containing the run schedule.
-
-  Returns:
-    An estimator object to be used for training and eval.
-  """
-  if not params["use_tpu"]:
-    distribution_strategy = distribution_utils.get_distribution_strategy(
-        flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)
-    return tf.estimator.Estimator(
-        model_fn=model_fn, model_dir=flags_obj.model_dir, params=params,
-        config=tf.estimator.RunConfig(train_distribute=distribution_strategy))
-
-  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
-      tpu=flags_obj.tpu,
-      zone=flags_obj.tpu_zone,
-      project=flags_obj.tpu_gcp_project
-  )
-
-  tpu_config = tf.contrib.tpu.TPUConfig(
-      iterations_per_loop=schedule_manager.single_iteration_train_steps,
-      num_shards=flags_obj.num_tpu_shards)
-
-  run_config = tf.contrib.tpu.RunConfig(
-      cluster=tpu_cluster_resolver,
-      model_dir=flags_obj.model_dir,
-      session_config=tf.ConfigProto(
-          allow_soft_placement=True, log_device_placement=True),
-      tpu_config=tpu_config)
-
-  return tf.contrib.tpu.TPUEstimator(
-      model_fn=model_fn,
-      use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL,
-      train_batch_size=schedule_manager.batch_size,
-      eval_batch_size=schedule_manager.batch_size,
-      params={
-          # TPUEstimator needs to populate batch_size itself due to sharding.
-          key: value for key, value in params.items() if key != "batch_size"},
-      config=run_config)
-
-
-def run_transformer(flags_obj):
-  """Create tf.Estimator to train and evaluate transformer model.
-
-  Args:
-    flags_obj: Object containing parsed flag values.
-  """
-  num_gpus = flags_core.get_num_gpus(flags_obj)
-
-  # Add flag-defined parameters to params object
-  params = PARAMS_MAP[flags_obj.param_set]
-  if num_gpus > 1:
-    if flags_obj.param_set == "big":
-      params = model_params.BIG_MULTI_GPU_PARAMS
-    elif flags_obj.param_set == "base":
-      params = model_params.BASE_MULTI_GPU_PARAMS
-
-  params["data_dir"] = flags_obj.data_dir
-  params["model_dir"] = flags_obj.model_dir
-  params["num_parallel_calls"] = flags_obj.num_parallel_calls
-
-  params["tpu"] = flags_obj.tpu
-  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
-  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
-  params["allow_ffn_pad"] = not params["use_tpu"]
-
-  params["use_synthetic_data"] = flags_obj.use_synthetic_data
-
-  # Set batch size parameter, which depends on the availability of
-  # TPU and GPU, and distribution settings.
-  params["batch_size"] = (flags_obj.batch_size or (
-      params["default_batch_size_tpu"] if params["use_tpu"]
-      else params["default_batch_size"]))
-
-  if not params["use_tpu"]:
-    params["batch_size"] = distribution_utils.per_device_batch_size(
-        params["batch_size"], num_gpus)
-
-  schedule_manager = schedule.Manager(
-      train_steps=flags_obj.train_steps,
-      steps_between_evals=flags_obj.steps_between_evals,
-      train_epochs=flags_obj.train_epochs,
-      epochs_between_evals=flags_obj.epochs_between_evals,
-      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
-      batch_size=params["batch_size"],
-      max_length=params["max_length"],
-      use_tpu=params["use_tpu"],
-      num_tpu_shards=flags_obj.num_tpu_shards
-  )
-
-  params["repeat_dataset"] = schedule_manager.repeat_dataset
-
-  model_helpers.apply_clean(flags.FLAGS)
-
-  # Create hooks that log information about the training and metric values
-  train_hooks = hooks_helper.get_train_hooks(
-      flags_obj.hooks,
-      model_dir=flags_obj.model_dir,
-      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
-      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
-      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
-  )
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info(
-      model_name="transformer",
-      dataset_name="wmt_translate_ende",
-      run_params=params,
-      test_id=flags_obj.benchmark_test_id)
-
-  # Train and evaluate transformer model
-  estimator = construct_estimator(flags_obj, params, schedule_manager)
-  run_loop(
-      estimator=estimator,
-      # Training arguments
-      schedule_manager=schedule_manager,
-      train_hooks=train_hooks,
-      benchmark_logger=benchmark_logger,
-      # BLEU calculation arguments
-      bleu_source=flags_obj.bleu_source,
-      bleu_ref=flags_obj.bleu_ref,
-      bleu_threshold=flags_obj.stop_threshold,
-      vocab_file=flags_obj.vocab_file)
-
-  if flags_obj.export_dir and not params["use_tpu"]:
-    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
-        shape=[None], dtype=tf.int64, batch_size=None)
-    # Export saved model, and save the vocab file as an extra asset. The vocab
-    # file is saved to allow consistent input encoding and output decoding.
-    # (See the "Export trained model" section in the README for an example of
-    # how to use the vocab file.)
-    # Since the model itself does not use the vocab file, this file is saved as
-    # an extra asset rather than a core asset.
-    estimator.export_savedmodel(
-        flags_obj.export_dir, serving_input_fn,
-        assets_extra={"vocab.txt": flags_obj.vocab_file},
-        strip_default_attrs=True)
-
-
-def main(_):
-  with logger.benchmark_context(flags.FLAGS):
-    run_transformer(flags.FLAGS)
-
-
-if __name__ == "__main__":
-  tf.logging.set_verbosity(tf.logging.INFO)
-  define_transformer_flags()
-  absl_app.run(main)
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/translate.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/translate.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Translate text or files using trained transformer model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-# pylint: disable=g-bad-import-order
-from absl import app as absl_app
-from absl import flags
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.transformer.utils import tokenizer
-from official.utils.flags import core as flags_core
-
-_DECODE_BATCH_SIZE = 32
-_EXTRA_DECODE_LENGTH = 100
-_BEAM_SIZE = 4
-_ALPHA = 0.6
-
-
-def _get_sorted_inputs(filename):
-  """Read and sort lines from the file sorted by decreasing length.
-
-  Args:
-    filename: String name of file to read inputs from.
-  Returns:
-    Sorted list of inputs, and dictionary mapping original index->sorted index
-    of each element.
-  """
-  with tf.gfile.Open(filename) as f:
-    records = f.read().split("\n")
-    inputs = [record.strip() for record in records]
-    if not inputs[-1]:
-      inputs.pop()
-
-  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
-  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
-
-  sorted_inputs = [None] * len(sorted_input_lens)
-  sorted_keys = [0] * len(sorted_input_lens)
-  for i, (index, _) in enumerate(sorted_input_lens):
-    sorted_inputs[i] = inputs[index]
-    sorted_keys[index] = i
-  return sorted_inputs, sorted_keys
-
-
-def _encode_and_add_eos(line, subtokenizer):
-  """Encode line with subtokenizer, and add EOS id to the end."""
-  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
-
-
-def _trim_and_decode(ids, subtokenizer):
-  """Trim EOS and PAD tokens from ids, and decode to return a string."""
-  try:
-    index = list(ids).index(tokenizer.EOS_ID)
-    return subtokenizer.decode(ids[:index])
-  except ValueError:  # No EOS found in sequence
-    return subtokenizer.decode(ids)
-
-
-def translate_file(
-    estimator, subtokenizer, input_file, output_file=None,
-    print_all_translations=True):
-  """Translate lines in file, and save to output file if specified.
-
-  Args:
-    estimator: tf.Estimator used to generate the translations.
-    subtokenizer: Subtokenizer object for encoding and decoding source and
-       translated lines.
-    input_file: file containing lines to translate
-    output_file: file that stores the generated translations.
-    print_all_translations: If true, all translations are printed to stdout.
-
-  Raises:
-    ValueError: if output file is invalid.
-  """
-  batch_size = _DECODE_BATCH_SIZE
-
-  # Read and sort inputs by length. Keep dictionary (original index-->new index
-  # in sorted list) to write translations in the original order.
-  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
-  num_decode_batches = (len(sorted_inputs) - 1) // batch_size + 1
-
-  def input_generator():
-    """Yield encoded strings from sorted_inputs."""
-    for i, line in enumerate(sorted_inputs):
-      if i % batch_size == 0:
-        batch_num = (i // batch_size) + 1
-
-        tf.logging.info("Decoding batch %d out of %d." %
-                        (batch_num, num_decode_batches))
-      yield _encode_and_add_eos(line, subtokenizer)
-
-  def input_fn():
-    """Created batched dataset of encoded inputs."""
-    ds = tf.data.Dataset.from_generator(
-        input_generator, tf.int64, tf.TensorShape([None]))
-    ds = ds.padded_batch(batch_size, [None])
-    return ds
-
-  translations = []
-  for i, prediction in enumerate(estimator.predict(input_fn)):
-    translation = _trim_and_decode(prediction["outputs"], subtokenizer)
-    translations.append(translation)
-
-    if print_all_translations:
-      tf.logging.info("Translating:\n\tInput: %s\n\tOutput: %s" %
-                      (sorted_inputs[i], translation))
-
-  # Write translations in the order they appeared in the original file.
-  if output_file is not None:
-    if tf.gfile.IsDirectory(output_file):
-      raise ValueError("File output is a directory, will not save outputs to "
-                       "file.")
-    tf.logging.info("Writing to file %s" % output_file)
-    with tf.gfile.Open(output_file, "w") as f:
-      for i in sorted_keys:
-        f.write("%s\n" % translations[i])
-
-
-def translate_text(estimator, subtokenizer, txt):
-  """Translate a single string."""
-  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
-
-  def input_fn():
-    ds = tf.data.Dataset.from_tensors(encoded_txt)
-    ds = ds.batch(_DECODE_BATCH_SIZE)
-    return ds
-
-  predictions = estimator.predict(input_fn)
-  translation = next(predictions)["outputs"]
-  translation = _trim_and_decode(translation, subtokenizer)
-  tf.logging.info("Translation of \"%s\": \"%s\"" % (txt, translation))
-
-
-def main(unused_argv):
-  from official.transformer import transformer_main
-
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  if FLAGS.text is None and FLAGS.file is None:
-    tf.logging.warn("Nothing to translate. Make sure to call this script using "
-                    "flags --text or --file.")
-    return
-
-  subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)
-
-  # Set up estimator and params
-  params = transformer_main.PARAMS_MAP[FLAGS.param_set]
-  params["beam_size"] = _BEAM_SIZE
-  params["alpha"] = _ALPHA
-  params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
-  params["batch_size"] = _DECODE_BATCH_SIZE
-  estimator = tf.estimator.Estimator(
-      model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir,
-      params=params)
-
-  if FLAGS.text is not None:
-    tf.logging.info("Translating text: %s" % FLAGS.text)
-    translate_text(estimator, subtokenizer, FLAGS.text)
-
-  if FLAGS.file is not None:
-    input_file = os.path.abspath(FLAGS.file)
-    tf.logging.info("Translating file: %s" % input_file)
-    if not tf.gfile.Exists(FLAGS.file):
-      raise ValueError("File does not exist: %s" % input_file)
-
-    output_file = None
-    if FLAGS.file_out is not None:
-      output_file = os.path.abspath(FLAGS.file_out)
-      tf.logging.info("File output specified: %s" % output_file)
-
-    translate_file(estimator, subtokenizer, input_file, output_file)
-
-
-def define_translate_flags():
-  """Define flags used for translation script."""
-  # Model flags
-  flags.DEFINE_string(
-      name="model_dir", short_name="md", default="/tmp/transformer_model",
-      help=flags_core.help_wrap(
-          "Directory containing Transformer model checkpoints."))
-  flags.DEFINE_enum(
-      name="param_set", short_name="mp", default="big",
-      enum_values=["base", "big"],
-      help=flags_core.help_wrap(
-          "Parameter set to use when creating and training the model. The "
-          "parameters define the input shape (batch size and max length), "
-          "model configuration (size of embedding, # of hidden layers, etc.), "
-          "and various other settings. The big parameter set increases the "
-          "default batch size, embedding/hidden size, and filter size. For a "
-          "complete list of parameters, please see model/model_params.py."))
-  flags.DEFINE_string(
-      name="vocab_file", short_name="vf", default=None,
-      help=flags_core.help_wrap(
-          "Path to subtoken vocabulary file. If data_download.py was used to "
-          "download and encode the training data, look in the data_dir to find "
-          "the vocab file."))
-  flags.mark_flag_as_required("vocab_file")
-
-  flags.DEFINE_string(
-      name="text", default=None,
-      help=flags_core.help_wrap(
-          "Text to translate. Output will be printed to console."))
-  flags.DEFINE_string(
-      name="file", default=None,
-      help=flags_core.help_wrap(
-          "File containing text to translate. Translation will be printed to "
-          "console and, if --file_out is provided, saved to an output file."))
-  flags.DEFINE_string(
-      name="file_out", default=None,
-      help=flags_core.help_wrap(
-          "If --file flag is specified, save translation to this file."))
-
-
-if __name__ == "__main__":
-  define_translate_flags()
-  FLAGS = flags.FLAGS
-  absl_app.run(main)
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/__init__.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/__init__.py
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/dataset.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/dataset.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Input pipeline for the transformer model to read, filter, and batch examples.
-
-Two things to note in the pipeline:
-
-1. Batching scheme
-
-   The examples encoded in the TFRecord files contain data in the format:
-     {"inputs": [variable length array of integers],
-      "targets": [variable length array of integers]}
-   Where integers in the arrays refer to tokens in the English and German vocab
-   file (named `vocab.ende.32768`).
-
-   Prior to batching, elements in the dataset are grouped by length (max between
-   "inputs" and "targets" length). Each group is then batched such that:
-     group_batch_size * length <= batch_size.
-
-   Another way to view batch_size is the maximum number of tokens in each batch.
-
-   Once batched, each element in the dataset will have the shape:
-     {"inputs": [group_batch_size, padded_input_length],
-      "targets": [group_batch_size, padded_target_length]}
-   Lengths are padded to the longest "inputs" or "targets" sequence in the batch
-   (padded_input_length and padded_target_length can be different).
-
-   This batching scheme decreases the fraction of padding tokens per training
-   batch, thus improving the training speed significantly.
-
-2. Shuffling
-
-   While training, the dataset is shuffled in two places in the code. The first
-   is the list of training files. Second, while reading records using
-   `parallel_interleave`, the `sloppy` argument is used to generate randomness
-   in the order of the examples.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os
-
-import tensorflow as tf
-
-from official.utils.misc import model_helpers
-
-# Buffer size for reading records from a TFRecord file. Each training file is
-# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
-_READ_RECORD_BUFFER = 8 * 1000 * 1000
-
-# Example grouping constants. Defines length boundaries for each group.
-# These values are the defaults used in Tensor2Tensor.
-_MIN_BOUNDARY = 8
-_BOUNDARY_SCALE = 1.1
-
-
-def _load_records(filename):
-  """Read file and return a dataset of tf.Examples."""
-  return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
-
-
-def _parse_example(serialized_example):
-  """Return inputs and targets Tensors from a serialized tf.Example."""
-  data_fields = {
-      "inputs": tf.VarLenFeature(tf.int64),
-      "targets": tf.VarLenFeature(tf.int64)
-  }
-  parsed = tf.parse_single_example(serialized_example, data_fields)
-  inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
-  targets = tf.sparse_tensor_to_dense(parsed["targets"])
-  return inputs, targets
-
-
-def _filter_max_length(example, max_length=256):
-  """Indicates whether the example's length is lower than the maximum length."""
-  return tf.logical_and(tf.size(example[0]) <= max_length,
-                        tf.size(example[1]) <= max_length)
-
-
-def _get_example_length(example):
-  """Returns the maximum length between the example inputs and targets."""
-  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
-  return length
-
-
-def _create_min_max_boundaries(
-    max_length, min_boundary=_MIN_BOUNDARY, boundary_scale=_BOUNDARY_SCALE):
-  """Create min and max boundary lists up to max_length.
-
-  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
-  returned values will be:
-    buckets_min = [0, 4, 8, 16, 24]
-    buckets_max = [4, 8, 16, 24, 25]
-
-  Args:
-    max_length: The maximum length of example in dataset.
-    min_boundary: Minimum length in boundary.
-    boundary_scale: Amount to scale consecutive boundaries in the list.
-
-  Returns:
-    min and max boundary lists
-
-  """
-  # Create bucket boundaries list by scaling the previous boundary or adding 1
-  # (to ensure increasing boundary sizes).
-  bucket_boundaries = []
-  x = min_boundary
-  while x < max_length:
-    bucket_boundaries.append(x)
-    x = max(x + 1, int(x * boundary_scale))
-
-  # Create min and max boundary lists from the initial list.
-  buckets_min = [0] + bucket_boundaries
-  buckets_max = bucket_boundaries + [max_length + 1]
-  return buckets_min, buckets_max
-
-
-def _batch_examples(dataset, batch_size, max_length):
-  """Group examples by similar lengths, and return batched dataset.
-
-  Each batch of similar-length examples are padded to the same length, and may
-  have different number of elements in each batch, such that:
-    group_batch_size * padded_length <= batch_size.
-
-  This decreases the number of padding tokens per batch, which improves the
-  training speed.
-
-  Args:
-    dataset: Dataset of unbatched examples.
-    batch_size: Max number of tokens per batch of examples.
-    max_length: Max number of tokens in an example input or target sequence.
-
-  Returns:
-    Dataset of batched examples with similar lengths.
-  """
-  # Get min and max boundary lists for each example. These are used to calculate
-  # the `bucket_id`, which is the index at which:
-  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
-  # Note that using both min and max lists improves the performance.
-  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
-
-  # Create list of batch sizes for each bucket_id, so that
-  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
-  bucket_batch_sizes = [batch_size // x for x in buckets_max]
-  # bucket_id will be a tensor, so convert this list to a tensor as well.
-  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
-
-  def example_to_bucket_id(example_input, example_target):
-    """Return int64 bucket id for this example, calculated based on length."""
-    seq_length = _get_example_length((example_input, example_target))
-
-    # TODO: investigate whether removing code branching improves performance.
-    conditions_c = tf.logical_and(
-        tf.less_equal(buckets_min, seq_length),
-        tf.less(seq_length, buckets_max))
-    bucket_id = tf.reduce_min(tf.where(conditions_c))
-    return bucket_id
-
-  def window_size_fn(bucket_id):
-    """Return number of examples to be grouped when given a bucket id."""
-    return bucket_batch_sizes[bucket_id]
-
-  def batching_fn(bucket_id, grouped_dataset):
-    """Batch and add padding to a dataset of elements with similar lengths."""
-    bucket_batch_size = window_size_fn(bucket_id)
-
-    # Batch the dataset and add padding so that all input sequences in the
-    # examples have the same length, and all target sequences have the same
-    # lengths as well. Resulting lengths of inputs and targets can differ.
-    return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
-
-  return dataset.apply(tf.contrib.data.group_by_window(
-      key_func=example_to_bucket_id,
-      reduce_func=batching_fn,
-      window_size=None,
-      window_size_func=window_size_fn))
-
-
-def _read_and_batch_from_files(
-    file_pattern, batch_size, max_length, num_parallel_calls, shuffle, repeat,
-    static_batch=False):
-  """Create dataset where each item is a dict of "inputs" and "targets".
-
-  Args:
-    file_pattern: String used to match the input TFRecord files.
-    batch_size: Maximum number of tokens per batch of examples
-    max_length: Maximum number of tokens per example
-    num_parallel_calls: Number of cpu cores for parallel input processing.
-    shuffle: If true, randomizes order of elements.
-    repeat: Number of times to repeat the dataset. If None, the dataset is
-      repeated forever.
-    static_batch: Whether the batches in the dataset should have static shapes.
-      If True, the input is batched so that every batch has the
-      shape [batch_size // max_length, max_length]. If False, the input is
-      grouped by length, and batched so that batches may have different
-      shapes [N, M], where:
-        N * M <= batch_size
-        M <= max_length
-      In general, this setting should be False. Dynamic shapes allow the inputs
-      to be grouped so that the number of padding tokens is minimized, and helps
-      model training. In cases where the input shape must be static
-      (e.g. running on TPU), this setting should be set to True.
-
-  Returns:
-    tf.data.Dataset object containing examples loaded from the files.
-  """
-  dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle)
-
-  # Read files and interleave results. When training, the order of the examples
-  # will be non-deterministic.
-  dataset = dataset.apply(
-      tf.contrib.data.parallel_interleave(
-          _load_records, sloppy=shuffle, cycle_length=num_parallel_calls))
-
-  # Parse each tf.Example into a dictionary
-  # TODO: Look into prefetch_input_elements for performance optimization.
-  dataset = dataset.map(_parse_example,
-                        num_parallel_calls=num_parallel_calls)
-
-  # Remove examples where the input or target length exceeds the maximum length,
-  dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
-
-  if static_batch:
-    dataset = dataset.apply(tf.contrib.data.padded_batch_and_drop_remainder(
-        batch_size // max_length, ([max_length], [max_length])))
-  else:
-    # Group and batch such that each batch has examples of similar length.
-    dataset = _batch_examples(dataset, batch_size, max_length)
-
-  dataset = dataset.repeat(repeat)
-
-  # Prefetch the next element to improve speed of input pipeline.
-  dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)
-  return dataset
-
-
-def _generate_synthetic_data(params):
-  """Create synthetic data based on the parameter batch size."""
-  batch = length = int(math.sqrt(params["batch_size"]))
-  return model_helpers.generate_synthetic_data(
-      input_shape=tf.TensorShape([batch, length]),
-      input_value=1,
-      input_dtype=tf.int32,
-      label_shape=tf.TensorShape([batch, length]),
-      label_value=1,
-      label_dtype=tf.int32,
-  )
-
-
-def train_input_fn(params):
-  """Load and return dataset of batched examples for use during training."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*train*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern, params["batch_size"], params["max_length"],
-      params["num_parallel_calls"], shuffle=True,
-      repeat=params["repeat_dataset"], static_batch=params["static_batch"])
-
-
-def eval_input_fn(params):
-  """Load and return dataset of batched examples for use during evaluation."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*dev*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern, params["batch_size"], params["max_length"],
-      params["num_parallel_calls"], shuffle=False, repeat=1,
-      static_batch=params["static_batch"])
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/metrics.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/metrics.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functions for calculating loss, accuracy, and other model metrics.
-
-Metrics:
- - Padded loss, accuracy, and negative log perplexity. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
- - BLEU approximation. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
- - ROUGE score. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import math
-
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-
-def _pad_tensors_to_same_length(x, y):
-  """Pad x and y so that the results have the same length (second dimension)."""
-  with tf.name_scope("pad_to_same_length"):
-    x_length = tf.shape(x)[1]
-    y_length = tf.shape(y)[1]
-
-    max_length = tf.maximum(x_length, y_length)
-
-    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
-    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
-    return x, y
-
-
-def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
-  """Calculate cross entropy loss while ignoring padding.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch_size, length_labels]
-    smoothing: Label smoothing constant, used to determine the on and off values
-    vocab_size: int size of the vocabulary
-  Returns:
-    Returns the cross entropy loss and weight tensors: float32 tensors with
-      shape [batch_size, max(length_logits, length_labels)]
-  """
-  with tf.name_scope("loss", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-
-    # Calculate smoothing cross entropy
-    with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
-      confidence = 1.0 - smoothing
-      low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
-      soft_targets = tf.one_hot(
-          tf.cast(labels, tf.int32),
-          depth=vocab_size,
-          on_value=confidence,
-          off_value=low_confidence)
-      xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
-          logits=logits, labels=soft_targets)
-
-      # Calculate the best (lowest) possible value of cross entropy, and
-      # subtract from the cross entropy loss.
-      normalizing_constant = -(
-          confidence * tf.log(confidence) + tf.to_float(vocab_size - 1) *
-          low_confidence * tf.log(low_confidence + 1e-20))
-      xentropy -= normalizing_constant
-
-    weights = tf.to_float(tf.not_equal(labels, 0))
-    return xentropy * weights, weights
-
-
-def _convert_to_eval_metric(metric_fn):
-  """Wrap a metric fn that returns scores and weights as an eval metric fn.
-
-  The input metric_fn returns values for the current batch. The wrapper
-  aggregates the return values collected over all of the batches evaluated.
-
-  Args:
-    metric_fn: function that returns scores and weights for the current batch's
-      logits and predicted labels.
-
-  Returns:
-    function that aggregates the scores and weights from metric_fn.
-  """
-  def problem_metric_fn(*args):
-    """Returns an aggregation of the metric_fn's returned values."""
-    (scores, weights) = metric_fn(*args)
-
-    # The tf.metrics.mean function assures correct aggregation.
-    return tf.metrics.mean(scores, weights)
-  return problem_metric_fn
-
-
-def get_eval_metrics(logits, labels, params):
-  """Return dictionary of model evaluation metrics."""
-  metrics = {
-      "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
-      "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(
-          logits, labels),
-      "accuracy_per_sequence": _convert_to_eval_metric(
-          padded_sequence_accuracy)(logits, labels),
-      "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(
-          logits, labels, params["vocab_size"]),
-  }
-
-  if not params["use_tpu"]:
-    # TPU does not support tf.py_func
-    metrics.update({
-        "approx_bleu_score": _convert_to_eval_metric(
-            bleu_score)(logits, labels),
-        "rouge_2_fscore": _convert_to_eval_metric(
-            rouge_2_fscore)(logits, labels),
-        "rouge_L_fscore": _convert_to_eval_metric(
-            rouge_l_fscore)(logits, labels),
-    })
-
-  # Prefix each of the metric names with "metrics/". This allows the metric
-  # graphs to display under the "metrics" category in TensorBoard.
-  metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
-  return metrics
-
-
-def padded_accuracy(logits, labels):
-  """Percentage of times that predictions matches labels on non-0s."""
-  with tf.variable_scope("padded_accuracy", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.to_float(tf.not_equal(labels, 0))
-    outputs = tf.to_int32(tf.argmax(logits, axis=-1))
-    padded_labels = tf.to_int32(labels)
-    return tf.to_float(tf.equal(outputs, padded_labels)), weights
-
-
-def padded_accuracy_topk(logits, labels, k):
-  """Percentage of times that top-k predictions matches labels on non-0s."""
-  with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.to_float(tf.not_equal(labels, 0))
-    effective_k = tf.minimum(k, tf.shape(logits)[-1])
-    _, outputs = tf.nn.top_k(logits, k=effective_k)
-    outputs = tf.to_int32(outputs)
-    padded_labels = tf.to_int32(labels)
-    padded_labels = tf.expand_dims(padded_labels, axis=-1)
-    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
-    same = tf.to_float(tf.equal(outputs, padded_labels))
-    same_topk = tf.reduce_sum(same, axis=-1)
-    return same_topk, weights
-
-
-def padded_accuracy_top5(logits, labels):
-  return padded_accuracy_topk(logits, labels, 5)
-
-
-def padded_sequence_accuracy(logits, labels):
-  """Percentage of times that predictions matches labels everywhere (non-0)."""
-  with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.to_float(tf.not_equal(labels, 0))
-    outputs = tf.to_int32(tf.argmax(logits, axis=-1))
-    padded_labels = tf.to_int32(labels)
-    not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
-    axis = list(range(1, len(outputs.get_shape())))
-    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
-    return correct_seq, tf.constant(1.0)
-
-
-def padded_neg_log_perplexity(logits, labels, vocab_size):
-  """Average log-perplexity excluding padding 0s. No smoothing."""
-  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
-  return -num, den
-
-
-def bleu_score(logits, labels):
-  """Approximate BLEU score computation between labels and predictions.
-
-  An approximate BLEU scoring method since we do not glue word pieces or
-  decode the ids and tokenize the output. By default, we use ngram order of 4
-  and use brevity penalty. Also, this does not have beam search.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch-size, length_labels]
-
-  Returns:
-    bleu: int, approx bleu score
-  """
-  predictions = tf.to_int32(tf.argmax(logits, axis=-1))
-  # TODO: Look into removing use of py_func
-  bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
-  return bleu, tf.constant(1.0)
-
-
-def _get_ngrams_with_counter(segment, max_order):
-  """Extracts all n-grams up to a given maximum order from an input segment.
-
-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
-
-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-  ngram_counts = collections.Counter()
-  for order in xrange(1, max_order + 1):
-    for i in xrange(0, len(segment) - order + 1):
-      ngram = tuple(segment[i:i + order])
-      ngram_counts[ngram] += 1
-  return ngram_counts
-
-
-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 use_bp=True):
-  """Computes BLEU score of translated segments against one or more references.
-
-  Args:
-    reference_corpus: list of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-    translation_corpus: list of translations to score. Each translation
-        should be tokenized into a list of tokens.
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    use_bp: boolean, whether to apply brevity penalty.
-
-  Returns:
-    BLEU score.
-  """
-  reference_length = 0
-  translation_length = 0
-  bp = 1.0
-  geo_mean = 0
-
-  matches_by_order = [0] * max_order
-  possible_matches_by_order = [0] * max_order
-  precisions = []
-
-  for (references, translations) in zip(reference_corpus, translation_corpus):
-    reference_length += len(references)
-    translation_length += len(translations)
-    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
-    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
-
-    overlap = dict((ngram,
-                    min(count, translation_ngram_counts[ngram]))
-                   for ngram, count in ref_ngram_counts.items())
-
-    for ngram in overlap:
-      matches_by_order[len(ngram) - 1] += overlap[ngram]
-    for ngram in translation_ngram_counts:
-      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
-          ngram]
-
-  precisions = [0] * max_order
-  smooth = 1.0
-
-  for i in xrange(0, max_order):
-    if possible_matches_by_order[i] > 0:
-      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
-      if matches_by_order[i] > 0:
-        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
-            i]
-      else:
-        smooth *= 2
-        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
-    else:
-      precisions[i] = 0.0
-
-  if max(precisions) > 0:
-    p_log_sum = sum(math.log(p) for p in precisions if p)
-    geo_mean = math.exp(p_log_sum / max_order)
-
-  if use_bp:
-    ratio = translation_length / reference_length
-    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
-  bleu = geo_mean * bp
-  return np.float32(bleu)
-
-
-def rouge_2_fscore(logits, labels):
-  """ROUGE-2 F1 score computation between labels and predictions.
-
-  This is an approximate ROUGE scoring method since we do not glue word pieces
-  or decode the ids and tokenize the output.
-
-  Args:
-    logits: tensor, model predictions
-    labels: tensor, gold output.
-
-  Returns:
-    rouge2_fscore: approx rouge-2 f1 score.
-  """
-  predictions = tf.to_int32(tf.argmax(logits, axis=-1))
-  # TODO: Look into removing use of py_func
-  rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
-  return rouge_2_f_score, tf.constant(1.0)
-
-
-def _get_ngrams(n, text):
-  """Calculates n-grams.
-
-  Args:
-    n: which n-grams to calculate
-    text: An array of tokens
-
-  Returns:
-    A set of n-grams
-  """
-  ngram_set = set()
-  text_length = len(text)
-  max_index_ngram_start = text_length - n
-  for i in range(max_index_ngram_start + 1):
-    ngram_set.add(tuple(text[i:i + n]))
-  return ngram_set
-
-
-def rouge_n(eval_sentences, ref_sentences, n=2):
-  """Computes ROUGE-N f1 score of two text collections of sentences.
-
-  Source: https://www.microsoft.com/en-us/research/publication/
-  rouge-a-package-for-automatic-evaluation-of-summaries/
-
-  Args:
-    eval_sentences: Predicted sentences.
-    ref_sentences: Sentences from the reference set
-    n: Size of ngram.  Defaults to 2.
-
-  Returns:
-    f1 score for ROUGE-N
-  """
-  f1_scores = []
-  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
-    eval_ngrams = _get_ngrams(n, eval_sentence)
-    ref_ngrams = _get_ngrams(n, ref_sentence)
-    ref_count = len(ref_ngrams)
-    eval_count = len(eval_ngrams)
-
-    # Count the overlapping ngrams between evaluated and reference
-    overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
-    overlapping_count = len(overlapping_ngrams)
-
-    # Handle edge case. This isn't mathematically correct, but it's good enough
-    if eval_count == 0:
-      precision = 0.0
-    else:
-      precision = float(overlapping_count) / eval_count
-    if ref_count == 0:
-      recall = 0.0
-    else:
-      recall = float(overlapping_count) / ref_count
-    f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
-
-  # return overlapping_count / reference_count
-  return np.mean(f1_scores, dtype=np.float32)
-
-
-def rouge_l_fscore(predictions, labels):
-  """ROUGE scores computation between labels and predictions.
-
-  This is an approximate ROUGE scoring method since we do not glue word pieces
-  or decode the ids and tokenize the output.
-
-  Args:
-    predictions: tensor, model predictions
-    labels: tensor, gold output.
-
-  Returns:
-    rouge_l_fscore: approx rouge-l f1 score.
-  """
-  outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
-  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
-                               tf.float32)
-  return rouge_l_f_score, tf.constant(1.0)
-
-
-def rouge_l_sentence_level(eval_sentences, ref_sentences):
-  """Computes ROUGE-L (sentence level) of two collections of sentences.
-
-  Source: https://www.microsoft.com/en-us/research/publication/
-  rouge-a-package-for-automatic-evaluation-of-summaries/
-
-  Calculated according to:
-  R_lcs = LCS(X,Y)/m
-  P_lcs = LCS(X,Y)/n
-  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
-
-  where:
-  X = reference summary
-  Y = Candidate summary
-  m = length of reference summary
-  n = length of candidate summary
-
-  Args:
-    eval_sentences: The sentences that have been picked by the summarizer
-    ref_sentences: The sentences from the reference set
-
-  Returns:
-    A float: F_lcs
-  """
-
-  f1_scores = []
-  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
-    m = float(len(ref_sentence))
-    n = float(len(eval_sentence))
-    lcs = _len_lcs(eval_sentence, ref_sentence)
-    f1_scores.append(_f_lcs(lcs, m, n))
-  return np.mean(f1_scores, dtype=np.float32)
-
-
-def _len_lcs(x, y):
-  """Returns the length of the Longest Common Subsequence between two seqs.
-
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
-
-  Args:
-    x: sequence of words
-    y: sequence of words
-
-  Returns
-    integer: Length of LCS between x and y
-  """
-  table = _lcs(x, y)
-  n, m = len(x), len(y)
-  return table[n, m]
-
-
-def _lcs(x, y):
-  """Computes the length of the LCS between two seqs.
-
-  The implementation below uses a DP programming algorithm and runs
-  in O(nm) time where n = len(x) and m = len(y).
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
-
-  Args:
-    x: collection of words
-    y: collection of words
-
-  Returns:
-    Table of dictionary of coord and len lcs
-  """
-  n, m = len(x), len(y)
-  table = dict()
-  for i in range(n + 1):
-    for j in range(m + 1):
-      if i == 0 or j == 0:
-        table[i, j] = 0
-      elif x[i - 1] == y[j - 1]:
-        table[i, j] = table[i - 1, j - 1] + 1
-      else:
-        table[i, j] = max(table[i - 1, j], table[i, j - 1])
-  return table
-
-
-def _f_lcs(llcs, m, n):
-  """Computes the LCS-based F-measure score.
-
-  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
-  rouge-working-note-v1.3.1.pdf
-
-  Args:
-    llcs: Length of LCS
-    m: number of words in reference summary
-    n: number of words in candidate summary
-
-  Returns:
-    Float. LCS-based F-measure score
-  """
-  r_lcs = llcs / m
-  p_lcs = llcs / n
-  beta = p_lcs / (r_lcs + 1e-12)
-  num = (1 + (beta ** 2)) * r_lcs * p_lcs
-  denom = r_lcs + ((beta ** 2) * p_lcs)
-  f_lcs = num / (denom + 1e-12)
-  return f_lcs
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/schedule.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/schedule.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Abstract training on a step or epoch basis."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import tensorflow as tf
-
-
-_TRAIN, _EVAL = tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL
-
-
-NUM_EXAMPLES = {
-    tf.estimator.ModeKeys.TRAIN: 4572160,
-    # # Examples that are too long are filtered out, thus the total is less
-    # # than the total number of lines.
-    # 2399123 +  # news-commentary-v12.de-en
-    # 1920209 +  # commoncrawl.de-en
-    # 270769,    # europarl-v7.de-en
-    tf.estimator.ModeKeys.EVAL: 3000,  # newstest2013
-}
-
-
-class Manager(object):
-  """Container for convenience functions to abstract step or epoch basis.
-  Transformer allows users to specify an epoch basis (generally recommended for
-  full training) or a number of steps basis (convenient since epochs are rather
-  large). TPUs furthermore require a step basis; however epochs are the norm in
-  the machine learning community and it is desirable to allow users to specify
-  epochs even when running with TPUS which requires behind the scenes
-  conversions.
-  This container simply groups what are largely mundane checks and conversions
-  rather than interspersing them throughout the run loop code.
-  """
-
-  def __init__(self, train_steps, steps_between_evals, train_epochs,
-               epochs_between_evals, default_train_epochs, batch_size,
-               max_length, use_tpu=False, num_tpu_shards=8):
-    if train_steps and train_epochs:
-      raise ValueError("Both train_steps or train_epochs were be defined.")
-
-    # Determine training schedule based on flags.
-    if train_steps:
-      self.train_eval_iterations = train_steps // steps_between_evals
-      self._single_iteration_train_steps = steps_between_evals
-      self._single_iteration_train_epochs = None
-    else:
-      train_epochs = train_epochs or default_train_epochs
-      self.train_eval_iterations = train_epochs // epochs_between_evals
-      self._single_iteration_train_steps = None
-      self._single_iteration_train_epochs = epochs_between_evals
-
-    self.max_length = max_length
-    self.batch_size = batch_size
-    self.use_tpu = use_tpu
-    self.num_tpu_shards = num_tpu_shards
-
-    if self.use_tpu:
-      assert (self.batch_size // self.max_length) % self.num_tpu_shards == 0
-
-  @property
-  def single_iteration_train_steps(self):
-    if self._single_iteration_train_steps or not self.use_tpu:
-      return self._single_iteration_train_steps
-
-    return self.epochs_to_steps(
-        num_epochs=self._single_iteration_train_epochs, mode=_TRAIN)
-
-  @property
-  def single_iteration_eval_steps(self):
-    if not self.use_tpu:
-      return None
-
-    return self.epochs_to_steps(num_epochs=1, mode=_EVAL)
-
-  @property
-  def train_increment_str(self):
-    if self._single_iteration_train_steps:
-      return "{} steps.".format(self._single_iteration_train_steps)
-
-    if not self.use_tpu:
-      return "{} epochs.".format(self._single_iteration_train_epochs)
-
-    return "~{} epochs. ({} steps)".format(
-        self._single_iteration_train_epochs,
-        self.single_iteration_train_steps)
-
-  @property
-  def repeat_dataset(self):
-    if (self._single_iteration_train_epochs is None and
-        self._single_iteration_train_steps > NUM_EXAMPLES[_TRAIN]):
-      return math.ceil(self._single_iteration_train_steps /
-                       NUM_EXAMPLES[_TRAIN])
-    return self._single_iteration_train_epochs
-
-  def epochs_to_steps(self, num_epochs, mode):
-    """Converts a number of epochs to a number of training steps.
-
-    TPU only: This function assumes that static_batch is True.
-
-      TPU can not tolerate an OutOfRange error from a dataset. As a result the
-    number of examples to be processed must be known ahead of time. TPUs also
-    do not allow partial batches, so this function rounds down.
-
-    Args:
-      num_epochs: An integer of the number of epochs to convert to steps.
-      mode: The estimator ModeKey of the computation
-
-    Returns:
-      An integer of the number of equivalent steps rounded down.
-    """
-    assert self.use_tpu, "epochs_to_steps should only be reached when using TPU"
-    total_num_tokens = NUM_EXAMPLES[mode] * self.max_length * num_epochs
-    return total_num_tokens // self.batch_size
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/schedule_test.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/transformer/utils/schedule_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test Transformer's schedule manager."""
-
-import tensorflow as tf
-
-from official.transformer.utils import schedule
-
-
-class ScheduleBaseTester(tf.test.TestCase):
-  def test_mutual_exclusivity(self):
-    with self.assertRaises(ValueError):
-      schedule.Manager(
-          train_steps=100, steps_between_evals=100, train_epochs=2,
-          epochs_between_evals=1, default_train_epochs=None, batch_size=2048,
-          max_length=256)
-
-  def test_step_basis(self):
-    manager = schedule.Manager(
-        train_steps=1000, steps_between_evals=100, train_epochs=None,
-        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
-        max_length=256)
-
-    self.assertEqual(manager.single_iteration_train_steps, 100)
-
-    # Evaluation uses the full set
-    self.assertIsNone(manager.single_iteration_eval_steps)
-
-    self.assertIsNone(manager.repeat_dataset)
-
-  def test_epoch_basis(self):
-    manager = schedule.Manager(
-        train_steps=None, steps_between_evals=None, train_epochs=10,
-        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
-        max_length=256)
-
-    # For non-TPU, estimator relies on dataset exhausion
-    self.assertIsNone(manager.single_iteration_train_steps)
-    self.assertIsNone(manager.single_iteration_eval_steps)
-
-    self.assertEqual(manager.repeat_dataset, 2)
-
-  def test_step_basis_tpu(self):
-    manager = schedule.Manager(
-        train_steps=1000, steps_between_evals=100, train_epochs=None,
-        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
-        max_length=256, use_tpu=True)
-
-    self.assertEqual(manager.single_iteration_train_steps, 100)
-    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
-    self.assertEqual(manager.single_iteration_eval_steps, 375)
-    self.assertIsNone(manager.repeat_dataset)
-
-  def test_epoch_basis_tpu(self):
-    manager = schedule.Manager(
-        train_steps=None, steps_between_evals=None, train_epochs=10,
-        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
-        max_length=256, use_tpu=True)
-
-    self.assertEqual(
-        manager.single_iteration_train_steps,
-        schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 // (2048 / 256)
-    )
-
-    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
-    self.assertEqual(manager.single_iteration_eval_steps, 375)
-
-    self.assertEqual(manager.repeat_dataset, 2)
-
-
-if __name__ == "__main__":
-  tf.test.main()