Transformer v2 benchmark (#6860)

* Moved common keras code to utils. * Initial 1 gpu benchmark - Aligned flags with resnet example - removed code/features that are not super useful - eval as part of train if bleu source/ref provided - add exp_per_second hook * Rename benchmark classes, pass batch-size and log_steps. * fix docstring * Predict done with checkpoints inline - perfzero baseclass * steps not epochs with smoother training loop. * do not initialize history outside loop. * 5000 between eval not 500 * estimator to keras. * remove epochs var. * use range not xrange. * 200K steps for 1 gpu * fix global step

Transformer v2 benchmark (#6860)
* Moved common keras code to utils. * Initial 1 gpu benchmark - Aligned flags with resnet example - removed code/features that are not super useful - eval as part of train if bleu source/ref provided - add exp_per_second hook * Rename benchmark classes, pass batch-size and log_steps. * fix docstring * Predict done with checkpoints inline - perfzero baseclass * steps not epochs with smoother training loop. * do not initialize history outside loop. * 5000 between eval not 500 * estimator to keras. * remove epochs var. * use range not xrange. * 200K steps for 1 gpu * fix global step
f2ea2f53 · Toby Boyd · GitHub · 49eaaaf2 · f2ea2f53 · f2ea2f53
Unverified Commit f2ea2f53 authored May 24, 2019 by Toby Boyd Committed by GitHub May 24, 2019
4 changed files
--- a/official/transformer/v2/misc.py
+++ b/official/transformer/v2/misc.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+# pylint: disable=g-bad-import-order
 from absl import flags
+import tensorflow as tf
 # TODO(tianlin) Import internal library. Remove this when some functions for
 # different TF versions are fixed.
@@ -26,11 +28,14 @@ from tensorflow.python import tf2 as tf2_internal
 from official.transformer.model import model_params
 from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+FLAGS = flags.FLAGS
 PARAMS_MAP = {
-    "tiny": model_params.TINY_PARAMS,
+    'tiny': model_params.TINY_PARAMS,
-    "base": model_params.BASE_PARAMS,
+    'base': model_params.BASE_PARAMS,
-    "big": model_params.BIG_PARAMS,
+    'big': model_params.BIG_PARAMS,
 }
@@ -42,12 +47,12 @@ def is_v2():
 def get_model_params(param_set, num_gpus):
  """Gets predefined model params."""
  if num_gpus > 1:
-    if param_set == "big":
+    if param_set == 'big':
      return model_params.BIG_MULTI_GPU_PARAMS.copy()
-    elif param_set == "base":
+    elif param_set == 'base':
      return model_params.BASE_MULTI_GPU_PARAMS.copy()
    else:
-      raise ValueError("Not valid params: param_set={} num_gpus={}".format(
+      raise ValueError('Not valid params: param_set={} num_gpus={}'.format(
          param_set, num_gpus))
  return PARAMS_MAP[param_set].copy()
@@ -69,113 +74,175 @@ def define_transformer_flags():
  flags_core.define_benchmark()
  flags_core.define_device(tpu=True)
-  # Set flags from the flags_core module as "key flags" so they're listed when
+  flags.DEFINE_integer(
+      name='train_steps', short_name='ts', default=None,
+      help=flags_core.help_wrap('The number of steps used to train.'))
+  flags.DEFINE_integer(
+      name='steps_between_evals', short_name='sbe', default=1000,
+      help=flags_core.help_wrap(
+          'The Number of training steps to run between evaluations. This is '
+          'used if --train_steps is defined.'))
+  flags.DEFINE_boolean(
+      name='enable_tensorboard', default=False,
+      help='Whether to enable Tensorboard callback.')
+  flags.DEFINE_string(
+      name='profile_steps', default=None,
+      help='Save profiling data to model dir at given range of steps. The '
+      'value must be a comma separated pair of positive integers, specifying '
+      'the first and last step to profile. For example, "--profile_steps=2,4" '
+      'triggers the profiler to process 3 steps, starting from the 2nd step. '
+      'Note that profiler has a non-trivial performance overhead, and the '
+      'output file can be gigantic if profiling many steps.')
+  # Set flags from the flags_core module as 'key flags' so they're listed when
  # the '-h' flag is used. Without this line, the flags defined above are
  # only shown in the full `--helpful` help text.
  flags.adopt_module_key_flags(flags_core)
  # Add transformer-specific flags
  flags.DEFINE_enum(
-      name="param_set", short_name="mp", default="big",
+      name='param_set', short_name='mp', default='big',
      enum_values=PARAMS_MAP.keys(),
      help=flags_core.help_wrap(
-          "Parameter set to use when creating and training the model. The "
+          'Parameter set to use when creating and training the model. The '
-          "parameters define the input shape (batch size and max length), "
+          'parameters define the input shape (batch size and max length), '
-          "model configuration (size of embedding, # of hidden layers, etc.), "
+          'model configuration (size of embedding, # of hidden layers, etc.), '
-          "and various other settings. The big parameter set increases the "
+          'and various other settings. The big parameter set increases the '
-          "default batch size, embedding/hidden size, and filter size. For a "
+          'default batch size, embedding/hidden size, and filter size. For a '
-          "complete list of parameters, please see model/model_params.py."))
+          'complete list of parameters, please see model/model_params.py.'))
  flags.DEFINE_bool(
-      name="static_batch", default=False,
+      name='static_batch', default=False,
      help=flags_core.help_wrap(
-          "Whether the batches in the dataset should have static shapes. In "
+          'Whether the batches in the dataset should have static shapes. In '
-          "general, this setting should be False. Dynamic shapes allow the "
+          'general, this setting should be False. Dynamic shapes allow the '
-          "inputs to be grouped so that the number of padding tokens is "
+          'inputs to be grouped so that the number of padding tokens is '
-          "minimized, and helps model training. In cases where the input shape "
+          'minimized, and helps model training. In cases where the input shape '
-          "must be static (e.g. running on TPU), this setting will be ignored "
+          'must be static (e.g. running on TPU), this setting will be ignored '
-          "and static batching will always be used."))
+          'and static batching will always be used.'))
  # Flags for training with steps (may be used for debugging)
  flags.DEFINE_integer(
-      name="steps_per_epoch", short_name="sbe", default=1000,
+      name='validation_steps', short_name='vs', default=64,
-      help=flags_core.help_wrap(
+      help=flags_core.help_wrap('The number of steps used in validation.'))
-          "The number of training steps for each epoch."))
-  flags.DEFINE_integer(
-      name="init_epoch", short_name="is", default=0,
-      help=flags_core.help_wrap("The number of initial epoch for training."))
-  flags.DEFINE_string(
-      name="init_weight_path", short_name="iwp", default=None,
-      help=flags_core.help_wrap("The initial model weights to load."))
-  flags.DEFINE_string(
-      name="init_logdir_timestamp", short_name="ilt", default=None,
-      help=flags_core.help_wrap("The initial timestamp for logdir."))
-  flags.DEFINE_integer(
-      name="validation_steps", short_name="vs", default=64,
-      help=flags_core.help_wrap("The number of steps used in validation."))
  # BLEU score computation
  flags.DEFINE_string(
-      name="bleu_source", short_name="bls", default=None,
+      name='bleu_source', short_name='bls', default=None,
      help=flags_core.help_wrap(
-          "Path to source file containing text translate when calculating the "
+          'Path to source file containing text translate when calculating the '
-          "official BLEU score. Both --bleu_source and --bleu_ref must be set. "
+          'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
-          "Use the flag --stop_threshold to stop the script based on the "
+          'Use the flag --stop_threshold to stop the script based on the '
-          "uncased BLEU score."))
+          'uncased BLEU score.'))
  flags.DEFINE_string(
-      name="bleu_ref", short_name="blr", default=None,
+      name='bleu_ref', short_name='blr', default=None,
      help=flags_core.help_wrap(
-          "Path to source file containing text translate when calculating the "
+          'Path to source file containing text translate when calculating the '
-          "official BLEU score. Both --bleu_source and --bleu_ref must be set. "
+          'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
-          "Use the flag --stop_threshold to stop the script based on the "
+          'Use the flag --stop_threshold to stop the script based on the '
-          "uncased BLEU score."))
+          'uncased BLEU score.'))
  flags.DEFINE_string(
-      name="vocab_file", short_name="vf", default=None,
+      name='vocab_file', short_name='vf', default=None,
      help=flags_core.help_wrap(
-          "Path to subtoken vocabulary file. If data_download.py was used to "
+          'Path to subtoken vocabulary file. If data_download.py was used to '
-          "download and encode the training data, look in the data_dir to find "
+          'download and encode the training data, look in the data_dir to find '
-          "the vocab file."))
+          'the vocab file.'))
  flags.DEFINE_string(
-      name="mode", default="train",
+      name='mode', default='train',
-      help=flags_core.help_wrap("mode: train, eval, or predict"))
+      help=flags_core.help_wrap('mode: train, eval, or predict'))
-  flags_core.set_defaults(data_dir="/tmp/translate_ende",
+  flags_core.set_defaults(data_dir='/tmp/translate_ende',
-                          model_dir="/tmp/transformer_model",
+                          model_dir='/tmp/transformer_model',
                          batch_size=None,
                          train_epochs=10)
  # pylint: disable=unused-variable
  @flags.multi_flags_validator(
-      ["mode", "train_epochs"],
+      ['mode', 'train_epochs'],
-      message="--train_epochs must be defined in train mode")
+      message='--train_epochs must be defined in train mode')
  def _check_train_limits(flag_dict):
-    if flag_dict["mode"] == "train":
+    if flag_dict['mode'] == 'train':
-      return flag_dict["train_epochs"] is not None
+      return flag_dict['train_epochs'] is not None
    return True
  @flags.multi_flags_validator(
-      ["bleu_source", "bleu_ref"],
+      ['bleu_source', 'bleu_ref'],
-      message="Both or neither --bleu_source and --bleu_ref must be defined.")
+      message='Both or neither --bleu_source and --bleu_ref must be defined.')
  def _check_bleu_files(flags_dict):
-    return (flags_dict["bleu_source"] is None) == (
+    return (flags_dict['bleu_source'] is None) == (
-        flags_dict["bleu_ref"] is None)
+        flags_dict['bleu_ref'] is None)
  @flags.multi_flags_validator(
-      ["bleu_source", "bleu_ref", "vocab_file"],
+      ['bleu_source', 'bleu_ref', 'vocab_file'],
-      message="--vocab_file must be defined if --bleu_source and --bleu_ref "
+      message='--vocab_file must be defined if --bleu_source and --bleu_ref '
-              "are defined.")
+              'are defined.')
  def _check_bleu_vocab_file(flags_dict):
-    if flags_dict["bleu_source"] and flags_dict["bleu_ref"]:
+    if flags_dict['bleu_source'] and flags_dict['bleu_ref']:
-      return flags_dict["vocab_file"] is not None
+      return flags_dict['vocab_file'] is not None
    return True
  @flags.multi_flags_validator(
-      ["export_dir", "vocab_file"],
+      ['export_dir', 'vocab_file'],
-      message="--vocab_file must be defined if --export_dir is set.")
+      message='--vocab_file must be defined if --export_dir is set.')
  def _check_export_vocab_file(flags_dict):
-    if flags_dict["export_dir"]:
+    if flags_dict['export_dir']:
-      return flags_dict["vocab_file"] is not None
+      return flags_dict['vocab_file'] is not None
    return True
  # pylint: enable=unused-variable
-  flags_core.require_cloud_storage(["data_dir", "model_dir", "export_dir"])
+  flags_core.require_cloud_storage(['data_dir', 'model_dir', 'export_dir'])
+def get_callbacks():
+  """Returns common callbacks."""
+  callbacks = []
+  time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps)
+  callbacks.append(time_callback)
+  if FLAGS.enable_tensorboard:
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(
+        log_dir=FLAGS.model_dir)
+    callbacks.append(tensorboard_callback)
+  if FLAGS.profile_steps:
+    profiler_callback = keras_utils.get_profiler_callback(
+        FLAGS.model_dir,
+        FLAGS.profile_steps,
+        FLAGS.enable_tensorboard)
+    callbacks.append(profiler_callback)
+  return callbacks
+def build_stats(history, callbacks):
+  """Normalizes and returns dictionary of stats.
+  Args:
+    history: Results of the training step.
+    callbacks: a list of callbacks which might include a time history callback
+      used during keras.fit.
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+  if history and history.history:
+    train_hist = history.history
+    # Gets final loss from training.
+    stats['loss'] = train_hist['loss'][-1].item()
+  if not callbacks:
+    return stats
+  # Look for the time history callback which was used during keras.fit
+  for callback in callbacks:
+    if isinstance(callback, keras_utils.TimeHistory):
+      timestamp_log = callback.timestamp_log
+      stats['step_timestamp_log'] = timestamp_log
+      stats['train_finish_time'] = callback.train_finish_time
+      if len(timestamp_log) > 1:
+        stats['avg_exp_per_second'] = (
+            callback.batch_size * callback.log_steps *
+            (len(callback.timestamp_log)-1) /
+            (timestamp_log[-1].timestamp - timestamp_log[0].timestamp))
+  return stats
--- a/official/transformer/v2/transformer_benchmark.py
+++ b/official/transformer/v2/transformer_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes Transformer w/Keras benchmark and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import time
+from absl import flags
+from official.transformer.v2 import misc
+from official.transformer.v2 import transformer_main as transformer_main
+from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
+TRANSFORMER_EN2DE_DATA_DIR_NAME = 'wmt32k-en2de-official'
+EN2DE_2014_BLEU_DATA_DIR_NAME = 'newstest2014'
+FLAGS = flags.FLAGS
+class TransformerBenchmark(PerfZeroBenchmark):
+  """Methods common to executing transformer w/keras tests.
+     Code under test for the Transformer Keras models report the same data and
+     require the same FLAG setup.
+  """
+  def __init__(self, output_dir=None, default_flags=None, root_data_dir=None,
+               flag_methods=None):
+    self.train_data_dir = os.path.join(root_data_dir,
+                                       TRANSFORMER_EN2DE_DATA_DIR_NAME)
+    self.vocab_file = os.path.join(root_data_dir,
+                                   TRANSFORMER_EN2DE_DATA_DIR_NAME,
+                                   'vocab.ende.32768')
+    self.bleu_source = os.path.join(root_data_dir,
+                                    EN2DE_2014_BLEU_DATA_DIR_NAME,
+                                    'newstest2014.en')
+    self.bleu_ref = os.path.join(root_data_dir,
+                                 EN2DE_2014_BLEU_DATA_DIR_NAME,
+                                 'newstest2014.de')
+    super(TransformerBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        flag_methods=flag_methods)
+  def _run_and_report_benchmark(self,
+                                bleu_max=None,
+                                bleu_min=None,
+                                log_steps=None,
+                                total_batch_size=None,
+                                warmup=1):
+    """Report benchmark results by writing to local protobuf file.
+    Args:
+      bleu_max: highest passing level for bleu score.
+      bleu_min: lowest passing level for bleu score.
+      log_steps: How often the log was created for stats['step_timestamp_log'].
+      total_batch_size: Global batch-size.
+      warmup: number of entries in stats['step_timestamp_log'] to ignore.
+    """
+    start_time_sec = time.time()
+    task = transformer_main.TransformerTask(FLAGS)
+    stats = task.train()
+    wall_time_sec = time.time() - start_time_sec
+    metrics = []
+    if 'bleu_uncased' in stats:
+      metrics.append({'name': 'bleu_uncased',
+                      'value': stats['bleu_uncased'],
+                      'min_value': bleu_min,
+                      'max_value': bleu_max})
+    if (warmup and 'step_timestamp_log' in stats and
+        len(stats['step_timestamp_log']) > warmup):
+      # first entry in the time_log is start of step 1. The rest of the
+      # entries are the end of each step recorded
+      time_log = stats['step_timestamp_log']
+      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      num_examples = (
+          total_batch_size * log_steps * (len(time_log) - warmup - 1))
+      examples_per_sec = num_examples / elapsed
+      metrics.append({'name': 'exp_per_second',
+                      'value': examples_per_sec})
+    if 'avg_exp_per_second' in stats:
+      metrics.append({'name': 'avg_exp_per_second',
+                      'value': stats['avg_exp_per_second']})
+    self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics)
+class TransformerBaseKerasAccuracy(TransformerBenchmark):
+  """Benchmark accuracy tests for Transformer Base model w/ Keras."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Benchmark accuracy tests for Transformer Base model w/ Keras.
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    flag_methods = [misc.define_transformer_flags]
+    super(TransformerBaseKerasAccuracy, self).__init__(
+        output_dir=output_dir, root_data_dir=root_data_dir,
+        flag_methods=flag_methods)
+  def benchmark_1_gpu(self):
+    """Benchmark 1 gpu.
+      The paper uses 8 GPUs and a much larger effective batch size, this is will
+      not converge to the 27.3 BLEU (uncased) SOTA.
+    """
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.train_data_dir
+    FLAGS.vocab_file = self.vocab_file
+    # Sets values directly to avoid validation check.
+    FLAGS['bleu_source'].value = self.bleu_source
+    FLAGS['bleu_ref'].value = self.bleu_ref
+    FLAGS.param_set = 'base'
+    FLAGS.batch_size = 4096
+    FLAGS.train_steps = 200000
+    FLAGS.steps_between_evals = 5000
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    # These bleu scores are based on test runs after at this limited
+    # number of steps and batch size after verifying SOTA at 8xV100s.
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps,
+                                   bleu_min=25.3,
+                                   bleu_max=26)
+class TransformerKerasBenchmark(TransformerBenchmark):
+  """Benchmarks for Transformer (Base and Big) using Keras."""
+  def __init__(self, output_dir=None, default_flags=None,
+               root_data_dir=None, batch_per_gpu=4096):
+    """Initialize.
+    Args:
+      output_dir: Based directory for saving artifacts, e.g. checkpoints.
+      default_flags: default flags to use for all tests.
+      root_data_dir: root directory for data, e.g. training.
+      batch_per_gpu: batch size to use per gpu.
+    """
+    flag_methods = [misc.define_transformer_flags]
+    self.batch_per_gpu = batch_per_gpu
+    super(TransformerKerasBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        root_data_dir=root_data_dir,
+        flag_methods=flag_methods)
+  def benchmark_1_gpu(self):
+    """Benchmark 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = self.batch_per_gpu
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
+                                   log_steps=FLAGS.log_steps)
+class TransformerBaseKerasBenchmarkReal(TransformerKerasBenchmark):
+  """Transformer based version real data benchmark tests."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    train_data_dir = os.path.join(root_data_dir,
+                                  TRANSFORMER_EN2DE_DATA_DIR_NAME)
+    vocab_file = os.path.join(root_data_dir,
+                              TRANSFORMER_EN2DE_DATA_DIR_NAME,
+                              'vocab.ende.32768')
+    def_flags = {}
+    def_flags['param_set'] = 'base'
+    def_flags['vocab_file'] = vocab_file
+    def_flags['data_dir'] = train_data_dir
+    def_flags['train_steps'] = 200
+    def_flags['log_steps'] = 10
+    super(TransformerBaseKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags,
+        root_data_dir=root_data_dir, batch_per_gpu=4096)
+class TransformerBigKerasBenchmarkReal(TransformerKerasBenchmark):
+  """Transformer based version real data benchmark tests."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    train_data_dir = os.path.join(root_data_dir,
+                                  TRANSFORMER_EN2DE_DATA_DIR_NAME)
+    vocab_file = os.path.join(root_data_dir,
+                              TRANSFORMER_EN2DE_DATA_DIR_NAME,
+                              'vocab.ende.32768')
+    def_flags = {}
+    def_flags['param_set'] = 'big'
+    def_flags['vocab_file'] = vocab_file
+    def_flags['data_dir'] = train_data_dir
+    def_flags['train_steps'] = 200
+    def_flags['log_steps'] = 10
+    super(TransformerBigKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags,
+        root_data_dir=root_data_dir, batch_per_gpu=3072)
--- a/official/transformer/v2/transformer_main.py
+++ b/official/transformer/v2/transformer_main.py
@@ -22,15 +22,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import datetime
 import os
 import tempfile
-# pylint: disable=g-bad-import-order
+from absl import app as absl_app
 from absl import flags
 import tensorflow as tf
-# pylint: enable=g-bad-import-order
+# pylint: disable=g-bad-import-order
 from official.transformer import compute_bleu
 from official.transformer.utils import tokenizer
 from official.transformer.v2 import data_pipeline
@@ -89,6 +88,7 @@ class TransformerTask(object):
      flags_obj: Object containing parsed flag values, i.e., FLAGS.
    """
    self.flags_obj = flags_obj
+    self.predict_model = None
    # Add flag-defined parameters to params object
    num_gpus = flags_core.get_num_gpus(flags_obj)
@@ -106,52 +106,62 @@ class TransformerTask(object):
  def train(self):
    """Trains the model."""
    params, flags_obj, is_train = self.params, self.flags_obj, True
+    _ensure_dir(flags_obj.model_dir)
    model = transformer.create_model(params, is_train)
    opt = self._create_optimizer()
    model.compile(opt, target_tensors=[])
    model.summary()
-    self._load_weights_if_possible(model, flags_obj.init_weight_path)
-    cur_log_dir = _get_log_dir_or_default(flags_obj)
-    _ensure_dir(cur_log_dir)
    map_data_fn = data_pipeline.map_data_for_transformer_fn
    train_ds = data_pipeline.train_input_fn(params)
    train_ds = train_ds.map(
        map_data_fn, num_parallel_calls=params["num_parallel_calls"])
-    valid_ds = data_pipeline.eval_input_fn(params)
-    valid_ds = valid_ds.map(
-        map_data_fn, num_parallel_calls=params["num_parallel_calls"])
-    init_epoch = flags_obj.init_epoch or 0
+    callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)
-    init_steps = init_epoch * flags_obj.steps_per_epoch
-    callbacks = self._create_callbacks(cur_log_dir, init_steps, params)
+    if flags_obj.train_steps < flags_obj.steps_between_evals:
+      flags_obj.steps_between_evals = flags_obj.train_steps
-    history = model.fit(
+    iterations = flags_obj.train_steps // flags_obj.steps_between_evals
-        train_ds,
-        initial_epoch=init_epoch,
+    cased_score, uncased_score = None, None
-        epochs=flags_obj.train_epochs,
+    for i in range(1, iterations + 1):
-        steps_per_epoch=flags_obj.steps_per_epoch,
+      print("Start train iteration:{}/{}".format(i, iterations))
-        validation_data=valid_ds,
+      history = model.fit(
-        validation_steps=flags_obj.validation_steps,
+          train_ds,
-        callbacks=callbacks)
+          initial_epoch=i-1,
-    tf.compat.v1.logging.info("\nTrain history: {}".format(history.history))
+          epochs=i,
+          steps_per_epoch=flags_obj.steps_between_evals,
-    save_weight_path = os.path.join(cur_log_dir, "saves-model-weights.hdf5")
+          callbacks=callbacks,
-    save_model_path = os.path.join(cur_log_dir, "saves-model.hdf5")
+          verbose=2)
-    model.save_weights(save_weight_path)
+      print("End train iteration:{}/{} global step:{}".format(
-    model.save(save_model_path)
+          i,
+          iterations,
+          i*flags_obj.steps_between_evals))
+      tf.compat.v1.logging.info("Train history: {}".format(history.history))
+      stats = misc.build_stats(history, callbacks)
+      if (flags_obj.bleu_source and flags_obj.bleu_ref):
+        uncased_score, cased_score = self.eval()
+    stats = misc.build_stats(history, callbacks)
+    if uncased_score and cased_score:
+      stats["bleu_uncased"] = uncased_score
+      stats["bleu_cased"] = cased_score
+    return stats
  def eval(self):
    """Evaluates the model."""
-    params, flags_obj, is_train = self.params, self.flags_obj, False
+    if not self.predict_model:
-    with tf.name_scope("model"):
+      self.predict_model = transformer.create_model(self.params, False)
-      model = transformer.create_model(params, is_train)
+    self._load_weights_if_possible(
-      self._load_weights_if_possible(model, flags_obj.init_weight_path)
+        self.predict_model,
-      model.summary()
+        tf.train.latest_checkpoint(self.flags_obj.model_dir))
-    evaluate_and_log_bleu(model, flags_obj.bleu_source, flags_obj.bleu_ref,
+    self.predict_model.summary()
-                          flags_obj.vocab_file)
+    return evaluate_and_log_bleu(self.predict_model,
+                                 self.flags_obj.bleu_source,
+                                 self.flags_obj.bleu_ref,
+                                 self.flags_obj.vocab_file)
  def predict(self):
    """Predicts result from the model."""
@@ -177,23 +187,20 @@ class TransformerTask(object):
                                     params["hidden_size"],
                                     params["learning_rate_warmup_steps"])
    scheduler_callback = optimizer.LearningRateScheduler(sfunc, init_steps)
+    callbacks = misc.get_callbacks()
-    tb_logdir = os.path.join(cur_log_dir, "logs")
+    callbacks.append(scheduler_callback)
-    save_path = os.path.join(cur_log_dir,
+    ckpt_full_path = os.path.join(cur_log_dir, "cp-{epoch:04d}.ckpt")
-                             "weights-epoch-{epoch:02d}-loss-{loss:.4f}.hdf5")
+    callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
-    csv_path = os.path.join(cur_log_dir, "result.csv")
+                                                        save_weights_only=True))
-    return [
+    return callbacks
-        scheduler_callback,
-        tf.keras.callbacks.TensorBoard(tb_logdir),
-        tf.keras.callbacks.ModelCheckpoint(save_path, save_weights_only=True),
-        tf.keras.callbacks.CSVLogger(csv_path, append=True),
-    ]
  def _load_weights_if_possible(self, model, init_weight_path=None):
    """Loads model weights when it is provided."""
    if init_weight_path:
      tf.compat.v1.logging.info("Load weights: {}".format(init_weight_path))
-      model.load_weights(init_weight_path, by_name=True)
+      model.load_weights(init_weight_path)
+    else:
+      print("Weights not loaded from path:{}".format(init_weight_path))
  def _create_optimizer(self):
    """Creates optimizer."""
@@ -206,15 +213,6 @@ class TransformerTask(object):
    return opt
-def _get_log_dir_or_default(flags_obj):
-  """Gets init_logdir_timestamp if it is given, otherwise use current time."""
-  if flags_obj.init_logdir_timestamp is not None:
-    timestamp = flags_obj.init_logdir_timestamp
-  else:
-    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M")
-  return os.path.join(flags_obj.model_dir, timestamp)
 def _ensure_dir(log_dir):
  """Makes log dir if not existed."""
  if not os.path.exists(log_dir):
@@ -238,4 +236,4 @@ def main(_):
 if __name__ == "__main__":
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
  misc.define_transformer_flags()
-  tf.compat.v1.app.run(main)
+  absl_app.run(main)
--- a/official/utils/testing/perfzero_benchmark.py
+++ b/official/utils/testing/perfzero_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for creating PerfZero benchmarks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+FLAGS = flags.FLAGS
+class PerfZeroBenchmark(tf.test.Benchmark):
+  """Common methods used in PerfZero Benchmarks.
+     Handles the resetting of flags between tests, loading of default_flags,
+     overriding of defaults.  PerfZero (OSS) runs each test in a separate
+     process reducing some need to reset the flags.
+  """
+  local_flags = None
+  def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
+    """Initialize class.
+    Args:
+      output_dir: Base directory to store all output for the test.
+      default_flags:
+      flag_methods:
+    """
+    if not output_dir:
+      output_dir = '/tmp'
+    self.output_dir = output_dir
+    self.default_flags = default_flags or {}
+    self.flag_methods = flag_methods or {}
+  def _get_model_dir(self, folder_name):
+    """Returns directory to store info, e.g. saved model and event log."""
+    return os.path.join(self.output_dir, folder_name)
+  def _setup(self):
+    """Sets up and resets flags before each test."""
+    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
+    if PerfZeroBenchmark.local_flags is None:
+      for flag_method in self.flag_methods:
+        flag_method()
+      # Loads flags to get defaults to then override. List cannot be empty.
+      flags.FLAGS(['foo'])
+      # Overrides flag values with defaults for the class of tests.
+      for k, v in self.default_flags.items():
+        setattr(FLAGS, k, v)
+      saved_flag_values = flagsaver.save_flag_values()
+      PerfZeroBenchmark.local_flags = saved_flag_values
+    else:
+      flagsaver.restore_flag_values(PerfZeroBenchmark.local_flags)