"docs/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "6246c70d2150dcc58415facfe5c199f49b4d2af1"
Unverified Commit f2ea2f53 authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Transformer v2 benchmark (#6860)

* Moved common keras code to utils.

* Initial 1 gpu benchmark

- Aligned flags with resnet example
- removed code/features that are not super useful
- eval as part of train if bleu source/ref provided
- add exp_per_second hook

* Rename benchmark classes, pass batch-size and log_steps.

* fix docstring

* Predict done with checkpoints inline

- perfzero baseclass

* steps not epochs with smoother training loop.

* do not initialize history outside loop.

* 5000 between eval not 500

* estimator to keras.

* remove epochs var.

* use range not xrange.

* 200K steps for 1 gpu

* fix global step
parent 49eaaaf2
...@@ -18,7 +18,9 @@ from __future__ import absolute_import ...@@ -18,7 +18,9 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
# pylint: disable=g-bad-import-order
from absl import flags from absl import flags
import tensorflow as tf
# TODO(tianlin) Import internal library. Remove this when some functions for # TODO(tianlin) Import internal library. Remove this when some functions for
# different TF versions are fixed. # different TF versions are fixed.
...@@ -26,11 +28,14 @@ from tensorflow.python import tf2 as tf2_internal ...@@ -26,11 +28,14 @@ from tensorflow.python import tf2 as tf2_internal
from official.transformer.model import model_params from official.transformer.model import model_params
from official.utils.flags import core as flags_core from official.utils.flags import core as flags_core
from official.utils.misc import keras_utils
FLAGS = flags.FLAGS
PARAMS_MAP = { PARAMS_MAP = {
"tiny": model_params.TINY_PARAMS, 'tiny': model_params.TINY_PARAMS,
"base": model_params.BASE_PARAMS, 'base': model_params.BASE_PARAMS,
"big": model_params.BIG_PARAMS, 'big': model_params.BIG_PARAMS,
} }
...@@ -42,12 +47,12 @@ def is_v2(): ...@@ -42,12 +47,12 @@ def is_v2():
def get_model_params(param_set, num_gpus): def get_model_params(param_set, num_gpus):
"""Gets predefined model params.""" """Gets predefined model params."""
if num_gpus > 1: if num_gpus > 1:
if param_set == "big": if param_set == 'big':
return model_params.BIG_MULTI_GPU_PARAMS.copy() return model_params.BIG_MULTI_GPU_PARAMS.copy()
elif param_set == "base": elif param_set == 'base':
return model_params.BASE_MULTI_GPU_PARAMS.copy() return model_params.BASE_MULTI_GPU_PARAMS.copy()
else: else:
raise ValueError("Not valid params: param_set={} num_gpus={}".format( raise ValueError('Not valid params: param_set={} num_gpus={}'.format(
param_set, num_gpus)) param_set, num_gpus))
return PARAMS_MAP[param_set].copy() return PARAMS_MAP[param_set].copy()
...@@ -69,113 +74,175 @@ def define_transformer_flags(): ...@@ -69,113 +74,175 @@ def define_transformer_flags():
flags_core.define_benchmark() flags_core.define_benchmark()
flags_core.define_device(tpu=True) flags_core.define_device(tpu=True)
# Set flags from the flags_core module as "key flags" so they're listed when flags.DEFINE_integer(
name='train_steps', short_name='ts', default=None,
help=flags_core.help_wrap('The number of steps used to train.'))
flags.DEFINE_integer(
name='steps_between_evals', short_name='sbe', default=1000,
help=flags_core.help_wrap(
'The Number of training steps to run between evaluations. This is '
'used if --train_steps is defined.'))
flags.DEFINE_boolean(
name='enable_tensorboard', default=False,
help='Whether to enable Tensorboard callback.')
flags.DEFINE_string(
name='profile_steps', default=None,
help='Save profiling data to model dir at given range of steps. The '
'value must be a comma separated pair of positive integers, specifying '
'the first and last step to profile. For example, "--profile_steps=2,4" '
'triggers the profiler to process 3 steps, starting from the 2nd step. '
'Note that profiler has a non-trivial performance overhead, and the '
'output file can be gigantic if profiling many steps.')
# Set flags from the flags_core module as 'key flags' so they're listed when
# the '-h' flag is used. Without this line, the flags defined above are # the '-h' flag is used. Without this line, the flags defined above are
# only shown in the full `--helpful` help text. # only shown in the full `--helpful` help text.
flags.adopt_module_key_flags(flags_core) flags.adopt_module_key_flags(flags_core)
# Add transformer-specific flags # Add transformer-specific flags
flags.DEFINE_enum( flags.DEFINE_enum(
name="param_set", short_name="mp", default="big", name='param_set', short_name='mp', default='big',
enum_values=PARAMS_MAP.keys(), enum_values=PARAMS_MAP.keys(),
help=flags_core.help_wrap( help=flags_core.help_wrap(
"Parameter set to use when creating and training the model. The " 'Parameter set to use when creating and training the model. The '
"parameters define the input shape (batch size and max length), " 'parameters define the input shape (batch size and max length), '
"model configuration (size of embedding, # of hidden layers, etc.), " 'model configuration (size of embedding, # of hidden layers, etc.), '
"and various other settings. The big parameter set increases the " 'and various other settings. The big parameter set increases the '
"default batch size, embedding/hidden size, and filter size. For a " 'default batch size, embedding/hidden size, and filter size. For a '
"complete list of parameters, please see model/model_params.py.")) 'complete list of parameters, please see model/model_params.py.'))
flags.DEFINE_bool( flags.DEFINE_bool(
name="static_batch", default=False, name='static_batch', default=False,
help=flags_core.help_wrap( help=flags_core.help_wrap(
"Whether the batches in the dataset should have static shapes. In " 'Whether the batches in the dataset should have static shapes. In '
"general, this setting should be False. Dynamic shapes allow the " 'general, this setting should be False. Dynamic shapes allow the '
"inputs to be grouped so that the number of padding tokens is " 'inputs to be grouped so that the number of padding tokens is '
"minimized, and helps model training. In cases where the input shape " 'minimized, and helps model training. In cases where the input shape '
"must be static (e.g. running on TPU), this setting will be ignored " 'must be static (e.g. running on TPU), this setting will be ignored '
"and static batching will always be used.")) 'and static batching will always be used.'))
# Flags for training with steps (may be used for debugging) # Flags for training with steps (may be used for debugging)
flags.DEFINE_integer( flags.DEFINE_integer(
name="steps_per_epoch", short_name="sbe", default=1000, name='validation_steps', short_name='vs', default=64,
help=flags_core.help_wrap( help=flags_core.help_wrap('The number of steps used in validation.'))
"The number of training steps for each epoch."))
flags.DEFINE_integer(
name="init_epoch", short_name="is", default=0,
help=flags_core.help_wrap("The number of initial epoch for training."))
flags.DEFINE_string(
name="init_weight_path", short_name="iwp", default=None,
help=flags_core.help_wrap("The initial model weights to load."))
flags.DEFINE_string(
name="init_logdir_timestamp", short_name="ilt", default=None,
help=flags_core.help_wrap("The initial timestamp for logdir."))
flags.DEFINE_integer(
name="validation_steps", short_name="vs", default=64,
help=flags_core.help_wrap("The number of steps used in validation."))
# BLEU score computation # BLEU score computation
flags.DEFINE_string( flags.DEFINE_string(
name="bleu_source", short_name="bls", default=None, name='bleu_source', short_name='bls', default=None,
help=flags_core.help_wrap( help=flags_core.help_wrap(
"Path to source file containing text translate when calculating the " 'Path to source file containing text translate when calculating the '
"official BLEU score. Both --bleu_source and --bleu_ref must be set. " 'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
"Use the flag --stop_threshold to stop the script based on the " 'Use the flag --stop_threshold to stop the script based on the '
"uncased BLEU score.")) 'uncased BLEU score.'))
flags.DEFINE_string( flags.DEFINE_string(
name="bleu_ref", short_name="blr", default=None, name='bleu_ref', short_name='blr', default=None,
help=flags_core.help_wrap( help=flags_core.help_wrap(
"Path to source file containing text translate when calculating the " 'Path to source file containing text translate when calculating the '
"official BLEU score. Both --bleu_source and --bleu_ref must be set. " 'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
"Use the flag --stop_threshold to stop the script based on the " 'Use the flag --stop_threshold to stop the script based on the '
"uncased BLEU score.")) 'uncased BLEU score.'))
flags.DEFINE_string( flags.DEFINE_string(
name="vocab_file", short_name="vf", default=None, name='vocab_file', short_name='vf', default=None,
help=flags_core.help_wrap( help=flags_core.help_wrap(
"Path to subtoken vocabulary file. If data_download.py was used to " 'Path to subtoken vocabulary file. If data_download.py was used to '
"download and encode the training data, look in the data_dir to find " 'download and encode the training data, look in the data_dir to find '
"the vocab file.")) 'the vocab file.'))
flags.DEFINE_string( flags.DEFINE_string(
name="mode", default="train", name='mode', default='train',
help=flags_core.help_wrap("mode: train, eval, or predict")) help=flags_core.help_wrap('mode: train, eval, or predict'))
flags_core.set_defaults(data_dir="/tmp/translate_ende", flags_core.set_defaults(data_dir='/tmp/translate_ende',
model_dir="/tmp/transformer_model", model_dir='/tmp/transformer_model',
batch_size=None, batch_size=None,
train_epochs=10) train_epochs=10)
# pylint: disable=unused-variable # pylint: disable=unused-variable
@flags.multi_flags_validator( @flags.multi_flags_validator(
["mode", "train_epochs"], ['mode', 'train_epochs'],
message="--train_epochs must be defined in train mode") message='--train_epochs must be defined in train mode')
def _check_train_limits(flag_dict): def _check_train_limits(flag_dict):
if flag_dict["mode"] == "train": if flag_dict['mode'] == 'train':
return flag_dict["train_epochs"] is not None return flag_dict['train_epochs'] is not None
return True return True
@flags.multi_flags_validator( @flags.multi_flags_validator(
["bleu_source", "bleu_ref"], ['bleu_source', 'bleu_ref'],
message="Both or neither --bleu_source and --bleu_ref must be defined.") message='Both or neither --bleu_source and --bleu_ref must be defined.')
def _check_bleu_files(flags_dict): def _check_bleu_files(flags_dict):
return (flags_dict["bleu_source"] is None) == ( return (flags_dict['bleu_source'] is None) == (
flags_dict["bleu_ref"] is None) flags_dict['bleu_ref'] is None)
@flags.multi_flags_validator( @flags.multi_flags_validator(
["bleu_source", "bleu_ref", "vocab_file"], ['bleu_source', 'bleu_ref', 'vocab_file'],
message="--vocab_file must be defined if --bleu_source and --bleu_ref " message='--vocab_file must be defined if --bleu_source and --bleu_ref '
"are defined.") 'are defined.')
def _check_bleu_vocab_file(flags_dict): def _check_bleu_vocab_file(flags_dict):
if flags_dict["bleu_source"] and flags_dict["bleu_ref"]: if flags_dict['bleu_source'] and flags_dict['bleu_ref']:
return flags_dict["vocab_file"] is not None return flags_dict['vocab_file'] is not None
return True return True
@flags.multi_flags_validator( @flags.multi_flags_validator(
["export_dir", "vocab_file"], ['export_dir', 'vocab_file'],
message="--vocab_file must be defined if --export_dir is set.") message='--vocab_file must be defined if --export_dir is set.')
def _check_export_vocab_file(flags_dict): def _check_export_vocab_file(flags_dict):
if flags_dict["export_dir"]: if flags_dict['export_dir']:
return flags_dict["vocab_file"] is not None return flags_dict['vocab_file'] is not None
return True return True
# pylint: enable=unused-variable # pylint: enable=unused-variable
flags_core.require_cloud_storage(["data_dir", "model_dir", "export_dir"]) flags_core.require_cloud_storage(['data_dir', 'model_dir', 'export_dir'])
def get_callbacks():
"""Returns common callbacks."""
callbacks = []
time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps)
callbacks.append(time_callback)
if FLAGS.enable_tensorboard:
tensorboard_callback = tf.keras.callbacks.TensorBoard(
log_dir=FLAGS.model_dir)
callbacks.append(tensorboard_callback)
if FLAGS.profile_steps:
profiler_callback = keras_utils.get_profiler_callback(
FLAGS.model_dir,
FLAGS.profile_steps,
FLAGS.enable_tensorboard)
callbacks.append(profiler_callback)
return callbacks
def build_stats(history, callbacks):
"""Normalizes and returns dictionary of stats.
Args:
history: Results of the training step.
callbacks: a list of callbacks which might include a time history callback
used during keras.fit.
Returns:
Dictionary of normalized results.
"""
stats = {}
if history and history.history:
train_hist = history.history
# Gets final loss from training.
stats['loss'] = train_hist['loss'][-1].item()
if not callbacks:
return stats
# Look for the time history callback which was used during keras.fit
for callback in callbacks:
if isinstance(callback, keras_utils.TimeHistory):
timestamp_log = callback.timestamp_log
stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = callback.train_finish_time
if len(timestamp_log) > 1:
stats['avg_exp_per_second'] = (
callback.batch_size * callback.log_steps *
(len(callback.timestamp_log)-1) /
(timestamp_log[-1].timestamp - timestamp_log[0].timestamp))
return stats
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes Transformer w/Keras benchmark and accuracy tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from absl import flags
from official.transformer.v2 import misc
from official.transformer.v2 import transformer_main as transformer_main
from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
TRANSFORMER_EN2DE_DATA_DIR_NAME = 'wmt32k-en2de-official'
EN2DE_2014_BLEU_DATA_DIR_NAME = 'newstest2014'
FLAGS = flags.FLAGS
class TransformerBenchmark(PerfZeroBenchmark):
"""Methods common to executing transformer w/keras tests.
Code under test for the Transformer Keras models report the same data and
require the same FLAG setup.
"""
def __init__(self, output_dir=None, default_flags=None, root_data_dir=None,
flag_methods=None):
self.train_data_dir = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME)
self.vocab_file = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME,
'vocab.ende.32768')
self.bleu_source = os.path.join(root_data_dir,
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.en')
self.bleu_ref = os.path.join(root_data_dir,
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.de')
super(TransformerBenchmark, self).__init__(
output_dir=output_dir,
default_flags=default_flags,
flag_methods=flag_methods)
def _run_and_report_benchmark(self,
bleu_max=None,
bleu_min=None,
log_steps=None,
total_batch_size=None,
warmup=1):
"""Report benchmark results by writing to local protobuf file.
Args:
bleu_max: highest passing level for bleu score.
bleu_min: lowest passing level for bleu score.
log_steps: How often the log was created for stats['step_timestamp_log'].
total_batch_size: Global batch-size.
warmup: number of entries in stats['step_timestamp_log'] to ignore.
"""
start_time_sec = time.time()
task = transformer_main.TransformerTask(FLAGS)
stats = task.train()
wall_time_sec = time.time() - start_time_sec
metrics = []
if 'bleu_uncased' in stats:
metrics.append({'name': 'bleu_uncased',
'value': stats['bleu_uncased'],
'min_value': bleu_min,
'max_value': bleu_max})
if (warmup and 'step_timestamp_log' in stats and
len(stats['step_timestamp_log']) > warmup):
# first entry in the time_log is start of step 1. The rest of the
# entries are the end of each step recorded
time_log = stats['step_timestamp_log']
elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
num_examples = (
total_batch_size * log_steps * (len(time_log) - warmup - 1))
examples_per_sec = num_examples / elapsed
metrics.append({'name': 'exp_per_second',
'value': examples_per_sec})
if 'avg_exp_per_second' in stats:
metrics.append({'name': 'avg_exp_per_second',
'value': stats['avg_exp_per_second']})
self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics)
class TransformerBaseKerasAccuracy(TransformerBenchmark):
"""Benchmark accuracy tests for Transformer Base model w/ Keras."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Benchmark accuracy tests for Transformer Base model w/ Keras.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [misc.define_transformer_flags]
super(TransformerBaseKerasAccuracy, self).__init__(
output_dir=output_dir, root_data_dir=root_data_dir,
flag_methods=flag_methods)
def benchmark_1_gpu(self):
"""Benchmark 1 gpu.
The paper uses 8 GPUs and a much larger effective batch size, this is will
not converge to the 27.3 BLEU (uncased) SOTA.
"""
self._setup()
FLAGS.num_gpus = 1
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base'
FLAGS.batch_size = 4096
FLAGS.train_steps = 200000
FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
# These bleu scores are based on test runs after at this limited
# number of steps and batch size after verifying SOTA at 8xV100s.
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
bleu_min=25.3,
bleu_max=26)
class TransformerKerasBenchmark(TransformerBenchmark):
"""Benchmarks for Transformer (Base and Big) using Keras."""
def __init__(self, output_dir=None, default_flags=None,
root_data_dir=None, batch_per_gpu=4096):
"""Initialize.
Args:
output_dir: Based directory for saving artifacts, e.g. checkpoints.
default_flags: default flags to use for all tests.
root_data_dir: root directory for data, e.g. training.
batch_per_gpu: batch size to use per gpu.
"""
flag_methods = [misc.define_transformer_flags]
self.batch_per_gpu = batch_per_gpu
super(TransformerKerasBenchmark, self).__init__(
output_dir=output_dir,
default_flags=default_flags,
root_data_dir=root_data_dir,
flag_methods=flag_methods)
def benchmark_1_gpu(self):
"""Benchmark 1 gpu."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
class TransformerBaseKerasBenchmarkReal(TransformerKerasBenchmark):
"""Transformer based version real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
train_data_dir = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME)
vocab_file = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME,
'vocab.ende.32768')
def_flags = {}
def_flags['param_set'] = 'base'
def_flags['vocab_file'] = vocab_file
def_flags['data_dir'] = train_data_dir
def_flags['train_steps'] = 200
def_flags['log_steps'] = 10
super(TransformerBaseKerasBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags,
root_data_dir=root_data_dir, batch_per_gpu=4096)
class TransformerBigKerasBenchmarkReal(TransformerKerasBenchmark):
"""Transformer based version real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
train_data_dir = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME)
vocab_file = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME,
'vocab.ende.32768')
def_flags = {}
def_flags['param_set'] = 'big'
def_flags['vocab_file'] = vocab_file
def_flags['data_dir'] = train_data_dir
def_flags['train_steps'] = 200
def_flags['log_steps'] = 10
super(TransformerBigKerasBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags,
root_data_dir=root_data_dir, batch_per_gpu=3072)
...@@ -22,15 +22,14 @@ from __future__ import absolute_import ...@@ -22,15 +22,14 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import datetime
import os import os
import tempfile import tempfile
# pylint: disable=g-bad-import-order from absl import app as absl_app
from absl import flags from absl import flags
import tensorflow as tf import tensorflow as tf
# pylint: enable=g-bad-import-order
# pylint: disable=g-bad-import-order
from official.transformer import compute_bleu from official.transformer import compute_bleu
from official.transformer.utils import tokenizer from official.transformer.utils import tokenizer
from official.transformer.v2 import data_pipeline from official.transformer.v2 import data_pipeline
...@@ -89,6 +88,7 @@ class TransformerTask(object): ...@@ -89,6 +88,7 @@ class TransformerTask(object):
flags_obj: Object containing parsed flag values, i.e., FLAGS. flags_obj: Object containing parsed flag values, i.e., FLAGS.
""" """
self.flags_obj = flags_obj self.flags_obj = flags_obj
self.predict_model = None
# Add flag-defined parameters to params object # Add flag-defined parameters to params object
num_gpus = flags_core.get_num_gpus(flags_obj) num_gpus = flags_core.get_num_gpus(flags_obj)
...@@ -106,52 +106,62 @@ class TransformerTask(object): ...@@ -106,52 +106,62 @@ class TransformerTask(object):
def train(self): def train(self):
"""Trains the model.""" """Trains the model."""
params, flags_obj, is_train = self.params, self.flags_obj, True params, flags_obj, is_train = self.params, self.flags_obj, True
_ensure_dir(flags_obj.model_dir)
model = transformer.create_model(params, is_train) model = transformer.create_model(params, is_train)
opt = self._create_optimizer() opt = self._create_optimizer()
model.compile(opt, target_tensors=[]) model.compile(opt, target_tensors=[])
model.summary() model.summary()
self._load_weights_if_possible(model, flags_obj.init_weight_path)
cur_log_dir = _get_log_dir_or_default(flags_obj)
_ensure_dir(cur_log_dir)
map_data_fn = data_pipeline.map_data_for_transformer_fn map_data_fn = data_pipeline.map_data_for_transformer_fn
train_ds = data_pipeline.train_input_fn(params) train_ds = data_pipeline.train_input_fn(params)
train_ds = train_ds.map( train_ds = train_ds.map(
map_data_fn, num_parallel_calls=params["num_parallel_calls"]) map_data_fn, num_parallel_calls=params["num_parallel_calls"])
valid_ds = data_pipeline.eval_input_fn(params)
valid_ds = valid_ds.map(
map_data_fn, num_parallel_calls=params["num_parallel_calls"])
init_epoch = flags_obj.init_epoch or 0 callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)
init_steps = init_epoch * flags_obj.steps_per_epoch
callbacks = self._create_callbacks(cur_log_dir, init_steps, params) if flags_obj.train_steps < flags_obj.steps_between_evals:
flags_obj.steps_between_evals = flags_obj.train_steps
history = model.fit( iterations = flags_obj.train_steps // flags_obj.steps_between_evals
train_ds,
initial_epoch=init_epoch, cased_score, uncased_score = None, None
epochs=flags_obj.train_epochs, for i in range(1, iterations + 1):
steps_per_epoch=flags_obj.steps_per_epoch, print("Start train iteration:{}/{}".format(i, iterations))
validation_data=valid_ds, history = model.fit(
validation_steps=flags_obj.validation_steps, train_ds,
callbacks=callbacks) initial_epoch=i-1,
tf.compat.v1.logging.info("\nTrain history: {}".format(history.history)) epochs=i,
steps_per_epoch=flags_obj.steps_between_evals,
save_weight_path = os.path.join(cur_log_dir, "saves-model-weights.hdf5") callbacks=callbacks,
save_model_path = os.path.join(cur_log_dir, "saves-model.hdf5") verbose=2)
model.save_weights(save_weight_path) print("End train iteration:{}/{} global step:{}".format(
model.save(save_model_path) i,
iterations,
i*flags_obj.steps_between_evals))
tf.compat.v1.logging.info("Train history: {}".format(history.history))
stats = misc.build_stats(history, callbacks)
if (flags_obj.bleu_source and flags_obj.bleu_ref):
uncased_score, cased_score = self.eval()
stats = misc.build_stats(history, callbacks)
if uncased_score and cased_score:
stats["bleu_uncased"] = uncased_score
stats["bleu_cased"] = cased_score
return stats
def eval(self): def eval(self):
"""Evaluates the model.""" """Evaluates the model."""
params, flags_obj, is_train = self.params, self.flags_obj, False if not self.predict_model:
with tf.name_scope("model"): self.predict_model = transformer.create_model(self.params, False)
model = transformer.create_model(params, is_train) self._load_weights_if_possible(
self._load_weights_if_possible(model, flags_obj.init_weight_path) self.predict_model,
model.summary() tf.train.latest_checkpoint(self.flags_obj.model_dir))
evaluate_and_log_bleu(model, flags_obj.bleu_source, flags_obj.bleu_ref, self.predict_model.summary()
flags_obj.vocab_file) return evaluate_and_log_bleu(self.predict_model,
self.flags_obj.bleu_source,
self.flags_obj.bleu_ref,
self.flags_obj.vocab_file)
def predict(self): def predict(self):
"""Predicts result from the model.""" """Predicts result from the model."""
...@@ -177,23 +187,20 @@ class TransformerTask(object): ...@@ -177,23 +187,20 @@ class TransformerTask(object):
params["hidden_size"], params["hidden_size"],
params["learning_rate_warmup_steps"]) params["learning_rate_warmup_steps"])
scheduler_callback = optimizer.LearningRateScheduler(sfunc, init_steps) scheduler_callback = optimizer.LearningRateScheduler(sfunc, init_steps)
callbacks = misc.get_callbacks()
tb_logdir = os.path.join(cur_log_dir, "logs") callbacks.append(scheduler_callback)
save_path = os.path.join(cur_log_dir, ckpt_full_path = os.path.join(cur_log_dir, "cp-{epoch:04d}.ckpt")
"weights-epoch-{epoch:02d}-loss-{loss:.4f}.hdf5") callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
csv_path = os.path.join(cur_log_dir, "result.csv") save_weights_only=True))
return [ return callbacks
scheduler_callback,
tf.keras.callbacks.TensorBoard(tb_logdir),
tf.keras.callbacks.ModelCheckpoint(save_path, save_weights_only=True),
tf.keras.callbacks.CSVLogger(csv_path, append=True),
]
def _load_weights_if_possible(self, model, init_weight_path=None): def _load_weights_if_possible(self, model, init_weight_path=None):
"""Loads model weights when it is provided.""" """Loads model weights when it is provided."""
if init_weight_path: if init_weight_path:
tf.compat.v1.logging.info("Load weights: {}".format(init_weight_path)) tf.compat.v1.logging.info("Load weights: {}".format(init_weight_path))
model.load_weights(init_weight_path, by_name=True) model.load_weights(init_weight_path)
else:
print("Weights not loaded from path:{}".format(init_weight_path))
def _create_optimizer(self): def _create_optimizer(self):
"""Creates optimizer.""" """Creates optimizer."""
...@@ -206,15 +213,6 @@ class TransformerTask(object): ...@@ -206,15 +213,6 @@ class TransformerTask(object):
return opt return opt
def _get_log_dir_or_default(flags_obj):
"""Gets init_logdir_timestamp if it is given, otherwise use current time."""
if flags_obj.init_logdir_timestamp is not None:
timestamp = flags_obj.init_logdir_timestamp
else:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M")
return os.path.join(flags_obj.model_dir, timestamp)
def _ensure_dir(log_dir): def _ensure_dir(log_dir):
"""Makes log dir if not existed.""" """Makes log dir if not existed."""
if not os.path.exists(log_dir): if not os.path.exists(log_dir):
...@@ -238,4 +236,4 @@ def main(_): ...@@ -238,4 +236,4 @@ def main(_):
if __name__ == "__main__": if __name__ == "__main__":
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
misc.define_transformer_flags() misc.define_transformer_flags()
tf.compat.v1.app.run(main) absl_app.run(main)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utils for creating PerfZero benchmarks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import flags
from absl.testing import flagsaver
import tensorflow as tf # pylint: disable=g-bad-import-order
FLAGS = flags.FLAGS
class PerfZeroBenchmark(tf.test.Benchmark):
"""Common methods used in PerfZero Benchmarks.
Handles the resetting of flags between tests, loading of default_flags,
overriding of defaults. PerfZero (OSS) runs each test in a separate
process reducing some need to reset the flags.
"""
local_flags = None
def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
"""Initialize class.
Args:
output_dir: Base directory to store all output for the test.
default_flags:
flag_methods:
"""
if not output_dir:
output_dir = '/tmp'
self.output_dir = output_dir
self.default_flags = default_flags or {}
self.flag_methods = flag_methods or {}
def _get_model_dir(self, folder_name):
"""Returns directory to store info, e.g. saved model and event log."""
return os.path.join(self.output_dir, folder_name)
def _setup(self):
"""Sets up and resets flags before each test."""
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
if PerfZeroBenchmark.local_flags is None:
for flag_method in self.flag_methods:
flag_method()
# Loads flags to get defaults to then override. List cannot be empty.
flags.FLAGS(['foo'])
# Overrides flag values with defaults for the class of tests.
for k, v in self.default_flags.items():
setattr(FLAGS, k, v)
saved_flag_values = flagsaver.save_flag_values()
PerfZeroBenchmark.local_flags = saved_flag_values
else:
flagsaver.restore_flag_values(PerfZeroBenchmark.local_flags)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment