Merge pull request #4 from tensorflow/master

Updating

Merge pull request #4 from tensorflow/master
Updating
1e2ceffd · Ayushman Kumar · GitHub · 51e60bab · c7adbbe4 · 1e2ceffd
Unverified Commit 1e2ceffd authored Mar 12, 2020 by Ayushman Kumar Committed by GitHub Mar 12, 2020
20 changed files
--- a/official/benchmark/bert_benchmark_utils.py
+++ b/official/benchmark/bert_benchmark_utils.py
@@ -44,6 +44,11 @@ class BenchmarkTimerCallback(tf.keras.callbacks.Callback):
    self.batch_start_times[batch] = time.time()

  def on_batch_end(self, batch, logs=None):
+    # If there are multiple steps_per_loop, the end batch index will not be the
+    # same as the starting index. Use the last starting index instead.
+    if batch not in self.batch_start_times:
+      batch = max(self.batch_start_times.keys())
+
    self.batch_stop_times[batch] = time.time()

  def get_examples_per_sec(self, batch_size, num_batches_to_skip=1):

--- a/official/benchmark/keras_imagenet_benchmark.py
+++ b/official/benchmark/keras_imagenet_benchmark.py
@@ -1305,7 +1305,6 @@ class Resnet50KerasPruningAccuracy(KerasPruningAccuracyBase):
        'model': 'resnet50_v1.5',
        'optimizer': 'mobilenet_default',
        'initial_learning_rate_per_sample': 0.0000039,
-        'use_tf_keras_layers': True,
        'pretrained_filepath': tf.train.latest_checkpoint(
            os.path.join(root_data_dir, 'resnet50')),
        'pruning_begin_step': 0,
@@ -1369,7 +1368,6 @@ class Resnet50KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase):
    default_flags = {
        'model': 'resnet50_v1.5',
        'optimizer': 'mobilenet_default',
-        'use_tf_keras_layers': True,
    }
    super(Resnet50KerasPruningBenchmarkReal, self).__init__(
        default_flags=default_flags, **kwargs)

--- a/official/benchmark/models/resnet_cifar_main.py
+++ b/official/benchmark/models/resnet_cifar_main.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
-from absl import app as absl_app
+from absl import app
 from absl import flags
+import numpy as np
 import tensorflow as tf
 from official.benchmark.models import resnet_cifar_model
 from official.utils.flags import core as flags_core
@@ -174,7 +174,6 @@ def run(flags_obj):
      is_training=True,
      data_dir=flags_obj.data_dir,
      batch_size=flags_obj.batch_size,
-      num_epochs=flags_obj.train_epochs,
      parse_record_fn=cifar_preprocessing.parse_record,
      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
      dtype=dtype,
@@ -189,7 +188,6 @@ def run(flags_obj):
        is_training=False,
        data_dir=flags_obj.data_dir,
        batch_size=flags_obj.batch_size,
-        num_epochs=flags_obj.train_epochs,
        parse_record_fn=cifar_preprocessing.parse_record)

  steps_per_epoch = (
@@ -284,4 +282,4 @@ def main(_):
 if __name__ == '__main__':
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
  define_cifar_flags()
-  absl_app.run(main)
+  app.run(main)
--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
@@ -281,6 +281,7 @@ class Resnet50CtlBenchmarkBase(CtlBenchmark):
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_eager')
    FLAGS.batch_size = 128
    FLAGS.use_tf_function = False
+    FLAGS.use_tf_while_loop = False
    FLAGS.single_l2_loss_op = True
    self._run_and_report_benchmark()

@@ -294,6 +295,7 @@ class Resnet50CtlBenchmarkBase(CtlBenchmark):
    FLAGS.batch_size = 250
    FLAGS.dtype = 'fp16'
    FLAGS.use_tf_function = False
+    FLAGS.use_tf_while_loop = False
    FLAGS.single_l2_loss_op = True
    self._run_and_report_benchmark()

@@ -324,6 +326,7 @@ class Resnet50CtlBenchmarkBase(CtlBenchmark):

    FLAGS.num_gpus = 8
    FLAGS.use_tf_function = False
+    FLAGS.use_tf_while_loop = False
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager')
    FLAGS.batch_size = 128
@@ -336,6 +339,7 @@ class Resnet50CtlBenchmarkBase(CtlBenchmark):
    FLAGS.num_gpus = 8
    FLAGS.dtype = 'fp16'
    FLAGS.use_tf_function = False
+    FLAGS.use_tf_while_loop = False
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_eager_fp16')
    FLAGS.batch_size = 128
@@ -392,8 +396,7 @@ class Resnet50CtlBenchmarkReal(Resnet50CtlBenchmarkBase):
  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
    def_flags = {}
    def_flags['skip_eval'] = True
-    def_flags['data_dir'] = ('/readahead/200M/placer/prod/home/distbelief/'
-                             'imagenet-tensorflow/imagenet-2012-tfrecord')
+    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
    def_flags['train_steps'] = 110
    def_flags['steps_per_loop'] = 20
    def_flags['log_steps'] = 10

--- a/official/modeling/hyperparams/base_config.py
+++ b/official/modeling/hyperparams/base_config.py
@@ -24,7 +24,6 @@ import copy
 import functools
 from typing import Any, List, Mapping, Optional, Type

-from absl import logging
 import dataclasses
 import tensorflow as tf
 import yaml
@@ -74,8 +73,8 @@ class Config(params_dict.ParamsDict):
    """Returns v with dicts converted to Configs, recursively."""
    if not issubclass(subconfig_type, params_dict.ParamsDict):
      raise TypeError(
-          'Subconfig_type should be subclass of ParamsDict, found %r',
-          subconfig_type)
+          'Subconfig_type should be subclass of ParamsDict, found {!r}'.format(
+              subconfig_type))
    if isinstance(v, cls.IMMUTABLE_TYPES):
      return v
    elif isinstance(v, cls.SEQUENCE_TYPES):
@@ -95,7 +94,7 @@ class Config(params_dict.ParamsDict):
    elif isinstance(v, dict):
      return subconfig_type(v)
    else:
-      raise TypeError('Unknown type: %r' % type(v))
+      raise TypeError('Unknown type: {!r}'.format(type(v)))

  @classmethod
  def _export_config(cls, v):
@@ -162,7 +161,9 @@ class Config(params_dict.ParamsDict):
    """
    subconfig_type = self._get_subconfig_type(k)
    if isinstance(v, dict):
-      if k not in self.__dict__:
+      if k not in self.__dict__ or not self.__dict__[k]:
+        # If the key not exist or the value is None, a new Config-family object
+        # sould be created for the key.
        self.__dict__[k] = subconfig_type(v)
      else:
        self.__dict__[k].override(v)
@@ -193,15 +194,16 @@ class Config(params_dict.ParamsDict):
                       'Can not be overridden.'.format(k))
      if k not in self.__dict__:
        if is_strict:
-          raise KeyError('The key {!r} does not exist. '
+          raise KeyError('The key {!r} does not exist in {!r}. '
                         'To extend the existing keys, use '
-                         '`override` with `is_strict` = False.'.format(k))
+                         '`override` with `is_strict` = False.'.format(
+                             k, type(self)))
        else:
          self._set(k, v)
      else:
-        if isinstance(v, dict):
+        if isinstance(v, dict) and self.__dict__[k]:
          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
-        elif isinstance(v, params_dict.ParamsDict):
+        elif isinstance(v, params_dict.ParamsDict) and self.__dict__[k]:
          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
        else:
          self._set(k, v)
@@ -268,6 +270,8 @@ class RuntimeConfig(Config):
      multi-worker models with DistributionStrategy.
    task_index: If multi-worker training, the task index of this worker.
    all_reduce_alg: Defines the algorithm for performing all-reduce.
+    num_packs: Sets `num_packs` in the cross device ops used in
+      MirroredStrategy.  For details, see tf.distribute.NcclAllReduce.
  """
  distribution_strategy: str = 'mirrored'
  enable_eager: bool = False
@@ -281,6 +285,7 @@ class RuntimeConfig(Config):
  worker_hosts: Optional[str] = None
  task_index: int = -1
  all_reduce_alg: Optional[str] = None
+  num_packs: int = 1


 @dataclasses.dataclass
@@ -311,4 +316,3 @@ class CallbacksConfig(Config):
  """
  enable_checkpoint_and_export: bool = True
  enable_tensorboard: bool = True
-
--- a/official/modeling/model_training_utils.py
+++ b/official/modeling/model_training_utils.py
@@ -20,6 +20,7 @@ from __future__ import print_function

 import json
 import os
+import tempfile

 from absl import logging
 import tensorflow as tf
@@ -30,12 +31,29 @@ _SUMMARY_TXT = 'training_summary.txt'
 _MIN_SUMMARY_STEPS = 10


-def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
+def _should_export_checkpoint(strategy):
+  return (not strategy) or strategy.extended.should_checkpoint
+
+
+def _should_export_summary(strategy):
+  return (not strategy) or strategy.extended.should_save_summary
+
+
+def _save_checkpoint(strategy, checkpoint, model_dir, checkpoint_prefix):
  """Saves model to with provided checkpoint prefix."""

-  checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
-  saved_path = checkpoint.save(checkpoint_path)
-  logging.info('Saving model as TF checkpoint: %s', saved_path)
+  if _should_export_checkpoint(strategy):
+    checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
+    saved_path = checkpoint.save(checkpoint_path)
+    logging.info('Saving model as TF checkpoint: %s', saved_path)
+  else:
+    # In multi worker training we need every worker to save checkpoint, because
+    # variables can trigger synchronization on read and synchronization needs
+    # all workers to participate. To avoid workers overriding each other we save
+    # to a temporary directory on non-chief workers.
+    tmp_dir = tempfile.mkdtemp()
+    checkpoint.save(os.path.join(tmp_dir, 'ckpt'))
+    tf.io.gfile.rmtree(tmp_dir)
  return


@@ -242,7 +260,13 @@ def run_customized_training_loop(
    ]

    # Create summary writers
-    summary_dir = os.path.join(model_dir, 'summaries')
+    if _should_export_summary(strategy):
+      summary_dir = os.path.join(model_dir, 'summaries')
+    else:
+      # In multi worker training we need every worker to write summary, because
+      # variables can trigger synchronization on read and synchronization needs
+      # all workers to participate.
+      summary_dir = tempfile.mkdtemp()
    eval_summary_writer = tf.summary.create_file_writer(
        os.path.join(summary_dir, 'eval'))
    if steps_per_loop >= _MIN_SUMMARY_STEPS:
@@ -395,8 +419,8 @@ def run_customized_training_loop(
        train_steps(train_iterator,
                    tf.convert_to_tensor(steps, dtype=tf.int32))
      train_loss = _float_metric_value(train_loss_metric)
-      _run_callbacks_on_batch_end(current_step, {'loss': train_loss})
      current_step += steps
+      _run_callbacks_on_batch_end(current_step - 1, {'loss': train_loss})

      # Updates training logging.
      training_status = 'Train Step: %d/%d  / loss = %s' % (
@@ -418,11 +442,11 @@ def run_customized_training_loop(
        # To avoid repeated model saving, we do not save after the last
        # step of training.
        if current_step < total_training_steps:
-          _save_checkpoint(checkpoint, model_dir,
+          _save_checkpoint(strategy, checkpoint, model_dir,
                           checkpoint_name.format(step=current_step))
          if sub_model_export_name:
            _save_checkpoint(
-                sub_model_checkpoint, model_dir,
+                strategy, sub_model_checkpoint, model_dir,
                '%s_step_%d.ckpt' % (sub_model_export_name, current_step))
        if eval_input_fn:
          logging.info('Running evaluation after step: %s.', current_step)
@@ -432,10 +456,10 @@ def run_customized_training_loop(
          for metric in eval_metrics + model.metrics:
            metric.reset_states()

-    _save_checkpoint(checkpoint, model_dir,
+    _save_checkpoint(strategy, checkpoint, model_dir,
                     checkpoint_name.format(step=current_step))
    if sub_model_export_name:
-      _save_checkpoint(sub_model_checkpoint, model_dir,
+      _save_checkpoint(strategy, sub_model_checkpoint, model_dir,
                       '%s.ckpt' % sub_model_export_name)

    if eval_input_fn:
@@ -455,4 +479,7 @@ def run_customized_training_loop(

    write_txt_summary(training_summary, summary_dir)

+    if not _should_export_summary(strategy):
+      tf.io.gfile.rmtree(summary_dir)
+
    return model
--- a/official/modeling/model_training_utils_test.py
+++ b/official/modeling/model_training_utils_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os

 from absl.testing import parameterized
+from absl.testing.absltest import mock
 import numpy as np
 import tensorflow as tf

@@ -208,6 +209,27 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
        check_eventfile_for_keyword('mean_input',
                                    os.path.join(model_dir, 'summaries/eval')))

+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          mode='eager',
+      ))
+  def test_train_check_artifacts_non_chief(self, distribution):
+    # We shouldn't export artifacts on non-chief workers. Since there's no easy
+    # way to test with real MultiWorkerMirroredStrategy, we patch the strategy
+    # to make it as if it's MultiWorkerMirroredStrategy on non-chief workers.
+    extended = distribution.extended
+    with mock.patch.object(extended.__class__, 'should_checkpoint',
+                           new_callable=mock.PropertyMock, return_value=False), \
+         mock.patch.object(extended.__class__, 'should_save_summary',
+                           new_callable=mock.PropertyMock, return_value=False):
+      model_dir = self.get_temp_dir()
+      self.run_training(
+          distribution, model_dir, steps_per_loop=10, run_eagerly=False)
+      self.assertEmpty(tf.io.gfile.listdir(model_dir))
+

 if __name__ == '__main__':
  assert tf.version.VERSION.startswith('2.')

--- a/official/nlp/albert/export_albert_tfhub.py
+++ b/official/nlp/albert/export_albert_tfhub.py
@@ -78,7 +78,6 @@ def export_albert_tfhub(albert_config: configs.AlbertConfig,


 def main(_):
-  assert tf.version.VERSION.startswith('2.')
  albert_config = configs.AlbertConfig.from_json_file(
      FLAGS.albert_config_file)
  export_albert_tfhub(albert_config, FLAGS.model_checkpoint_path,

--- a/official/nlp/albert/export_albert_tfhub_test.py
+++ b/official/nlp/albert/export_albert_tfhub_test.py
@@ -86,5 +86,4 @@ class ExportAlbertTfhubTest(tf.test.TestCase):


 if __name__ == "__main__":
-  assert tf.version.VERSION.startswith('2.')
  tf.test.main()
--- a/official/nlp/albert/run_classifier.py
+++ b/official/nlp/albert/run_classifier.py
@@ -33,7 +33,6 @@ FLAGS = flags.FLAGS

 def main(_):
  # Users should always run this script under TF 2.x
-  assert tf.version.VERSION.startswith('2.')

  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
    input_meta_data = json.loads(reader.read().decode('utf-8'))

--- a/official/nlp/albert/run_squad.py
+++ b/official/nlp/albert/run_squad.py
@@ -80,7 +80,6 @@ def export_squad(model_export_path, input_meta_data):

 def main(_):
  # Users should always run this script under TF 2.x
-  assert tf.version.VERSION.startswith('2.')

  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
    input_meta_data = json.loads(reader.read().decode('utf-8'))

--- a/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
+++ b/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
@@ -122,7 +122,6 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint):


 def main(_):
-  assert tf.version.VERSION.startswith('2.')
  output_path = FLAGS.converted_checkpoint_path
  v1_checkpoint = FLAGS.checkpoint_to_convert
  albert_config = configs.AlbertConfig.from_json_file(FLAGS.albert_config_file)

--- a/official/nlp/bert/common_flags.py
+++ b/official/nlp/bert/common_flags.py
@@ -77,6 +77,8 @@ def define_common_bert_flags():
  flags.DEFINE_bool('hub_module_trainable', True,
                    'True to make keras layers in the hub module trainable.')

+  flags_core.define_log_steps()
+
  # Adds flags for mixed precision and multi-worker training.
  flags_core.define_performance(
      num_parallel_calls=False,

--- a/official/nlp/bert/export_tfhub.py
+++ b/official/nlp/bert/export_tfhub.py
@@ -77,7 +77,6 @@ def export_bert_tfhub(bert_config: configs.BertConfig,


 def main(_):
-  assert tf.version.VERSION.startswith('2.')
  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
  export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path, FLAGS.export_path,
                    FLAGS.vocab_file)

--- a/official/nlp/bert/export_tfhub_test.py
+++ b/official/nlp/bert/export_tfhub_test.py
@@ -84,5 +84,4 @@ class ExportTfhubTest(tf.test.TestCase):


 if __name__ == "__main__":
-  assert tf.version.VERSION.startswith('2.')
  tf.test.main()
--- a/official/nlp/bert/run_classifier.py
+++ b/official/nlp/bert/run_classifier.py
@@ -169,7 +169,7 @@ def run_bert_classifier(strategy,
        epochs,
        steps_per_epoch,
        eval_steps,
-        custom_callbacks=None)
+        custom_callbacks=custom_callbacks)

  # Use user-defined loop to start training.
  logging.info('Training using customized training loop TF 2.0 with '
@@ -363,6 +363,15 @@ def run_bert(strategy,
  if not strategy:
    raise ValueError('Distribution strategy has not been specified.')

+  if FLAGS.log_steps:
+    custom_callbacks = [keras_utils.TimeHistory(
+        batch_size=FLAGS.train_batch_size,
+        log_steps=FLAGS.log_steps,
+        logdir=FLAGS.model_dir,
+    )]
+  else:
+    custom_callbacks = None
+
  trained_model = run_bert_classifier(
      strategy,
      model_config,
@@ -378,7 +387,8 @@ def run_bert(strategy,
      train_input_fn,
      eval_input_fn,
      run_eagerly=FLAGS.run_eagerly,
-      use_keras_compile_fit=FLAGS.use_keras_compile_fit)
+      use_keras_compile_fit=FLAGS.use_keras_compile_fit,
+      custom_callbacks=custom_callbacks)

  if FLAGS.model_export_path:
    # As Keras ModelCheckpoint callback used with Keras compile/fit() API
@@ -393,7 +403,6 @@ def run_bert(strategy,

 def main(_):
  # Users should always run this script under TF 2.x
-  assert tf.version.VERSION.startswith('2.')

  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
    input_meta_data = json.loads(reader.read().decode('utf-8'))

--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
@@ -159,7 +159,6 @@ def run_bert_pretrain(strategy):

 def main(_):
  # Users should always run this script under TF 2.x
-  assert tf.version.VERSION.startswith('2.')
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
  if not FLAGS.model_dir:
    FLAGS.model_dir = '/tmp/bert20/'

--- a/official/nlp/bert/run_squad.py
+++ b/official/nlp/bert/run_squad.py
@@ -29,6 +29,7 @@ from official.nlp.bert import run_squad_helper
 from official.nlp.bert import tokenization
 from official.nlp.data import squad_lib as squad_lib_wp
 from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils


 flags.DEFINE_string('vocab_file', None,
@@ -75,7 +76,6 @@ def export_squad(model_export_path, input_meta_data):

 def main(_):
  # Users should always run this script under TF 2.x
-  assert tf.version.VERSION.startswith('2.')

  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
    input_meta_data = json.loads(reader.read().decode('utf-8'))
@@ -94,7 +94,21 @@ def main(_):
      all_reduce_alg=FLAGS.all_reduce_alg,
      tpu_address=FLAGS.tpu)
  if FLAGS.mode in ('train', 'train_and_predict'):
-    train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly)
+    if FLAGS.log_steps:
+      custom_callbacks = [keras_utils.TimeHistory(
+          batch_size=FLAGS.train_batch_size,
+          log_steps=FLAGS.log_steps,
+          logdir=FLAGS.model_dir,
+      )]
+    else:
+      custom_callbacks = None
+
+    train_squad(
+        strategy,
+        input_meta_data,
+        custom_callbacks=custom_callbacks,
+        run_eagerly=FLAGS.run_eagerly,
+    )
  if FLAGS.mode in ('predict', 'train_and_predict'):
    predict_squad(strategy, input_meta_data)


--- a/official/nlp/bert/tf2_encoder_checkpoint_converter.py
+++ b/official/nlp/bert/tf2_encoder_checkpoint_converter.py
@@ -98,7 +98,7 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint):


 def main(_):
-  assert tf.version.VERSION.startswith('2.')
+  tf.enable_v2_behavior()
  output_path = FLAGS.converted_checkpoint_path
  v1_checkpoint = FLAGS.checkpoint_to_convert
  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)

--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
@@ -6,7 +6,10 @@ assemble new layers, networks, or models.
 logic required to generate the einsum expression for the given initialization
 parameters.

-* [Attention](attention.py) implements an optionally masked attention between two tensors, from_tensor and to_tensor, as described in ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). If `from_tensor` and `to_tensor` are the same, then this is self-attention.
+* [MultiHeadAttention](attention.py) implements an optionally masked attention
+between two tensors, from_tensor and to_tensor, as described in
+["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
+If `from_tensor` and `to_tensor` are the same, then this is self-attention.

 * [CachedAttention](attention.py) implements an attention layer with cache used
 for auto-agressive decoding.