Merge branch 'master' into absl

55bf4b80 · Hongkun Yu · GitHub · 15e0057f · 2416dd9c · 55bf4b80
Unverified Commit 55bf4b80 authored Mar 19, 2020 by Hongkun Yu Committed by GitHub Mar 19, 2020
20 changed files
--- a/README.md
+++ b/README.md
@@ -2,12 +2,10 @@
 This repository contains a number of different models implemented in [TensorFlow](https://www.tensorflow.org):
-The [official models](official) are a collection of example models that use TensorFlow's high-level APIs. They are intended to be well-maintained, tested, and kept up to date with the latest stable TensorFlow API. They should also be reasonably optimized for fast performance while still being easy to read. We especially recommend newer TensorFlow users to start here.
+The [official models](official) are a collection of example models that use TensorFlow 2's high-level APIs. They are intended to be well-maintained, tested, and kept up to date with the latest stable TensorFlow API. They should also be reasonably optimized for fast performance while still being easy to read. We especially recommend newer TensorFlow users to start here.
 The [research models](https://github.com/tensorflow/models/tree/master/research) are a large collection of models implemented in TensorFlow by researchers. They are not officially supported or available in release branches; it is up to the individual researchers to maintain the models and/or provide support on issues and pull requests.
-The [tutorials folder](tutorials) is a collection of models described in the [TensorFlow tutorials](https://www.tensorflow.org/tutorials/).
 ## Contribution guidelines
 If you want to contribute to models, be sure to review the [contribution guidelines](CONTRIBUTING.md).

--- a/official/README.md
+++ b/official/README.md
 # TensorFlow Official Models
-The TensorFlow official models are a collection of example models that use
+The TensorFlow official models are a collection of models that use
 TensorFlow's high-level APIs. They are intended to be well-maintained, tested,
 and kept up to date with the latest TensorFlow API. They should also be
 reasonably optimized for fast performance while still being easy to read.
@@ -83,7 +83,7 @@ installable Official Models package. This is being tracked in
 *   [bert](nlp/bert): A powerful pre-trained language representation model:
    BERT, which stands for Bidirectional Encoder Representations from
    Transformers.
-*   [transformer](transformer): A transformer model to translate the WMT English
+*   [transformer](nlp/transformer): A transformer model to translate the WMT English
    to German dataset.
 *   [xlnet](nlp/xlnet): XLNet: Generalized Autoregressive Pretraining for
    Language Understanding.

--- a/official/benchmark/keras_imagenet_benchmark.py
+++ b/official/benchmark/keras_imagenet_benchmark.py
@@ -23,7 +23,7 @@ import tensorflow as tf  # pylint: disable=g-bad-import-order
 from official.benchmark import keras_benchmark
 from official.utils.testing import benchmark_wrappers
-from official.vision.image_classification import resnet_imagenet_main
+from official.vision.image_classification.resnet import resnet_imagenet_main
 MIN_TOP_1_ACCURACY = 0.76
 MAX_TOP_1_ACCURACY = 0.77
@@ -61,18 +61,6 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
    super(Resnet50KerasAccuracy, self).__init__(
        output_dir=output_dir, flag_methods=flag_methods)
-  def benchmark_graph_8_gpu(self):
-    """Test Keras model with Keras fit/dist_strat and 8 GPUs."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
  def benchmark_8_gpu(self):
    """Test Keras model with eager, dist_strat and 8 GPUs."""
    self._setup()
@@ -135,30 +123,6 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    self._run_and_report_benchmark()
-  def benchmark_8_gpu_mlperf_like(self):
-    """Test similar to the rules for MLPerf 0.5.
-    Listed below are reasons this comparison is not to the MLSpec, but this is
-    still a decent directional measurement:
-      - Eval is every 4 epochs and again at the end. ~2 extra times.
-      - Learning rate is not tuned to hit 75%, but we know the model is correct.
-      - We measure total time and MLPerf 0.5 excluded some startup time.
-      - Eval is not on the total set, need to set eval batch_size where
-        8*batch_size/50K is even. 250 is a good number.
-      - Not sure if we are doing any extra or too few steps due to epoch bleed.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 61
-    FLAGS.epochs_between_evals = 4
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mlperf_like')
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark(top_1_min=0.736)
  def benchmark_xla_8_gpu_fp16_dynamic(self):
    """Test Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16."""
    self._setup()
@@ -921,129 +885,353 @@ class Resnet50KerasBenchmarkRemoteData(Resnet50KerasBenchmarkBase):
    # Cache dataset so performance is stable after the first epoch.
    def_flags['training_dataset_cache'] = True
    def_flags['log_steps'] = 100
+    # Note that for single GPU and pure eager tests which are less likely to be
+    # input bound and more stable, these tests will run for shorter time by
+    # overriding FLAGS.train_epochs, train_seteps, log_steps in benchmark
+    # methods, and skip_steps in _run_and_report_benchmark().
    super(Resnet50KerasBenchmarkRemoteData, self).__init__(
        output_dir=output_dir, default_flags=def_flags)
-  @benchmark_wrappers.enable_runtime_flags
+  def _override_flags_to_run_test_shorter(self):
-  def _run_and_report_benchmark(self):
+    FLAGS.train_epochs = 1
-    # skip the first epoch for performance measurement.
+    FLAGS.train_steps = 300
-    super(Resnet50KerasBenchmarkRemoteData,
+    FLAGS.log_steps = 10
-          self)._run_and_report_benchmark(skip_steps=600)
+  def benchmark_1_gpu_no_dist_strat(self):
+    """Test Keras model with 1 GPU, no distribution strategy."""
+    self._setup()
-class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
+    FLAGS.num_gpus = 1
-  """Trivial model with real data benchmark tests."""
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
-    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
+    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
+    self._setup()
-    def_flags = {}
+    FLAGS.num_gpus = 1
-    def_flags['use_trivial_model'] = True
+    FLAGS.enable_eager = True
-    def_flags['skip_eval'] = True
+    FLAGS.run_eagerly = True
-    def_flags['report_accuracy_metrics'] = False
+    FLAGS.distribution_strategy = 'off'
-    def_flags['dtype'] = 'fp16'
+    FLAGS.model_dir = self._get_model_dir(
-    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
+        'benchmark_1_gpu_no_dist_strat_run_eagerly')
-    def_flags['train_steps'] = 600
+    FLAGS.batch_size = 64
-    def_flags['log_steps'] = 100
+    self._override_flags_to_run_test_shorter()
-    def_flags['distribution_strategy'] = 'mirrored'
+    self._run_and_report_benchmark()
-    super(TrivialKerasBenchmarkReal, self).__init__(
+  def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
-        output_dir=output_dir,
+    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
-        flag_methods=flag_methods,
+    self._setup()
-        default_flags=def_flags)
-  @benchmark_wrappers.enable_runtime_flags
+    FLAGS.num_gpus = 1
-  def _run_and_report_benchmark(self):
+    FLAGS.enable_eager = True
-    start_time_sec = time.time()
+    FLAGS.run_eagerly = True
-    stats = resnet_imagenet_main.run(FLAGS)
+    FLAGS.explicit_gpu_placement = True
-    wall_time_sec = time.time() - start_time_sec
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
+    FLAGS.batch_size = 64
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
-    super(TrivialKerasBenchmarkReal, self)._report_benchmark(
+  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
-        stats,
+    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
-        wall_time_sec,
+    self._setup()
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-  def benchmark_8_gpu_warmup(self):
+    FLAGS.num_gpus = 1
-    """Dummy test that runs over an epoch to warmup the machine."""
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
+    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
    self._setup()
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
    FLAGS.enable_eager = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup')
+    FLAGS.run_eagerly = True
-    FLAGS.batch_size = 256 * 8
+    FLAGS.explicit_gpu_placement = True
-    FLAGS.train_steps = 700
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_graph_1_gpu_no_dist_strat(self):
+    """Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = False
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
+    FLAGS.batch_size = 96  # BatchNorm is less efficient in legacy graph mode
+    # due to its reliance on v1 cond.
+    self._override_flags_to_run_test_shorter()
    self._run_and_report_benchmark()
  def benchmark_1_gpu(self):
-    """Test trivial Keras model (input pipeline) with 1 GPU."""
+    """Test Keras model with 1 GPU."""
    self._setup()
    FLAGS.num_gpus = 1
    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_amp(self):
+    """Test Keras model with 1 GPU with automatic mixed precision."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.fp16_implementation = 'graph_rewrite'
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
    self._run_and_report_benchmark()
-  def benchmark_graph_1_gpu(self):
+  def benchmark_xla_1_gpu(self):
-    """Test trivial Keras model (input pipeline) with 1 GPU."""
+    """Test Keras model with XLA and 1 GPU."""
    self._setup()
    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = False
+    FLAGS.enable_eager = True
    FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_xla_1_gpu_amp(self):
+    """Test Keras model with XLA and 1 GPU with automatic mixed precision."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.fp16_implementation = 'graph_rewrite'
+    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
    self._run_and_report_benchmark()
-  def benchmark_8_gpu(self):
+  def benchmark_1_gpu_fp16(self):
-    """Test trivial Keras model (input pipeline) with 8 GPUs."""
+    """Test Keras model with 1 GPU and fp16."""
    self._setup()
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_fp16_dynamic(self):
+    """Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
+    FLAGS.loss_scale = 'dynamic'
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_xla_1_gpu_fp16(self):
+    """Test Keras model with XLA, 1 GPU and fp16."""
+    self._setup()
+    FLAGS.num_gpus = 1
    FLAGS.enable_eager = True
    FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
+    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.batch_size = 256 * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
    self._run_and_report_benchmark()
-  def benchmark_8_gpu_tweaked(self):
+  def benchmark_xla_1_gpu_fp16_tweaked(self):
-    """Test trivial Keras model with tuning and 8 GPUs."""
+    """Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
    self._setup()
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
    FLAGS.enable_eager = True
    FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_tweaked')
+    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.batch_size = 256 * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
+    self._override_flags_to_run_test_shorter()
    self._run_and_report_benchmark()
-  def benchmark_graph_8_gpu(self):
+  def benchmark_xla_1_gpu_fp16_dynamic(self):
-    """Test trivial Keras model in legacy graph mode with 8 GPUs."""
+    """Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
    self._setup()
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
+    FLAGS.loss_scale = 'dynamic'
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_graph_1_gpu(self):
+    """Test Keras model in legacy graph mode with 1 GPU."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = False
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_graph_xla_1_gpu(self):
+    """Test Keras model in legacy graph mode with XLA and 1 GPU."""
+    self._setup()
+    FLAGS.num_gpus = 1
    FLAGS.enable_eager = False
    FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
+    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.batch_size = 256 * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_xla_1_gpu')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
    self._run_and_report_benchmark()
-  def benchmark_graph_8_gpu_tweaked(self):
+  def benchmark_graph_1_gpu_fp16(self):
-    """Test trivial Keras model in legacy graph mode with tuning and 8 GPUs."""
+    """Test Keras model in legacy graph mode with 1 GPU and fp16."""
    self._setup()
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
+    FLAGS.dtype = 'fp16'
+    FLAGS.enable_eager = False
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_fp16')
+    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_graph_xla_1_gpu_fp16(self):
+    """Test Keras model in legacy graph mode with 1 GPU, fp16 and XLA."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.dtype = 'fp16'
    FLAGS.enable_eager = False
    FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu_tweaked')
+    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.batch_size = 256 * 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_xla_1_gpu_fp16')
+    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  def benchmark_graph_xla_1_gpu_fp16_tweaked(self):
+    """Test Keras model in legacy graph with 1 GPU, fp16, XLA, and tuning."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = False
+    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_graph_xla_1_gpu_fp16_tweaked')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    if FLAGS.num_gpus == 1 or FLAGS.run_eagerly:
+      # For single GPU and pure eager tests which are less likely to be input
+      # bound and more stable, run for shorter time and use the default
+      # skip_steps.
+      skip_steps = None
+    else:
+      # skip the first epoch for performance measurement.
+      skip_steps = 600
+    super(Resnet50KerasBenchmarkRemoteData,
+          self)._run_and_report_benchmark(skip_steps=skip_steps)
+class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
+  """Trivial model with real data benchmark tests."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
+    def_flags = {}
+    def_flags['use_trivial_model'] = True
+    def_flags['skip_eval'] = True
+    def_flags['report_accuracy_metrics'] = False
+    def_flags['dtype'] = 'fp16'
+    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
+    def_flags['train_steps'] = 600
+    def_flags['log_steps'] = 100
+    def_flags['distribution_strategy'] = 'mirrored'
+    super(TrivialKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir,
+        flag_methods=flag_methods,
+        default_flags=def_flags)
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = resnet_imagenet_main.run(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    super(TrivialKerasBenchmarkReal, self)._report_benchmark(
+        stats,
+        wall_time_sec,
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps)
+  def benchmark_8_gpu_warmup(self):
+    """Dummy test that runs over an epoch to warmup the machine."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.enable_eager = True
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup')
+    FLAGS.batch_size = 256 * 8
+    FLAGS.train_steps = 700
    self._run_and_report_benchmark()
  def fill_report_object(self, stats):

--- a/official/benchmark/models/resnet_cifar_main.py
+++ b/official/benchmark/models/resnet_cifar_main.py
@@ -28,8 +28,8 @@ from official.utils.flags import core as flags_core
 from official.utils.logs import logger
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
-from official.vision.image_classification import cifar_preprocessing
+from official.vision.image_classification.resnet import cifar_preprocessing
-from official.vision.image_classification import common
+from official.vision.image_classification.resnet import common
 LR_SCHEDULE = [  # (multiplier, epoch to start) tuples

--- a/official/benchmark/models/resnet_cifar_test.py
+++ b/official/benchmark/models/resnet_cifar_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import googletest
 from official.benchmark.models import resnet_cifar_main
 from official.utils.misc import keras_utils
 from official.utils.testing import integration
-from official.vision.image_classification import cifar_preprocessing
+from official.vision.image_classification.resnet import cifar_preprocessing
 class KerasCifarTest(googletest.TestCase):

--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
@@ -22,8 +22,8 @@ import time
 from absl import flags
 import tensorflow as tf
-from official.vision.image_classification import common
+from official.vision.image_classification.resnet import common
-from official.vision.image_classification import resnet_ctl_imagenet_main
+from official.vision.image_classification.resnet import resnet_ctl_imagenet_main
 from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
 from official.utils.testing import benchmark_wrappers
 from official.utils.flags import core as flags_core
@@ -87,10 +87,9 @@ class CtlBenchmark(PerfZeroBenchmark):
      # first entry in the time_log is start of step 0. The rest of the
      # entries are the end of each step recorded
      time_log = stats['step_timestamp_log']
-      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
-      num_examples = (
+      time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-          total_batch_size * log_steps * (len(time_log) - warmup - 1))
+      examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
-      examples_per_sec = num_examples / elapsed
      metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
    if 'avg_exp_per_second' in stats:

--- a/official/benchmark/tfhub_memory_usage_benchmark.py
+++ b/official/benchmark/tfhub_memory_usage_benchmark.py
@@ -41,8 +41,14 @@ class TfHubMemoryUsageBenchmark(PerfZeroBenchmark):
        output_dir=output_dir, default_flags=default_flags, **kwargs)
    if hub_model_handle_list:
      for hub_model_handle in hub_model_handle_list.split(';'):
+        # Converts a model handle of the form
+        # https://tfhub.dev/google/nnlm-en-dim128/1 to valid python method name
+        # like google_nnlm_en_dim128_1.
+        hub_model_method_name = hub_model_handle.replace(
+            'https://tfhub.dev',
+            '').replace('/', '_').replace('-', '_').strip('_')
        setattr(
-            self, 'benchmark_' + hub_model_handle,
+            self, 'benchmark_' + hub_model_method_name,
            functools.partial(self.benchmark_memory_usage, hub_model_handle))
  def benchmark_memory_usage(

--- a/official/modeling/model_training_utils.py
+++ b/official/modeling/model_training_utils.py
@@ -102,6 +102,7 @@ def run_customized_training_loop(
    strategy=None,
    model_fn=None,
    loss_fn=None,
+    scale_loss=True,
    model_dir=None,
    train_input_fn=None,
    steps_per_epoch=None,
@@ -129,6 +130,8 @@ def run_customized_training_loop(
        to be used for initial checkpoint -- if provided.
      loss_fn: Function with signature func(labels, logits) and returns a loss
        tensor.
+      scale_loss: Whether to divide the raw loss by number of replicas before
+        gradients calculation.
      model_dir: Model directory used during training for restoring/saving model
        weights.
      train_input_fn: Function that returns a tf.data.Dataset used for training.
@@ -211,7 +214,7 @@ def run_customized_training_loop(
  if run_eagerly:
    if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
      raise ValueError(
-          'TPUStrategy should not run eagerly as it heavily replies on graph'
+          'TPUStrategy should not run eagerly as it heavily relies on graph'
          ' optimization for the distributed system.')
  if eval_input_fn and (eval_steps is None or metric_fn is None):
@@ -223,9 +226,6 @@ def run_customized_training_loop(
        'if `metric_fn` is specified, metric_fn must be a callable.')
  total_training_steps = steps_per_epoch * epochs
-  # To reduce unnecessary send/receive input pipeline operation, we place input
-  # pipeline ops in worker task.
  train_iterator = _get_input_iterator(train_input_fn, strategy)
  with distribution_utils.get_strategy_scope(strategy):
@@ -287,6 +287,12 @@ def run_customized_training_loop(
      with tf.GradientTape() as tape:
        model_outputs = model(inputs, training=True)
        loss = loss_fn(labels, model_outputs)
+        # Raw loss is used for reporting in metrics/logs.
+        raw_loss = loss
+        if scale_loss:
+          # Scales down the loss for gradients to be invariant from replicas.
+          loss = loss / strategy.num_replicas_in_sync
      if explicit_allreduce:
        grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
                                                     training_vars,
@@ -303,7 +309,7 @@ def run_customized_training_loop(
          grads = tape.gradient(loss, training_vars)
        optimizer.apply_gradients(zip(grads, training_vars))
      # For reporting, the metric takes the mean of losses.
-      train_loss_metric.update_state(loss)
+      train_loss_metric.update_state(raw_loss)
      for metric in train_metrics:
        metric.update_state(labels, model_outputs)
@@ -324,7 +330,7 @@ def run_customized_training_loop(
                         'retracing.')
      for _ in tf.range(steps):
-        strategy.experimental_run_v2(_replicated_step, args=(next(iterator),))
+        strategy.run(_replicated_step, args=(next(iterator),))
    def train_single_step(iterator):
      """Performs a distributed training step.
@@ -335,7 +341,7 @@ def run_customized_training_loop(
      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
-      strategy.experimental_run_v2(_replicated_step, args=(next(iterator),))
+      strategy.run(_replicated_step, args=(next(iterator),))
    def test_step(iterator):
      """Calculates evaluation metrics on distributed devices."""
@@ -348,7 +354,7 @@ def run_customized_training_loop(
        for metric in eval_metrics:
          metric.update_state(labels, model_outputs)
-      strategy.experimental_run_v2(_test_step_fn, args=(next(iterator),))
+      strategy.run(_test_step_fn, args=(next(iterator),))
    if not run_eagerly:
      train_single_step = tf.function(train_single_step)

--- a/official/modeling/model_training_utils_test.py
+++ b/official/modeling/model_training_utils_test.py
@@ -233,5 +233,4 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
 if __name__ == '__main__':
-  assert tf.version.VERSION.startswith('2.')
  tf.test.main()
--- a/official/modeling/training/distributed_executor.py
+++ b/official/modeling/training/distributed_executor.py
@@ -243,10 +243,10 @@ class DistributedExecutor(object):
        raise ValueError('steps should be an Tensor. Python object may cause '
                         'retracing.')
-      per_replica_losses = strategy.experimental_run_v2(
+      per_replica_losses = strategy.run(
          _replicated_step, args=(next(iterator),))
      for _ in tf.range(num_steps - 1):
-        per_replica_losses = strategy.experimental_run_v2(
+        per_replica_losses = strategy.run(
            _replicated_step, args=(next(iterator),))
      # For reporting, we returns the mean of losses.
@@ -278,7 +278,7 @@ class DistributedExecutor(object):
        metric.update_state(labels, model_outputs)
        return labels, model_outputs
-      return strategy.experimental_run_v2(_test_step_fn, args=(next(iterator),))
+      return strategy.run(_test_step_fn, args=(next(iterator),))
    return test_step

--- a/official/nlp/bert/run_classifier.py
+++ b/official/nlp/bert/run_classifier.py
@@ -61,7 +61,7 @@ common_flags.define_common_bert_flags()
 FLAGS = flags.FLAGS
-def get_loss_fn(num_classes, loss_factor=1.0):
+def get_loss_fn(num_classes):
  """Gets the classification loss function."""
  def classification_loss_fn(labels, logits):
@@ -72,9 +72,7 @@ def get_loss_fn(num_classes, loss_factor=1.0):
        tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(
        tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
-    loss = tf.reduce_mean(per_example_loss)
+    return tf.reduce_mean(per_example_loss)
-    loss *= loss_factor
-    return loss
  return classification_loss_fn
@@ -135,17 +133,7 @@ def run_bert_classifier(strategy,
        use_graph_rewrite=common_flags.use_graph_rewrite())
    return classifier_model, core_model
-  # During distributed training, loss used for gradient computation is
+  loss_fn = get_loss_fn(num_classes)
-  # summed over from all replicas. When Keras compile/fit() API is used,
-  # the fit() API internally normalizes the loss by dividing the loss by
-  # the number of replicas used for computation. However, when custom
-  # training loop is used this is not done automatically and should be
-  # done manually by the end user.
-  loss_multiplier = 1.0
-  if FLAGS.scale_loss and not use_keras_compile_fit:
-    loss_multiplier = 1.0 / strategy.num_replicas_in_sync
-  loss_fn = get_loss_fn(num_classes, loss_factor=loss_multiplier)
  # Defines evaluation metrics function, which will create metrics in the
  # correct device and strategy scope.
@@ -267,7 +255,7 @@ def get_predictions_and_labels(strategy, trained_model, eval_input_fn,
      model_outputs = trained_model(inputs, training=False)
      return model_outputs, labels
-    outputs, labels = strategy.experimental_run_v2(
+    outputs, labels = strategy.run(
        _test_step_fn, args=(next(iterator),))
    # outputs: current batch logits as a tuple of shard logits
    outputs = tf.nest.map_structure(strategy.experimental_local_results,

--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
@@ -74,11 +74,11 @@ def get_pretrain_dataset_fn(input_file_pattern, seq_length,
  return _dataset_fn
-def get_loss_fn(loss_factor=1.0):
+def get_loss_fn():
  """Returns loss function for BERT pretraining."""
  def _bert_pretrain_loss_fn(unused_labels, losses, **unused_args):
-    return tf.reduce_mean(losses) * loss_factor
+    return tf.reduce_mean(losses)
  return _bert_pretrain_loss_fn
@@ -116,9 +116,8 @@ def run_customized_training(strategy,
  trained_model = model_training_utils.run_customized_training_loop(
      strategy=strategy,
      model_fn=_get_pretrain_model,
-      loss_fn=get_loss_fn(
+      loss_fn=get_loss_fn(),
-          loss_factor=1.0 /
+      scale_loss=FLAGS.scale_loss,
-          strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0),
      model_dir=model_dir,
      train_input_fn=train_input_fn,
      steps_per_epoch=steps_per_epoch,

--- a/official/nlp/bert/run_squad_helper.py
+++ b/official/nlp/bert/run_squad_helper.py
@@ -24,6 +24,7 @@ from absl import logging
 import tensorflow as tf
 from official.modeling import model_training_utils
+from official.modeling import performance
 from official.nlp import optimization
 from official.nlp.bert import bert_models
 from official.nlp.bert import common_flags
@@ -89,8 +90,7 @@ FLAGS = flags.FLAGS
 def squad_loss_fn(start_positions,
                  end_positions,
                  start_logits,
-                  end_logits,
+                  end_logits):
-                  loss_factor=1.0):
  """Returns sparse categorical crossentropy for start/end logits."""
  start_loss = tf.keras.losses.sparse_categorical_crossentropy(
      start_positions, start_logits, from_logits=True)
@@ -98,11 +98,10 @@ def squad_loss_fn(start_positions,
      end_positions, end_logits, from_logits=True)
  total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
-  total_loss *= loss_factor
  return total_loss
-def get_loss_fn(loss_factor=1.0):
+def get_loss_fn():
  """Gets a loss function for squad task."""
  def _loss_fn(labels, model_outputs):
@@ -113,8 +112,7 @@ def get_loss_fn(loss_factor=1.0):
        start_positions,
        end_positions,
        start_logits,
-        end_logits,
+        end_logits)
-        loss_factor=loss_factor)
  return _loss_fn
@@ -194,8 +192,7 @@ def predict_squad_customized(strategy, input_meta_data, bert_config,
          start_logits=start_logits,
          end_logits=end_logits)
-    outputs = strategy.experimental_run_v2(
+    outputs = strategy.run(_replicated_step, args=(next(iterator),))
-        _replicated_step, args=(next(iterator),))
    return tf.nest.map_structure(strategy.experimental_local_results, outputs)
  all_results = []
@@ -219,10 +216,7 @@ def train_squad(strategy,
                 ' strategy.')
  # Enables XLA in Session Config. Should not be set for TPU.
  keras_utils.set_config_v2(FLAGS.enable_xla)
+  performance.set_mixed_precision_policy(common_flags.dtype())
-  use_float16 = common_flags.use_float16()
-  if use_float16:
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
  epochs = FLAGS.num_train_epochs
  num_train_examples = input_meta_data['train_data_size']
@@ -242,33 +236,16 @@ def train_squad(strategy,
        max_seq_length,
        hub_module_url=FLAGS.hub_module_url,
        hub_module_trainable=FLAGS.hub_module_trainable)
-    squad_model.optimizer = optimization.create_optimizer(
+    optimizer = optimization.create_optimizer(FLAGS.learning_rate,
-        FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps)
+                                              steps_per_epoch * epochs,
-    if use_float16:
+                                              warmup_steps)
-      # Wraps optimizer with a LossScaleOptimizer. This is done automatically
-      # in compile() with the "mixed_float16" policy, but since we do not call
+    squad_model.optimizer = performance.configure_optimizer(
-      # compile(), we must wrap the optimizer manually.
+        optimizer,
-      squad_model.optimizer = (
+        use_float16=common_flags.use_float16(),
-          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+        use_graph_rewrite=common_flags.use_graph_rewrite())
-              squad_model.optimizer, loss_scale=common_flags.get_loss_scale()))
-    if FLAGS.fp16_implementation == 'graph_rewrite':
-      # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
-      # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
-      # which will ensure tf.compat.v2.keras.mixed_precision and
-      # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
-      # up.
-      squad_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-          squad_model.optimizer)
    return squad_model, core_model
-  # The original BERT model does not scale the loss by
-  # 1/num_replicas_in_sync. It could be an accident. So, in order to use
-  # the same hyper parameter, we do the same thing here by keeping each
-  # replica loss as it is.
-  loss_fn = get_loss_fn(
-      loss_factor=1.0 /
-      strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0)
  # If explicit_allreduce = True, apply_gradients() no longer implicitly
  # allreduce gradients, users manually allreduce gradient and pass the
  # allreduced grads_and_vars to apply_gradients(). clip_by_global_norm will be
@@ -281,7 +258,7 @@ def train_squad(strategy,
  model_training_utils.run_customized_training_loop(
      strategy=strategy,
      model_fn=_get_squad_model,
-      loss_fn=loss_fn,
+      loss_fn=get_loss_fn(),
      model_dir=FLAGS.model_dir,
      steps_per_epoch=steps_per_epoch,
      steps_per_loop=FLAGS.steps_per_loop,

--- a/official/nlp/bert/tf2_encoder_checkpoint_converter.py
+++ b/official/nlp/bert/tf2_encoder_checkpoint_converter.py
@@ -98,7 +98,6 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint):
 def main(_):
-  tf.enable_v2_behavior()
  output_path = FLAGS.converted_checkpoint_path
  v1_checkpoint = FLAGS.checkpoint_to_convert
  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)

--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -24,6 +24,7 @@ import os
 from absl import logging
 import tensorflow as tf
+import tensorflow_datasets as tfds
 from official.nlp.bert import tokenization
@@ -386,6 +387,99 @@ class QnliProcessor(DataProcessor):
    return examples
+class TfdsProcessor(DataProcessor):
+  """Processor for generic text classification TFDS data set.
+  The TFDS parameters are expected to be provided in the tfds_params string, in
+  a comma-separated list of parameter assignments.
+  Examples:
+    tfds_params="dataset=scicite,text_key=string"
+    tfds_params="dataset=imdb_reviews,test_split=,dev_split=test"
+    tfds_params="dataset=glue/cola,text_key=sentence"
+    tfds_params="dataset=glue/sst2,text_key=sentence"
+    tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence"
+    tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2"
+  Possible parameters (please refer to the documentation of Tensorflow Datasets
+  (TFDS) for the meaning of individual parameters):
+    dataset: Required dataset name (potentially with subset and version number).
+    data_dir: Optional TFDS source root directory.
+    train_split: Name of the train split (defaults to `train`).
+    dev_split: Name of the dev split (defaults to `validation`).
+    test_split: Name of the test split (defaults to `test`).
+    text_key: Key of the text_a feature (defaults to `text`).
+    text_b_key: Key of the second text feature if available.
+    label_key: Key of the label feature (defaults to `label`).
+    test_text_key: Key of the text feature to use in test set.
+    test_text_b_key: Key of the second text feature to use in test set.
+    test_label: String to be used as the label for all test examples.
+  """
+  def __init__(self, tfds_params,
+               process_text_fn=tokenization.convert_to_unicode):
+    super(TfdsProcessor, self).__init__(process_text_fn)
+    self._process_tfds_params_str(tfds_params)
+    self.dataset, info = tfds.load(self.dataset_name, data_dir=self.data_dir,
+                                   with_info=True)
+    self._labels = list(range(info.features[self.label_key].num_classes))
+  def _process_tfds_params_str(self, params_str):
+    """Extracts TFDS parameters from a comma-separated assignements string."""
+    tuples = [x.split("=") for x in params_str.split(",")]
+    d = {k.strip(): v.strip() for k, v in tuples}
+    self.dataset_name = d["dataset"]  # Required.
+    self.data_dir = d.get("data_dir", None)
+    self.train_split = d.get("train_split", "train")
+    self.dev_split = d.get("dev_split", "validation")
+    self.test_split = d.get("test_split", "test")
+    self.text_key = d.get("text_key", "text")
+    self.text_b_key = d.get("text_b_key", None)
+    self.label_key = d.get("label_key", "label")
+    self.test_text_key = d.get("test_text_key", self.text_key)
+    self.test_text_b_key = d.get("test_text_b_key", self.text_b_key)
+    self.test_label = d.get("test_label", "test_example")
+  def get_train_examples(self, data_dir):
+    assert data_dir is None
+    return self._create_examples(self.train_split, "train")
+  def get_dev_examples(self, data_dir):
+    assert data_dir is None
+    return self._create_examples(self.dev_split, "dev")
+  def get_test_examples(self, data_dir):
+    assert data_dir is None
+    return self._create_examples(self.test_split, "test")
+  def get_labels(self):
+    return self._labels
+  def get_processor_name(self):
+    return "TFDS_" + self.dataset_name
+  def _create_examples(self, split_name, set_type):
+    """Creates examples for the training and dev sets."""
+    if split_name not in self.dataset:
+      raise ValueError("Split {} not available.".format(split_name))
+    dataset = self.dataset[split_name].as_numpy_iterator()
+    examples = []
+    text_b = None
+    for i, example in enumerate(dataset):
+      guid = "%s-%s" % (set_type, i)
+      if set_type == "test":
+        text_a = self.process_text_fn(example[self.test_text_key])
+        if self.test_text_b_key:
+          text_b = self.process_text_fn(example[self.test_text_b_key])
+        label = self.test_label
+      else:
+        text_a = self.process_text_fn(example[self.text_key])
+        if self.text_b_key:
+          text_b = self.process_text_fn(example[self.text_b_key])
+        label = int(example[self.label_key])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
 def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenizer):
  """Converts a single `InputExample` into a single `InputFeatures`."""

--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -104,22 +104,16 @@ flags.DEFINE_enum(
    "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, "
    "while ALBERT uses sentence_piece tokenizer.")
+flags.DEFINE_string("tfds_params", "",
+                    "Comma-separated list of TFDS parameter assigments for "
+                    "generic classfication data import (for more details "
+                    "see the TfdsProcessor class documentation).")
 def generate_classifier_dataset():
  """Generates classifier dataset and returns input meta data."""
-  assert FLAGS.input_data_dir and FLAGS.classification_task_name
+  assert (FLAGS.input_data_dir and FLAGS.classification_task_name
+          or FLAGS.tfds_params)
-  processors = {
-      "cola": classifier_data_lib.ColaProcessor,
-      "mnli": classifier_data_lib.MnliProcessor,
-      "mrpc": classifier_data_lib.MrpcProcessor,
-      "qnli": classifier_data_lib.QnliProcessor,
-      "sst-2": classifier_data_lib.SstProcessor,
-      "xnli": classifier_data_lib.XnliProcessor,
-  }
-  task_name = FLAGS.classification_task_name.lower()
-  if task_name not in processors:
-    raise ValueError("Task not found: %s" % (task_name))
  if FLAGS.tokenizer_impl == "word_piece":
    tokenizer = tokenization.FullTokenizer(
@@ -131,14 +125,38 @@ def generate_classifier_dataset():
    processor_text_fn = functools.partial(
        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
-  processor = processors[task_name](processor_text_fn)
+  if FLAGS.tfds_params:
-  return classifier_data_lib.generate_tf_record_from_data_file(
+    processor = classifier_data_lib.TfdsProcessor(
-      processor,
+        tfds_params=FLAGS.tfds_params,
-      FLAGS.input_data_dir,
+        process_text_fn=processor_text_fn)
-      tokenizer,
+    return classifier_data_lib.generate_tf_record_from_data_file(
-      train_data_output_path=FLAGS.train_data_output_path,
+        processor,
-      eval_data_output_path=FLAGS.eval_data_output_path,
+        None,
-      max_seq_length=FLAGS.max_seq_length)
+        tokenizer,
+        train_data_output_path=FLAGS.train_data_output_path,
+        eval_data_output_path=FLAGS.eval_data_output_path,
+        max_seq_length=FLAGS.max_seq_length)
+  else:
+    processors = {
+        "cola": classifier_data_lib.ColaProcessor,
+        "mnli": classifier_data_lib.MnliProcessor,
+        "mrpc": classifier_data_lib.MrpcProcessor,
+        "qnli": classifier_data_lib.QnliProcessor,
+        "sst-2": classifier_data_lib.SstProcessor,
+        "xnli": classifier_data_lib.XnliProcessor,
+    }
+    task_name = FLAGS.classification_task_name.lower()
+    if task_name not in processors:
+      raise ValueError("Task not found: %s" % (task_name))
+    processor = processors[task_name](processor_text_fn)
+    return classifier_data_lib.generate_tf_record_from_data_file(
+        processor,
+        FLAGS.input_data_dir,
+        tokenizer,
+        train_data_output_path=FLAGS.train_data_output_path,
+        eval_data_output_path=FLAGS.eval_data_output_path,
+        max_seq_length=FLAGS.max_seq_length)
 def generate_squad_dataset():

--- a/official/nlp/transformer/compute_bleu.py
+++ b/official/nlp/transformer/compute_bleu.py
@@ -47,8 +47,10 @@ class UnicodeRegex(object):
    self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
  def property_chars(self, prefix):
-    return "".join(six.unichr(x) for x in range(sys.maxunicode)
+    return "".join(
-                   if unicodedata.category(six.unichr(x)).startswith(prefix))
+        six.unichr(x)
+        for x in range(sys.maxunicode)
+        if unicodedata.category(six.unichr(x)).startswith(prefix))
 uregex = UnicodeRegex()
@@ -92,9 +94,10 @@ def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
      tf.io.gfile.GFile(hyp_filename).read()).strip().splitlines()
  if len(ref_lines) != len(hyp_lines):
-    raise ValueError("Reference and translation files have different number of "
+    raise ValueError(
-                     "lines. If training only a few steps (100-200), the "
+        "Reference and translation files have different number of "
-                     "translation may be empty.")
+        "lines (%d VS %d). If training only a few steps (100-200), the "
+        "translation may be empty." % (len(ref_lines), len(hyp_lines)))
  if not case_sensitive:
    ref_lines = [x.lower() for x in ref_lines]
    hyp_lines = [x.lower() for x in hyp_lines]
@@ -116,18 +119,23 @@ def main(unused_argv):
 def define_compute_bleu_flags():
  """Add flags for computing BLEU score."""
  flags.DEFINE_string(
-      name="translation", default=None,
+      name="translation",
+      default=None,
      help=flags_core.help_wrap("File containing translated text."))
  flags.mark_flag_as_required("translation")
  flags.DEFINE_string(
-      name="reference", default=None,
+      name="reference",
+      default=None,
      help=flags_core.help_wrap("File containing reference translation."))
  flags.mark_flag_as_required("reference")
  flags.DEFINE_enum(
-      name="bleu_variant", short_name="bv", default="both",
+      name="bleu_variant",
-      enum_values=["both", "uncased", "cased"], case_sensitive=False,
+      short_name="bv",
+      default="both",
+      enum_values=["both", "uncased", "cased"],
+      case_sensitive=False,
      help=flags_core.help_wrap(
          "Specify one or more BLEU variants to calculate. Variants: \"cased\""
          ", \"uncased\", or \"both\"."))

--- a/official/nlp/transformer/transformer_main.py
+++ b/official/nlp/transformer/transformer_main.py
@@ -280,7 +280,7 @@ class TransformerTask(object):
      for _ in tf.range(steps):
        train_loss_metric.reset_states()
-        self.distribution_strategy.experimental_run_v2(
+        self.distribution_strategy.run(
            _step_fn, args=(next(iterator),))
    cased_score, uncased_score = None, None

--- a/official/nlp/transformer/translate.py
+++ b/official/nlp/transformer/translate.py
@@ -132,7 +132,7 @@ def translate_file(model,
      val_outputs, _ = model([val_inputs], training=False)
      return tag, val_outputs
-    return distribution_strategy.experimental_run_v2(_step_fn, args=(inputs,))
+    return distribution_strategy.run(_step_fn, args=(inputs,))
  translations = []
  if distribution_strategy:
@@ -151,7 +151,7 @@ def translate_file(model,
        replica_id = replica_context.replica_id_in_sync_group
        return replica_id, text[replica_id]
-      text = distribution_strategy.experimental_run_v2(text_as_per_replica)
+      text = distribution_strategy.run(text_as_per_replica)
      outputs = distribution_strategy.experimental_local_results(
          predict_step(text))
      tags, unordered_val_outputs = outputs[0]

--- a/official/nlp/transformer/utils/tokenizer.py
+++ b/official/nlp/transformer/utils/tokenizer.py
@@ -29,6 +29,8 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
+# pylint: disable=g-complex-comprehension
 PAD = "<pad>"
 PAD_ID = 0
 EOS = "<EOS>"
@@ -46,27 +48,36 @@ _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
 _UNDEFINED_UNICODE = u"\u3013"
+def alphanumeric_char_set():
+  return set(
+      six.unichr(i)
+      for i in xrange(sys.maxunicode)
+      if (unicodedata.category(six.unichr(i)).startswith("L") or
+          unicodedata.category(six.unichr(i)).startswith("N")))
 # Set contains all letter and number characters.
-_ALPHANUMERIC_CHAR_SET = set(
+_ALPHANUMERIC_CHAR_SET = alphanumeric_char_set()
-    six.unichr(i) for i in xrange(sys.maxunicode)
-    if (unicodedata.category(six.unichr(i)).startswith("L") or
-        unicodedata.category(six.unichr(i)).startswith("N")))
 # min_count is the minimum number of times a subtoken must appear in the data
 # before before it is added to the vocabulary. The value is found using binary
 # search to obtain the target vocabulary size.
-_MIN_MIN_COUNT = 1     # min value to use when binary searching for min_count
+_MIN_MIN_COUNT = 1  # min value to use when binary searching for min_count
 _MAX_MIN_COUNT = 1000  # max value to use when binary searching for min_count
 class Subtokenizer(object):
  """Encodes and decodes strings to/from integer IDs."""
-  def __init__(self, vocab_file, reserved_tokens=None):
+  def __init__(self, vocab_file, reserved_tokens=None, master_char_set=None):
    """Initializes class, creating a vocab file if data_files is provided."""
    logging.info("Initializing Subtokenizer from file %s." %
                              vocab_file)
+    if master_char_set is None:
+      master_char_set = _ALPHANUMERIC_CHAR_SET
    if reserved_tokens is None:
      reserved_tokens = RESERVED_TOKENS
@@ -79,13 +90,20 @@ class Subtokenizer(object):
      self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken))
    # Create cache to speed up subtokenization
-    self._cache_size = 2 ** 20
+    self._cache_size = 2**20
    self._cache = [(None, None)] * self._cache_size
+    self._master_char_set = master_char_set
  @staticmethod
-  def init_from_files(
+  def init_from_files(vocab_file,
-      vocab_file, files, target_vocab_size, threshold, min_count=None,
+                      files,
-      file_byte_limit=1e6, reserved_tokens=None, correct_strip=True):
+                      target_vocab_size,
+                      threshold,
+                      min_count=None,
+                      file_byte_limit=1e6,
+                      reserved_tokens=None,
+                      correct_strip=True,
+                      master_char_set=None):
    """Create subtoken vocabulary based on files, and save vocab to file.
    Args:
@@ -102,10 +120,13 @@ class Subtokenizer(object):
      reserved_tokens: List of string tokens that are guaranteed to be at the
        beginning of the subtoken vocabulary list.
      correct_strip: Whether to convert text to unicode before strip.
+      master_char_set: the char set.
    Returns:
      Subtokenizer object
    """
+    if master_char_set is None:
+      master_char_set = _ALPHANUMERIC_CHAR_SET
    if reserved_tokens is None:
      reserved_tokens = RESERVED_TOKENS
@@ -113,7 +134,8 @@ class Subtokenizer(object):
      logging.info("Vocab file already exists (%s)" % vocab_file)
    else:
      logging.info("Begin steps to create subtoken vocabulary...")
-      token_counts = _count_tokens(files, file_byte_limit, correct_strip)
+      token_counts = _count_tokens(files, file_byte_limit, correct_strip,
+                                   master_char_set)
      alphabet = _generate_alphabet_dict(token_counts)
      subtoken_list = _generate_subtokens_with_target_vocab_size(
          token_counts, alphabet, target_vocab_size, threshold, min_count,
@@ -121,15 +143,18 @@ class Subtokenizer(object):
      logging.info("Generated vocabulary with %d subtokens." %
                                len(subtoken_list))
      _save_vocab_file(vocab_file, subtoken_list)
-    return Subtokenizer(vocab_file)
+    return Subtokenizer(vocab_file, master_char_set=master_char_set)
  def encode(self, raw_string, add_eos=False):
    """Encodes a string into a list of int subtoken ids."""
    ret = []
-    tokens = _split_string_to_tokens(native_to_unicode(raw_string))
+    tokens = _split_string_to_tokens(
+        native_to_unicode(raw_string), self._master_char_set)
    for token in tokens:
      ret.extend(self._token_to_subtoken_ids(token))
    if add_eos:
+      assert EOS in self.subtoken_list, \
+          "Can't append 'EOS' because it is not in list of known subtokens."
      ret.append(EOS_ID)
    return ret
@@ -162,13 +187,14 @@ class Subtokenizer(object):
        "Subtokens argument passed into decode() must be a list of integers.")
    return _unicode_to_native(
-        _join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens)))
+        _join_tokens_to_string(
+            self._subtoken_ids_to_tokens(subtokens), self._master_char_set))
  def _subtoken_ids_to_tokens(self, subtokens):
    """Convert list of int subtoken ids to a list of string tokens."""
    escaped_tokens = "".join([
-        self.subtoken_list[s] for s in subtokens
+        self.subtoken_list[s] for s in subtokens if s < len(self.subtoken_list)
-        if s < len(self.subtoken_list)])
+    ])
    escaped_tokens = escaped_tokens.split("_")
    # All tokens in the vocabulary list have been escaped (see _escape_token())
@@ -205,7 +231,7 @@ def _load_vocab_file(vocab_file, reserved_tokens=None):
 def native_to_unicode(s):
  """Convert string to unicode (required in Python 2)."""
-  try:               # Python 2
+  try:  # Python 2
    return s if isinstance(s, unicode) else s.decode("utf-8")
  except NameError:  # Python 3
    return s
@@ -213,22 +239,22 @@ def native_to_unicode(s):
 def _unicode_to_native(s):
  """Convert string from unicode to native format (required in Python 2)."""
-  try:               # Python 2
+  try:  # Python 2
    return s.encode("utf-8") if isinstance(s, unicode) else s
  except NameError:  # Python 3
    return s
-def _split_string_to_tokens(text):
+def _split_string_to_tokens(text, master_char_set):
  """Splits text to a list of string tokens."""
  if not text:
    return []
  ret = []
  token_start = 0
  # Classify each character in the input string
-  is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
+  is_master = [c in master_char_set for c in text]
  for pos in xrange(1, len(text)):
-    if is_alnum[pos] != is_alnum[pos - 1]:
+    if is_master[pos] != is_master[pos - 1]:
      token = text[token_start:pos]
      if token != u" " or token_start == 0:
        ret.append(token)
@@ -238,12 +264,12 @@ def _split_string_to_tokens(text):
  return ret
-def _join_tokens_to_string(tokens):
+def _join_tokens_to_string(tokens, master_char_set):
  """Join a list of string tokens into a single string."""
-  token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+  token_is_master = [t[0] in master_char_set for t in tokens]
  ret = []
  for i, token in enumerate(tokens):
-    if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
+    if i > 0 and token_is_master[i - 1] and token_is_master[i]:
      ret.append(u" ")
    ret.append(token)
  return "".join(ret)
@@ -325,7 +351,10 @@ def _unescape_token(token):
  return _UNESCAPE_REGEX.sub(match, token)
-def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
+def _count_tokens(files,
+                  file_byte_limit=1e6,
+                  correct_strip=True,
+                  master_char_set=None):
  """Return token counts of words in the files.
  Samples file_byte_limit bytes from each file, and counts the words that appear
@@ -338,11 +367,15 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
      vocabulary generation for PY2. Sets correct_strip to False in PY2 to
      reproduce previous common public result. Sets correct_strip to True will
      let PY2 and PY3 get a consistent vocabulary.
+    master_char_set: the char set.
  Returns:
    Dictionary mapping tokens to the number of times they appear in the sampled
    lines from the files.
  """
+  if master_char_set is None:
+    master_char_set = _ALPHANUMERIC_CHAR_SET
  token_counts = collections.defaultdict(int)
  for filepath in files:
@@ -363,7 +396,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
          counter = 0
          # Add words to token counts
-          for token in _split_string_to_tokens(native_to_unicode(line)):
+          for token in _split_string_to_tokens(
+              native_to_unicode(line), master_char_set):
            token_counts[token] += 1
  return token_counts
@@ -395,9 +429,12 @@ def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length):
  return ret
-def _generate_subtokens_with_target_vocab_size(
+def _generate_subtokens_with_target_vocab_size(token_counts,
-    token_counts, alphabet, target_size, threshold, min_count=None,
+                                               alphabet,
-    reserved_tokens=None):
+                                               target_size,
+                                               threshold,
+                                               min_count=None,
+                                               reserved_tokens=None):
  """Generate subtoken vocabulary close to the target size."""
  if reserved_tokens is None:
    reserved_tokens = RESERVED_TOKENS
@@ -450,8 +487,8 @@ def _generate_alphabet_dict(iterable, reserved_tokens=None):
  return alphabet
-def _count_and_gen_subtokens(
+def _count_and_gen_subtokens(token_counts, alphabet, subtoken_dict,
-    token_counts, alphabet, subtoken_dict, max_subtoken_length):
+                             max_subtoken_length):
  """Count number of times subtokens appear, and generate new subtokens.
  Args:
@@ -469,8 +506,8 @@ def _count_and_gen_subtokens(
  subtoken_counts = collections.defaultdict(int)
  for token, count in six.iteritems(token_counts):
    token = _escape_token(token, alphabet)
-    subtokens = _split_token_to_subtokens(
+    subtokens = _split_token_to_subtokens(token, subtoken_dict,
-        token, subtoken_dict, max_subtoken_length)
+                                          max_subtoken_length)
    # Generate new subtokens by taking substrings from token.
    start = 0
@@ -504,8 +541,10 @@ def _filter_and_bucket_subtokens(subtoken_counts, min_count):
  return subtoken_buckets
-def _gen_new_subtoken_list(
+def _gen_new_subtoken_list(subtoken_counts,
-    subtoken_counts, min_count, alphabet, reserved_tokens=None):
+                           min_count,
+                           alphabet,
+                           reserved_tokens=None):
  """Generate candidate subtokens ordered by count, and new max subtoken length.
  Add subtokens to the candiate list in order of length (longest subtokens
@@ -576,9 +615,11 @@ def _gen_new_subtoken_list(
  return subtoken_list, max_subtoken_length
-def _generate_subtokens(
+def _generate_subtokens(token_counts,
-    token_counts, alphabet, min_count, num_iterations=4,
+                        alphabet,
-    reserved_tokens=None):
+                        min_count,
+                        num_iterations=4,
+                        reserved_tokens=None):
  """Create a list of subtokens in decreasing order of frequency.
  Args:
@@ -610,8 +651,9 @@ def _generate_subtokens(
    # Create dict mapping subtoken->count, with additional subtokens created
    # from substrings taken from the tokens.
-    subtoken_counts = _count_and_gen_subtokens(
+    subtoken_counts = _count_and_gen_subtokens(token_counts, alphabet,
-        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+                                               subtoken_dict,
+                                               max_subtoken_length)
    # Generate new list of subtokens sorted by subtoken count.
    subtoken_list, max_subtoken_length = _gen_new_subtoken_list(