Merge branch 'master' into amp_resnet50

a35e09d2 · Vinh Nguyen · GitHub · d5722dcd · 1f5a5e9d · a35e09d2
Unverified Commit a35e09d2 authored Aug 28, 2019 by Vinh Nguyen Committed by GitHub Aug 28, 2019
15 changed files
--- a/official/transformer/v2/transformer_benchmark.py
+++ b/official/transformer/v2/transformer_benchmark.py
@@ -21,6 +21,7 @@ import os
 import time
 from absl import flags
+import tensorflow as tf
 from official.transformer.v2 import misc
 from official.transformer.v2 import transformer_main as transformer_main
@@ -30,6 +31,7 @@ from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
 TRANSFORMER_EN2DE_DATA_DIR_NAME = 'wmt32k-en2de-official'
 EN2DE_2014_BLEU_DATA_DIR_NAME = 'newstest2014'
 FLAGS = flags.FLAGS
+TMP_DIR = os.getenv('TMPDIR')
 class TransformerBenchmark(PerfZeroBenchmark):
@@ -56,6 +58,11 @@ class TransformerBenchmark(PerfZeroBenchmark):
                                 EN2DE_2014_BLEU_DATA_DIR_NAME,
                                 'newstest2014.de')
+    if default_flags is None:
+      default_flags = {}
+    default_flags['data_dir'] = self.train_data_dir
+    default_flags['vocab_file'] = self.vocab_file
    super(TransformerBenchmark, self).__init__(
        output_dir=output_dir,
        default_flags=default_flags,
@@ -280,8 +287,8 @@ class TransformerBigKerasAccuracy(TransformerBenchmark):
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
                                   log_steps=FLAGS.log_steps,
-                                   bleu_min=28,
+                                   bleu_min=27.9,
-                                   bleu_max=29)
+                                   bleu_max=29.2)
  def benchmark_8_gpu_static_batch(self):
    """Benchmark 8 gpu.
@@ -305,12 +312,19 @@ class TransformerBigKerasAccuracy(TransformerBenchmark):
    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
                                   log_steps=FLAGS.log_steps,
                                   bleu_min=28,
-                                   bleu_max=29)
+                                   bleu_max=29.2)
  def benchmark_8_gpu_fp16(self):
    """Benchmark 8 gpu with dynamic batch and fp16.
-      Should converge to 28.4 BLEU (uncased). This has not be verified yet."
+    Over 6 runs with eval every 20K steps the average highest value was 28.247
+    (bleu uncased). 28.424 was the highest and 28.09 the lowest. The values are
+    the highest value seen during a run and occurred at a median of iteration
+    11. While this could be interpreted as worse than FP32, if looking at the
+    first iteration at which 28 is passed FP16 performs equal and possibly
+    better. Although not part of the initial test runs, the highest value
+    recorded with the arguments below was 28.9 at iteration 12. Iterations are
+    not epochs, an iteration is a number of steps between evals.
    """
    self._setup()
    FLAGS.num_gpus = 8
@@ -328,7 +342,7 @@ class TransformerBigKerasAccuracy(TransformerBenchmark):
    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
                                   log_steps=FLAGS.log_steps,
                                   bleu_min=28,
-                                   bleu_max=29)
+                                   bleu_max=29.2)
  def benchmark_8_gpu_fp16_amp(self):
    """Benchmark 8 gpu with dynamic batch and fp16 with automatic mixed precision.
@@ -377,7 +391,7 @@ class TransformerBigKerasAccuracy(TransformerBenchmark):
    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
                                   log_steps=FLAGS.log_steps,
                                   bleu_min=28,
-                                   bleu_max=29)
+                                   bleu_max=29.2)
  def benchmark_xla_8_gpu_static_batch_fp16(self):
    """Benchmark 8 gpu with static batch, XLA, and FP16.
@@ -404,7 +418,7 @@ class TransformerBigKerasAccuracy(TransformerBenchmark):
    self._run_and_report_benchmark(total_batch_size=FLAGS.batch_size,
                                   log_steps=FLAGS.log_steps,
                                   bleu_min=28,
-                                   bleu_max=29)
+                                   bleu_max=29.2)
 class TransformerKerasBenchmark(TransformerBenchmark):
@@ -635,17 +649,9 @@ class TransformerKerasBenchmark(TransformerBenchmark):
 class TransformerBaseKerasBenchmarkReal(TransformerKerasBenchmark):
  """Transformer based version real data benchmark tests."""
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+  def __init__(self, output_dir=TMP_DIR, root_data_dir=None, **kwargs):
-    train_data_dir = os.path.join(root_data_dir,
-                                  TRANSFORMER_EN2DE_DATA_DIR_NAME)
-    vocab_file = os.path.join(root_data_dir,
-                              TRANSFORMER_EN2DE_DATA_DIR_NAME,
-                              'vocab.ende.32768')
    def_flags = {}
    def_flags['param_set'] = 'base'
-    def_flags['vocab_file'] = vocab_file
-    def_flags['data_dir'] = train_data_dir
    def_flags['train_steps'] = 200
    def_flags['log_steps'] = 10
@@ -657,20 +663,16 @@ class TransformerBaseKerasBenchmarkReal(TransformerKerasBenchmark):
 class TransformerBigKerasBenchmarkReal(TransformerKerasBenchmark):
  """Transformer based version real data benchmark tests."""
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+  def __init__(self, output_dir=TMP_DIR, root_data_dir=None, **kwargs):
-    train_data_dir = os.path.join(root_data_dir,
-                                  TRANSFORMER_EN2DE_DATA_DIR_NAME)
-    vocab_file = os.path.join(root_data_dir,
-                              TRANSFORMER_EN2DE_DATA_DIR_NAME,
-                              'vocab.ende.32768')
    def_flags = {}
    def_flags['param_set'] = 'big'
-    def_flags['vocab_file'] = vocab_file
-    def_flags['data_dir'] = train_data_dir
    def_flags['train_steps'] = 200
    def_flags['log_steps'] = 10
    super(TransformerBigKerasBenchmarkReal, self).__init__(
        output_dir=output_dir, default_flags=def_flags,
        root_data_dir=root_data_dir, batch_per_gpu=3072)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/transformer/v2/transformer_layers_test.py
+++ b/official/transformer/v2/transformer_layers_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Tests for layers in Transformer."""
 from __future__ import absolute_import
@@ -79,4 +93,5 @@ class TransformerLayersTest(tf.test.TestCase):
 if __name__ == "__main__":
+  tf.compat.v1.enable_v2_behavior()
  tf.test.main()
--- a/official/transformer/v2/transformer_main.py
+++ b/official/transformer/v2/transformer_main.py
@@ -52,18 +52,40 @@ BLEU_DIR = "bleu"
 _SINGLE_SAMPLE = 1
-def translate_and_compute_bleu(model, subtokenizer, bleu_source, bleu_ref):
+def translate_and_compute_bleu(model,
-  """Translate file and report the cased and uncased bleu scores."""
+                               params,
+                               subtokenizer,
+                               bleu_source,
+                               bleu_ref,
+                               distribution_strategy=None):
+  """Translate file and report the cased and uncased bleu scores.
+  Args:
+    model: A Keras model, used to generate the translations.
+    params: A dictionary, containing the translation related parameters.
+    subtokenizer: A subtokenizer object, used for encoding and decoding source
+      and translated lines.
+    bleu_source: A file containing source sentences for translation.
+    bleu_ref: A file containing the reference for the translated sentences.
+    distribution_strategy: A platform distribution strategy, used for TPU based
+      translation.
+  Returns:
+    uncased_score: A float, the case insensitive BLEU score.
+    cased_score: A float, the case sensitive BLEU score.
+  """
  # Create temporary file to store translation.
  tmp = tempfile.NamedTemporaryFile(delete=False)
  tmp_filename = tmp.name
  translate.translate_file(
      model,
+      params,
      subtokenizer,
      bleu_source,
      output_file=tmp_filename,
-      print_all_translations=False)
+      print_all_translations=False,
+      distribution_strategy=distribution_strategy)
  # Compute uncased and cased bleu scores.
  uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
@@ -72,12 +94,31 @@ def translate_and_compute_bleu(model, subtokenizer, bleu_source, bleu_ref):
  return uncased_score, cased_score
-def evaluate_and_log_bleu(model, bleu_source, bleu_ref, vocab_file):
+def evaluate_and_log_bleu(model,
-  """Calculate and record the BLEU score."""
+                          params,
+                          bleu_source,
+                          bleu_ref,
+                          vocab_file,
+                          distribution_strategy=None):
+  """Calculate and record the BLEU score.
+  Args:
+    model: A Keras model, used to generate the translations.
+    params: A dictionary, containing the translation related parameters.
+    bleu_source: A file containing source sentences for translation.
+    bleu_ref: A file containing the reference for the translated sentences.
+    vocab_file: A file containing the vocabulary for translation.
+    distribution_strategy: A platform distribution strategy, used for TPU based
+      translation.
+  Returns:
+    uncased_score: A float, the case insensitive BLEU score.
+    cased_score: A float, the case sensitive BLEU score.
+  """
  subtokenizer = tokenizer.Subtokenizer(vocab_file)
  uncased_score, cased_score = translate_and_compute_bleu(
-      model, subtokenizer, bleu_source, bleu_ref)
+      model, params, subtokenizer, bleu_source, bleu_ref, distribution_strategy)
  logging.info("Bleu score (uncased): %s", uncased_score)
  logging.info("Bleu score (cased): %s", cased_score)
@@ -110,6 +151,9 @@ class TransformerTask(object):
    params["model_dir"] = flags_obj.model_dir
    params["static_batch"] = flags_obj.static_batch
    params["max_length"] = flags_obj.max_length
+    params["decode_batch_size"] = flags_obj.decode_batch_size
+    params["decode_max_length"] = flags_obj.decode_max_length
+    params["padded_decode"] = flags_obj.padded_decode
    params["num_parallel_calls"] = (
        flags_obj.num_parallel_calls or tf.data.experimental.AUTOTUNE)
@@ -124,8 +168,10 @@ class TransformerTask(object):
      # like this. What if multiple instances of TransformerTask are created?
      # We should have a better way in the tf.keras.mixed_precision API of doing
      # this.
+      loss_scale = flags_core.get_loss_scale(flags_obj,
+                                             default_for_fp16="dynamic")
      policy = tf.keras.mixed_precision.experimental.Policy(
-          "infer_float32_vars")
+          "mixed_float16", loss_scale=loss_scale)
      tf.keras.mixed_precision.experimental.set_policy(policy)
    self.distribution_strategy = distribution_utils.get_distribution_strategy(
@@ -133,6 +179,7 @@ class TransformerTask(object):
        num_gpus=num_gpus,
        tpu_address=flags_obj.tpu or "")
    if self.use_tpu:
+      params["num_replicas"] = self.distribution_strategy.num_replicas_in_sync
      if not params["static_batch"]:
        raise ValueError("TPU requires static batch for input data.")
    else:
@@ -306,10 +353,10 @@ class TransformerTask(object):
        self.predict_model,
        tf.train.latest_checkpoint(self.flags_obj.model_dir))
    self.predict_model.summary()
-    return evaluate_and_log_bleu(self.predict_model,
+    return evaluate_and_log_bleu(
-                                 self.flags_obj.bleu_source,
+        self.predict_model, self.params, self.flags_obj.bleu_source,
-                                 self.flags_obj.bleu_ref,
+        self.flags_obj.bleu_ref, self.flags_obj.vocab_file,
-                                 self.flags_obj.vocab_file)
+        self.distribution_strategy if self.use_tpu else None)
  def predict(self):
    """Predicts result from the model."""
@@ -372,6 +419,7 @@ class TransformerTask(object):
        params["optimizer_adam_beta1"],
        params["optimizer_adam_beta2"],
        epsilon=params["optimizer_adam_epsilon"])
    if params["dtype"] == tf.float16:
      opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
          opt, loss_scale=flags_core.get_loss_scale(self.flags_obj,

--- a/official/transformer/v2/transformer_main_test.py
+++ b/official/transformer/v2/transformer_main_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import os
 import re
+import sys
 import unittest
 from absl import flags
@@ -178,10 +179,13 @@ class TransformerTaskTest(tf.test.TestCase):
  def test_eval(self):
    if context.num_gpus() >= 2:
      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    if 'test_xla' in sys.argv[0]:
+      self.skipTest('TODO(xla): Make this test faster under XLA.')
    self._prepare_files_and_flags()
    t = tm.TransformerTask(FLAGS)
    t.eval()
 if __name__ == '__main__':
+  tf.compat.v1.enable_v2_behavior()
  tf.test.main()
--- a/official/transformer/v2/transformer_test.py
+++ b/official/transformer/v2/transformer_test.py
@@ -65,4 +65,5 @@ class TransformerV2Test(tf.test.TestCase):
 if __name__ == "__main__":
+  tf.compat.v1.enable_v2_behavior()
  tf.test.main()
--- a/official/transformer/v2/translate.py
+++ b/official/transformer/v2/translate.py
@@ -18,11 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import numpy as np
 import tensorflow as tf
+from tensorflow.python.distribute import values
 from official.transformer.utils import tokenizer
-_DECODE_BATCH_SIZE = 32
 _EXTRA_DECODE_LENGTH = 100
 _BEAM_SIZE = 4
 _ALPHA = 0.6
@@ -68,23 +69,31 @@ def _trim_and_decode(ids, subtokenizer):
    return subtokenizer.decode(ids)
-def translate_file(
+def translate_file(model,
-    model, subtokenizer, input_file, output_file=None,
+                   params,
-    print_all_translations=True):
+                   subtokenizer,
+                   input_file,
+                   output_file=None,
+                   print_all_translations=True,
+                   distribution_strategy=None):
  """Translate lines in file, and save to output file if specified.
  Args:
-    model: Keras model used to generate the translations.
+    model: A Keras model, used to generate the translations.
-    subtokenizer: Subtokenizer object for encoding and decoding source and
+    params: A dictionary, containing the translation related parameters.
-       translated lines.
+    subtokenizer: A subtokenizer object, used for encoding and decoding source
-    input_file: file containing lines to translate
+      and translated lines.
-    output_file: file that stores the generated translations.
+    input_file: A file containing lines to translate.
-    print_all_translations: If true, all translations are printed to stdout.
+    output_file: A file that stores the generated translations.
+    print_all_translations: A bool. If true, all translations are printed to
+      stdout.
+    distribution_strategy: A distribution strategy, used to perform inference
+      directly with tf.function instead of Keras model.predict().
  Raises:
    ValueError: if output file is invalid.
  """
-  batch_size = _DECODE_BATCH_SIZE
+  batch_size = params["decode_batch_size"]
  # Read and sort inputs by length. Keep dictionary (original index-->new index
  # in sorted list) to write translations in the original order.
@@ -101,24 +110,59 @@ def translate_file(
          if j + i * batch_size < total_samples
      ]
      lines = [_encode_and_add_eos(l, subtokenizer) for l in lines]
+      if distribution_strategy:
+        for j in range(batch_size - len(lines)):
+          lines.append([tokenizer.EOS_ID])
      batch = tf.keras.preprocessing.sequence.pad_sequences(
-          lines, dtype="int64", padding="post")
+          lines,
+          maxlen=params["decode_max_length"],
+          dtype="int32",
+          padding="post")
      tf.compat.v1.logging.info("Decoding batch %d out of %d.", i,
                                num_decode_batches)
      yield batch
+  @tf.function
+  def predict_step(inputs):
+    """Decoding step function for TPU runs."""
+    def _step_fn(inputs):
+      """Per replica step function."""
+      val_outputs, _ = model([inputs], training=False)
+      return val_outputs
+    return distribution_strategy.experimental_run_v2(_step_fn, args=(inputs,))
  translations = []
+  if distribution_strategy:
+    num_replicas = distribution_strategy.num_replicas_in_sync
+    local_batch_size = params["decode_batch_size"] // num_replicas
  for i, text in enumerate(input_generator()):
-    val_outputs, _ = model.predict(text)
+    if distribution_strategy:
+      text = np.reshape(text, [num_replicas, local_batch_size, -1])
+      text = [
+          tf.convert_to_tensor(per_replica_text) for per_replica_text in text
+      ]
+      # pylint: disable=protected-access
+      text = values.PerReplica(distribution_strategy.extended._device_map, text)
+      # pylint: enable=protected-access
+      val_outputs = distribution_strategy.experimental_local_results(
+          predict_step(text))
+      val_outputs = np.reshape(
+          [val_output.numpy() for val_output in val_outputs],
+          [params["decode_batch_size"], -1])
+    else:
+      val_outputs, _ = model.predict(text)
    length = len(val_outputs)
    for j in range(length):
-      translation = _trim_and_decode(val_outputs[j], subtokenizer)
+      if j + i * batch_size < total_samples:
-      translations.append(translation)
+        translation = _trim_and_decode(val_outputs[j], subtokenizer)
-      if print_all_translations:
+        translations.append(translation)
-        tf.compat.v1.logging.info(
+        if print_all_translations:
-            "Translating:\n\tInput: %s\n\tOutput: %s" %
+          tf.compat.v1.logging.info(
-            (sorted_inputs[j + i * batch_size], translation))
+              "Translating:\n\tInput: %s\n\tOutput: %s" %
+              (sorted_inputs[j + i * batch_size], translation))
  # Write translations in the order they appeared in the original file.
  if output_file is not None:

--- a/official/utils/flags/_performance.py
+++ b/official/utils/flags/_performance.py
@@ -53,9 +53,9 @@ def get_loss_scale(flags_obj, default_for_fp16):
    return default_for_fp16
-def define_performance(num_parallel_calls=True, inter_op=True, intra_op=True,
+def define_performance(num_parallel_calls=False, inter_op=False, intra_op=False,
-                       synthetic_data=True, max_train_steps=False, dtype=True,
+                       synthetic_data=False, max_train_steps=False, dtype=False,
-                       all_reduce_alg=True, num_packs=True,
+                       all_reduce_alg=False, num_packs=False,
                       tf_gpu_thread_mode=False,
                       datasets_num_private_threads=False,
                       datasets_num_parallel_batches=False,

--- a/official/utils/flags/flags_test.py
+++ b/official/utils/flags/flags_test.py
@@ -23,7 +23,10 @@ from official.utils.flags import core as flags_core  # pylint: disable=g-bad-imp
 def define_flags():
  flags_core.define_base(num_gpu=False)
-  flags_core.define_performance(dynamic_loss_scale=True, loss_scale=True)
+  flags_core.define_performance(
+      num_parallel_calls=True, inter_op=True,  intra_op=True,
+      dynamic_loss_scale=True, loss_scale=True, synthetic_data=True,
+      dtype=True)
  flags_core.define_image()
  flags_core.define_benchmark()

--- a/official/utils/testing/pylint.rcfile
+++ b/official/utils/testing/pylint.rcfile
 [MESSAGES CONTROL]
-disable=R,W,bad-option-value,trailing-newlines
+disable=R,W,bad-option-value,trailing-newlines,no-name-in-module
 [REPORTS]
 # Tells whether to display a full report or only the messages

--- a/official/vision/image_classification/README.md
+++ b/official/vision/image_classification/README.md
@@ -18,20 +18,20 @@ official.resnet`.
 Download and extract the CIFAR-10 data. You can use the following script:
 ```bash
-python cifar10_download_and_extract.py
+python ../../r1/resnet/cifar10_download_and_extract.py
 ```
 After you download the data, you can run the program by:
 ```bash
-python keras_cifar_main.py
+python resnet_cifar_main.py
 ```
-If you did not use the default directory to download the data, specify the 
+If you did not use the default directory to download the data, specify the
 location with the `--data_dir` flag, like:
 ```bash
-python keras_cifar_main.py --data_dir=/path/to/cifar
+python resnet_cifar_main.py --data_dir=/path/to/cifar
 ```
 ## ImageNet
@@ -44,14 +44,14 @@ provide a few options.
 Once your dataset is ready, you can begin training the model as follows:
 ```bash
-python keras_imagenet_main.py 
+python resnet_imagenet_main.py
 ```
 Again, if you did not download the data to the default directory, specify the
 location with the `--data_dir` flag:
 ```bash
-python keras_imagenet_main.py --data_dir=/path/to/imagenet
+python resnet_imagenet_main.py --data_dir=/path/to/imagenet
 ```
 There are more flag options you can specify. Here are some examples:
@@ -70,16 +70,16 @@ For example, this is a typical command line to run with ImageNet data with
 batch size 128 per GPU:
 ```bash
-python -m keras_imagenet_main \
+python -m resnet_imagenet_main \
--model_dir=/tmp/model_dir/something \
+    --model_dir=/tmp/model_dir/something \
--num_gpus=2 \
+    --num_gpus=2 \
--batch_size=128 \
+    --batch_size=128 \
--train_epochs=90 \
+    --train_epochs=90 \
--train_steps=10 \
+    --train_steps=10 \
--use_synthetic_data=false
+    --use_synthetic_data=false
 ```
-See [`keras_common.py`](keras_common.py) for full list of options.
+See [`common.py`](common.py) for full list of options.
 ## Using multiple GPUs
 You can train these models on multiple GPUs using `tf.distribute.Strategy` API.

--- a/official/vision/image_classification/__init__.py
+++ b/official/vision/image_classification/__init__.py
--- a/official/vision/image_classification/common.py
+++ b/official/vision/image_classification/common.py
@@ -249,6 +249,10 @@ def define_keras_flags(dynamic_loss_scale=True):
  """Define flags for Keras models."""
  flags_core.define_base(run_eagerly=True)
  flags_core.define_performance(num_parallel_calls=False,
+                                synthetic_data=True,
+                                dtype=True,
+                                all_reduce_alg=True,
+                                num_packs=True,
                                tf_gpu_thread_mode=True,
                                datasets_num_private_threads=True,
                                dynamic_loss_scale=dynamic_loss_scale,

--- a/official/vision/image_classification/resnet_imagenet_main.py
+++ b/official/vision/image_classification/resnet_imagenet_main.py
@@ -31,7 +31,7 @@ from official.utils.misc import model_helpers
 from official.vision.image_classification import common
 from official.vision.image_classification import imagenet_preprocessing
 from official.vision.image_classification import resnet_model
-from official.vision.image_classification import trivial_model
+from official.benchmark.models import trivial_model
 LR_SCHEDULE = [    # (multiplier, epoch to start) tuples
    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
@@ -184,6 +184,7 @@ def run(flags_obj):
      optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
          optimizer, loss_scale=flags_core.get_loss_scale(flags_obj,
                                                          default_for_fp16=128))
    if flags_obj.fp16_implementation == "graph_rewrite":
      # Note: when flags_obj.fp16_implementation == "graph_rewrite", 
      # dtype as determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
@@ -191,6 +192,7 @@ def run(flags_obj):
      # do not double up.
      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
+    # TODO(hongkuny): Remove trivial model usage and move it to benchmark.
    if flags_obj.use_trivial_model:
      model = trivial_model.trivial_model(
          imagenet_preprocessing.NUM_CLASSES, dtype)

--- a/official/vision/image_classification/resnet_model.py
+++ b/official/vision/image_classification/resnet_model.py
@@ -28,7 +28,7 @@ from __future__ import division
 from __future__ import print_function
 from tensorflow.python.keras import backend
-from tensorflow.python.keras  import initializers
+from tensorflow.python.keras import initializers
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import regularizers
@@ -39,7 +39,16 @@ BATCH_NORM_DECAY = 0.9
 BATCH_NORM_EPSILON = 1e-5
-def identity_block(input_tensor, kernel_size, filters, stage, block):
+def _gen_l2_regularizer(use_l2_regularizer=True):
+  return regularizers.l2(L2_WEIGHT_DECAY) if use_l2_regularizer else None
+def identity_block(input_tensor,
+                   kernel_size,
+                   filters,
+                   stage,
+                   block,
+                   use_l2_regularizer=True):
  """The identity block is the block that has no conv layer at shortcut.
  Args:
@@ -48,6 +57,7 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
    filters: list of integers, the filters of 3 conv layer at main path
    stage: integer, current stage label, used for generating layer names
    block: 'a','b'..., current block label, used for generating layer names
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
  Returns:
    Output tensor for the block.
@@ -60,35 +70,51 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
  conv_name_base = 'res' + str(stage) + block + '_branch'
  bn_name_base = 'bn' + str(stage) + block + '_branch'
-  x = layers.Conv2D(filters1, (1, 1), use_bias=False,
+  x = layers.Conv2D(
-                    kernel_initializer='he_normal',
+      filters1, (1, 1),
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      use_bias=False,
-                    name=conv_name_base + '2a')(input_tensor)
+      kernel_initializer='he_normal',
-  x = layers.BatchNormalization(axis=bn_axis,
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-                                momentum=BATCH_NORM_DECAY,
+      name=conv_name_base + '2a')(
-                                epsilon=BATCH_NORM_EPSILON,
+          input_tensor)
-                                name=bn_name_base + '2a')(x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(
+          x)
  x = layers.Activation('relu')(x)
-  x = layers.Conv2D(filters2, kernel_size,
+  x = layers.Conv2D(
-                    padding='same', use_bias=False,
+      filters2,
-                    kernel_initializer='he_normal',
+      kernel_size,
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      padding='same',
-                    name=conv_name_base + '2b')(x)
+      use_bias=False,
-  x = layers.BatchNormalization(axis=bn_axis,
+      kernel_initializer='he_normal',
-                                momentum=BATCH_NORM_DECAY,
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-                                epsilon=BATCH_NORM_EPSILON,
+      name=conv_name_base + '2b')(
-                                name=bn_name_base + '2b')(x)
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(
+          x)
  x = layers.Activation('relu')(x)
-  x = layers.Conv2D(filters3, (1, 1), use_bias=False,
+  x = layers.Conv2D(
-                    kernel_initializer='he_normal',
+      filters3, (1, 1),
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      use_bias=False,
-                    name=conv_name_base + '2c')(x)
+      kernel_initializer='he_normal',
-  x = layers.BatchNormalization(axis=bn_axis,
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-                                momentum=BATCH_NORM_DECAY,
+      name=conv_name_base + '2c')(
-                                epsilon=BATCH_NORM_EPSILON,
+          x)
-                                name=bn_name_base + '2c')(x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2c')(
+          x)
  x = layers.add([x, input_tensor])
  x = layers.Activation('relu')(x)
@@ -100,7 +126,8 @@ def conv_block(input_tensor,
               filters,
               stage,
               block,
-               strides=(2, 2)):
+               strides=(2, 2),
+               use_l2_regularizer=True):
  """A block that has a conv layer at shortcut.
  Note that from stage 3,
@@ -114,6 +141,7 @@ def conv_block(input_tensor,
    stage: integer, current stage label, used for generating layer names
    block: 'a','b'..., current block label, used for generating layer names
    strides: Strides for the second conv layer in the block.
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
  Returns:
    Output tensor for the block.
@@ -126,114 +154,231 @@ def conv_block(input_tensor,
  conv_name_base = 'res' + str(stage) + block + '_branch'
  bn_name_base = 'bn' + str(stage) + block + '_branch'
-  x = layers.Conv2D(filters1, (1, 1), use_bias=False,
+  x = layers.Conv2D(
-                    kernel_initializer='he_normal',
+      filters1, (1, 1),
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      use_bias=False,
-                    name=conv_name_base + '2a')(input_tensor)
+      kernel_initializer='he_normal',
-  x = layers.BatchNormalization(axis=bn_axis,
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-                                momentum=BATCH_NORM_DECAY,
+      name=conv_name_base + '2a')(
-                                epsilon=BATCH_NORM_EPSILON,
+          input_tensor)
-                                name=bn_name_base + '2a')(x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(
+          x)
  x = layers.Activation('relu')(x)
-  x = layers.Conv2D(filters2, kernel_size, strides=strides, padding='same',
+  x = layers.Conv2D(
-                    use_bias=False, kernel_initializer='he_normal',
+      filters2,
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      kernel_size,
-                    name=conv_name_base + '2b')(x)
+      strides=strides,
-  x = layers.BatchNormalization(axis=bn_axis,
+      padding='same',
-                                momentum=BATCH_NORM_DECAY,
+      use_bias=False,
-                                epsilon=BATCH_NORM_EPSILON,
+      kernel_initializer='he_normal',
-                                name=bn_name_base + '2b')(x)
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(
+          x)
  x = layers.Activation('relu')(x)
-  x = layers.Conv2D(filters3, (1, 1), use_bias=False,
+  x = layers.Conv2D(
-                    kernel_initializer='he_normal',
+      filters3, (1, 1),
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      use_bias=False,
-                    name=conv_name_base + '2c')(x)
+      kernel_initializer='he_normal',
-  x = layers.BatchNormalization(axis=bn_axis,
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-                                momentum=BATCH_NORM_DECAY,
+      name=conv_name_base + '2c')(
-                                epsilon=BATCH_NORM_EPSILON,
+          x)
-                                name=bn_name_base + '2c')(x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
-  shortcut = layers.Conv2D(filters3, (1, 1), strides=strides, use_bias=False,
+      momentum=BATCH_NORM_DECAY,
-                           kernel_initializer='he_normal',
+      epsilon=BATCH_NORM_EPSILON,
-                           kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      name=bn_name_base + '2c')(
-                           name=conv_name_base + '1')(input_tensor)
+          x)
-  shortcut = layers.BatchNormalization(axis=bn_axis,
-                                       momentum=BATCH_NORM_DECAY,
+  shortcut = layers.Conv2D(
-                                       epsilon=BATCH_NORM_EPSILON,
+      filters3, (1, 1),
-                                       name=bn_name_base + '1')(shortcut)
+      strides=strides,
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '1')(
+          input_tensor)
+  shortcut = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '1')(
+          shortcut)
  x = layers.add([x, shortcut])
  x = layers.Activation('relu')(x)
  return x
-def resnet50(num_classes, dtype='float32', batch_size=None):
+def resnet50(num_classes,
+             dtype='float32',
+             batch_size=None,
+             use_l2_regularizer=True):
  """Instantiates the ResNet50 architecture.
  Args:
    num_classes: `int` number of classes for image classification.
    dtype: dtype to use float32 or float16 are most common.
    batch_size: Size of the batches for each step.
+    use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
  Returns:
      A Keras model instance.
  """
  input_shape = (224, 224, 3)
-  img_input = layers.Input(shape=input_shape, dtype=dtype,
+  img_input = layers.Input(
-                           batch_size=batch_size)
+      shape=input_shape, dtype=dtype, batch_size=batch_size)
  if backend.image_data_format() == 'channels_first':
-    x = layers.Lambda(lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
+    x = layers.Lambda(
-                      name='transpose')(img_input)
+        lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
+        name='transpose')(
+            img_input)
    bn_axis = 1
  else:  # channels_last
    x = img_input
    bn_axis = 3
  x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
-  x = layers.Conv2D(64, (7, 7),
+  x = layers.Conv2D(
-                    strides=(2, 2),
+      64, (7, 7),
-                    padding='valid', use_bias=False,
+      strides=(2, 2),
-                    kernel_initializer='he_normal',
+      padding='valid',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      use_bias=False,
-                    name='conv1')(x)
+      kernel_initializer='he_normal',
-  x = layers.BatchNormalization(axis=bn_axis,
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-                                momentum=BATCH_NORM_DECAY,
+      name='conv1')(
-                                epsilon=BATCH_NORM_EPSILON,
+          x)
-                                name='bn_conv1')(x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name='bn_conv1')(
+          x)
  x = layers.Activation('relu')(x)
  x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
-  x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
+  x = conv_block(
-  x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
+      x,
-  x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
+      3, [64, 64, 256],
+      stage=2,
-  x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
+      block='a',
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
+      strides=(1, 1),
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
+      use_l2_regularizer=use_l2_regularizer)
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
+  x = identity_block(
+      x,
-  x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
+      3, [64, 64, 256],
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
+      stage=2,
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
+      block='b',
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
+      use_l2_regularizer=use_l2_regularizer)
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
+  x = identity_block(
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
+      x,
+      3, [64, 64, 256],
-  x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
+      stage=2,
-  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
+      block='c',
-  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
+      use_l2_regularizer=use_l2_regularizer)
+  x = conv_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='d',
+      use_l2_regularizer=use_l2_regularizer)
+  x = conv_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='d',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='e',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='f',
+      use_l2_regularizer=use_l2_regularizer)
+  x = conv_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
  rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
  x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
  x = layers.Dense(
      num_classes,
      kernel_initializer=initializers.RandomNormal(stddev=0.01),
-      kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      bias_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
-      name='fc1000')(x)
+      name='fc1000')(
+          x)
  # TODO(reedwm): Remove manual casts once mixed precision can be enabled with a
  # single line of code.

--- a/research/lstm_object_detection/tflite/WORKSPACE
+++ b/research/lstm_object_detection/tflite/WORKSPACE
@@ -90,6 +90,7 @@ http_archive(
    sha256 = "79d102c61e2a479a0b7e5fc167bcfaa4832a0c6aad4a75fa7da0480564931bcc",
 )
 # Needed by TensorFlow
 http_archive(
    name = "io_bazel_rules_closure",