Merge pull request #3 from tensorflow/master

Updated

Merge pull request #3 from tensorflow/master
Updated
51e60bab · Ayushman Kumar · GitHub · 7653185e · 7d86c317 · 51e60bab
Unverified Commit 51e60bab authored Mar 08, 2020 by Ayushman Kumar Committed by GitHub Mar 08, 2020
20 changed files
--- a/README.md
+++ b/README.md
@@ -6,8 +6,6 @@ The [official models](official) are a collection of example models that use Tens
 The [research models](https://github.com/tensorflow/models/tree/master/research) are a large collection of models implemented in TensorFlow by researchers. They are not officially supported or available in release branches; it is up to the individual researchers to maintain the models and/or provide support on issues and pull requests.
-The [samples folder](samples) contains code snippets and smaller models that demonstrate features of TensorFlow, including code presented in various blog posts.
 The [tutorials folder](tutorials) is a collection of models described in the [TensorFlow tutorials](https://www.tensorflow.org/tutorials/).
 ## Contribution guidelines

--- a/official/benchmark/keras_cifar_benchmark.py
+++ b/official/benchmark/keras_cifar_benchmark.py
@@ -163,21 +163,6 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.dtype = 'fp32'
    self._run_and_report_benchmark()
-  def benchmark_1_gpu_no_dist_strat_force_v1_path(self):
-    """No dist strat forced v1 execution path."""
-    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_force_v1_path')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
  def benchmark_2_gpu(self):
    """Test keras based model with eager and distribution strategies."""
    self._setup()
@@ -261,17 +246,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()
-  def benchmark_1_gpu_force_v1_path(self):
-    """Test 1 gpu using forced v1 execution path."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_force_v1_path')
-    FLAGS.batch_size = 128
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
  def benchmark_graph_1_gpu(self):
    """Test 1 gpu graph."""
    self._setup()
@@ -316,33 +290,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.distribution_strategy = 'off'
    self._run_and_report_benchmark()
-  def benchmark_1_gpu_no_dist_strat_force_v1_path(self):
-    """No dist strat but forced v1 execution path."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 128
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_force_v1_path')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
-  def benchmark_1_gpu_no_dist_strat_force_v1_path_run_eagerly(self):
-    """Forced v1 execution path and forced eager."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 128
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_force_v1_path_run_eagerly')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
  def benchmark_2_gpu(self):
    """Test 2 gpu."""
    self._setup()
@@ -409,19 +356,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.data_format = 'channels_last'
    self._run_and_report_benchmark()
-  def benchmark_cpu_no_dist_strat_force_v1_path(self):
-    """Test cpu without dist strat and force v1 in model.compile."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_cpu_no_dist_strat_force_v1_path')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
  def benchmark_graph_cpu_no_dist_strat(self):
    """Test cpu graph mode without distribution strategies."""
    self._setup()

--- a/official/benchmark/models/resnet_cifar_main.py
+++ b/official/benchmark/models/resnet_cifar_main.py
@@ -205,24 +205,12 @@ def run(flags_obj):
  with strategy_scope:
    optimizer = common.get_optimizer(lr_schedule)
    model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
+    model.compile(
-    # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
+        loss='sparse_categorical_crossentropy',
-    # a valid arg for this model. Also remove as a valid flag.
+        optimizer=optimizer,
-    if flags_obj.force_v2_in_keras_compile is not None:
+        metrics=(['sparse_categorical_accuracy']
-      model.compile(
+                 if flags_obj.report_accuracy_metrics else None),
-          loss='sparse_categorical_crossentropy',
+        run_eagerly=flags_obj.run_eagerly)
-          optimizer=optimizer,
-          metrics=(['sparse_categorical_accuracy']
-                   if flags_obj.report_accuracy_metrics else None),
-          run_eagerly=flags_obj.run_eagerly,
-          experimental_run_tf_function=flags_obj.force_v2_in_keras_compile)
-    else:
-      model.compile(
-          loss='sparse_categorical_crossentropy',
-          optimizer=optimizer,
-          metrics=(['sparse_categorical_accuracy']
-                   if flags_obj.report_accuracy_metrics else None),
-          run_eagerly=flags_obj.run_eagerly)
  train_epochs = flags_obj.train_epochs

--- a/official/benchmark/ncf_keras_benchmark.py
+++ b/official/benchmark/ncf_keras_benchmark.py
@@ -142,25 +142,12 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.early_stopping = True
    self._run_and_report_benchmark()
-  def benchmark_1_gpu_force_v1_path_early_stop(self):
-    self._setup()
-    FLAGS.early_stopping = True
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
  def benchmark_1_gpu_no_dist_strat_early_stop(self):
    self._setup()
    FLAGS.distribution_strategy = 'off'
    FLAGS.early_stopping = True
    self._run_and_report_benchmark()
-  def benchmark_1_gpu_no_dist_strat_force_v1_path_early_stop(self):
-    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.early_stopping = True
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
  def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self):
    self._setup()
    FLAGS.distribution_strategy = 'off'
@@ -174,13 +161,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.enable_xla = True
    self._run_and_report_benchmark()
-  def benchmark_xla_1_gpu_force_v1_path_early_stop(self):
-    self._setup()
-    FLAGS.early_stopping = True
-    FLAGS.enable_xla = True
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
  def benchmark_1_gpu_ctl_early_stop(self):
    self._setup()
    FLAGS.keras_use_ctl = True
@@ -233,14 +213,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.train_epochs = 7
    self._run_and_report_benchmark_mlperf_like()
-  def benchmark_1_gpu_no_dist_strat_force_v1_path_mlperf_like(self):
-    """1 GPU using compile/fit without dist_strat."""
-    self._setup()
-    FLAGS.train_epochs = 7
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
  def benchmark_1_gpu_no_dist_strat_mlperf_like(self):
    """1 GPU using compile/fit without dist_strat."""
    self._setup()
@@ -353,20 +325,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.epsilon = 1e-8
    self._run_and_report_benchmark_mlperf_like()
-  def benchmark_8_gpu_force_v1_path_mlperf_like(self):
-    """8 GPU using keras fit/compile v1 codepath."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.train_epochs = 17
-    FLAGS.batch_size = 1048576
-    FLAGS.eval_batch_size = 160000
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark_mlperf_like()
  def benchmark_8_gpu_ctl_mlperf_like(self):
    """8 GPU using CTL."""
    self._setup()

--- a/official/modeling/model_training_utils.py
+++ b/official/modeling/model_training_utils.py
@@ -150,7 +150,9 @@ def run_customized_training_loop(
        and model variables pairs as input, manipulate them, and returns a new
        gradients and model variables paris. The callback functions will be
        invoked in the list order and before gradients are allreduced.
-        Default is no callbacks. Only used when explicit_allreduce=True.
+        With mixed precision training, the pre_allreduce_allbacks will be
+        applied on scaled_gradients. Default is no callbacks.
+        Only used when explicit_allreduce=True.
      post_allreduce_callbacks: A list of callback functions that takes
        gradients and model variables pairs as input, manipulate them, and
        returns a new gradients and model variables paris. The callback
@@ -393,8 +395,8 @@ def run_customized_training_loop(
        train_steps(train_iterator,
                    tf.convert_to_tensor(steps, dtype=tf.int32))
      train_loss = _float_metric_value(train_loss_metric)
+      _run_callbacks_on_batch_end(current_step, {'loss': train_loss})
      current_step += steps
-      _run_callbacks_on_batch_end(current_step - 1, {'loss': train_loss})
      # Updates training logging.
      training_status = 'Train Step: %d/%d  / loss = %s' % (

--- a/official/nlp/README.md
+++ b/official/nlp/README.md
-# TensorFlow Natural Language Processing Models
+# TensorFlow Natural Language Processing Modelling Toolkit
-tensorflow/models/official/nlp is a library of state-of-the-art models for
+tensorflow/models/official/nlp provides a [modeling library](modeling) for constructing
-Natural Language Processing (NLP).
+NLP model achitectures, as well as TF2 reference implementations for
+state-of-the-art models.
-The library currently contains TensorFlow 2.x implementations, pre-trained
+The repository contains the following models, with implementations, pre-trained
-model weights, usage scripts and conversion utilities for the following models:
+model weights, usage scripts and conversion utilities:
 * [Bert](bert)
 * [Albert](albert)
 * [XLNet](xlnet)
 * [Transformer for translation](transformer)
+Addtional features:
+* Distributed trainable on both multi-GPU and TPU
+* e2e training for custom models, including both pretraining and finetuning.
--- a/official/nlp/bert/bert_models.py
+++ b/official/nlp/bert/bert_models.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import gin
 import tensorflow as tf
 import tensorflow_hub as hub
@@ -85,16 +86,46 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
    return final_loss
-def get_transformer_encoder(bert_config, sequence_length):
+@gin.configurable
+def get_transformer_encoder(bert_config,
+                            sequence_length,
+                            transformer_encoder_cls=None):
  """Gets a 'TransformerEncoder' object.
  Args:
    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
    sequence_length: Maximum sequence length of the training data.
+    transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
+      default BERT encoder implementation.
  Returns:
    A networks.TransformerEncoder object.
  """
+  if transformer_encoder_cls is not None:
+    # TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
+    embedding_cfg = dict(
+        vocab_size=bert_config.vocab_size,
+        type_vocab_size=bert_config.type_vocab_size,
+        hidden_size=bert_config.hidden_size,
+        seq_length=sequence_length,
+        max_seq_length=bert_config.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+        dropout_rate=bert_config.hidden_dropout_prob,
+    )
+    hidden_cfg = dict(
+        num_attention_heads=bert_config.num_attention_heads,
+        intermediate_size=bert_config.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(bert_config.hidden_act),
+        dropout_rate=bert_config.hidden_dropout_prob,
+        attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+    )
+    kwargs = dict(embedding_cfg=embedding_cfg, hidden_cfg=hidden_cfg,
+                  num_hidden_instances=bert_config.num_hidden_layers,)
+    # Relies on gin configuration to define the Transformer encoder arguments.
+    return transformer_encoder_cls(**kwargs)
  kwargs = dict(
      vocab_size=bert_config.vocab_size,
      hidden_size=bert_config.hidden_size,

--- a/official/nlp/bert/common_flags.py
+++ b/official/nlp/bert/common_flags.py
@@ -20,6 +20,14 @@ import tensorflow as tf
 from official.utils.flags import core as flags_core
+def define_gin_flags():
+  """Define common gin configurable flags."""
+  flags.DEFINE_multi_string('gin_file', None,
+                            'List of paths to the config files.')
+  flags.DEFINE_multi_string(
+      'gin_param', None, 'Newline separated list of Gin parameter bindings.')
 def define_common_bert_flags():
  """Define common flags for BERT tasks."""
  flags_core.define_base(
@@ -69,8 +77,6 @@ def define_common_bert_flags():
  flags.DEFINE_bool('hub_module_trainable', True,
                    'True to make keras layers in the hub module trainable.')
-  flags_core.define_log_steps()
  # Adds flags for mixed precision and multi-worker training.
  flags_core.define_performance(
      num_parallel_calls=False,

--- a/official/nlp/bert/run_classifier.py
+++ b/official/nlp/bert/run_classifier.py
@@ -169,7 +169,7 @@ def run_bert_classifier(strategy,
        epochs,
        steps_per_epoch,
        eval_steps,
-        custom_callbacks=custom_callbacks)
+        custom_callbacks=None)
  # Use user-defined loop to start training.
  logging.info('Training using customized training loop TF 2.0 with '
@@ -363,15 +363,6 @@ def run_bert(strategy,
  if not strategy:
    raise ValueError('Distribution strategy has not been specified.')
-  if FLAGS.log_steps:
-    custom_callbacks = [keras_utils.TimeHistory(
-        batch_size=FLAGS.train_batch_size,
-        log_steps=FLAGS.log_steps,
-        logdir=FLAGS.model_dir,
-    )]
-  else:
-    custom_callbacks = None
  trained_model = run_bert_classifier(
      strategy,
      model_config,
@@ -387,8 +378,7 @@ def run_bert(strategy,
      train_input_fn,
      eval_input_fn,
      run_eagerly=FLAGS.run_eagerly,
-      use_keras_compile_fit=FLAGS.use_keras_compile_fit,
+      use_keras_compile_fit=FLAGS.use_keras_compile_fit)
-      custom_callbacks=custom_callbacks)
  if FLAGS.model_export_path:
    # As Keras ModelCheckpoint callback used with Keras compile/fit() API

--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from absl import app
 from absl import flags
 from absl import logging
+import gin
 import tensorflow as tf
 from official.modeling import model_training_utils
@@ -49,6 +50,7 @@ flags.DEFINE_float('warmup_steps', 10000,
                   'Warmup steps for Adam weight decay optimizer.')
 common_flags.define_common_bert_flags()
+common_flags.define_gin_flags()
 FLAGS = flags.FLAGS
@@ -158,7 +160,7 @@ def run_bert_pretrain(strategy):
 def main(_):
  # Users should always run this script under TF 2.x
  assert tf.version.VERSION.startswith('2.')
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
  if not FLAGS.model_dir:
    FLAGS.model_dir = '/tmp/bert20/'
  strategy = distribution_utils.get_distribution_strategy(

--- a/official/nlp/bert/run_squad.py
+++ b/official/nlp/bert/run_squad.py
@@ -29,7 +29,6 @@ from official.nlp.bert import run_squad_helper
 from official.nlp.bert import tokenization
 from official.nlp.data import squad_lib as squad_lib_wp
 from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils
 flags.DEFINE_string('vocab_file', None,
@@ -95,21 +94,7 @@ def main(_):
      all_reduce_alg=FLAGS.all_reduce_alg,
      tpu_address=FLAGS.tpu)
  if FLAGS.mode in ('train', 'train_and_predict'):
-    if FLAGS.log_steps:
+    train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly)
-      custom_callbacks = [keras_utils.TimeHistory(
-          batch_size=FLAGS.train_batch_size,
-          log_steps=FLAGS.log_steps,
-          logdir=FLAGS.model_dir,
-      )]
-    else:
-      custom_callbacks = None
-    train_squad(
-        strategy,
-        input_meta_data,
-        custom_callbacks=custom_callbacks,
-        run_eagerly=FLAGS.run_eagerly,
-    )
  if FLAGS.mode in ('predict', 'train_and_predict'):
    predict_squad(strategy, input_meta_data)

--- a/official/nlp/bert/run_squad_helper.py
+++ b/official/nlp/bert/run_squad_helper.py
@@ -269,11 +269,10 @@ def train_squad(strategy,
      loss_factor=1.0 /
      strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0)
-  # when all_reduce_sum_gradients = False, apply_gradients() no longer
+  # If explicit_allreduce = True, apply_gradients() no longer implicitly
-  # implicitly allreduce gradients, users manually allreduce gradient and
+  # allreduce gradients, users manually allreduce gradient and pass the
-  # passed the allreduced grads_and_vars. For now, the clip_by_global_norm
+  # allreduced grads_and_vars to apply_gradients(). clip_by_global_norm will be
-  # will be moved to before users' manual allreduce to keep the math
+  # applied to allreduced gradients.
-  # unchanged.
  def clip_by_global_norm_callback(grads_and_vars):
    grads, variables = zip(*grads_and_vars)
    (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
@@ -291,8 +290,8 @@ def train_squad(strategy,
      init_checkpoint=FLAGS.init_checkpoint,
      run_eagerly=run_eagerly,
      custom_callbacks=custom_callbacks,
-      explicit_allreduce=True,
+      explicit_allreduce=False,
-      pre_allreduce_callbacks=[clip_by_global_norm_callback])
+      post_allreduce_callbacks=[clip_by_global_norm_callback])
 def predict_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib):

--- a/official/nlp/modeling/README.md
+++ b/official/nlp/modeling/README.md
+# NLP Modeling Library
+This libary provides a set of Keras primitives (Layers, Networks, and Models)
+that can be assembled into transformer-based models. They are
+flexible, validated, interoperable, and both TF1 and TF2 compatible.
+* [`layers`](layers) are the fundamental building blocks for NLP models.
+They can be used to assemble new layers, networks, or models.
+* [`networks`](networks) are combinations of layers (and possibly other networks). They are sub-units of models that would not be trained alone. They
+encapsulate common network structures like a classification head
+or a transformer encoder into an easily handled object with a
+standardized configuration.
+* [`models`](models) are combinations of layers and networks that would be trained. Pre-built canned models are provided as both convenience functions and canonical examples.
+* [`losses`](losses) contains common loss computation used in NLP tasks.
+Besides the pre-defined primitives, it also provides scaffold classes to allow
+easy experimentation with noval achitectures, e.g., you don’t need to fork a whole Transformer object to try a different kind of attention primitive, for instance.
+* [`TransformerScaffold`](layers/transformer_scaffold.py) implements the
+Transformer from ["Attention Is All You Need"]
+(https://arxiv.org/abs/1706.03762), with a customizable attention layer
+option. Users can pass a class to `attention_cls` and associated config to
+`attention_cfg`, in which case the scaffold will instantiate the class with
+the config, or pass a class instance to `attention_cls`.
+* [`EncoderScaffold`](networks/encoder_scaffold.py) implements the transformer
+encoder from ["BERT: Pre-training of Deep Bidirectional Transformers for
+Language Understanding"](https://arxiv.org/abs/1810.04805), with customizable
+embedding subnetwork (which will replace the standard embedding logic) and/or a
+custom hidden layer (which will replace the Transformer instantiation in the
+encoder).
+BERT and ALBERT models in this repo are implemented using this library. Code examples can be found in the corresponding model folder.
--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
+# Layers
+Layers are the fundamental building blocks for NLP models. They can be used to
+assemble new layers, networks, or models.
+* [DenseEinsum](dense_einsum.py) implements a feedforward network using tf.einsum. This layer contains the einsum op, the associated weight, and the
+logic required to generate the einsum expression for the given initialization
+parameters.
+* [Attention](attention.py) implements an optionally masked attention between two tensors, from_tensor and to_tensor, as described in ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). If `from_tensor` and `to_tensor` are the same, then this is self-attention.
+* [CachedAttention](attention.py) implements an attention layer with cache used
+for auto-agressive decoding.
+* [Transformer](transformer.py) implements an optionally masked transformer as
+described in ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
+* [OnDeviceEmbedding](on_device_embedding.py) implements efficient embedding lookups designed for TPU-based models.
+* [PositionalEmbedding](position_embedding.py) creates a positional embedding
+  as described in ["BERT: Pre-training
+  of Deep Bidirectional Transformers for Language Understanding"]
+  (https://arxiv.org/abs/1810.04805).
+* [SelfAttentionMask](self_attention_mask.py) creates a 3D attention mask from a 2D tensor mask.
+* [MaskedSoftmax](masked_softmax.py) implements a softmax with an optional masking input. If no mask is provided to this layer, it performs a standard softmax; however, if a mask tensor is applied (which should be 1 in positions where the data should be allowed through, and 0 where the data should be masked), the output will have masked positions set to approximately zero.
--- a/official/nlp/modeling/layers/transformer_scaffold.py
+++ b/official/nlp/modeling/layers/transformer_scaffold.py
@@ -19,6 +19,7 @@ from __future__ import division
 # from __future__ import google_type_annotations
 from __future__ import print_function
+import gin
 import tensorflow as tf
 from official.nlp.modeling.layers import attention
@@ -26,6 +27,7 @@ from official.nlp.modeling.layers import dense_einsum
 @tf.keras.utils.register_keras_serializable(package="Text")
+@gin.configurable
 class TransformerScaffold(tf.keras.layers.Layer):
  """Transformer scaffold layer.

--- a/official/nlp/modeling/losses/README.md
+++ b/official/nlp/modeling/losses/README.md
+# Losses
+Losses contains common loss computation used in NLP tasks.
+* `weighted_sparse_categorical_crossentropy_loss` computes per-batch sparse
+categorical crossentropy loss.
+* `weighted_sparse_categorical_crossentropy_per_example_loss` computes
+per-example sparse categorical crossentropy loss.
--- a/official/nlp/modeling/models/README.md
+++ b/official/nlp/modeling/models/README.md
+# Models
+Models are combinations of layers and networks that would be trained.
+Several pre-built canned models are provided to train encoder networks. These
+models are intended as both convenience functions and canonical examples.
+* [`BertClassifier`](bert_classifier.py) implements a simple classification
+model containing a single classification head using the Classification network.
+* [`BertSpanLabeler`](bert_span_labeler.py) implementats a simple single-span
+start-end predictor (that is, a model that predicts two values: a start token
+index and an end token index), suitable for SQuAD-style tasks.
+* [`BertPretrainer`](bert_pretrainer.py) implements a masked LM and a
+classification head using the Masked LM and Classification networks,
+respectively.
--- a/official/nlp/modeling/networks/README.md
+++ b/official/nlp/modeling/networks/README.md
+# Networks
+Networks are combinations of layers (and possibly other networks). They are sub-units of models that would not be trained alone. It
+encapsulates common network structures like a classification head
+or a transformer encoder into an easily handled object with a
+standardized configuration.
+* [`TransformerEncoder`](transformer_encoder.py) implements a bi-directional
+Transformer-based encoder as described in ["BERT: Pre-training of Deep
+Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding lookups,
+transformer layers and pooling layer.
+* [`AlbertTransformerEncoder`](albert_transformer_encoder.py) implements a
+Transformer-encoder described in the paper ["ALBERT: A Lite BERT for
+Self-supervised Learning of Language Representations]
+(https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805), ALBERT refactorizes embedding parameters
+into two smaller matrices and shares parameters across layers.
+* [`MaskedLM`](masked_lm.py) implements a masked language model for BERT pretraining. It assumes that the network being passed has a `get_embedding_table()` method.
+* [`Classification`](classification.py) contains a single hidden layer, and is intended for use as a classification head.
+* [`SpanLabeling`](span_labeling.py) implements a single-span labeler (that is, a prediction head that can predict one start and end index per batch item) based on a single dense hidden layer. It can be used in the SQuAD task.
--- a/official/nlp/modeling/networks/encoder_scaffold.py
+++ b/official/nlp/modeling/networks/encoder_scaffold.py
+# Lint as: python3
 # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,6 +21,8 @@ from __future__ import division
 from __future__ import print_function
 import inspect
+import gin
 import tensorflow as tf
 from tensorflow.python.keras.engine import network  # pylint: disable=g-direct-tensorflow-import
@@ -27,6 +30,7 @@ from official.nlp.modeling import layers
 @tf.keras.utils.register_keras_serializable(package='Text')
+@gin.configurable
 class EncoderScaffold(network.Network):
  """Bi-directional Transformer-based encoder network scaffold.
@@ -96,7 +100,6 @@ class EncoderScaffold(network.Network):
      hidden_cls=layers.Transformer,
      hidden_cfg=None,
      **kwargs):
-    print(embedding_cfg)
    self._self_setattr_tracking = False
    self._hidden_cls = hidden_cls
    self._hidden_cfg = hidden_cfg
@@ -171,7 +174,8 @@ class EncoderScaffold(network.Network):
    for _ in range(num_hidden_instances):
      if inspect.isclass(hidden_cls):
-        layer = self._hidden_cls(**hidden_cfg)
+        layer = self._hidden_cls(
+            **hidden_cfg) if hidden_cfg else self._hidden_cls()
      else:
        layer = self._hidden_cls
      data = layer([data, attention_mask])

--- a/official/nlp/transformer/data_download.py
+++ b/official/nlp/transformer/data_download.py
@@ -23,16 +23,18 @@ import random
 import tarfile
 # pylint: disable=g-bad-import-order
-import six
-from six.moves import urllib
 from absl import app as absl_app
 from absl import flags
 from absl import logging
+import six
+from six.moves import range
+from six.moves import urllib
+from six.moves import zip
 import tensorflow.compat.v1 as tf
-# pylint: enable=g-bad-import-order
 from official.nlp.transformer.utils import tokenizer
 from official.utils.flags import core as flags_core
+# pylint: enable=g-bad-import-order
 # Data sources for training/evaluating the transformer translation model.
 # If any of the training sources are changed, then either:
@@ -148,7 +150,7 @@ def download_report_hook(count, block_size, total_size):
    total_size: total size
  """
  percent = int(count * block_size * 100 / total_size)
-  print("\r%d%%" % percent + " completed", end="\r")
+  print(six.ensure_str("\r%d%%" % percent) + " completed", end="\r")
 def download_from_url(path, url):
@@ -161,12 +163,12 @@ def download_from_url(path, url):
  Returns:
    Full path to downloaded file
  """
-  filename = url.split("/")[-1]
+  filename = six.ensure_str(url).split("/")[-1]
  found_file = find_file(path, filename, max_depth=0)
  if found_file is None:
    filename = os.path.join(path, filename)
    logging.info("Downloading from %s to %s." % (url, filename))
-    inprogress_filepath = filename + ".incomplete"
+    inprogress_filepath = six.ensure_str(filename) + ".incomplete"
    inprogress_filepath, _ = urllib.request.urlretrieve(
        url, inprogress_filepath, reporthook=download_report_hook)
    # Print newline to clear the carriage return from the download progress.
@@ -242,8 +244,10 @@ def compile_files(raw_dir, raw_files, tag):
  """
  logging.info("Compiling files with tag %s." % tag)
  filename = "%s-%s" % (_PREFIX, tag)
-  input_compiled_file = os.path.join(raw_dir, filename + ".lang1")
+  input_compiled_file = os.path.join(raw_dir,
-  target_compiled_file = os.path.join(raw_dir, filename + ".lang2")
+                                     six.ensure_str(filename) + ".lang1")
+  target_compiled_file = os.path.join(raw_dir,
+                                      six.ensure_str(filename) + ".lang2")
  with tf.io.gfile.GFile(input_compiled_file, mode="w") as input_writer:
    with tf.io.gfile.GFile(target_compiled_file, mode="w") as target_writer:
@@ -295,7 +299,7 @@ def encode_and_save_files(
  target_file = raw_files[1]
  # Write examples to each shard in round robin order.
-  tmp_filepaths = [fname + ".incomplete" for fname in filepaths]
+  tmp_filepaths = [six.ensure_str(fname) + ".incomplete" for fname in filepaths]
  writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths]
  counter, shard = 0, 0
  for counter, (input_line, target_line) in enumerate(zip(
@@ -328,7 +332,7 @@ def shuffle_records(fname):
  logging.info("Shuffling records in file %s" % fname)
  # Rename file prior to shuffling
-  tmp_fname = fname + ".unshuffled"
+  tmp_fname = six.ensure_str(fname) + ".unshuffled"
  tf.gfile.Rename(fname, tmp_fname)
  reader = tf.io.tf_record_iterator(tmp_fname)