Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

09d9656f · Srihari Humbarwadi · GitHub · ac671306 · 49a5706c · 09d9656f
Unverified Commit 09d9656f authored Jan 13, 2022 by Srihari Humbarwadi Committed by GitHub Jan 13, 2022
20 changed files
--- a/official/modeling/performance.py
+++ b/official/modeling/performance.py
@@ -14,14 +14,19 @@

 """Functions and classes related to training performance."""

+from absl import logging
 import tensorflow as tf


 def configure_optimizer(optimizer,
                        use_float16=False,
-                        use_graph_rewrite=False,
-                        loss_scale=None):
+                        loss_scale=None,
+                        use_graph_rewrite=None):
  """Configures optimizer object with performance options."""
+  if use_graph_rewrite is not None:
+    logging.warning('`use_graph_rewrite` is deprecated inside '
+                    '`configure_optimizer`. Please remove the usage.')
+  del use_graph_rewrite
  if use_float16:
    if loss_scale in (None, 'dynamic'):
      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
@@ -29,13 +34,6 @@ def configure_optimizer(optimizer,
      # loss_scale is a number. We interpret that as a fixed loss scale.
      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
          optimizer, dynamic=False, initial_scale=loss_scale)
-  if use_graph_rewrite:
-    # Note: the model dtype must be 'float32', which will ensure
-    # tf.keras.mixed_precision and enable_mixed_precision_graph_rewrite do not
-    # double up.
-    optimizer = (
-        tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-            optimizer))
  return optimizer



--- a/official/modeling/tf_utils.py
+++ b/official/modeling/tf_utils.py
@@ -110,6 +110,8 @@ def get_activation(identifier, use_keras_layer=False):
          "swish": "swish",
          "sigmoid": "sigmoid",
          "relu6": tf.nn.relu6,
+          "hard_swish": activations.hard_swish,
+          "hard_sigmoid": activations.hard_sigmoid,
      }
      if identifier in keras_layer_allowlist:
        return tf.keras.layers.Activation(keras_layer_allowlist[identifier])
@@ -199,3 +201,74 @@ def safe_mean(losses):
  total = tf.reduce_sum(losses)
  num_elements = tf.cast(tf.size(losses), dtype=losses.dtype)
  return tf.math.divide_no_nan(total, num_elements)
+
+
+def get_replica_id():
+  """Gets replica id depending on the environment."""
+  context = tf.distribute.get_replica_context()
+  if context is not None:
+    return context.replica_id_in_sync_group
+  else:
+    raise RuntimeError("Unknown replica context. The `get_replica_id` method "
+                       "relies on TF 2.x tf.distribute API.")
+
+
+def cross_replica_concat(value, axis, name="cross_replica_concat"):
+  """Concatenates the given `value` across (GPU/TPU) cores, along `axis`.
+
+  In general, each core ("replica") will pass a
+  replica-specific value as `value` (corresponding to some element of a
+  data-parallel computation taking place across replicas).
+
+  The resulting concatenated `Tensor` will have the same shape as `value` for
+  all dimensions except `axis`, where it will be larger by a factor of the
+  number of replicas. It will also have the same `dtype` as `value`.
+
+  The position of a given replica's `value` within the resulting concatenation
+  is determined by that replica's replica ID. For
+  example:
+
+  With `value` for replica 0 given as
+
+      0 0 0
+      0 0 0
+
+  and `value` for replica 1 given as
+
+      1 1 1
+      1 1 1
+
+  the resulting concatenation along axis 0 will be
+
+      0 0 0
+      0 0 0
+      1 1 1
+      1 1 1
+
+  and this result will be identical across all replicas.
+
+  Note that this API only works in TF2 with `tf.distribute`.
+
+  Args:
+    value: The `Tensor` to concatenate across replicas. Each replica will have a
+      different value for this `Tensor`, and these replica-specific values will
+      be concatenated.
+    axis: The axis along which to perform the concatenation as a Python integer
+      (not a `Tensor`). E.g., `axis=0` to concatenate along the batch dimension.
+    name: A name for the operation (used to create a name scope).
+
+  Returns:
+    The result of concatenating `value` along `axis` across replicas.
+
+  Raises:
+    RuntimeError: when the batch (0-th) dimension is None.
+  """
+  with tf.name_scope(name):
+    context = tf.distribute.get_replica_context()
+    # Typically this could be hit only if the tensor is derived from a
+    # dataset with finite epochs and drop_remainder=False, where the last
+    # batch could of different batch size and then the dim-0 is of dynamic
+    # shape.
+    if value.shape.as_list()[0] is None:
+      raise RuntimeError(f"{value} has unknown batch.")
+    return context.all_gather(value, axis=axis)
--- a/official/modeling/tf_utils_test.py
+++ b/official/modeling/tf_utils_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tf_utils."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.modeling import tf_utils
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      strategy=[
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      mode='eager',
+  )
+
+
+class TFUtilsTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_cross_replica_concat(self, strategy):
+    num_cores = strategy.num_replicas_in_sync
+
+    shape = (2, 3, 4)
+
+    def concat(axis):
+
+      @tf.function
+      def function():
+        replica_value = tf.fill(shape, tf_utils.get_replica_id())
+        return tf_utils.cross_replica_concat(replica_value, axis=axis)
+
+      return function
+
+    def expected(axis):
+      values = [np.full(shape, i) for i in range(num_cores)]
+      return np.concatenate(values, axis=axis)
+
+    per_replica_results = strategy.run(concat(axis=0))
+    replica_0_result = per_replica_results.values[0].numpy()
+    for value in per_replica_results.values[1:]:
+      self.assertAllClose(value.numpy(), replica_0_result)
+    self.assertAllClose(replica_0_result, expected(axis=0))
+
+    replica_0_result = strategy.run(concat(axis=1)).values[0].numpy()
+    self.assertAllClose(replica_0_result, expected(axis=1))
+
+    replica_0_result = strategy.run(concat(axis=2)).values[0].numpy()
+    self.assertAllClose(replica_0_result, expected(axis=2))
+
+  @combinations.generate(all_strategy_combinations())
+  def test_cross_replica_concat_gradient(self, strategy):
+    num_cores = strategy.num_replicas_in_sync
+
+    shape = (10, 5)
+
+    @tf.function
+    def function():
+      replica_value = tf.random.normal(shape)
+      with tf.GradientTape() as tape:
+        tape.watch(replica_value)
+        concat_value = tf_utils.cross_replica_concat(replica_value, axis=0)
+        output = tf.reduce_sum(concat_value)
+      return tape.gradient(output, replica_value)
+
+    per_replica_gradients = strategy.run(function)
+    for gradient in per_replica_gradients.values:
+      self.assertAllClose(gradient, num_cores * tf.ones(shape))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/MODEL_GARDEN.md
+++ b/official/nlp/MODEL_GARDEN.md
+# TF-NLP Model Garden
+
+## Introduction
+
+The TF-NLP library provides a collection of scripts for training and
+evaluating transformer-based models, on various tasks such as sentence
+classification, question answering, and translation. Additionally, we provide
+checkpoints of pretrained models which can be finetuned on downstream tasks.
+
+### How to Train Models
+
+Model Garden can be easily installed with
+`pip install tf-models-nightly`. After installation, check out
+[this instruction](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
+on how to train models with this codebase.
+
+
+By default, the experiment runs on GPUs. To run on TPUs, one should overwrite
+`runtime.distribution_strategy` and set the tpu address. See [RuntimeConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py) for details.
+
+In general, the experiments can run with the folloing command by setting the
+corresponding `${TASK}`, `${TASK_CONFIG}`, `${MODEL_CONFIG}`.
+```
+EXPERIMENT=???
+TASK_CONFIG=???
+MODEL_CONFIG=???
+EXRTRA_PARAMS=???
+MODEL_DIR=???  # a-folder-to-hold-checkpoints-and-logs
+python3 train.py \
+  --experiment=${EXPERIMENT} \
+  --mode=train_and_eval \
+  --model_dir=${MODEL_DIR} \ 
+  --config_file=${TASK_CONFIG} \
+  --config_file=${MODEL_CONFIG} \
+  --params_override=${EXRTRA_PARAMS}
+``` 
+
+* `EXPERIMENT` can be found under `configs/`
+* `TASK_CONFIG` can be found under `configs/experiments/`
+* `MODEL_CONFIG` can be found under `configs/models/`
+
+#### Order of params override:
+1. `train.py` looks up the registered `ExperimentConfig` with `${EXPERIMENT}`
+2. Overrides params in `TaskConfig` in `${TASK_CONFIG}`
+3. Overrides params `model` in `TaskConfig` with `${MODEL_CONFIG}`
+4. Overrides any params in `ExperimentConfig` with `${EXTRA_PARAMS}`
+
+Note that 
+1. `${TASK_CONFIG}`, `${MODEL_CONFIG}`, `${EXTRA_PARAMS}` can be optional when EXPERIMENT default is enough.
+2. `${TASK_CONFIG}`, `${MODEL_CONFIG}`, `${EXTRA_PARAMS}` are only guaranteed to be compatible to it's `${EXPERIMENT}` that defines it.
+
+## Experiments
+
+| NAME          | EXPERIMENT                     | TASK_CONFIG  | MODEL_CONFIG | EXRTRA_PARAMS |
+| ----------------- | ------------------------ | ------- | -------- | ----------- |
+| BERT-base GLUE/MNLI-matched finetune | [bert/sentence_prediction](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py) | [glue_mnli_matched.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/glue_mnli_matched.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4 </details> |
+| BERT-base GLUE/MNLI-matched finetune | [bert/sentence_prediction](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py) | [glue_mnli_matched.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/glue_mnli_matched.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base ckpt init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.init_checkpoint=gs://tf_model_garden/nlp/bert/uncased_L-12_H-768_A-12/bert_model.ckpt </details> |
+| BERT-base SQuAD v1.1 finetune        | [bert/squad](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py)               | [squad_v1.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/squad_v1.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4 </details> |
+|ALBERT-base SQuAD v1.1 finetune | [bert/squad](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py)   | [squad_v1.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/squad_v1.yaml) | [albert_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/albert_base.yaml)| <details> <summary>data and albert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/3 </details>|
+| Transformer-large WMT14/en-de scratch |[wmt_transformer/large](https://github.com/tensorflow/models/blob/master/official/nlp/configs/wmt_transformer_experiments.py)|  | | <details> <summary>ende-32k sentencepiece</summary>task.sentencepiece_model_path='gs://tf_model_garden/nlp/transformer_wmt/ende_bpe_32k.model'</details> |
+
+
+## Useful links
+
+[How to Train Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
+
+[List of Pretrained Models for finetuning](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md)
+
+[How to Publish Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/tfhub.md)
+
+[TensorFlow blog on Model Garden](https://blog.tensorflow.org/2020/03/introducing-model-garden-for-tensorflow-2.html).
--- a/official/nlp/README.md
+++ b/official/nlp/README.md
@@ -32,10 +32,10 @@ We provide SoTA model implementations, pre-trained models, training and
 evaluation examples, and command lines. Detail instructions can be found in the
 READMEs for specific papers.

-1.  [BERT](bert): [BERT: Pre-training of Deep Bidirectional Transformers for
+1.  [BERT](MODEL_GARDEN.md#available-model-configs): [BERT: Pre-training of Deep Bidirectional Transformers for
    Language Understanding](https://arxiv.org/abs/1810.04805) by Devlin et al.,
    2018
-2.  [ALBERT](albert):
+2.  [ALBERT](MODEL_GARDEN.md#available-model-configs):
    [A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
    by Lan et al., 2019
 3.  [XLNet](xlnet):

--- a/official/nlp/albert/README.md
+++ b/official/nlp/albert/README.md
-# ALBERT (ALBERT: A Lite BERT for Self-supervised Learning of Language Representations)
-
-**WARNING**: We are on the way to deprecate this directory.
-We will add documentation in `nlp/docs` to use the new code in `nlp/modeling`.
-
-The academic paper which describes ALBERT in detail and provides full results on
-a number of tasks can be found here: https://arxiv.org/abs/1909.11942.
-
-This repository contains TensorFlow 2.x implementation for ALBERT.
-
-## Contents
-  * [Contents](#contents)
-  * [Pre-trained Models](#pre-trained-models)
-    * [Restoring from Checkpoints](#restoring-from-checkpoints)
-  * [Set Up](#set-up)
-  * [Process Datasets](#process-datasets)
-  * [Fine-tuning with BERT](#fine-tuning-with-bert)
-    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
-    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
-    * [SQuAD 1.1](#squad-1.1)
-
-
-## Pre-trained Models
-
-We released both checkpoints and tf.hub modules as the pretrained models for
-fine-tuning. They are TF 2.x compatible and are converted from the ALBERT v2
-checkpoints released in TF 1.x official ALBERT repository
-[google-research/albert](https://github.com/google-research/albert)
-in order to keep consistent with ALBERT paper.
-
-Our current released checkpoints are exactly the same as TF 1.x official ALBERT
-repository.
-
-### Access to Pretrained Checkpoints
-
-Pretrained checkpoints can be found in the following links:
-
-**Note: We implemented ALBERT using Keras functional-style networks in [nlp/modeling](../modeling).
-ALBERT V2 models compatible with TF 2.x checkpoints are:**
-
-*   **[`ALBERT V2 Base`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base.tar.gz)**:
-    12-layer, 768-hidden, 12-heads, 12M parameters
-*   **[`ALBERT V2 Large`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_large.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 18M parameters
-*   **[`ALBERT V2 XLarge`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_xlarge.tar.gz)**:
-    24-layer, 2048-hidden, 32-heads, 60M parameters
-*   **[`ALBERT V2 XXLarge`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_xxlarge.tar.gz)**:
-    12-layer, 4096-hidden, 64-heads, 235M parameters
-
-We recommend to host checkpoints on Google Cloud storage buckets when you use
-Cloud GPU/TPU.
-
-### Restoring from Checkpoints
-
-`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
-weights from provided pre-trained checkpoints, you can use the following code:
-
-```python
-init_checkpoint='the pretrained model checkpoint path.'
-model=tf.keras.Model() # Bert pre-trained model as feature extractor.
-checkpoint = tf.train.Checkpoint(model=model)
-checkpoint.restore(init_checkpoint)
-```
-
-Checkpoints featuring native serialized Keras models
-(i.e. model.load()/load_weights()) will be available soon.
-
-### Access to Pretrained hub modules.
-
-Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
-following links:
-
-*   **[`ALBERT V2 Base`](https://tfhub.dev/tensorflow/albert_en_base/1)**:
-    12-layer, 768-hidden, 12-heads, 12M parameters
-*   **[`ALBERT V2 Large`](https://tfhub.dev/tensorflow/albert_en_large/1)**:
-    24-layer, 1024-hidden, 16-heads, 18M parameters
-*   **[`ALBERT V2 XLarge`](https://tfhub.dev/tensorflow/albert_en_xlarge/1)**:
-    24-layer, 2048-hidden, 32-heads, 60M parameters
-*   **[`ALBERT V2 XXLarge`](https://tfhub.dev/tensorflow/albert_en_xxlarge/1)**:
-    12-layer, 4096-hidden, 64-heads, 235M parameters
-
-## Set Up
-
-```shell
-export PYTHONPATH="$PYTHONPATH:/path/to/models"
-```
-
-Install `tf-nightly` to get latest updates:
-
-```shell
-pip install tf-nightly-gpu
-```
-
-With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
-TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
-
-```shell
-ctpu up -name <instance name> --tf-version=”nightly”
-```
-
-Second, you need to install TF 2 `tf-nightly` on your VM:
-
-```shell
-pip install tf-nightly
-```
-
-Warning: More details TPU-specific set-up instructions and tutorial should come
-along with official TF 2.x release for TPU. Note that this repo is not
-officially supported by Google Cloud TPU team yet until TF 2.1 released.
-
-## Process Datasets
-
-### Pre-training
-
-Pre-train ALBERT using TF2.x will come soon.
-For now, please use [ALBERT research repo](https://github.com/google-research/ALBERT)
-to pretrain the model and convert the checkpoint to TF2.x compatible ones using
-[tf2_albert_encoder_checkpoint_converter.py](tf2_albert_encoder_checkpoint_converter.py).
-
-
-
-### Fine-tuning
-
-To prepare the fine-tuning data for final model training, use the
-[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
-Note that different from BERT models that use word piece tokenzer,
-ALBERT models employ sentence piece tokenizer. So the FLAG tokenizer_impl has
-to be set to 'sentence_piece'.
-Resulting datasets in `tf_record` format and training meta data should be later
-passed to training or evaluation scripts. The task-specific arguments are
-described in following sections:
-
-* GLUE
-
-Users can download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```shell
-export GLUE_DIR=~/glue
-export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
-
-export TASK_NAME=MNLI
-export OUTPUT_DIR=gs://some_bucket/datasets
-python ../data/create_finetuning_data.py \
- --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
- --sp_model_file=${ALBERT_DIR}/30k-clean.model \
- --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
- --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
- --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
- --fine_tuning_task_type=classification --max_seq_length=128 \
- --classification_task_name=${TASK_NAME} \
- --tokenization=SentencePiece
-```
-
-* SQUAD
-
-The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
-detailed information about the SQuAD datasets and evaluation.
-
-The necessary files can be found here:
-
-*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
-*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
-*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
-*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
-
-```shell
-export SQUAD_DIR=~/squad
-export SQUAD_VERSION=v1.1
-export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
-export OUTPUT_DIR=gs://some_bucket/datasets
-
-python ../data/create_finetuning_data.py \
- --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
- --sp_model_file=${ALBERT_DIR}/30k-clean.model \
- --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
- --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
- --fine_tuning_task_type=squad --max_seq_length=384 \
- --tokenization=SentencePiece
-```
-
-## Fine-tuning with ALBERT
-
-### Cloud GPUs and TPUs
-
-* Cloud Storage
-
-The unzipped pre-trained model files can also be found in the Google Cloud
-Storage folder `gs://cloud-tpu-checkpoints/albert/checkpoints`. For example:
-
-```shell
-export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
-export MODEL_DIR=gs://some_bucket/my_output_dir
-```
-
-Currently, users are able to access to `tf-nightly` TPUs and the following TPU
-script should run with `tf-nightly`.
-
-* GPU -> TPU
-
-Just add the following flags to `run_classifier.py` or `run_squad.py`:
-
-```shell
-  --distribution_strategy=tpu
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-### Sentence and Sentence-pair Classification Tasks
-
-This example code fine-tunes `albert_v2_base` on the Microsoft Research
-Paraphrase Corpus (MRPC) corpus, which only contains 3,600 examples and can
-fine-tune in a few minutes on most GPUs.
-
-We use the `albert_v2_base` as an example throughout the
-workflow.
-
-
-```shell
-export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export GLUE_DIR=gs://some_bucket/datasets
-export TASK=MRPC
-
-python run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${ALBERT_DIR}/albert_config.json \
-  --init_checkpoint=${ALBERT_DIR}/bert_model.ckpt \
-  --train_batch_size=4 \
-  --eval_batch_size=4 \
-  --steps_per_loop=1 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-```
-
-Alternatively, instead of specifying `init_checkpoint`, you can specify
-`hub_module_url` to employ a pretraind BERT hub module, e.g.,
-` --hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/1`.
-
-To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
-information and use remote storage for model checkpoints.
-
-```shell
-export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
-export TPU_IP_ADDRESS='???'
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export GLUE_DIR=gs://some_bucket/datasets
-
-python run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=$ALBERT_DIR/albert_config.json \
-  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
-  --train_batch_size=32 \
-  --eval_batch_size=32 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-### SQuAD 1.1
-
-The Stanford Question Answering Dataset (SQuAD) is a popular question answering
-benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
-
-We use the `albert_v2_base` as an example throughout the
-workflow.
-
-```shell
-export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
-export SQUAD_DIR=gs://some_bucket/datasets
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --sp_model_file=${ALBERT_DIR}/30k-clean.model \
-  --bert_config_file=$ALBERT_DIR/albert_config.json \
-  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
-  --train_batch_size=4 \
-  --predict_batch_size=4 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-```
-
-Similarily, you can replace `init_checkpoint` FLAGS with `hub_module_url` to
-specify a hub module path.
-
-To use TPU, you need switch distribution strategy type to `tpu` with TPU
-information.
-
-```shell
-export ALBERT_DIR=gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
-export TPU_IP_ADDRESS='???'
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export SQUAD_DIR=gs://some_bucket/datasets
-export SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --sp_model_file=${ALBERT_DIR}/30k-clean.model \
-  --bert_config_file=$ALBERT_DIR/albert_config.json \
-  --init_checkpoint=$ALBERT_DIR/bert_model.ckpt \
-  --train_batch_size=32 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-The dev set predictions will be saved into a file called predictions.json in the
-model_dir:
-
-```shell
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
-```
--- a/official/nlp/albert/run_classifier.py
+++ b/official/nlp/albert/run_classifier.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""ALBERT classification finetuning runner in tf2.x."""
-
-import json
-import os
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-from official.common import distribute_utils
-from official.nlp.albert import configs as albert_configs
-from official.nlp.bert import bert_models
-from official.nlp.bert import run_classifier as run_classifier_bert
-
-
-FLAGS = flags.FLAGS
-
-
-def predict(strategy, albert_config, input_meta_data, predict_input_fn):
-  """Function outputs both the ground truth predictions as .tsv files."""
-  with strategy.scope():
-    classifier_model = bert_models.classifier_model(
-        albert_config, input_meta_data['num_labels'])[0]
-    checkpoint = tf.train.Checkpoint(model=classifier_model)
-    latest_checkpoint_file = (
-        FLAGS.predict_checkpoint_path or
-        tf.train.latest_checkpoint(FLAGS.model_dir))
-    assert latest_checkpoint_file
-    logging.info('Checkpoint file %s found and restoring from '
-                 'checkpoint', latest_checkpoint_file)
-    checkpoint.restore(
-        latest_checkpoint_file).assert_existing_objects_matched()
-    preds, ground_truth = run_classifier_bert.get_predictions_and_labels(
-        strategy, classifier_model, predict_input_fn, return_probs=True)
-    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
-    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
-      logging.info('***** Predict results *****')
-      for probabilities in preds:
-        output_line = '\t'.join(
-            str(class_probability)
-            for class_probability in probabilities) + '\n'
-        writer.write(output_line)
-    ground_truth_labels_file = os.path.join(FLAGS.model_dir,
-                                            'output_labels.tsv')
-    with tf.io.gfile.GFile(ground_truth_labels_file, 'w') as writer:
-      logging.info('***** Ground truth results *****')
-      for label in ground_truth:
-        output_line = '\t'.join(str(label)) + '\n'
-        writer.write(output_line)
-  return
-
-
-def main(_):
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
-
-  if not FLAGS.model_dir:
-    FLAGS.model_dir = '/tmp/bert20/'
-
-  strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      tpu_address=FLAGS.tpu)
-  max_seq_length = input_meta_data['max_seq_length']
-  train_input_fn = run_classifier_bert.get_dataset_fn(
-      FLAGS.train_data_path,
-      max_seq_length,
-      FLAGS.train_batch_size,
-      is_training=True)
-  eval_input_fn = run_classifier_bert.get_dataset_fn(
-      FLAGS.eval_data_path,
-      max_seq_length,
-      FLAGS.eval_batch_size,
-      is_training=False)
-
-  albert_config = albert_configs.AlbertConfig.from_json_file(
-      FLAGS.bert_config_file)
-  if FLAGS.mode == 'train_and_eval':
-    run_classifier_bert.run_bert(strategy, input_meta_data, albert_config,
-                                 train_input_fn, eval_input_fn)
-  elif FLAGS.mode == 'predict':
-    predict(strategy, albert_config, input_meta_data, eval_input_fn)
-  else:
-    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
-  return
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('bert_config_file')
-  flags.mark_flag_as_required('input_meta_data_path')
-  flags.mark_flag_as_required('model_dir')
-  app.run(main)
--- a/official/nlp/albert/run_squad.py
+++ b/official/nlp/albert/run_squad.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Run ALBERT on SQuAD 1.1 and SQuAD 2.0 in TF 2.x."""
-
-import json
-import os
-import time
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-from official.common import distribute_utils
-from official.nlp.albert import configs as albert_configs
-from official.nlp.bert import run_squad_helper
-from official.nlp.bert import tokenization
-from official.nlp.data import squad_lib_sp
-
-flags.DEFINE_string(
-    'sp_model_file', None,
-    'The path to the sentence piece model. Used by sentence piece tokenizer '
-    'employed by ALBERT.')
-
-# More flags can be found in run_squad_helper.
-run_squad_helper.define_common_squad_flags()
-
-FLAGS = flags.FLAGS
-
-
-def train_squad(strategy,
-                input_meta_data,
-                custom_callbacks=None,
-                run_eagerly=False):
-  """Runs bert squad training."""
-  bert_config = albert_configs.AlbertConfig.from_json_file(
-      FLAGS.bert_config_file)
-  run_squad_helper.train_squad(strategy, input_meta_data, bert_config,
-                               custom_callbacks, run_eagerly)
-
-
-def predict_squad(strategy, input_meta_data):
-  """Makes predictions for the squad dataset."""
-  bert_config = albert_configs.AlbertConfig.from_json_file(
-      FLAGS.bert_config_file)
-  tokenizer = tokenization.FullSentencePieceTokenizer(
-      sp_model_file=FLAGS.sp_model_file)
-
-  run_squad_helper.predict_squad(strategy, input_meta_data, tokenizer,
-                                 bert_config, squad_lib_sp)
-
-
-def eval_squad(strategy, input_meta_data):
-  """Evaluate on the squad dataset."""
-  bert_config = albert_configs.AlbertConfig.from_json_file(
-      FLAGS.bert_config_file)
-  tokenizer = tokenization.FullSentencePieceTokenizer(
-      sp_model_file=FLAGS.sp_model_file)
-
-  eval_metrics = run_squad_helper.eval_squad(
-      strategy, input_meta_data, tokenizer, bert_config, squad_lib_sp)
-  return eval_metrics
-
-
-def export_squad(model_export_path, input_meta_data):
-  """Exports a trained model as a `SavedModel` for inference.
-
-  Args:
-    model_export_path: a string specifying the path to the SavedModel directory.
-    input_meta_data: dictionary containing meta data about input and model.
-
-  Raises:
-    Export path is not specified, got an empty string or None.
-  """
-  bert_config = albert_configs.AlbertConfig.from_json_file(
-      FLAGS.bert_config_file)
-  run_squad_helper.export_squad(model_export_path, input_meta_data, bert_config)
-
-
-def main(_):
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
-
-  if FLAGS.mode == 'export_only':
-    export_squad(FLAGS.model_export_path, input_meta_data)
-    return
-
-  # Configures cluster spec for multi-worker distribution strategy.
-  if FLAGS.num_gpus > 0:
-    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
-  strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      all_reduce_alg=FLAGS.all_reduce_alg,
-      tpu_address=FLAGS.tpu)
-
-  if 'train' in FLAGS.mode:
-    train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly)
-  if 'predict' in FLAGS.mode:
-    predict_squad(strategy, input_meta_data)
-  if 'eval' in FLAGS.mode:
-    eval_metrics = eval_squad(strategy, input_meta_data)
-    f1_score = eval_metrics['final_f1']
-    logging.info('SQuAD eval F1-score: %f', f1_score)
-    summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval')
-    summary_writer = tf.summary.create_file_writer(summary_dir)
-    with summary_writer.as_default():
-      # TODO(lehou): write to the correct step number.
-      tf.summary.scalar('F1-score', f1_score, step=0)
-      summary_writer.flush()
-    # Also write eval_metrics to json file.
-    squad_lib_sp.write_to_json_files(
-        eval_metrics, os.path.join(summary_dir, 'eval_metrics.json'))
-    time.sleep(60)
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('bert_config_file')
-  flags.mark_flag_as_required('model_dir')
-  app.run(main)
--- a/official/nlp/bert/bert_models.py
+++ b/official/nlp/bert/bert_models.py
@@ -17,9 +17,8 @@
 import gin
 import tensorflow as tf
 import tensorflow_hub as hub
-
+from official.legacy.albert import configs as albert_configs
 from official.modeling import tf_utils
-from official.nlp.albert import configs as albert_configs
 from official.nlp.bert import configs
 from official.nlp.modeling import models
 from official.nlp.modeling import networks

--- a/official/nlp/bert/common_flags.py
+++ b/official/nlp/bert/common_flags.py
@@ -121,9 +121,5 @@ def use_float16():
  return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16


-def use_graph_rewrite():
-  return flags.FLAGS.fp16_implementation == 'graph_rewrite'
-
-
 def get_loss_scale():
  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
--- a/official/nlp/bert/run_classifier.py
+++ b/official/nlp/bert/run_classifier.py
@@ -76,7 +76,7 @@ def get_loss_fn(num_classes):

  def classification_loss_fn(labels, logits):
    """Classification loss."""
-    labels = tf.squeeze(labels)
+    labels = tf.reshape(labels, [-1])
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    one_hot_labels = tf.one_hot(
        tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
@@ -150,8 +150,7 @@ def run_bert_classifier(strategy,
                                              FLAGS.optimizer_type)
    classifier_model.optimizer = performance.configure_optimizer(
        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
+        use_float16=common_flags.use_float16())
    return classifier_model, core_model

  # tf.keras.losses objects accept optional sample_weight arguments (eg. coming

--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
@@ -125,8 +125,7 @@ def run_customized_training(strategy,
        end_lr, optimizer_type)
    pretrain_model.optimizer = performance.configure_optimizer(
        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
+        use_float16=common_flags.use_float16())
    return pretrain_model, core_model

  trained_model = model_training_utils.run_customized_training_loop(

--- a/official/nlp/bert/run_squad_helper.py
+++ b/official/nlp/bert/run_squad_helper.py
@@ -252,8 +252,7 @@ def train_squad(strategy,

    squad_model.optimizer = performance.configure_optimizer(
        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
+        use_float16=common_flags.use_float16())
    return squad_model, core_model

  # Only when explicit_allreduce = True, post_allreduce_callbacks and

--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -16,9 +16,9 @@

 Includes configurations and factory methods.
 """
+import dataclasses
 from typing import Optional

-import dataclasses
 import gin
 import tensorflow as tf

@@ -26,7 +26,7 @@ from official.modeling import hyperparams
 from official.modeling import tf_utils
 from official.nlp.modeling import layers
 from official.nlp.modeling import networks
-from official.nlp.projects.bigbird import encoder as bigbird_encoder
+from official.projects.bigbird import encoder as bigbird_encoder


 @dataclasses.dataclass
@@ -232,8 +232,9 @@ class EncoderConfig(hyperparams.OneOfConfig):
  kernel: KernelEncoderConfig = KernelEncoderConfig()
  mobilebert: MobileBertEncoderConfig = MobileBertEncoderConfig()
  reuse: ReuseEncoderConfig = ReuseEncoderConfig()
-  teams: BertEncoderConfig = BertEncoderConfig()
  xlnet: XLNetEncoderConfig = XLNetEncoderConfig()
+  # If `any` is used, the encoder building relies on any.BUILDER.
+  any: hyperparams.Config = hyperparams.Config()


 @gin.configurable
@@ -290,6 +291,16 @@ def build_encoder(config: EncoderConfig,
        dict_outputs=True)
    return encoder_cls(**kwargs)

+  if encoder_type == "any":
+    encoder = encoder_cfg.BUILDER(encoder_cfg)
+    if not isinstance(encoder,
+                      (tf.Module, tf.keras.Model, tf.keras.layers.Layer)):
+      raise ValueError("The BUILDER returns an unexpected instance. The "
+                       "`build_encoder` should returns a tf.Module, "
+                       "tf.keras.Model or tf.keras.layers.Layer. However, "
+                       f"we get {encoder.__class__}")
+    return encoder
+
  if encoder_type == "mobilebert":
    return networks.MobileBERTEncoder(
        word_vocab_size=encoder_cfg.word_vocab_size,
@@ -465,40 +476,6 @@ def build_encoder(config: EncoderConfig,
        initializer=tf.keras.initializers.RandomNormal(
            stddev=encoder_cfg.initializer_range))

-  if encoder_type == "teams":
-    embedding_cfg = dict(
-        vocab_size=encoder_cfg.vocab_size,
-        type_vocab_size=encoder_cfg.type_vocab_size,
-        hidden_size=encoder_cfg.hidden_size,
-        embedding_width=encoder_cfg.embedding_size,
-        max_seq_length=encoder_cfg.max_position_embeddings,
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        dropout_rate=encoder_cfg.dropout_rate,
-    )
-    embedding_network = networks.PackedSequenceEmbedding(**embedding_cfg)
-    hidden_cfg = dict(
-        num_attention_heads=encoder_cfg.num_attention_heads,
-        intermediate_size=encoder_cfg.intermediate_size,
-        intermediate_activation=tf_utils.get_activation(
-            encoder_cfg.hidden_activation),
-        dropout_rate=encoder_cfg.dropout_rate,
-        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-    )
-    kwargs = dict(
-        embedding_cfg=embedding_cfg,
-        embedding_cls=embedding_network,
-        hidden_cfg=hidden_cfg,
-        num_hidden_instances=encoder_cfg.num_layers,
-        pooled_output_dim=encoder_cfg.hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        return_all_layer_outputs=encoder_cfg.return_all_encoder_outputs,
-        dict_outputs=True)
-    return networks.EncoderScaffold(**kwargs)
-
  if encoder_type == "reuse":
    embedding_cfg = dict(
        vocab_size=encoder_cfg.vocab_size,

--- a/official/nlp/configs/encoders_test.py
+++ b/official/nlp/configs/encoders_test.py
@@ -19,6 +19,8 @@ import tensorflow as tf

 from official.modeling import hyperparams
 from official.nlp.configs import encoders
+from official.nlp.modeling import networks
+from official.projects.teams import teams


 class EncodersTest(tf.test.TestCase):
@@ -37,6 +39,14 @@ class EncodersTest(tf.test.TestCase):
    status = tf.train.Checkpoint(encoder=retored_encoder).restore(ckpt_path)
    status.assert_consumed()

+  def test_build_teams(self):
+    config = encoders.EncoderConfig(
+        type="any", any=teams.TeamsEncoderConfig(num_layers=1))
+    encoder = encoders.build_encoder(config)
+    self.assertIsInstance(encoder, networks.EncoderScaffold)
+    self.assertIsInstance(encoder.embedding_network,
+                          networks.PackedSequenceEmbedding)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/configs/experiment_configs.py
+++ b/official/nlp/configs/experiment_configs.py
@@ -17,4 +17,4 @@
 from official.nlp.configs import finetuning_experiments
 from official.nlp.configs import pretraining_experiments
 from official.nlp.configs import wmt_transformer_experiments
-from official.nlp.projects.teams import teams_experiments
+from official.projects.teams import teams_experiments
--- a/official/nlp/configs/experiments/glue_mnli_text.yaml
+++ b/official/nlp/configs/experiments/glue_mnli_text.yaml
+task:
+  hub_module_url: ''
+  model:
+    num_classes: 3
+  init_checkpoint: ''
+  train_data:
+    drop_remainder: true
+    global_batch_size: 32
+    is_training: true
+    seq_length: 128
+    shuffle_buffer_size: 100
+    tfds_name: 'glue/mnli'
+    tfds_split: 'train'
+    text_fields: ['premise', 'hypothesis']
+    vocab_file: ''
+    lower_case: true
+  validation_data:
+    drop_remainder: false
+    global_batch_size: 32
+    is_training: false
+    seq_length: 128
+    tfds_name: 'glue/mnli'
+    tfds_split: 'validation_matched'
+    text_fields: ['premise', 'hypothesis']
+    vocab_file: ''
+    lower_case: true
+trainer:
+  checkpoint_interval: 3000
+  max_to_keep: 5
+  optimizer_config:
+    learning_rate:
+      polynomial:
+        cycle: false
+        decay_steps: 36813
+        end_learning_rate: 0.0
+        initial_learning_rate: 3.0e-05
+        power: 1.0
+      type: polynomial
+    optimizer:
+      type: adamw
+    warmup:
+      polynomial:
+        power: 1
+        warmup_steps: 3681
+      type: polynomial
+  steps_per_loop: 1000
+  summary_interval: 1000
+  train_steps: 36813
+  validation_interval: 6135
+  validation_steps: 307
--- a/official/nlp/configs/finetuning_experiments.py
+++ b/official/nlp/configs/finetuning_experiments.py
@@ -64,6 +64,48 @@ def bert_sentence_prediction() -> cfg.ExperimentConfig:
  return config


+@exp_factory.register_config_factory('bert/sentence_prediction_text')
+def bert_sentence_prediction_text() -> cfg.ExperimentConfig:
+  r"""BERT sentence prediction with raw text data.
+
+  Example: use tf.text and tfds as input with glue_mnli_text.yaml
+  """
+  config = cfg.ExperimentConfig(
+      task=sentence_prediction.SentencePredictionConfig(
+          train_data=sentence_prediction_dataloader
+          .SentencePredictionTextDataConfig(),
+          validation_data=sentence_prediction_dataloader
+          .SentencePredictionTextDataConfig(
+              is_training=False, drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay':
+                          ['LayerNorm', 'layer_norm', 'bias'],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 3e-5,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
+
+
 @exp_factory.register_config_factory('bert/squad')
 def bert_squad() -> cfg.ExperimentConfig:
  """BERT Squad V1/V2."""

--- a/official/nlp/docs/train.md
+++ b/official/nlp/docs/train.md
@@ -37,6 +37,45 @@ In addition, experiment configuration can be further overriden by
 --params_override=task.train_data.input_path=/some/path,task.hub_module_url=/some/tfhub
 ```

+## Run locally on GPUs
+
+An example command for training a model on local GPUs is below. This command
+trains a BERT-base model on GLUE/MNLI-matched which is a sentence prediction
+task.
+
+```shell
+PARAMS=runtime.distribution_strategy=mirrored  # Train no GPU
+PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/
+
+python3 train.py \
+  --experiment=bert/sentence_prediction \
+  --mode=train \
+  --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \
+  --config_file=configs/models/bert_en_uncased_base.yaml \
+  --config_file=configs/experiments/glue_mnli_matched.yaml \
+  --params_override=${PARAMS}
+```
+
+Note that you can specify any detailed configuration by appending
+to the `PARAMS` variable. For example, if you want to load from a pretrained
+checkpoint as initialization (instead of random initialization):
+
+```shell
+PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
+```
+
+The configuration entry `task.hub_module_url` uses a URL to a TF-Hub model which
+is officially pretrained. See
+[List of Pretrained Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md)
+for the complete list of pretrained models on TF-Hub. When initializing from a
+pretrained model, the encoder architecture of the pretrained model will be used
+and the encoder architecture you set in the config
+(`configs/models/bert_en_uncased_base.yaml` in this case) will be ignored.
+
+You can change `--mode=train` to `--mode=train_and_eval` if you want to see
+evaluation results. But you need to specify the path to the evaluation data by
+setting `task.validation_data.input_path` in `PARAMS`.
+
 ## Run on Cloud TPUs

 Next, we will describe how to run the [train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) on Cloud TPUs.
@@ -113,6 +152,7 @@ python3 train.py \
 --experiment=bert/sentence_prediction \
 --mode=train_and_eval \
 --model_dir=$OUTPUT_DIR \
+ --config_file=configs/models/bert_en_uncased_base.yaml \
 --config_file=configs/experiments/glue_mnli_matched.yaml \
 --tfhub_cache_dir=$OUTPUT_DIR/hub_cache \
 --tpu=${TPU_NAME} \
@@ -172,6 +212,7 @@ python3 train.py \
 --experiment=bert/squad \
 --mode=train_and_eval \
 --model_dir=$OUTPUT_DIR \
+ --config_file=configs/models/bert_en_uncased_base.yaml \
 --config_file=configs/experiments/squad_v1.1.yaml \
 --tpu=${TPU_NAME} \
 --params_override=$PARAMS

--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
@@ -20,8 +20,8 @@ They can be used to assemble new `tf.keras` layers or models.
 from official.nlp.modeling.layers.attention import *
 from official.nlp.modeling.layers.bigbird_attention import BigBirdAttention
 from official.nlp.modeling.layers.bigbird_attention import BigBirdMasks
+from official.nlp.modeling.layers.block_diag_feedforward import BlockDiagFeedforward
 from official.nlp.modeling.layers.cls_head import *
-from official.nlp.modeling.layers.dense_einsum import DenseEinsum
 from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
 from official.nlp.modeling.layers.gaussian_process import RandomFeatureGaussianProcess
 from official.nlp.modeling.layers.kernel_attention import KernelAttention
@@ -47,6 +47,7 @@ from official.nlp.modeling.layers.spectral_normalization import *
 from official.nlp.modeling.layers.talking_heads_attention import TalkingHeadsAttention
 from official.nlp.modeling.layers.text_layers import BertPackInputs
 from official.nlp.modeling.layers.text_layers import BertTokenizer
+from official.nlp.modeling.layers.text_layers import FastWordpieceBertTokenizer
 from official.nlp.modeling.layers.text_layers import SentencepieceTokenizer
 from official.nlp.modeling.layers.tn_transformer_expand_condense import TNTransformerExpandCondense
 from official.nlp.modeling.layers.transformer import *