Merge branch 'master' of https://github.com/tensorflow/models into context_tf2

3ce2f61b · Kaushik Shivakumar · bb16d5ca · 8e9296ff · 3ce2f61b · 3ce2f61b
Commit 3ce2f61b authored Jul 12, 2020 by Kaushik Shivakumar
20 changed files
--- a/README.md
+++ b/README.md
@@ -10,11 +10,13 @@ can take full advantage of TensorFlow for their research and product development
 | [official](official) | • A collection of example implementations for SOTA models using the latest TensorFlow 2's high-level APIs<br />• Officially maintained, supported, and kept up to date with the latest TensorFlow 2 APIs by TensorFlow<br />• Reasonably optimized for fast performance while still being easy to read |
 | [research](research) | • A collection of research model implementations in TensorFlow 1 or 2 by researchers<br />• Maintained and supported by researchers |
 | [community](community) | • A curated list of the GitHub repositories with machine learning models and implementations powered by TensorFlow 2 |
+| [orbit](orbit) | • A flexible and lightweight library that users can easily use or fork when writing customized training loop code in TensorFlow 2.x. It seamlessly integrates with `tf.distribute` and supports running on different device types (CPU, GPU, and TPU). |

 ## [Announcements](https://github.com/tensorflow/models/wiki/Announcements)

 | Date | News |
 |------|------|
+| July 10, 2020 | TensorFlow 2 meets the [Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection) ([Blog](https://blog.tensorflow.org/2020/07/tensorflow-2-meets-object-detection-api.html)) |
 | June 30, 2020 | [SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization](https://github.com/tensorflow/models/tree/master/official/vision/detection#train-a-spinenet-49-based-mask-r-cnn) released ([Tweet](https://twitter.com/GoogleAI/status/1278016712978264064)) |
 | June 17, 2020 | [Context R-CNN: Long Term Temporal Context for Per-Camera Object Detection](https://github.com/tensorflow/models/tree/master/research/object_detection#june-17th-2020) released ([Tweet](https://twitter.com/GoogleAI/status/1276571419422253057)) |
 | May 21, 2020 | [Unifying Deep Local and Global Features for Image Search (DELG)](https://github.com/tensorflow/models/tree/master/research/delf#delg) code released |
@@ -23,12 +25,6 @@ can take full advantage of TensorFlow for their research and product development
 | May 1, 2020 | [DELF: DEep Local Features](https://github.com/tensorflow/models/tree/master/research/delf) updated to support TensorFlow 2.1 |
 | March 31, 2020 | [Introducing the Model Garden for TensorFlow 2](https://blog.tensorflow.org/2020/03/introducing-model-garden-for-tensorflow-2.html) ([Tweet](https://twitter.com/TensorFlow/status/1245029834633297921)) |

-## [Milestones](https://github.com/tensorflow/models/milestones)
-
-| Date | Milestone |
-|------|-----------|
-| July 8, 2020 | [![GitHub milestone](https://img.shields.io/github/milestones/progress/tensorflow/models/1)](https://github.com/tensorflow/models/milestone/1) |
-
 ## Contributions

 [![help wanted:paper implementation](https://img.shields.io/github/issues/tensorflow/models/help%20wanted%3Apaper%20implementation)](https://github.com/tensorflow/models/labels/help%20wanted%3Apaper%20implementation)

--- a/official/README.md
+++ b/official/README.md
@@ -17,12 +17,9 @@ with the same or improved speed and performance with each new TensorFlow build.
 The team is actively developing new models.
 In the near future, we will add:

-* State-of-the-art language understanding models:
-  More members in Transformer family
-* State-of-the-art image classification models:
-  EfficientNet, MnasNet, and variants
-* State-of-the-art objection detection and instance segmentation models:
-  RetinaNet, Mask R-CNN, SpineNet, and variants
+* State-of-the-art language understanding models.
+* State-of-the-art image classification models.
+* State-of-the-art objection detection and instance segmentation models.

 ## Table of Contents


--- a/official/benchmark/unet3d_benchmark.py
+++ b/official/benchmark/unet3d_benchmark.py
@@ -93,8 +93,11 @@ class Unet3DAccuracyBenchmark(keras_benchmark.KerasBenchmark):
    """Runs and reports the benchmark given the provided configuration."""
    params = unet_training_lib.extract_params(FLAGS)
    strategy = unet_training_lib.create_distribution_strategy(params)
-    if params.use_bfloat16:
-      policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
+
+    input_dtype = params.dtype
+    if input_dtype == 'float16' or input_dtype == 'bfloat16':
+      policy = tf.keras.mixed_precision.experimental.Policy(
+          'mixed_bfloat16' if input_dtype == 'bfloat16' else 'mixed_float16')
      tf.keras.mixed_precision.experimental.set_policy(policy)

    stats = {}

--- a/official/colab/nlp/customize_encoder.ipynb
+++ b/official/colab/nlp/customize_encoder.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Bp8t2AI8i7uP"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "rxPj2Lsni9O4"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6xS-9i5DrRvO"
+      },
+      "source": [
+        "# Customizing a Transformer Encoder"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Mwb9uw1cDXsa"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "iLrcV4IyrcGX"
+      },
+      "source": [
+        "## Learning objectives\n",
+        "\n",
+        "The [TensorFlow Models NLP library](https://github.com/tensorflow/models/tree/master/official/nlp/modeling) is a collection of tools for building and training modern high performance natural language models.\n",
+        "\n",
+        "The [TransformEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py) is the core of this library, and lots of new network architectures are proposed to improve the encoder. In this Colab notebook, we will learn how to customize the encoder to employ new network architectures."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "YYxdyoWgsl8t"
+      },
+      "source": [
+        "## Install and import"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fEJSFutUsn_h"
+      },
+      "source": [
+        "### Install the TensorFlow Model Garden pip package\n",
+        "\n",
+        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
+        "*  `pip` will install all models and dependencies automatically."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "thsKZDjhswhR"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -q tf-nightly\n",
+        "!pip install -q tf-models-nightly"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "hpf7JPCVsqtv"
+      },
+      "source": [
+        "### Import Tensorflow and other libraries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "my4dp-RMssQe"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "from official.modeling import activations\n",
+        "from official.nlp import modeling\n",
+        "from official.nlp.modeling import layers, losses, models, networks"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "vjDmVsFfs85n"
+      },
+      "source": [
+        "## Canonical BERT encoder\n",
+        "\n",
+        "Before learning how to customize the encoder, let's firstly create a canonical BERT enoder and use it to instantiate a `BertClassifier` for classification task."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Oav8sbgstWc-"
+      },
+      "outputs": [],
+      "source": [
+        "cfg = {\n",
+        "    \"vocab_size\": 100,\n",
+        "    \"hidden_size\": 32,\n",
+        "    \"num_layers\": 3,\n",
+        "    \"num_attention_heads\": 4,\n",
+        "    \"intermediate_size\": 64,\n",
+        "    \"activation\": activations.gelu,\n",
+        "    \"dropout_rate\": 0.1,\n",
+        "    \"attention_dropout_rate\": 0.1,\n",
+        "    \"sequence_length\": 16,\n",
+        "    \"type_vocab_size\": 2,\n",
+        "    \"initializer\": tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
+        "}\n",
+        "bert_encoder = modeling.networks.TransformerEncoder(**cfg)\n",
+        "\n",
+        "def build_classifier(bert_encoder):\n",
+        "  return modeling.models.BertClassifier(bert_encoder, num_classes=2)\n",
+        "\n",
+        "canonical_classifier_model = build_classifier(bert_encoder)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Qe2UWI6_tsHo"
+      },
+      "source": [
+        "`canonical_classifier_model` can be trained using the training data. For details about how to train the model, please see the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb). We skip the code that trains the model here.\n",
+        "\n",
+        "After training, we can apply the model to do prediction.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "csED2d-Yt5h6"
+      },
+      "outputs": [],
+      "source": [
+        "def predict(model):\n",
+        "  batch_size = 3\n",
+        "  np.random.seed(0)\n",
+        "  word_ids = np.random.randint(\n",
+        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "  mask = np.random.randint(2, size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "  type_ids = np.random.randint(\n",
+        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "  print(model([word_ids, mask, type_ids], training=False))\n",
+        "\n",
+        "predict(canonical_classifier_model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "PzKStEK9t_Pb"
+      },
+      "source": [
+        "## Customize BERT encoder\n",
+        "\n",
+        "One BERT encoder consists of an embedding network and multiple transformer blocks, and each transformer block contains an attention layer and a feedforward layer."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "rmwQfhj6fmKz"
+      },
+      "source": [
+        "We provide easy ways to customize each of those components via (1)\n",
+        "[EncoderScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py) and (2) [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xsMgEVHAui11"
+      },
+      "source": [
+        "### Use EncoderScaffold\n",
+        "\n",
+        "`EncoderScaffold` allows users to provide a custom embedding subnetwork\n",
+        "  (which will replace the standard embedding logic) and/or a custom hidden layer class (which will replace the `Transformer` instantiation in the encoder)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-JBabpa2AOz8"
+      },
+      "source": [
+        "#### Without Customization\n",
+        "\n",
+        "Without any customization, `EncoderScaffold` behaves the same the canonical `TransformerEncoder`.\n",
+        "\n",
+        "As shown in the following example, `EncoderScaffold` can load `TransformerEncoder`'s weights and output the same values:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ktNzKuVByZQf"
+      },
+      "outputs": [],
+      "source": [
+        "default_hidden_cfg = dict(\n",
+        "    num_attention_heads=cfg[\"num_attention_heads\"],\n",
+        "    intermediate_size=cfg[\"intermediate_size\"],\n",
+        "    intermediate_activation=activations.gelu,\n",
+        "    dropout_rate=cfg[\"dropout_rate\"],\n",
+        "    attention_dropout_rate=cfg[\"attention_dropout_rate\"],\n",
+        "    kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
+        ")\n",
+        "default_embedding_cfg = dict(\n",
+        "    vocab_size=cfg[\"vocab_size\"],\n",
+        "    type_vocab_size=cfg[\"type_vocab_size\"],\n",
+        "    hidden_size=cfg[\"hidden_size\"],\n",
+        "    seq_length=cfg[\"sequence_length\"],\n",
+        "    initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
+        "    dropout_rate=cfg[\"dropout_rate\"],\n",
+        "    max_seq_length=cfg[\"sequence_length\"],\n",
+        ")\n",
+        "default_kwargs = dict(\n",
+        "    hidden_cfg=default_hidden_cfg,\n",
+        "    embedding_cfg=default_embedding_cfg,\n",
+        "    num_hidden_instances=cfg[\"num_layers\"],\n",
+        "    pooled_output_dim=cfg[\"hidden_size\"],\n",
+        "    return_all_layer_outputs=True,\n",
+        "    pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
+        ")\n",
+        "encoder_scaffold = modeling.networks.EncoderScaffold(**default_kwargs)\n",
+        "classifier_model_from_encoder_scaffold = build_classifier(encoder_scaffold)\n",
+        "classifier_model_from_encoder_scaffold.set_weights(\n",
+        "    canonical_classifier_model.get_weights())\n",
+        "predict(classifier_model_from_encoder_scaffold)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "sMaUmLyIuwcs"
+      },
+      "source": [
+        "#### Customize Embedding\n",
+        "\n",
+        "Next, we show how to use a customized embedding network.\n",
+        "\n",
+        "We firstly build an embedding network that will replace the default network. This one will have 2 inputs (`mask` and `word_ids`) instead of 3, and won't use positional embeddings."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "LTinnaG6vcsw"
+      },
+      "outputs": [],
+      "source": [
+        "word_ids = tf.keras.layers.Input(\n",
+        "    shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
+        "mask = tf.keras.layers.Input(\n",
+        "    shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
+        "embedding_layer = modeling.layers.OnDeviceEmbedding(\n",
+        "    vocab_size=cfg['vocab_size'],\n",
+        "    embedding_width=cfg['hidden_size'],\n",
+        "    initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
+        "    name=\"word_embeddings\")\n",
+        "word_embeddings = embedding_layer(word_ids)\n",
+        "attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])\n",
+        "new_embedding_network = tf.keras.Model([word_ids, mask],\n",
+        "                                       [word_embeddings, attention_mask])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HN7_yu-6O3qI"
+      },
+      "source": [
+        "Inspecting `new_embedding_network`, we can see it takes two inputs:\n",
+        "`input_word_ids` and `input_mask`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "fO9zKFE4OpHp"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(new_embedding_network, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "9cOaGQHLv12W"
+      },
+      "source": [
+        "We then can build a new encoder using the above `new_embedding_network`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "mtFDMNf2vIl9"
+      },
+      "outputs": [],
+      "source": [
+        "kwargs = dict(default_kwargs)\n",
+        "\n",
+        "# Use new embedding network.\n",
+        "kwargs['embedding_cls'] = new_embedding_network\n",
+        "kwargs['embedding_data'] = embedding_layer.embeddings\n",
+        "\n",
+        "encoder_with_customized_embedding = modeling.networks.EncoderScaffold(**kwargs)\n",
+        "classifier_model = build_classifier(encoder_with_customized_embedding)\n",
+        "# ... Train the model ...\n",
+        "print(classifier_model.inputs)\n",
+        "\n",
+        "# Assert that there are only two inputs.\n",
+        "assert len(classifier_model.inputs) == 2"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Z73ZQDtmwg9K"
+      },
+      "source": [
+        "#### Customized Transformer\n",
+        "\n",
+        "User can also override the [hidden_cls](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py#L103) argument in `EncoderScaffold`'s constructor to employ a customized Transformer layer.\n",
+        "\n",
+        "See [ReZeroTransformer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/rezero_transformer.py) for how to implement a customized Transformer layer.\n",
+        "\n",
+        "Following is an example of using `ReZeroTransformer`:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "uAIarLZgw6pA"
+      },
+      "outputs": [],
+      "source": [
+        "kwargs = dict(default_kwargs)\n",
+        "\n",
+        "# Use ReZeroTransformer.\n",
+        "kwargs['hidden_cls'] = modeling.layers.ReZeroTransformer\n",
+        "\n",
+        "encoder_with_rezero_transformer = modeling.networks.EncoderScaffold(**kwargs)\n",
+        "classifier_model = build_classifier(encoder_with_rezero_transformer)\n",
+        "# ... Train the model ...\n",
+        "predict(classifier_model)\n",
+        "\n",
+        "# Assert that the variable `rezero_alpha` from ReZeroTransformer exists.\n",
+        "assert 'rezero_alpha' in ''.join([x.name for x in classifier_model.trainable_weights])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6PMHFdvnxvR0"
+      },
+      "source": [
+        "### Use [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py)\n",
+        "\n",
+        "The above method of customizing `Transformer` requires rewriting the whole `Transformer` layer, while sometimes you may only want to customize either attention layer or feedforward block. In this case, [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py) can be used.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "D6FejlgwyAy_"
+      },
+      "source": [
+        "#### Customize Attention Layer\n",
+        "\n",
+        "User can also override the [attention_cls](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py#L45) argument in `TransformerScaffold`'s constructor to employ a customized Attention layer.\n",
+        "\n",
+        "See [TalkingHeadsAttention](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/talking_heads_attention.py) for how to implement a customized `Attention` layer.\n",
+        "\n",
+        "Following is an example of using [TalkingHeadsAttention](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/talking_heads_attention.py):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "nFrSMrZuyNeQ"
+      },
+      "outputs": [],
+      "source": [
+        "# Use TalkingHeadsAttention\n",
+        "hidden_cfg = dict(default_hidden_cfg)\n",
+        "hidden_cfg['attention_cls'] = modeling.layers.TalkingHeadsAttention\n",
+        "\n",
+        "kwargs = dict(default_kwargs)\n",
+        "kwargs['hidden_cls'] = modeling.layers.TransformerScaffold\n",
+        "kwargs['hidden_cfg'] = hidden_cfg\n",
+        "\n",
+        "encoder = modeling.networks.EncoderScaffold(**kwargs)\n",
+        "classifier_model = build_classifier(encoder)\n",
+        "# ... Train the model ...\n",
+        "predict(classifier_model)\n",
+        "\n",
+        "# Assert that the variable `pre_softmax_weight` from TalkingHeadsAttention exists.\n",
+        "assert 'pre_softmax_weight' in ''.join([x.name for x in classifier_model.trainable_weights])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "kuEJcTyByVvI"
+      },
+      "source": [
+        "#### Customize Feedforward Layer\n",
+        "\n",
+        "Similiarly, one could also customize the feedforward layer.\n",
+        "\n",
+        "See [GatedFeedforward](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/gated_feedforward.py) for how to implement a customized feedforward layer.\n",
+        "\n",
+        "Following is an example of using [GatedFeedforward](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/gated_feedforward.py)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "XAbKy_l4y_-i"
+      },
+      "outputs": [],
+      "source": [
+        "# Use TalkingHeadsAttention\n",
+        "hidden_cfg = dict(default_hidden_cfg)\n",
+        "hidden_cfg['feedforward_cls'] = modeling.layers.GatedFeedforward\n",
+        "\n",
+        "kwargs = dict(default_kwargs)\n",
+        "kwargs['hidden_cls'] = modeling.layers.TransformerScaffold\n",
+        "kwargs['hidden_cfg'] = hidden_cfg\n",
+        "\n",
+        "encoder_with_gated_feedforward = modeling.networks.EncoderScaffold(**kwargs)\n",
+        "classifier_model = build_classifier(encoder_with_gated_feedforward)\n",
+        "# ... Train the model ...\n",
+        "predict(classifier_model)\n",
+        "\n",
+        "# Assert that the variable `gate` from GatedFeedforward exists.\n",
+        "assert 'gate' in ''.join([x.name for x in classifier_model.trainable_weights])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "a_8NWUhkzeAq"
+      },
+      "source": [
+        "### Build a new Encoder using building blocks from KerasBERT.\n",
+        "\n",
+        "Finally, you could also build a new encoder using building blocks in the modeling library.\n",
+        "\n",
+        "See [AlbertTransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_transformer_encoder.py) as an example:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "xsiA3RzUzmUM"
+      },
+      "outputs": [],
+      "source": [
+        "albert_encoder = modeling.networks.AlbertTransformerEncoder(**cfg)\n",
+        "classifier_model = build_classifier(albert_encoder)\n",
+        "# ... Train the model ...\n",
+        "predict(classifier_model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "MeidDfhlHKSO"
+      },
+      "source": [
+        "Inspecting the `albert_encoder`, we see it stacks the same `Transformer` layer multiple times."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Uv_juT22HERW"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(albert_encoder, show_shapes=True, dpi=48)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Customizing a Transformer Encoder",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/official/colab/nlp/nlp_modeling_library_intro.ipynb
+++ b/official/colab/nlp/nlp_modeling_library_intro.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "80xnUmoI7fBX"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "8nvTnfs6Q692"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "WmfcMK5P5C1G"
+      },
+      "source": [
+        "# Introduction to the TensorFlow Models NLP library"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "cH-oJ8R6AHMK"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0H_EFIhq4-MJ"
+      },
+      "source": [
+        "## Learning objectives\n",
+        "\n",
+        "In this Colab notebook, you will learn how to build transformer-based models for common NLP tasks including pretraining, span labelling and classification using the building blocks from [NLP modeling library](https://github.com/tensorflow/models/tree/master/official/nlp/modeling)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2N97-dps_nUk"
+      },
+      "source": [
+        "## Install and import"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "459ygAVl_rg0"
+      },
+      "source": [
+        "### Install the TensorFlow Model Garden pip package\n",
+        "\n",
+        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
+        "*  `pip` will install all models and dependencies automatically."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Y-qGkdh6_sZc"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -q tf-nightly\n",
+        "!pip install -q tf-models-nightly"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "e4huSSwyAG_5"
+      },
+      "source": [
+        "### Import Tensorflow and other libraries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "jqYXqtjBAJd9"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "from official.nlp import modeling\n",
+        "from official.nlp.modeling import layers, losses, models, networks"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "djBQWjvy-60Y"
+      },
+      "source": [
+        "## BERT pretraining model\n",
+        "\n",
+        "BERT ([Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)) introduced the method of pre-training language representations on a large text corpus and then using that model for downstream NLP tasks.\n",
+        "\n",
+        "In this section, we will learn how to build a model to pretrain BERT on the masked language modeling task and next sentence prediction task. For simplicity, we only show the minimum example and use dummy data."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "MKuHVlsCHmiq"
+      },
+      "source": [
+        "### Build a `BertPretrainer` model wrapping `TransformerEncoder`\n",
+        "\n",
+        "The [TransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/transformer_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
+        "\n",
+        "The [BertPretrainer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_pretrainer.py) allows a user to pass in a transformer stack, and instantiates the masked language model and classification networks that are used to create the training objectives."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "EXkcXz-9BwB3"
+      },
+      "outputs": [],
+      "source": [
+        "# Build a small transformer network.\n",
+        "vocab_size = 100\n",
+        "sequence_length = 16\n",
+        "network = modeling.networks.TransformerEncoder(\n",
+        "    vocab_size=vocab_size, num_layers=2, sequence_length=16)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0NH5irV5KTMS"
+      },
+      "source": [
+        "Inspecting the encoder, we see it contains few embedding layers, stacked `Transformer` layers and are connected to three input layers:\n",
+        "\n",
+        "`input_word_ids`, `input_type_ids` and `input_mask`.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "lZNoZkBrIoff"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(network, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "o7eFOZXiIl-b"
+      },
+      "outputs": [],
+      "source": [
+        "# Create a BERT pretrainer with the created network.\n",
+        "num_token_predictions = 8\n",
+        "bert_pretrainer = modeling.models.BertPretrainer(\n",
+        "    network, num_classes=2, num_token_predictions=num_token_predictions, output='predictions')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "d5h5HT7gNHx_"
+      },
+      "source": [
+        "Inspecting the `bert_pretrainer`, we see it wraps the `encoder` with additional `MaskedLM` and `Classification` heads."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "2tcNfm03IBF7"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(bert_pretrainer, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "F2oHrXGUIS0M"
+      },
+      "outputs": [],
+      "source": [
+        "# We can feed some dummy data to get masked language model and sentence output.\n",
+        "batch_size = 2\n",
+        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
+        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "masked_lm_positions_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
+        "\n",
+        "outputs = bert_pretrainer(\n",
+        "    [word_id_data, mask_data, type_id_data, masked_lm_positions_data])\n",
+        "lm_output = outputs[\"masked_lm\"]\n",
+        "sentence_output = outputs[\"classification\"]\n",
+        "print(lm_output)\n",
+        "print(sentence_output)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "bnx3UCHniCS5"
+      },
+      "source": [
+        "### Compute loss\n",
+        "Next, we can use `lm_output` and `sentence_output` to compute `loss`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "k30H4Q86f52x"
+      },
+      "outputs": [],
+      "source": [
+        "masked_lm_ids_data = np.random.randint(vocab_size, size=(batch_size, num_token_predictions))\n",
+        "masked_lm_weights_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
+        "next_sentence_labels_data = np.random.randint(2, size=(batch_size))\n",
+        "\n",
+        "mlm_loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
+        "    labels=masked_lm_ids_data,\n",
+        "    predictions=lm_output,\n",
+        "    weights=masked_lm_weights_data)\n",
+        "sentence_loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
+        "    labels=next_sentence_labels_data,\n",
+        "    predictions=sentence_output)\n",
+        "loss = mlm_loss + sentence_loss\n",
+        "print(loss)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wrmSs8GjHxVw"
+      },
+      "source": [
+        "With the loss, you can optimize the model.\n",
+        "After training, we can save the weights of TransformerEncoder for the downstream fine-tuning tasks. Please see [run_pretraining.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_pretraining.py) for the full example.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "k8cQVFvBCV4s"
+      },
+      "source": [
+        "## Span labeling model\n",
+        "\n",
+        "Span labeling is the task to assign labels to a span of the text, for example, label a span of text as the answer of a given question.\n",
+        "\n",
+        "In this section, we will learn how to build a span labeling model. Again, we use dummy data for simplicity."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xrLLEWpfknUW"
+      },
+      "source": [
+        "### Build a BertSpanLabeler wrapping TransformerEncoder\n",
+        "\n",
+        "[BertSpanLabeler](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_span_labeler.py) implements a simple single-span start-end predictor (that is, a model that predicts two values: a start token index and an end token index), suitable for SQuAD-style tasks.\n",
+        "\n",
+        "Note that `BertSpanLabeler` wraps a `TransformerEncoder`, the weights of which can be restored from the above pretraining model.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "B941M4iUCejO"
+      },
+      "outputs": [],
+      "source": [
+        "network = modeling.networks.TransformerEncoder(\n",
+        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
+        "\n",
+        "# Create a BERT trainer with the created network.\n",
+        "bert_span_labeler = modeling.models.BertSpanLabeler(network)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QpB9pgj4PpMg"
+      },
+      "source": [
+        "Inspecting the `bert_span_labeler`, we see it wraps the encoder with additional `SpanLabeling` that outputs `start_position` and `end_postion`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "RbqRNJCLJu4H"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(bert_span_labeler, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "fUf1vRxZJwio"
+      },
+      "outputs": [],
+      "source": [
+        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
+        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
+        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "\n",
+        "# Feed the data to the model.\n",
+        "start_logits, end_logits = bert_span_labeler([word_id_data, mask_data, type_id_data])\n",
+        "print(start_logits)\n",
+        "print(end_logits)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "WqhgQaN1lt-G"
+      },
+      "source": [
+        "### Compute loss\n",
+        "With `start_logits` and `end_logits`, we can compute loss:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "waqs6azNl3Nn"
+      },
+      "outputs": [],
+      "source": [
+        "start_positions = np.random.randint(sequence_length, size=(batch_size))\n",
+        "end_positions = np.random.randint(sequence_length, size=(batch_size))\n",
+        "\n",
+        "start_loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
+        "    start_positions, start_logits, from_logits=True)\n",
+        "end_loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
+        "    end_positions, end_logits, from_logits=True)\n",
+        "\n",
+        "total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2\n",
+        "print(total_loss)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Zdf03YtZmd_d"
+      },
+      "source": [
+        "With the `loss`, you can optimize the model. Please see [run_squad.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_squad.py) for the full example."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0A1XnGSTChg9"
+      },
+      "source": [
+        "## Classification model\n",
+        "\n",
+        "In the last section, we show how to build a text classification model.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "MSK8OpZgnQa9"
+      },
+      "source": [
+        "### Build a BertClassifier model wrapping TransformerEncoder\n",
+        "\n",
+        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a simple token classification model containing a single classification head using the `TokenClassification` network."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "cXXCsffkCphk"
+      },
+      "outputs": [],
+      "source": [
+        "network = modeling.networks.TransformerEncoder(\n",
+        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
+        "\n",
+        "# Create a BERT trainer with the created network.\n",
+        "num_classes = 2\n",
+        "bert_classifier = modeling.models.BertClassifier(\n",
+        "    network, num_classes=num_classes)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "8tZKueKYP4bB"
+      },
+      "source": [
+        "Inspecting the `bert_classifier`, we see it wraps the `encoder` with additional `Classification` head."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "snlutm9ZJgEZ"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "yyHPHsqBJkCz"
+      },
+      "outputs": [],
+      "source": [
+        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
+        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
+        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
+        "\n",
+        "# Feed the data to the model.\n",
+        "logits = bert_classifier([word_id_data, mask_data, type_id_data])\n",
+        "print(logits)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "w--a2mg4nzKm"
+      },
+      "source": [
+        "### Compute loss\n",
+        "\n",
+        "With `logits`, we can compute `loss`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "9X0S1DoFn_5Q"
+      },
+      "outputs": [],
+      "source": [
+        "labels = np.random.randint(num_classes, size=(batch_size))\n",
+        "\n",
+        "loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
+        "    labels=labels, predictions=tf.nn.log_softmax(logits, axis=-1))\n",
+        "print(loss)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "mzBqOylZo3og"
+      },
+      "source": [
+        "With the `loss`, you can optimize the model. Please see [run_classifier.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_classifier.py) or the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb) for the full example."
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Introduction to the TensorFlow Models NLP library",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/official/core/base_task.py
+++ b/official/core/base_task.py
@@ -59,7 +59,7 @@ class Task(tf.Module):
  def initialize(self, model: tf.keras.Model):
    """A callback function used as CheckpointManager's init_fn.

-    This function will be called when no checkpoint found for the model.
+    This function will be called when no checkpoint is found for the model.
    If there is a checkpoint, the checkpoint will be loaded and this function
    will not be called. You can use this callback function to load a pretrained
    checkpoint, saved under a directory other than the model_dir.
@@ -71,7 +71,7 @@ class Task(tf.Module):

  @abc.abstractmethod
  def build_model(self) -> tf.keras.Model:
-    """Creates the model architecture.
+    """Creates model architecture.

    Returns:
      A model instance.
@@ -135,7 +135,7 @@ class Task(tf.Module):
    Args:
      labels: optional label tensors.
      model_outputs: a nested structure of output tensors.
-      aux_losses: auxiliarly loss tensors, i.e. `losses` in keras.Model.
+      aux_losses: auxiliary loss tensors, i.e. `losses` in keras.Model.

    Returns:
      The total loss tensor.
@@ -232,7 +232,7 @@ class Task(tf.Module):
    return logs

  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    """Validatation step.
+    """Validation step.

    With distribution strategies, this method runs on devices.


--- a/official/core/input_reader.py
+++ b/official/core/input_reader.py
@@ -171,6 +171,9 @@ class InputReader:
        as_supervised=self._tfds_as_supervised,
        decoders=decoders,
        read_config=read_config)
+
+    if self._is_training:
+      dataset = dataset.repeat()
    return dataset

  @property

--- a/official/modeling/hyperparams/base_config.py
+++ b/official/modeling/hyperparams/base_config.py
@@ -126,10 +126,10 @@ class Config(params_dict.ParamsDict):
    subconfig_type = Config
    if k in cls.__annotations__:
      # Directly Config subtype.
-      type_annotation = cls.__annotations__[k]
+      type_annotation = cls.__annotations__[k]  # pytype: disable=invalid-annotation
      if (isinstance(type_annotation, type) and
          issubclass(type_annotation, Config)):
-        subconfig_type = cls.__annotations__[k]
+        subconfig_type = cls.__annotations__[k]  # pytype: disable=invalid-annotation
      else:
        # Check if the field is a sequence of subtypes.
        field_type = getattr(type_annotation, '__origin__', type(None))

--- a/official/modeling/hyperparams/config_definitions.py
+++ b/official/modeling/hyperparams/config_definitions.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 """Common configuration settings."""
+
 from typing import Optional, Union

 import dataclasses
@@ -123,8 +124,8 @@ class RuntimeConfig(base_config.Config):
  task_index: int = -1
  all_reduce_alg: Optional[str] = None
  num_packs: int = 1
-  loss_scale: Optional[Union[str, float]] = None
  mixed_precision_dtype: Optional[str] = None
+  loss_scale: Optional[Union[str, float]] = None
  run_eagerly: bool = False
  batchnorm_spatial_persistent: bool = False

@@ -172,23 +173,27 @@ class TrainerConfig(base_config.Config):
    eval_tf_function: whether or not to use tf_function for eval.
    steps_per_loop: number of steps per loop.
    summary_interval: number of steps between each summary.
-    checkpoint_intervals: number of steps between checkpoints.
+    checkpoint_interval: number of steps between checkpoints.
    max_to_keep: max checkpoints to keep.
    continuous_eval_timeout: maximum number of seconds to wait between
-      checkpoints, if set to None, continuous eval will wait indefinetely.
+      checkpoints, if set to None, continuous eval will wait indefinitely.
+    train_steps: number of train steps.
+    validation_steps: number of eval steps. If `None`, the entire eval dataset
+      is used.
+    validation_interval: number of training steps to run between evaluations.
  """
  optimizer_config: OptimizationConfig = OptimizationConfig()
-  train_steps: int = 0
-  validation_steps: Optional[int] = None
-  validation_interval: int = 100
+  train_tf_while_loop: bool = True
+  train_tf_function: bool = True
+  eval_tf_function: bool = True
  steps_per_loop: int = 1000
  summary_interval: int = 1000
  checkpoint_interval: int = 1000
  max_to_keep: int = 5
  continuous_eval_timeout: Optional[int] = None
-  train_tf_while_loop: bool = True
-  train_tf_function: bool = True
-  eval_tf_function: bool = True
+  train_steps: int = 0
+  validation_steps: Optional[int] = None
+  validation_interval: int = 1000


 @dataclasses.dataclass

--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
@@ -20,6 +20,20 @@ import dataclasses
 from official.modeling.hyperparams import base_config


+@dataclasses.dataclass
+class ConstantLrConfig(base_config.Config):
+  """Configuration for constant learning rate.
+
+  This class is a containers for the constant learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to Constant.
+    learning_rate: A float. The learning rate. Defaults to 0.1.
+  """
+  name: str = 'Constant'
+  learning_rate: float = 0.1
+
+
 @dataclasses.dataclass
 class StepwiseLrConfig(base_config.Config):
  """Configuration for stepwise learning rate decay.

--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
@@ -55,12 +55,14 @@ class LrConfig(oneof.OneOfConfig):

  Attributes:
    type: 'str', type of lr schedule to be used, on the of fields below.
+    constant: constant learning rate config.
    stepwise: stepwise learning rate config.
    exponential: exponential learning rate config.
    polynomial: polynomial learning rate config.
    cosine: cosine learning rate config.
  """
  type: Optional[str] = None
+  constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
  stepwise: lr_cfg.StepwiseLrConfig = lr_cfg.StepwiseLrConfig()
  exponential: lr_cfg.ExponentialLrConfig = lr_cfg.ExponentialLrConfig()
  polynomial: lr_cfg.PolynomialLrConfig = lr_cfg.PolynomialLrConfig()

--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
@@ -28,13 +28,11 @@ class SGDConfig(base_config.Config):

  Attributes:
    name: name of the optimizer.
-    learning_rate: learning_rate for SGD optimizer.
    decay: decay rate for SGD optimizer.
    nesterov: nesterov for SGD optimizer.
    momentum: momentum for SGD optimizer.
  """
  name: str = "SGD"
-  learning_rate: float = 0.01
  decay: float = 0.0
  nesterov: bool = False
  momentum: float = 0.0
@@ -49,14 +47,12 @@ class RMSPropConfig(base_config.Config):

  Attributes:
    name: name of the optimizer.
-    learning_rate: learning_rate for RMSprop optimizer.
    rho: discounting factor for RMSprop optimizer.
    momentum: momentum for RMSprop optimizer.
    epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
    centered: Whether to normalize gradients or not.
  """
  name: str = "RMSprop"
-  learning_rate: float = 0.001
  rho: float = 0.9
  momentum: float = 0.0
  epsilon: float = 1e-7
@@ -72,7 +68,6 @@ class AdamConfig(base_config.Config):

  Attributes:
    name: name of the optimizer.
-    learning_rate: learning_rate for Adam optimizer.
    beta_1: decay rate for 1st order moments.
    beta_2: decay rate for 2st order moments.
    epsilon: epsilon value used for numerical stability in Adam optimizer.
@@ -80,7 +75,6 @@ class AdamConfig(base_config.Config):
    the paper "On the Convergence of Adam and beyond".
  """
  name: str = "Adam"
-  learning_rate: float = 0.001
  beta_1: float = 0.9
  beta_2: float = 0.999
  epsilon: float = 1e-07
@@ -93,7 +87,6 @@ class AdamWeightDecayConfig(base_config.Config):

  Attributes:
    name: name of the optimizer.
-    learning_rate: learning_rate for the optimizer.
    beta_1: decay rate for 1st order moments.
    beta_2: decay rate for 2st order moments.
    epsilon: epsilon value used for numerical stability in the optimizer.
@@ -106,7 +99,6 @@ class AdamWeightDecayConfig(base_config.Config):
                             include in weight decay.
  """
  name: str = "AdamWeightDecay"
-  learning_rate: float = 0.001
  beta_1: float = 0.9
  beta_2: float = 0.999
  epsilon: float = 1e-07
@@ -125,7 +117,6 @@ class LAMBConfig(base_config.Config):

  Attributes:
    name: name of the optimizer.
-    learning_rate: learning_rate for Adam optimizer.
    beta_1: decay rate for 1st order moments.
    beta_2: decay rate for 2st order moments.
    epsilon: epsilon value used for numerical stability in LAMB optimizer.
@@ -139,7 +130,6 @@ class LAMBConfig(base_config.Config):
                                   be excluded.
  """
  name: str = "LAMB"
-  learning_rate: float = 0.001
  beta_1: float = 0.9
  beta_2: float = 0.999
  epsilon: float = 1e-6

--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -60,7 +60,7 @@ class OptimizerFactory(object):
  params = {
        'optimizer': {
            'type': 'sgd',
-            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+            'sgd': {'momentum': 0.9}
        },
        'learning_rate': {
            'type': 'stepwise',
@@ -88,12 +88,15 @@ class OptimizerFactory(object):
    self._optimizer_config = config.optimizer.get()
    self._optimizer_type = config.optimizer.type

-    if self._optimizer_config is None:
+    if self._optimizer_type is None:
      raise ValueError('Optimizer type must be specified')

    self._lr_config = config.learning_rate.get()
    self._lr_type = config.learning_rate.type

+    if self._lr_type is None:
+      raise ValueError('Learning rate type must be specified')
+
    self._warmup_config = config.warmup.get()
    self._warmup_type = config.warmup.type

@@ -101,18 +104,15 @@ class OptimizerFactory(object):
    """Build learning rate.

    Builds learning rate from config. Learning rate schedule is built according
-    to the learning rate config. If there is no learning rate config, optimizer
-    learning rate is returned.
+    to the learning rate config. If learning rate type is consant,
+    lr_config.learning_rate is returned.

    Returns:
-      tf.keras.optimizers.schedules.LearningRateSchedule instance. If no
-      learning rate schedule defined, optimizer_config.learning_rate is
-      returned.
+      tf.keras.optimizers.schedules.LearningRateSchedule instance. If
+      learning rate type is consant, lr_config.learning_rate is returned.
    """
-
-    # TODO(arashwan): Explore if we want to only allow explicit const lr sched.
-    if not self._lr_config:
-      lr = self._optimizer_config.learning_rate
+    if self._lr_type == 'constant':
+      lr = self._lr_config.learning_rate
    else:
      lr = LR_CLS[self._lr_type](**self._lr_config.as_dict())


--- a/official/modeling/optimization/optimizer_factory_test.py
+++ b/official/modeling/optimization/optimizer_factory_test.py
@@ -35,10 +35,17 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    params = {
        'optimizer': {
            'type': optimizer_type
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
        }
    }
    optimizer_cls = optimizer_factory.OPTIMIZERS_CLS[optimizer_type]
    expected_optimizer_config = optimizer_cls().get_config()
+    expected_optimizer_config['learning_rate'] = 0.1

    opt_config = optimization_config.OptimizationConfig(params)
    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
@@ -48,11 +55,32 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    self.assertIsInstance(optimizer, optimizer_cls)
    self.assertEqual(expected_optimizer_config, optimizer.get_config())

+  def test_missing_types(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'momentum': 0.9}
+        }
+    }
+    with self.assertRaises(ValueError):
+      optimizer_factory.OptimizerFactory(
+          optimization_config.OptimizationConfig(params))
+    params = {
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {'boundaries': [10000, 20000],
+                         'values': [0.1, 0.01, 0.001]}
+        }
+    }
+    with self.assertRaises(ValueError):
+      optimizer_factory.OptimizerFactory(
+          optimization_config.OptimizationConfig(params))
+
  def test_stepwise_lr_schedule(self):
    params = {
        'optimizer': {
            'type': 'sgd',
-            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+            'sgd': {'momentum': 0.9}
        },
        'learning_rate': {
            'type': 'stepwise',
@@ -79,7 +107,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    params = {
        'optimizer': {
            'type': 'sgd',
-            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+            'sgd': {'momentum': 0.9}
        },
        'learning_rate': {
            'type': 'stepwise',
@@ -112,7 +140,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    params = {
        'optimizer': {
            'type': 'sgd',
-            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+            'sgd': {'momentum': 0.9}
        },
        'learning_rate': {
            'type': 'exponential',
@@ -142,7 +170,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    params = {
        'optimizer': {
            'type': 'sgd',
-            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+            'sgd': {'momentum': 0.9}
        },
        'learning_rate': {
            'type': 'polynomial',
@@ -166,7 +194,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    params = {
        'optimizer': {
            'type': 'sgd',
-            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+            'sgd': {'momentum': 0.9}
        },
        'learning_rate': {
            'type': 'cosine',
@@ -192,7 +220,13 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    params = {
        'optimizer': {
            'type': 'sgd',
-            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+            'sgd': {'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
        },
        'warmup': {
            'type': 'linear',
@@ -216,7 +250,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    params = {
        'optimizer': {
            'type': 'sgd',
-            'sgd': {'learning_rate': 0.1, 'momentum': 0.9}
+            'sgd': {'momentum': 0.9}
        },
        'learning_rate': {
            'type': 'stepwise',

--- a/official/modeling/tf_utils.py
+++ b/official/modeling/tf_utils.py
@@ -88,7 +88,6 @@ def is_special_none_tensor(tensor):
  return tensor.shape.ndims == 0 and tensor.dtype == tf.int32


-# TODO(hongkuny): consider moving custom string-map lookup to keras api.
 def get_activation(identifier):
  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.


--- a/official/nlp/albert/run_classifier.py
+++ b/official/nlp/albert/run_classifier.py
@@ -14,23 +14,61 @@
 # ==============================================================================
 """ALBERT classification finetuning runner in tf2.x."""

+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import json
-
+import os
 from absl import app
 from absl import flags
+from absl import logging
 import tensorflow as tf

 from official.nlp.albert import configs as albert_configs
+from official.nlp.bert import bert_models
 from official.nlp.bert import run_classifier as run_classifier_bert
 from official.utils.misc import distribution_utils

+
 FLAGS = flags.FLAGS


+def predict(strategy, albert_config, input_meta_data, predict_input_fn):
+  """Function outputs both the ground truth predictions as .tsv files."""
+  with strategy.scope():
+    classifier_model = bert_models.classifier_model(
+        albert_config, input_meta_data['num_labels'])[0]
+    checkpoint = tf.train.Checkpoint(model=classifier_model)
+    latest_checkpoint_file = (
+        FLAGS.predict_checkpoint_path or
+        tf.train.latest_checkpoint(FLAGS.model_dir))
+    assert latest_checkpoint_file
+    logging.info('Checkpoint file %s found and restoring from '
+                 'checkpoint', latest_checkpoint_file)
+    checkpoint.restore(
+        latest_checkpoint_file).assert_existing_objects_matched()
+    preds, ground_truth = run_classifier_bert.get_predictions_and_labels(
+        strategy, classifier_model, predict_input_fn, return_probs=True)
+    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
+    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
+      logging.info('***** Predict results *****')
+      for probabilities in preds:
+        output_line = '\t'.join(
+            str(class_probability)
+            for class_probability in probabilities) + '\n'
+        writer.write(output_line)
+    ground_truth_labels_file = os.path.join(FLAGS.model_dir,
+                                            'output_labels.tsv')
+    with tf.io.gfile.GFile(ground_truth_labels_file, 'w') as writer:
+      logging.info('***** Ground truth results *****')
+      for label in ground_truth:
+        output_line = '\t'.join(str(label)) + '\n'
+        writer.write(output_line)
+  return
+
+
 def main(_):
  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
    input_meta_data = json.loads(reader.read().decode('utf-8'))
@@ -56,9 +94,14 @@ def main(_):

  albert_config = albert_configs.AlbertConfig.from_json_file(
      FLAGS.bert_config_file)
-  run_classifier_bert.run_bert(strategy, input_meta_data, albert_config,
-                               train_input_fn, eval_input_fn)
-
+  if FLAGS.mode == 'train_and_eval':
+    run_classifier_bert.run_bert(strategy, input_meta_data, albert_config,
+                                 train_input_fn, eval_input_fn)
+  elif FLAGS.mode == 'predict':
+    predict(strategy, albert_config, input_meta_data, eval_input_fn)
+  else:
+    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
+  return

 if __name__ == '__main__':
  flags.mark_flag_as_required('bert_config_file')

--- a/official/nlp/bert/export_tfhub.py
+++ b/official/nlp/bert/export_tfhub.py
@@ -79,7 +79,7 @@ def export_bert_tfhub(bert_config: configs.BertConfig,
                 do_lower_case, vocab_file)
  core_model, encoder = create_bert_model(bert_config)
  checkpoint = tf.train.Checkpoint(model=encoder)
-  checkpoint.restore(model_checkpoint_path).assert_consumed()
+  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
  core_model.vocab_file = tf.saved_model.Asset(vocab_file)
  core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
  core_model.save(hub_destination, include_optimizer=False, save_format="tf")

--- a/official/nlp/bert/model_training_utils.py
+++ b/official/nlp/bert/model_training_utils.py
@@ -99,7 +99,9 @@ def write_txt_summary(training_summary, summary_dir):


 @deprecation.deprecated(
-    None, 'This function is deprecated. Please use Keras compile/fit instead.')
+    None, 'This function is deprecated and we do not expect adding new '
+    'functionalities. Please do not have your code depending '
+    'on this library.')
 def run_customized_training_loop(
    # pylint: disable=invalid-name
    _sentinel=None,
@@ -557,7 +559,6 @@ def run_customized_training_loop(
    for metric in model.metrics:
      training_summary[metric.name] = _float_metric_value(metric)
    if eval_metrics:
-      # TODO(hongkuny): Cleans up summary reporting in text.
      training_summary['last_train_metrics'] = _float_metric_value(
          train_metrics[0])
      training_summary['eval_metrics'] = _float_metric_value(eval_metrics[0])

--- a/official/nlp/configs/bert.py
+++ b/official/nlp/configs/bert.py
@@ -24,7 +24,6 @@ import tensorflow as tf

 from official.modeling import tf_utils
 from official.modeling.hyperparams import base_config
-from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import encoders
 from official.nlp.modeling import layers
 from official.nlp.modeling.models import bert_pretrainer
@@ -43,7 +42,6 @@ class ClsHeadConfig(base_config.Config):
 @dataclasses.dataclass
 class BertPretrainerConfig(base_config.Config):
  """BERT encoder configuration."""
-  num_masked_tokens: int = 76
  encoder: encoders.TransformerEncoderConfig = (
      encoders.TransformerEncoderConfig())
  cls_heads: List[ClsHeadConfig] = dataclasses.field(default_factory=list)
@@ -56,103 +54,18 @@ def instantiate_classification_heads_from_cfgs(
    ] if cls_head_configs else []


-def instantiate_bertpretrainer_from_cfg(
+def instantiate_pretrainer_from_cfg(
    config: BertPretrainerConfig,
    encoder_network: Optional[tf.keras.Model] = None
-    ) -> bert_pretrainer.BertPretrainerV2:
+) -> bert_pretrainer.BertPretrainerV2:
  """Instantiates a BertPretrainer from the config."""
  encoder_cfg = config.encoder
  if encoder_network is None:
    encoder_network = encoders.instantiate_encoder_from_cfg(encoder_cfg)
  return bert_pretrainer.BertPretrainerV2(
-      config.num_masked_tokens,
      mlm_activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
      mlm_initializer=tf.keras.initializers.TruncatedNormal(
          stddev=encoder_cfg.initializer_range),
      encoder_network=encoder_network,
      classification_heads=instantiate_classification_heads_from_cfgs(
          config.cls_heads))
-
-
-@dataclasses.dataclass
-class BertPretrainDataConfig(cfg.DataConfig):
-  """Data config for BERT pretraining task (tasks/masked_lm)."""
-  input_path: str = ""
-  global_batch_size: int = 512
-  is_training: bool = True
-  seq_length: int = 512
-  max_predictions_per_seq: int = 76
-  use_next_sentence_label: bool = True
-  use_position_id: bool = False
-
-
-@dataclasses.dataclass
-class BertPretrainEvalDataConfig(BertPretrainDataConfig):
-  """Data config for the eval set in BERT pretraining task (tasks/masked_lm)."""
-  input_path: str = ""
-  global_batch_size: int = 512
-  is_training: bool = False
-
-
-@dataclasses.dataclass
-class SentencePredictionDataConfig(cfg.DataConfig):
-  """Data config for sentence prediction task (tasks/sentence_prediction)."""
-  input_path: str = ""
-  global_batch_size: int = 32
-  is_training: bool = True
-  seq_length: int = 128
-
-
-@dataclasses.dataclass
-class SentencePredictionDevDataConfig(cfg.DataConfig):
-  """Dev Data config for sentence prediction (tasks/sentence_prediction)."""
-  input_path: str = ""
-  global_batch_size: int = 32
-  is_training: bool = False
-  seq_length: int = 128
-  drop_remainder: bool = False
-
-
-@dataclasses.dataclass
-class QADataConfig(cfg.DataConfig):
-  """Data config for question answering task (tasks/question_answering)."""
-  input_path: str = ""
-  global_batch_size: int = 48
-  is_training: bool = True
-  seq_length: int = 384
-
-
-@dataclasses.dataclass
-class QADevDataConfig(cfg.DataConfig):
-  """Dev Data config for queston answering (tasks/question_answering)."""
-  input_path: str = ""
-  input_preprocessed_data_path: str = ""
-  version_2_with_negative: bool = False
-  doc_stride: int = 128
-  global_batch_size: int = 48
-  is_training: bool = False
-  seq_length: int = 384
-  query_length: int = 64
-  drop_remainder: bool = False
-  vocab_file: str = ""
-  tokenization: str = "WordPiece"  # WordPiece or SentencePiece
-  do_lower_case: bool = True
-
-
-@dataclasses.dataclass
-class TaggingDataConfig(cfg.DataConfig):
-  """Data config for tagging (tasks/tagging)."""
-  input_path: str = ""
-  global_batch_size: int = 48
-  is_training: bool = True
-  seq_length: int = 384
-
-
-@dataclasses.dataclass
-class TaggingDevDataConfig(cfg.DataConfig):
-  """Dev Data config for tagging (tasks/tagging)."""
-  input_path: str = ""
-  global_batch_size: int = 48
-  is_training: bool = False
-  seq_length: int = 384
-  drop_remainder: bool = False
--- a/official/nlp/configs/bert_test.py
+++ b/official/nlp/configs/bert_test.py
@@ -26,7 +26,7 @@ class BertModelsTest(tf.test.TestCase):
  def test_network_invocation(self):
    config = bert.BertPretrainerConfig(
        encoder=encoders.TransformerEncoderConfig(vocab_size=10, num_layers=1))
-    _ = bert.instantiate_bertpretrainer_from_cfg(config)
+    _ = bert.instantiate_pretrainer_from_cfg(config)

    # Invokes with classification heads.
    config = bert.BertPretrainerConfig(
@@ -35,7 +35,7 @@ class BertModelsTest(tf.test.TestCase):
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=2, name="next_sentence")
        ])
-    _ = bert.instantiate_bertpretrainer_from_cfg(config)
+    _ = bert.instantiate_pretrainer_from_cfg(config)

    with self.assertRaises(ValueError):
      config = bert.BertPretrainerConfig(
@@ -47,7 +47,7 @@ class BertModelsTest(tf.test.TestCase):
              bert.ClsHeadConfig(
                  inner_dim=10, num_classes=2, name="next_sentence")
          ])
-      _ = bert.instantiate_bertpretrainer_from_cfg(config)
+      _ = bert.instantiate_pretrainer_from_cfg(config)

  def test_checkpoint_items(self):
    config = bert.BertPretrainerConfig(
@@ -56,9 +56,10 @@ class BertModelsTest(tf.test.TestCase):
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=2, name="next_sentence")
        ])
-    encoder = bert.instantiate_bertpretrainer_from_cfg(config)
-    self.assertSameElements(encoder.checkpoint_items.keys(),
-                            ["encoder", "next_sentence.pooler_dense"])
+    encoder = bert.instantiate_pretrainer_from_cfg(config)
+    self.assertSameElements(
+        encoder.checkpoint_items.keys(),
+        ["encoder", "masked_lm", "next_sentence.pooler_dense"])


 if __name__ == "__main__":