Merge branch 'master' of https://github.com/tensorflow/models into RTESuperGLUE

2e9bb539 · stephenwu · 7bae5317 · 8fba84f8 · 2e9bb539 · 2e9bb539
Commit 2e9bb539 authored Feb 25, 2021 by stephenwu
20 changed files
--- a/official/README.md
+++ b/official/README.md
@@ -19,7 +19,7 @@ In the near future, we will add:

 * State-of-the-art language understanding models.
 * State-of-the-art image classification models.
-* State-of-the-art objection detection and instance segmentation models.
+* State-of-the-art object detection and instance segmentation models.

 ## Table of Contents


--- a/official/colab/fine_tuning_bert.ipynb
+++ b/official/colab/fine_tuning_bert.ipynb
--- a/official/colab/nlp/customize_encoder.ipynb
+++ b/official/colab/nlp/customize_encoder.ipynb
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Customizing a Transformer Encoder",
+      "private_outputs": true,
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Bp8t2AI8i7uP"
      },
      "source": [
@@ -12,14 +26,10 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "rxPj2Lsni9O4"
      },
-      "outputs": [],
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
@@ -32,12 +42,13 @@
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "6xS-9i5DrRvO"
      },
      "source": [
@@ -47,30 +58,28 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Mwb9uw1cDXsa"
      },
      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "  </td>\n",
+        "</table>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "iLrcV4IyrcGX"
      },
      "source": [
@@ -84,7 +93,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "YYxdyoWgsl8t"
      },
      "source": [
@@ -94,7 +102,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "fEJSFutUsn_h"
      },
      "source": [
@@ -107,21 +114,18 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "thsKZDjhswhR"
      },
-      "outputs": [],
      "source": [
-        "!pip install -q tf-models-official==2.3.0"
-      ]
+        "!pip install -q tf-models-official==2.4.0"
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "hpf7JPCVsqtv"
      },
      "source": [
@@ -130,13 +134,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "my4dp-RMssQe"
      },
-      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import tensorflow as tf\n",
@@ -144,12 +144,13 @@
        "from official.modeling import activations\n",
        "from official.nlp import modeling\n",
        "from official.nlp.modeling import layers, losses, models, networks"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "vjDmVsFfs85n"
      },
      "source": [
@@ -160,13 +161,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Oav8sbgstWc-"
      },
-      "outputs": [],
      "source": [
        "cfg = {\n",
        "    \"vocab_size\": 100,\n",
@@ -177,22 +174,23 @@
        "    \"activation\": activations.gelu,\n",
        "    \"dropout_rate\": 0.1,\n",
        "    \"attention_dropout_rate\": 0.1,\n",
-        "    \"sequence_length\": 16,\n",
+        "    \"max_sequence_length\": 16,\n",
        "    \"type_vocab_size\": 2,\n",
        "    \"initializer\": tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
        "}\n",
-        "bert_encoder = modeling.networks.TransformerEncoder(**cfg)\n",
+        "bert_encoder = modeling.networks.BertEncoder(**cfg)\n",
        "\n",
        "def build_classifier(bert_encoder):\n",
        "  return modeling.models.BertClassifier(bert_encoder, num_classes=2)\n",
        "\n",
        "canonical_classifier_model = build_classifier(bert_encoder)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Qe2UWI6_tsHo"
      },
      "source": [
@@ -203,31 +201,28 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "csED2d-Yt5h6"
      },
-      "outputs": [],
      "source": [
        "def predict(model):\n",
        "  batch_size = 3\n",
        "  np.random.seed(0)\n",
        "  word_ids = np.random.randint(\n",
-        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
-        "  mask = np.random.randint(2, size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
+        "  mask = np.random.randint(2, size=(batch_size, cfg[\"max_sequence_length\"]))\n",
        "  type_ids = np.random.randint(\n",
-        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
        "  print(model([word_ids, mask, type_ids], training=False))\n",
        "\n",
        "predict(canonical_classifier_model)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "PzKStEK9t_Pb"
      },
      "source": [
@@ -239,7 +234,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "rmwQfhj6fmKz"
      },
      "source": [
@@ -250,7 +244,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xsMgEVHAui11"
      },
      "source": [
@@ -263,26 +256,21 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "-JBabpa2AOz8"
      },
      "source": [
        "#### Without Customization\n",
        "\n",
-        "Without any customization, `EncoderScaffold` behaves the same the canonical `TransformerEncoder`.\n",
+        "Without any customization, `EncoderScaffold` behaves the same the canonical `BertEncoder`.\n",
        "\n",
-        "As shown in the following example, `EncoderScaffold` can load `TransformerEncoder`'s weights and output the same values:"
+        "As shown in the following example, `EncoderScaffold` can load `BertEncoder`'s weights and output the same values:"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "ktNzKuVByZQf"
      },
-      "outputs": [],
      "source": [
        "default_hidden_cfg = dict(\n",
        "    num_attention_heads=cfg[\"num_attention_heads\"],\n",
@@ -296,10 +284,9 @@
        "    vocab_size=cfg[\"vocab_size\"],\n",
        "    type_vocab_size=cfg[\"type_vocab_size\"],\n",
        "    hidden_size=cfg[\"hidden_size\"],\n",
-        "    seq_length=cfg[\"sequence_length\"],\n",
        "    initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
        "    dropout_rate=cfg[\"dropout_rate\"],\n",
-        "    max_seq_length=cfg[\"sequence_length\"],\n",
+        "    max_seq_length=cfg[\"max_sequence_length\"]\n",
        ")\n",
        "default_kwargs = dict(\n",
        "    hidden_cfg=default_hidden_cfg,\n",
@@ -309,17 +296,19 @@
        "    return_all_layer_outputs=True,\n",
        "    pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
        ")\n",
+        "\n",
        "encoder_scaffold = modeling.networks.EncoderScaffold(**default_kwargs)\n",
        "classifier_model_from_encoder_scaffold = build_classifier(encoder_scaffold)\n",
        "classifier_model_from_encoder_scaffold.set_weights(\n",
        "    canonical_classifier_model.get_weights())\n",
        "predict(classifier_model_from_encoder_scaffold)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "sMaUmLyIuwcs"
      },
      "source": [
@@ -332,18 +321,14 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "LTinnaG6vcsw"
      },
-      "outputs": [],
      "source": [
        "word_ids = tf.keras.layers.Input(\n",
-        "    shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
+        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
        "mask = tf.keras.layers.Input(\n",
-        "    shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
+        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
        "embedding_layer = modeling.layers.OnDeviceEmbedding(\n",
        "    vocab_size=cfg['vocab_size'],\n",
        "    embedding_width=cfg['hidden_size'],\n",
@@ -353,12 +338,13 @@
        "attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])\n",
        "new_embedding_network = tf.keras.Model([word_ids, mask],\n",
        "                                       [word_embeddings, attention_mask])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "HN7_yu-6O3qI"
      },
      "source": [
@@ -368,21 +354,18 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "fO9zKFE4OpHp"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(new_embedding_network, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9cOaGQHLv12W"
      },
      "source": [
@@ -391,13 +374,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "mtFDMNf2vIl9"
      },
-      "outputs": [],
      "source": [
        "kwargs = dict(default_kwargs)\n",
        "\n",
@@ -412,12 +391,13 @@
        "\n",
        "# Assert that there are only two inputs.\n",
        "assert len(classifier_model.inputs) == 2"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Z73ZQDtmwg9K"
      },
      "source": [
@@ -432,13 +412,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "uAIarLZgw6pA"
      },
-      "outputs": [],
      "source": [
        "kwargs = dict(default_kwargs)\n",
        "\n",
@@ -452,12 +428,13 @@
        "\n",
        "# Assert that the variable `rezero_alpha` from ReZeroTransformer exists.\n",
        "assert 'rezero_alpha' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "6PMHFdvnxvR0"
      },
      "source": [
@@ -470,7 +447,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "D6FejlgwyAy_"
      },
      "source": [
@@ -485,13 +461,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "nFrSMrZuyNeQ"
      },
-      "outputs": [],
      "source": [
        "# Use TalkingHeadsAttention\n",
        "hidden_cfg = dict(default_hidden_cfg)\n",
@@ -508,12 +480,13 @@
        "\n",
        "# Assert that the variable `pre_softmax_weight` from TalkingHeadsAttention exists.\n",
        "assert 'pre_softmax_weight' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "kuEJcTyByVvI"
      },
      "source": [
@@ -528,13 +501,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "XAbKy_l4y_-i"
      },
-      "outputs": [],
      "source": [
        "# Use TalkingHeadsAttention\n",
        "hidden_cfg = dict(default_hidden_cfg)\n",
@@ -551,12 +520,13 @@
        "\n",
        "# Assert that the variable `gate` from GatedFeedforward exists.\n",
        "assert 'gate' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "a_8NWUhkzeAq"
      },
      "source": [
@@ -564,29 +534,26 @@
        "\n",
        "Finally, you could also build a new encoder using building blocks in the modeling library.\n",
        "\n",
-        "See [AlbertTransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_transformer_encoder.py) as an example:\n"
+        "See [AlbertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_encoder.py) as an example:\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "xsiA3RzUzmUM"
      },
-      "outputs": [],
      "source": [
-        "albert_encoder = modeling.networks.AlbertTransformerEncoder(**cfg)\n",
+        "albert_encoder = modeling.networks.AlbertEncoder(**cfg)\n",
        "classifier_model = build_classifier(albert_encoder)\n",
        "# ... Train the model ...\n",
        "predict(classifier_model)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MeidDfhlHKSO"
      },
      "source": [
@@ -595,31 +562,14 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Uv_juT22HERW"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(albert_encoder, show_shapes=True, dpi=48)"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Customizing a Transformer Encoder",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
+      ],
+      "execution_count": null,
+      "outputs": []
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
--- a/official/colab/nlp/nlp_modeling_library_intro.ipynb
+++ b/official/colab/nlp/nlp_modeling_library_intro.ipynb
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Introduction to the TensorFlow Models NLP library",
+      "private_outputs": true,
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "80xnUmoI7fBX"
      },
      "source": [
@@ -12,14 +26,10 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "8nvTnfs6Q692"
      },
-      "outputs": [],
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
@@ -32,12 +42,13 @@
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "WmfcMK5P5C1G"
      },
      "source": [
@@ -47,30 +58,28 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "cH-oJ8R6AHMK"
      },
      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "  </td>\n",
+        "</table>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0H_EFIhq4-MJ"
      },
      "source": [
@@ -82,7 +91,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "2N97-dps_nUk"
      },
      "source": [
@@ -92,7 +100,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "459ygAVl_rg0"
      },
      "source": [
@@ -105,21 +112,18 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Y-qGkdh6_sZc"
      },
-      "outputs": [],
      "source": [
-        "!pip install -q tf-models-official==2.3.0"
-      ]
+        "!pip install -q tf-models-official==2.4.0"
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "e4huSSwyAG_5"
      },
      "source": [
@@ -128,25 +132,22 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "jqYXqtjBAJd9"
      },
-      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import tensorflow as tf\n",
        "\n",
        "from official.nlp import modeling\n",
        "from official.nlp.modeling import layers, losses, models, networks"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "djBQWjvy-60Y"
      },
      "source": [
@@ -160,38 +161,34 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MKuHVlsCHmiq"
      },
      "source": [
-        "### Build a `BertPretrainer` model wrapping `TransformerEncoder`\n",
+        "### Build a `BertPretrainer` model wrapping `BertEncoder`\n",
        "\n",
-        "The [TransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/transformer_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
+        "The [BertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/bert_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
        "\n",
        "The [BertPretrainer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_pretrainer.py) allows a user to pass in a transformer stack, and instantiates the masked language model and classification networks that are used to create the training objectives."
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "EXkcXz-9BwB3"
      },
-      "outputs": [],
      "source": [
        "# Build a small transformer network.\n",
        "vocab_size = 100\n",
        "sequence_length = 16\n",
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "    vocab_size=vocab_size, num_layers=2, sequence_length=16)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0NH5irV5KTMS"
      },
      "source": [
@@ -202,37 +199,32 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "lZNoZkBrIoff"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(network, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "o7eFOZXiIl-b"
      },
-      "outputs": [],
      "source": [
        "# Create a BERT pretrainer with the created network.\n",
        "num_token_predictions = 8\n",
        "bert_pretrainer = modeling.models.BertPretrainer(\n",
        "    network, num_classes=2, num_token_predictions=num_token_predictions, output='predictions')"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "d5h5HT7gNHx_"
      },
      "source": [
@@ -241,26 +233,20 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "2tcNfm03IBF7"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_pretrainer, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "F2oHrXGUIS0M"
      },
-      "outputs": [],
      "source": [
        "# We can feed some dummy data to get masked language model and sentence output.\n",
        "batch_size = 2\n",
@@ -275,12 +261,13 @@
        "sentence_output = outputs[\"classification\"]\n",
        "print(lm_output)\n",
        "print(sentence_output)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "bnx3UCHniCS5"
      },
      "source": [
@@ -290,13 +277,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "k30H4Q86f52x"
      },
-      "outputs": [],
      "source": [
        "masked_lm_ids_data = np.random.randint(vocab_size, size=(batch_size, num_token_predictions))\n",
        "masked_lm_weights_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
@@ -311,12 +294,13 @@
        "    predictions=sentence_output)\n",
        "loss = mlm_loss + sentence_loss\n",
        "print(loss)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "wrmSs8GjHxVw"
      },
      "source": [
@@ -328,7 +312,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "k8cQVFvBCV4s"
      },
      "source": [
@@ -342,38 +325,34 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xrLLEWpfknUW"
      },
      "source": [
-        "### Build a BertSpanLabeler wrapping TransformerEncoder\n",
+        "### Build a BertSpanLabeler wrapping BertEncoder\n",
        "\n",
        "[BertSpanLabeler](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_span_labeler.py) implements a simple single-span start-end predictor (that is, a model that predicts two values: a start token index and an end token index), suitable for SQuAD-style tasks.\n",
        "\n",
-        "Note that `BertSpanLabeler` wraps a `TransformerEncoder`, the weights of which can be restored from the above pretraining model.\n"
+        "Note that `BertSpanLabeler` wraps a `BertEncoder`, the weights of which can be restored from the above pretraining model.\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "B941M4iUCejO"
      },
-      "outputs": [],
      "source": [
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
        "\n",
        "# Create a BERT trainer with the created network.\n",
        "bert_span_labeler = modeling.models.BertSpanLabeler(network)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "QpB9pgj4PpMg"
      },
      "source": [
@@ -382,26 +361,20 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "RbqRNJCLJu4H"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_span_labeler, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "fUf1vRxZJwio"
      },
-      "outputs": [],
      "source": [
        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
@@ -412,12 +385,13 @@
        "start_logits, end_logits = bert_span_labeler([word_id_data, mask_data, type_id_data])\n",
        "print(start_logits)\n",
        "print(end_logits)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "WqhgQaN1lt-G"
      },
      "source": [
@@ -427,13 +401,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "waqs6azNl3Nn"
      },
-      "outputs": [],
      "source": [
        "start_positions = np.random.randint(sequence_length, size=(batch_size))\n",
        "end_positions = np.random.randint(sequence_length, size=(batch_size))\n",
@@ -445,12 +415,13 @@
        "\n",
        "total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2\n",
        "print(total_loss)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Zdf03YtZmd_d"
      },
      "source": [
@@ -460,7 +431,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0A1XnGSTChg9"
      },
      "source": [
@@ -472,38 +442,34 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MSK8OpZgnQa9"
      },
      "source": [
-        "### Build a BertClassifier model wrapping TransformerEncoder\n",
+        "### Build a BertClassifier model wrapping BertEncoder\n",
        "\n",
        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a [CLS] token classification model containing a single classification head."
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "cXXCsffkCphk"
      },
-      "outputs": [],
      "source": [
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
        "\n",
        "# Create a BERT trainer with the created network.\n",
        "num_classes = 2\n",
        "bert_classifier = modeling.models.BertClassifier(\n",
        "    network, num_classes=num_classes)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "8tZKueKYP4bB"
      },
      "source": [
@@ -512,26 +478,20 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "snlutm9ZJgEZ"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "yyHPHsqBJkCz"
      },
-      "outputs": [],
      "source": [
        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
@@ -541,12 +501,13 @@
        "# Feed the data to the model.\n",
        "logits = bert_classifier([word_id_data, mask_data, type_id_data])\n",
        "print(logits)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "w--a2mg4nzKm"
      },
      "source": [
@@ -557,45 +518,27 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "9X0S1DoFn_5Q"
      },
-      "outputs": [],
      "source": [
        "labels = np.random.randint(num_classes, size=(batch_size))\n",
        "\n",
        "loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
        "    labels=labels, predictions=tf.nn.log_softmax(logits, axis=-1))\n",
        "print(loss)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "mzBqOylZo3og"
      },
      "source": [
        "With the `loss`, you can optimize the model. Please see [run_classifier.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_classifier.py) or the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb) for the full example."
      ]
    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Introduction to the TensorFlow Models NLP library",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
--- a/official/common/distribute_utils.py
+++ b/official/common/distribute_utils.py
@@ -127,6 +127,15 @@ def get_distribution_strategy(distribution_strategy="mirrored",
  if num_gpus < 0:
    raise ValueError("`num_gpus` can not be negative.")

+  if not isinstance(distribution_strategy, str):
+    msg = ("distribution_strategy must be a string but got: %s." %
+           (distribution_strategy,))
+    if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
+      msg += (" If you meant to pass the string 'off', make sure you add "
+              "quotes around 'off' so that yaml interprets it as a string "
+              "instead of a bool.")
+    raise ValueError(msg)
+
  distribution_strategy = distribution_strategy.lower()
  if distribution_strategy == "off":
    if num_gpus > 1:

--- a/official/common/distribute_utils_test.py
+++ b/official/common/distribute_utils_test.py
@@ -41,6 +41,19 @@ class GetDistributionStrategyTest(tf.test.TestCase):
    for device in ds.extended.worker_devices:
      self.assertIn('GPU', device)

+  def test_no_strategy(self):
+    ds = distribute_utils.get_distribution_strategy('off')
+    self.assertIsNone(ds)
+
+  def test_invalid_strategy(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'distribution_strategy must be a string but got: False. If'):
+      distribute_utils.get_distribution_strategy(False)
+    with self.assertRaisesRegexp(
+        ValueError, 'distribution_strategy must be a string but got: 1'):
+      distribute_utils.get_distribution_strategy(1)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/core/input_reader.py
+++ b/official/core/input_reader.py
@@ -14,7 +14,7 @@

 """A common dataset reader."""
 import random
-from typing import Any, Callable, Optional
+from typing import Any, Callable, List, Optional

 from absl import logging
 import tensorflow as tf
@@ -27,6 +27,13 @@ def _get_random_integer():
  return random.randint(0, (1 << 31) - 1)


+def _maybe_map_fn(dataset: tf.data.Dataset,
+                  fn: Optional[Callable[..., Any]] = None) -> tf.data.Dataset:
+  """Calls dataset.map if a valid function is passed in."""
+  return dataset if fn is None else dataset.map(
+      fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+
 class InputReader:
  """Input reader that returns a tf.data.Dataset instance."""

@@ -74,38 +81,7 @@ class InputReader:
    self._tfds_builder = None
    self._matched_files = []
    if params.input_path:
-      # Read dataset from files.
-      usage = ('`input_path` should be either (1) a str indicating a file '
-               'path/pattern, or (2) a str indicating multiple file '
-               'paths/patterns separated by comma (e.g "a, b, c" or no spaces '
-               '"a,b,c", or (3) a list of str, each of which is a file '
-               'path/pattern or multiple file paths/patterns separated by '
-               'comma, but got: %s')
-      if isinstance(params.input_path, str):
-        input_path_list = [params.input_path]
-      elif isinstance(params.input_path, (list, tuple)):
-        if any(not isinstance(x, str) for x in params.input_path):
-          raise ValueError(usage % params.input_path)
-        input_path_list = params.input_path
-      else:
-        raise ValueError(usage % params.input_path)
-
-      for input_path in input_path_list:
-        input_patterns = input_path.strip().split(',')
-        for input_pattern in input_patterns:
-          input_pattern = input_pattern.strip()
-          if not input_pattern:
-            continue
-          if '*' in input_pattern or '?' in input_pattern:
-            tmp_matched_files = tf.io.gfile.glob(input_pattern)
-            if not tmp_matched_files:
-              raise ValueError('%s does not match any files.' % input_pattern)
-            self._matched_files.extend(tmp_matched_files)
-          else:
-            self._matched_files.append(input_pattern)
-
-      if not self._matched_files:
-        raise ValueError('%s does not match any files.' % params.input_path)
+      self._matched_files = self._match_files(params.input_path)
    else:
      # Read dataset from TFDS.
      if not params.tfds_split:
@@ -135,7 +111,10 @@ class InputReader:
    self._parser_fn = parser_fn
    self._transform_and_batch_fn = transform_and_batch_fn
    self._postprocess_fn = postprocess_fn
-    self._seed = _get_random_integer()
+    # When tf.data service is enabled, each data service worker should get
+    # different random seeds. Thus, we set `seed` to None.
+    self._seed = (None
+                  if params.enable_tf_data_service else _get_random_integer())

    self._enable_tf_data_service = (
        params.enable_tf_data_service and params.tf_data_service_address)
@@ -148,15 +127,57 @@ class InputReader:
      self._enable_round_robin_tf_data_service = params.get(
          'enable_round_robin_tf_data_service', False)

+  def _match_files(self, input_path: str) -> List[str]:
+    """Matches files from an input_path."""
+    matched_files = []
+    # Read dataset from files.
+    usage = ('`input_path` should be either (1) a str indicating a file '
+             'path/pattern, or (2) a str indicating multiple file '
+             'paths/patterns separated by comma (e.g "a, b, c" or no spaces '
+             '"a,b,c", or (3) a list of str, each of which is a file '
+             'path/pattern or multiple file paths/patterns separated by '
+             'comma, but got: %s')
+    if isinstance(input_path, str):
+      input_path_list = [input_path]
+    elif isinstance(input_path, (list, tuple)):
+      if any(not isinstance(x, str) for x in input_path):
+        raise ValueError(usage % input_path)
+      input_path_list = input_path
+    else:
+      raise ValueError(usage % input_path)
+
+    for input_path in input_path_list:
+      input_patterns = input_path.strip().split(',')
+      for input_pattern in input_patterns:
+        input_pattern = input_pattern.strip()
+        if not input_pattern:
+          continue
+        if '*' in input_pattern or '?' in input_pattern:
+          tmp_matched_files = tf.io.gfile.glob(input_pattern)
+          if not tmp_matched_files:
+            raise ValueError('%s does not match any files.' % input_pattern)
+          matched_files.extend(tmp_matched_files)
+        else:
+          matched_files.append(input_pattern)
+
+    if not matched_files:
+      raise ValueError('%s does not match any files.' % input_path)
+
+    return matched_files
+
  def _shard_files_then_read(
-      self, input_context: Optional[tf.distribute.InputContext] = None):
+      self,
+      matched_files: List[str],
+      dataset_fn,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
    """Shards the data files and then sent a split to every worker to read."""
-    dataset = tf.data.Dataset.from_tensor_slices(self._matched_files)
+    dataset = tf.data.Dataset.from_tensor_slices(matched_files)

    # Shuffle and repeat at file level.
    if self._is_training:
      dataset = dataset.shuffle(
-          len(self._matched_files),
+          len(matched_files),
          seed=self._seed,
          reshuffle_each_iteration=True)

@@ -171,7 +192,7 @@ class InputReader:
      dataset = dataset.repeat()

    dataset = dataset.interleave(
-        map_func=self._dataset_fn,
+        map_func=dataset_fn,
        cycle_length=self._cycle_length,
        block_length=self._block_length,
        num_parallel_calls=(self._cycle_length if self._cycle_length else
@@ -180,9 +201,13 @@ class InputReader:
    return dataset

  def _read_files_then_shard(
-      self, input_context: Optional[tf.distribute.InputContext] = None):
+      self,
+      matched_files: List[str],
+      dataset_fn,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
    """Sends all data files to every worker and then shard by data."""
-    dataset = self._dataset_fn(self._matched_files)
+    dataset = dataset_fn(matched_files)

    # When `input_file` is a path to a single file or the number of files is
    # less than the number of input pipelines, disable auto sharding
@@ -238,26 +263,35 @@ class InputReader:
      raise ValueError('tfds_info is not available, because the dataset '
                       'is not loaded from tfds.')

-  def read(
+  def _read_decode_and_parse_dataset(
      self,
-      input_context: Optional[tf.distribute.InputContext] = None
-  ) -> tf.data.Dataset:
-    """Generates a tf.data.Dataset object."""
-    if self._tfds_builder:
+      matched_files: List[str],
+      dataset_fn,
+      batch_size: int,
+      input_context: Optional[tf.distribute.InputContext] = None,
+      tfds_builder: bool = False) -> tf.data.Dataset:
+    """Returns a tf.data.Dataset object after reading, decoding, and parsing."""
+    if tfds_builder:
      dataset = self._read_tfds(input_context)
    elif len(self._matched_files) > 1:
-      if input_context and (len(self._matched_files) <
+      if input_context and (len(matched_files) <
                            input_context.num_input_pipelines):
        logging.warn(
            'The number of files %d is less than the number of input pipelines '
            '%d. We will send all input files to every worker. '
            'Please consider sharding your data into more files.',
-            len(self._matched_files), input_context.num_input_pipelines)
-        dataset = self._read_files_then_shard(input_context)
+            len(matched_files), input_context.num_input_pipelines)
+        dataset = self._read_files_then_shard(matched_files,
+                                              dataset_fn,
+                                              input_context)
      else:
-        dataset = self._shard_files_then_read(input_context)
-    elif len(self._matched_files) == 1:
-      dataset = self._read_files_then_shard(input_context)
+        dataset = self._shard_files_then_read(matched_files,
+                                              dataset_fn,
+                                              input_context)
+    elif len(matched_files) == 1:
+      dataset = self._read_files_then_shard(matched_files,
+                                            dataset_fn,
+                                            input_context)
    else:
      raise ValueError('It is unexpected that `tfds_builder` is None and '
                       'there is also no `matched_files`.')
@@ -268,25 +302,28 @@ class InputReader:
    if self._is_training:
      dataset = dataset.shuffle(self._shuffle_buffer_size)

-    def maybe_map_fn(dataset, fn):
-      return dataset if fn is None else dataset.map(
-          fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-    dataset = maybe_map_fn(dataset, self._decoder_fn)
+    dataset = _maybe_map_fn(dataset, self._decoder_fn)
    if self._sample_fn is not None:
      dataset = dataset.apply(self._sample_fn)
-    dataset = maybe_map_fn(dataset, self._parser_fn)
+    dataset = _maybe_map_fn(dataset, self._parser_fn)

    if self._transform_and_batch_fn is not None:
      dataset = self._transform_and_batch_fn(dataset, input_context)
    else:
      per_replica_batch_size = input_context.get_per_replica_batch_size(
-          self._global_batch_size) if input_context else self._global_batch_size
+          batch_size) if input_context else batch_size
      dataset = dataset.batch(
-          per_replica_batch_size, drop_remainder=self._drop_remainder)
+          per_replica_batch_size, drop_remainder=self._drop_remainder
+      )

-    dataset = maybe_map_fn(dataset, self._postprocess_fn)
+    return dataset

+  def _maybe_apply_data_service(
+      self,
+      dataset: tf.data.Dataset,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Potentially distributes a dataset."""
    if self._enable_tf_data_service and input_context:
      if self._enable_round_robin_tf_data_service:
        replicas_per_input_pipeline = input_context.num_replicas_in_sync // (
@@ -316,6 +353,20 @@ class InputReader:
                processing_mode='parallel_epochs',
                service=self._tf_data_service_address,
                job_name=self._tf_data_service_job_name))
+    return dataset
+
+  def read(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Generates a tf.data.Dataset object."""
+    dataset = self._read_decode_and_parse_dataset(self._matched_files,
+                                                  self._dataset_fn,
+                                                  self._global_batch_size,
+                                                  input_context,
+                                                  self._tfds_builder)
+    dataset = _maybe_map_fn(dataset, self._postprocess_fn)
+    dataset = self._maybe_apply_data_service(dataset, input_context)

    if self._deterministic is not None:
      options = tf.data.Options()

--- a/official/core/train_lib.py
+++ b/official/core/train_lib.py
@@ -27,26 +27,7 @@ from official.core import config_definitions
 from official.core import train_utils

 BestCheckpointExporter = train_utils.BestCheckpointExporter
-
-
-def maybe_create_best_ckpt_exporter(params: config_definitions.ExperimentConfig,
-                                    data_dir: str) -> Any:
-  """Maybe create a BestCheckpointExporter object, according to the config."""
-  export_subdir = params.trainer.best_checkpoint_export_subdir
-  metric_name = params.trainer.best_checkpoint_eval_metric
-  metric_comp = params.trainer.best_checkpoint_metric_comp
-  if data_dir and export_subdir and metric_name:
-    best_ckpt_dir = os.path.join(data_dir, export_subdir)
-    best_ckpt_exporter = BestCheckpointExporter(
-        best_ckpt_dir, metric_name, metric_comp)
-    logging.info(
-        'Created the best checkpoint exporter. '
-        'data_dir: %s, export_subdir: %s, metric_name: %s', data_dir,
-        export_subdir, metric_name)
-  else:
-    best_ckpt_exporter = None
-
-  return best_ckpt_exporter
+maybe_create_best_ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter


 def run_experiment(distribution_strategy: tf.distribute.Strategy,
@@ -83,7 +64,8 @@ def run_experiment(distribution_strategy: tf.distribute.Strategy,
        task,
        train='train' in mode,
        evaluate=('eval' in mode) or run_post_eval,
-        checkpoint_exporter=maybe_create_best_ckpt_exporter(params, model_dir))
+        checkpoint_exporter=maybe_create_best_ckpt_exporter(
+            params, model_dir))

  if trainer.checkpoint:
    checkpoint_manager = tf.train.CheckpointManager(

--- a/official/core/train_utils.py
+++ b/official/core/train_utils.py
@@ -17,7 +17,7 @@ import copy
 import json
 import os
 import pprint
-from typing import List, Optional
+from typing import Any, Callable, Dict, List, Optional

 from absl import logging
 import dataclasses
@@ -32,6 +32,75 @@ from official.core import exp_factory
 from official.modeling import hyperparams


+def get_leaf_nested_dict(
+    d: Dict[str, Any], keys: List[str]) -> Dict[str, Any]:
+  """Get leaf from a dictionary with arbitrary depth with a list of keys.
+
+  Args:
+    d: The dictionary to extract value from.
+    keys: The list of keys to extract values recursively.
+
+  Returns:
+    The value of the leaf.
+
+  Raises:
+    KeyError: If the value of keys extracted is a dictionary.
+  """
+  leaf = d
+  for k in keys:
+    if not isinstance(leaf, dict) or k not in leaf:
+      raise KeyError(
+          'Path not exist while traversing the dictionary: d with keys'
+          ': %s.' % keys)
+    leaf = leaf[k]
+
+  if isinstance(leaf, dict):
+    raise KeyError('The value extracted with keys: %s is not a leaf of the '
+                   'dictionary: %s.' % (keys, d))
+  return leaf
+
+
+def cast_leaf_nested_dict(
+    d: Dict[str, Any],
+    cast_fn: Callable[[Any], Any]) -> Dict[str, Any]:
+  """Cast the leaves of a dictionary with arbitrary depth in place.
+
+  Args:
+    d: The dictionary to extract value from.
+    cast_fn: The casting function.
+
+  Returns:
+    A dictionray with the same structure as d.
+  """
+  for key, value in d.items():
+    if isinstance(value, dict):
+      d[key] = cast_leaf_nested_dict(value, cast_fn)
+    else:
+      d[key] = cast_fn(value)
+  return d
+
+
+def maybe_create_best_ckpt_exporter(params: config_definitions.ExperimentConfig,
+                                    data_dir: str) -> Any:
+  """Maybe create a BestCheckpointExporter object, according to the config."""
+  export_subdir = params.trainer.best_checkpoint_export_subdir
+  metric_name = params.trainer.best_checkpoint_eval_metric
+  metric_comp = params.trainer.best_checkpoint_metric_comp
+  if data_dir and export_subdir and metric_name:
+    best_ckpt_dir = os.path.join(data_dir, export_subdir)
+    best_ckpt_exporter = BestCheckpointExporter(
+        best_ckpt_dir, metric_name, metric_comp)
+    logging.info(
+        'Created the best checkpoint exporter. '
+        'data_dir: %s, export_subdir: %s, metric_name: %s', data_dir,
+        export_subdir, metric_name)
+  else:
+    best_ckpt_exporter = None
+
+  return best_ckpt_exporter
+
+
+# TODO(b/180147589): Add tests for this module.
 class BestCheckpointExporter:
  """Keeps track of the best result, and saves its checkpoint.

@@ -45,17 +114,32 @@ class BestCheckpointExporter:
    Args:
      export_dir: The directory that will contain exported checkpoints.
      metric_name: Indicates which metric to look at, when determining which
-        result is better.
+        result is better. If eval_logs being passed to maybe_export_checkpoint
+        is a nested dictionary, use `|` as a seperator for different layers.
      metric_comp: Indicates how to compare results. Either `lower` or `higher`.
    """
    self._export_dir = export_dir
-    self._metric_name = metric_name
+    self._metric_name = metric_name.split('|')
    self._metric_comp = metric_comp
    if self._metric_comp not in ('lower', 'higher'):
      raise ValueError('best checkpoint metric comp must be one of '
                       'higher, lower. Got: {}'.format(self._metric_comp))
    tf.io.gfile.makedirs(os.path.dirname(self.best_ckpt_logs_path))
    self._best_ckpt_logs = self._maybe_load_best_eval_metric()
+    self._checkpoint_manager = None
+
+  def _get_checkpoint_manager(self, checkpoint):
+    """Gets an existing checkpoint manager or creates a new one."""
+    if self._checkpoint_manager is None or (
+        self._checkpoint_manager.checkpoint != checkpoint):
+      logging.info('Creates a new checkpoint manager.')
+      self._checkpoint_manager = tf.train.CheckpointManager(
+          checkpoint,
+          directory=self._export_dir,
+          max_to_keep=1,
+          checkpoint_name='best_ckpt')
+
+    return self._checkpoint_manager

  def maybe_export_checkpoint(self, checkpoint, eval_logs, global_step):
    logging.info('[BestCheckpointExporter] received eval_logs: %s, at step: %d',
@@ -74,12 +158,10 @@ class BestCheckpointExporter:

  def _new_metric_is_better(self, old_logs, new_logs):
    """Check if the metric in new_logs is better than the metric in old_logs."""
-    if self._metric_name not in old_logs or self._metric_name not in new_logs:
-      raise KeyError('best checkpoint eval metric name {} is not valid. '
-                     'old_logs: {}, new_logs: {}'.format(
-                         self._metric_name, old_logs, new_logs))
-    old_value = float(orbit.utils.get_value(old_logs[self._metric_name]))
-    new_value = float(orbit.utils.get_value(new_logs[self._metric_name]))
+    old_value = float(orbit.utils.get_value(
+        get_leaf_nested_dict(old_logs, self._metric_name)))
+    new_value = float(orbit.utils.get_value(
+        get_leaf_nested_dict(new_logs, self._metric_name)))

    logging.info('[BestCheckpointExporter] comparing results. old: %f, new: %f',
                 old_value, new_value)
@@ -99,16 +181,13 @@ class BestCheckpointExporter:
    """Export evaluation results of the best checkpoint into a json file."""
    eval_logs_ext = copy.copy(eval_logs)
    eval_logs_ext['best_ckpt_global_step'] = global_step
-    for name, value in eval_logs_ext.items():
-      eval_logs_ext[name] = float(orbit.utils.get_value(value))
+    eval_logs_ext = cast_leaf_nested_dict(
+        eval_logs_ext, lambda x: float(orbit.utils.get_value(x)))
    # Saving json file is very fast.
    with tf.io.gfile.GFile(self.best_ckpt_logs_path, 'w') as writer:
      writer.write(json.dumps(eval_logs_ext, indent=4) + '\n')

-    # Saving the best checkpoint might be interrupted if the job got killed.
-    for file_to_remove in tf.io.gfile.glob(self.best_ckpt_path + '*'):
-      tf.io.gfile.remove(file_to_remove)
-    checkpoint.write(self.best_ckpt_path)
+    self._get_checkpoint_manager(checkpoint).save()

  @property
  def best_ckpt_logs(self):
@@ -120,7 +199,8 @@ class BestCheckpointExporter:

  @property
  def best_ckpt_path(self):
-    return os.path.join(self._export_dir, 'best_ckpt')
+    """Returns the best ckpt path or None if there is no ckpt yet."""
+    return tf.train.latest_checkpoint(self._export_dir)


 @gin.configurable

--- a/official/core/train_utils_test.py
+++ b/official/core/train_utils_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for official.core.train_utils."""
+
+import tensorflow as tf
+
+from official.core import train_utils
+
+
+class TrainUtilsTest(tf.test.TestCase):
+
+  def test_get_leaf_nested_dict(self):
+    d = {'a': {'i': {'x': 5}}}
+    self.assertEqual(train_utils.get_leaf_nested_dict(d, ['a', 'i', 'x']), 5)
+
+  def test_get_leaf_nested_dict_not_leaf(self):
+    with self.assertRaisesRegex(KeyError, 'The value extracted with keys.*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i'])
+
+  def test_get_leaf_nested_dict_path_not_exist_missing_key(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'y'])
+
+  def test_get_leaf_nested_dict_path_not_exist_out_of_range(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'z'])
+
+  def test_get_leaf_nested_dict_path_not_exist_meets_leaf(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': 5}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'z'])
+
+  def test_cast_leaf_nested_dict(self):
+    d = {'a': {'i': {'x': '123'}}, 'b': 456.5}
+    d = train_utils.cast_leaf_nested_dict(d, int)
+    self.assertEqual(d['a']['i']['x'], 123)
+    self.assertEqual(d['b'], 456)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/multitask/configs.py
+++ b/official/modeling/multitask/configs.py
@@ -37,16 +37,10 @@ class MultiTaskConfig(hyperparams.Config):


 @dataclasses.dataclass
-class MultiEvalExperimentConfig(hyperparams.Config):
+class MultiEvalExperimentConfig(cfg.ExperimentConfig):
  """An experiment config for single-task training and multi-task evaluation.

  Attributes:
-    task: the single-stream training task.
    eval_tasks: individual evaluation tasks.
-    trainer: the trainer configuration.
-    runtime: the runtime configuration.
  """
-  task: cfg.TaskConfig = cfg.TaskConfig()
  eval_tasks: MultiTaskConfig = MultiTaskConfig()
-  trainer: cfg.TrainerConfig = cfg.TrainerConfig()
-  runtime: cfg.RuntimeConfig = cfg.RuntimeConfig()
--- a/official/modeling/multitask/evaluator.py
+++ b/official/modeling/multitask/evaluator.py
@@ -21,6 +21,7 @@ import gin
 import orbit
 import tensorflow as tf

+from official.core import train_utils
 from official.modeling.multitask import base_model
 from official.modeling.multitask import multitask

@@ -29,16 +30,20 @@ from official.modeling.multitask import multitask
 class MultiTaskEvaluator(orbit.AbstractEvaluator):
  """Implements the common trainer shared for TensorFlow models."""

-  def __init__(self,
-               task: multitask.MultiTask,
-               model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
-               global_step: Optional[tf.Variable] = None):
+  def __init__(
+      self,
+      task: multitask.MultiTask,
+      model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
+      global_step: Optional[tf.Variable] = None,
+      checkpoint_exporter: Optional[train_utils.BestCheckpointExporter] = None):
    """Initialize common trainer for TensorFlow models.

    Args:
      task: A multitask.MultiTask instance.
      model: tf.keras.Model instance.
      global_step: the global step variable.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
    """
    # Gets the current distribution strategy. If not inside any strategy scope,
    # it gets a single-replica no-op strategy.
@@ -46,19 +51,10 @@ class MultiTaskEvaluator(orbit.AbstractEvaluator):
    self._task = task
    self._model = model
    self._global_step = global_step or orbit.utils.create_global_step()
-    # TODO(hongkuny): Define a more robust way to handle the training/eval
-    # checkpoint loading.
-    if hasattr(self.model, "checkpoint_items"):
-      # Each evaluation task can have different models and load a subset of
-      # components from the training checkpoint. This is assuming the
-      # checkpoint items are able to load the weights of the evaluation model.
-      checkpoint_items = self.model.checkpoint_items
-    else:
-      # This is assuming the evaluation model is exactly the training model.
-      checkpoint_items = dict(model=self.model)
+    self._checkpoint_exporter = checkpoint_exporter
    self._checkpoint = tf.train.Checkpoint(
        global_step=self.global_step,
-        **checkpoint_items)
+        model=self.model)

    self._validation_losses = None
    self._validation_metrics = None
@@ -168,4 +164,8 @@ class MultiTaskEvaluator(orbit.AbstractEvaluator):
        metrics = task.reduce_aggregated_logs(outputs)
        logs.update(metrics)
      results[name] = logs
+
+    if self._checkpoint_exporter:
+      self._checkpoint_exporter.maybe_export_checkpoint(
+          self.checkpoint, results, self.global_step.numpy())
    return results
--- a/official/modeling/multitask/train_lib.py
+++ b/official/modeling/multitask/train_lib.py
@@ -20,6 +20,7 @@ import orbit
 import tensorflow as tf
 from official.core import base_task
 from official.core import base_trainer as core_lib
+from official.core import train_utils
 from official.modeling.multitask import configs
 from official.modeling.multitask import evaluator as evaluator_lib
 from official.modeling.multitask import multitask
@@ -73,7 +74,9 @@ def run_experiment_with_multitask_eval(
      evaluator = evaluator_lib.MultiTaskEvaluator(
          task=eval_tasks,
          model=model,
-          global_step=trainer.global_step if is_training else None)
+          global_step=trainer.global_step if is_training else None,
+          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+              params, model_dir))
    else:
      evaluator = None


--- a/official/nlp/albert/export_albert_tfhub.py
+++ b/official/nlp/albert/export_albert_tfhub.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A script to export the ALBERT core model as a TF-Hub SavedModel."""
-
-# Import libraries
-from absl import app
-from absl import flags
-import tensorflow as tf
-from typing import Text
-
-from official.nlp.albert import configs
-from official.nlp.bert import bert_models
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("albert_config_file", None,
-                    "Albert configuration file to define core albert layers.")
-flags.DEFINE_string("model_checkpoint_path", None,
-                    "File path to TF model checkpoint.")
-flags.DEFINE_string("export_path", None, "TF-Hub SavedModel destination path.")
-flags.DEFINE_string(
-    "sp_model_file", None,
-    "The sentence piece model file that the ALBERT model was trained on.")
-
-
-def create_albert_model(
-    albert_config: configs.AlbertConfig) -> tf.keras.Model:
-  """Creates an ALBERT keras core model from ALBERT configuration.
-
-  Args:
-    albert_config: An `AlbertConfig` to create the core model.
-
-  Returns:
-    A keras model.
-  """
-  # Adds input layers just as placeholders.
-  input_word_ids = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_word_ids")
-  input_mask = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_mask")
-  input_type_ids = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_type_ids")
-  transformer_encoder = bert_models.get_transformer_encoder(
-      albert_config, sequence_length=None)
-  sequence_output, pooled_output = transformer_encoder(
-      [input_word_ids, input_mask, input_type_ids])
-  # To keep consistent with legacy hub modules, the outputs are
-  # "pooled_output" and "sequence_output".
-  return tf.keras.Model(
-      inputs=[input_word_ids, input_mask, input_type_ids],
-      outputs=[pooled_output, sequence_output]), transformer_encoder
-
-
-def export_albert_tfhub(albert_config: configs.AlbertConfig,
-                        model_checkpoint_path: Text, hub_destination: Text,
-                        sp_model_file: Text):
-  """Restores a tf.keras.Model and saves for TF-Hub."""
-  core_model, encoder = create_albert_model(albert_config)
-  checkpoint = tf.train.Checkpoint(model=encoder)
-  checkpoint.restore(model_checkpoint_path).assert_consumed()
-  core_model.sp_model_file = tf.saved_model.Asset(sp_model_file)
-  core_model.save(hub_destination, include_optimizer=False, save_format="tf")
-
-
-def main(_):
-  albert_config = configs.AlbertConfig.from_json_file(
-      FLAGS.albert_config_file)
-  export_albert_tfhub(albert_config, FLAGS.model_checkpoint_path,
-                      FLAGS.export_path, FLAGS.sp_model_file)
-
-
-if __name__ == "__main__":
-  app.run(main)
--- a/official/nlp/albert/export_albert_tfhub_test.py
+++ b/official/nlp/albert/export_albert_tfhub_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests official.nlp.albert.export_albert_tfhub."""
-import os
-
-import numpy as np
-
-import tensorflow as tf
-import tensorflow_hub as hub
-
-from official.nlp.albert import configs
-from official.nlp.albert import export_albert_tfhub
-
-
-class ExportAlbertTfhubTest(tf.test.TestCase):
-
-  def test_export_albert_tfhub(self):
-    # Exports a savedmodel for TF-Hub
-    albert_config = configs.AlbertConfig(
-        vocab_size=100,
-        embedding_size=8,
-        hidden_size=16,
-        intermediate_size=32,
-        max_position_embeddings=128,
-        num_attention_heads=2,
-        num_hidden_layers=1)
-    bert_model, encoder = export_albert_tfhub.create_albert_model(albert_config)
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(model=encoder)
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-
-    sp_model_file = os.path.join(self.get_temp_dir(), "sp_tokenizer.model")
-    with tf.io.gfile.GFile(sp_model_file, "w") as f:
-      f.write("dummy content")
-
-    hub_destination = os.path.join(self.get_temp_dir(), "hub")
-    export_albert_tfhub.export_albert_tfhub(
-        albert_config,
-        model_checkpoint_path,
-        hub_destination,
-        sp_model_file=sp_model_file)
-
-    # Restores a hub KerasLayer.
-    hub_layer = hub.KerasLayer(hub_destination, trainable=True)
-
-    if hasattr(hub_layer, "resolved_object"):
-      with tf.io.gfile.GFile(
-          hub_layer.resolved_object.sp_model_file.asset_path.numpy()) as f:
-        self.assertEqual("dummy content", f.read())
-    # Checks the hub KerasLayer.
-    for source_weight, hub_weight in zip(bert_model.trainable_weights,
-                                         hub_layer.trainable_weights):
-      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
-
-    dummy_ids = np.zeros((2, 10), dtype=np.int32)
-    hub_outputs = hub_layer([dummy_ids, dummy_ids, dummy_ids])
-    source_outputs = bert_model([dummy_ids, dummy_ids, dummy_ids])
-
-    # The outputs of hub module are "pooled_output" and "sequence_output",
-    # while the outputs of encoder is in reversed order, i.e.,
-    # "sequence_output" and "pooled_output".
-    encoder_outputs = reversed(encoder([dummy_ids, dummy_ids, dummy_ids]))
-    self.assertEqual(hub_outputs[0].shape, (2, 16))
-    self.assertEqual(hub_outputs[1].shape, (2, 10, 16))
-    for source_output, hub_output, encoder_output in zip(
-        source_outputs, hub_outputs, encoder_outputs):
-      self.assertAllClose(source_output.numpy(), hub_output.numpy())
-      self.assertAllClose(source_output.numpy(), encoder_output.numpy())
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
+++ b/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
@@ -65,6 +65,7 @@ ALBERT_NAME_REPLACEMENTS = (
    ("ffn_1/intermediate/output/dense", "output"),
    ("transformer/LayerNorm_1/", "transformer/output_layer_norm/"),
    ("pooler/dense", "pooler_transform"),
+    ("cls/predictions", "bert/cls/predictions"),
    ("cls/predictions/output_bias", "cls/predictions/output_bias/bias"),
    ("cls/seq_relationship/output_bias", "predictions/transform/logits/bias"),
    ("cls/seq_relationship/output_weights",
@@ -113,6 +114,8 @@ def _create_pretrainer_model(cfg):
      mlm_activation=tf_utils.get_activation(cfg.hidden_act),
      mlm_initializer=tf.keras.initializers.TruncatedNormal(
          stddev=cfg.initializer_range))
+  # Makes sure masked_lm layer's variables in pretrainer are created.
+  _ = pretrainer(pretrainer.inputs)
  return pretrainer



--- a/official/nlp/bert/export_tfhub.py
+++ b/official/nlp/bert/export_tfhub.py
@@ -12,14 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A script to export the BERT core model as a TF-Hub SavedModel."""
+"""A script to export BERT as a TF-Hub SavedModel.
+
+This script is **DEPRECATED** for exporting BERT encoder models;
+see the error message in by main() for details.
+"""
+
+from typing import Text

 # Import libraries
 from absl import app
 from absl import flags
 from absl import logging
 import tensorflow as tf
-from typing import Text
 from official.nlp.bert import bert_models
 from official.nlp.bert import configs

@@ -112,6 +117,14 @@ def export_bert_squad_tfhub(bert_config: configs.BertConfig,
 def main(_):
  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
  if FLAGS.model_type == "encoder":
+    deprecation_note = (
+        "nlp/bert/export_tfhub is **DEPRECATED** for exporting BERT encoder "
+        "models. Please switch to nlp/tools/export_tfhub for exporting BERT "
+        "(and other) encoders with dict inputs/outputs conforming to "
+        "https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders"
+    )
+    logging.error(deprecation_note)
+    print("\n\nNOTICE:", deprecation_note, "\n")
    export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path,
                      FLAGS.export_path, FLAGS.vocab_file, FLAGS.do_lower_case)
  elif FLAGS.model_type == "squad":

--- a/official/nlp/bert/tf1_checkpoint_converter_lib.py
+++ b/official/nlp/bert/tf1_checkpoint_converter_lib.py
@@ -116,7 +116,13 @@ def create_v2_checkpoint(model,
  """Converts a name-based matched TF V1 checkpoint to TF V2 checkpoint."""
  # Uses streaming-restore in eager model to read V1 name-based checkpoints.
  model.load_weights(src_checkpoint).assert_existing_objects_matched()
-  checkpoint = tf.train.Checkpoint(**{checkpoint_model_name: model})
+  if hasattr(model, "checkpoint_items"):
+    checkpoint_items = model.checkpoint_items
+  else:
+    checkpoint_items = {}
+
+  checkpoint_items[checkpoint_model_name] = model
+  checkpoint = tf.train.Checkpoint(**checkpoint_items)
  checkpoint.save(output_path)



--- a/official/nlp/configs/experiment_configs.py
+++ b/official/nlp/configs/experiment_configs.py
@@ -16,3 +16,4 @@
 """Experiments definition."""
 # pylint: disable=unused-import
 from official.nlp.configs import finetuning_experiments
+from official.nlp.configs import pretraining_experiments
--- a/official/nlp/configs/pretraining_experiments.py
+++ b/official/nlp/configs/pretraining_experiments.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pretraining experiment configurations."""
+# pylint: disable=g-doc-return-or-yield,line-too-long
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import optimization
+from official.nlp.data import pretrain_dataloader
+from official.nlp.tasks import masked_lm
+
+
+@exp_factory.register_config_factory('bert/pretraining')
+def bert_pretraining() -> cfg.ExperimentConfig:
+  """BERT pretraining experiment."""
+  config = cfg.ExperimentConfig(
+      task=masked_lm.MaskedLMConfig(
+          train_data=pretrain_dataloader.BertPretrainDataConfig(),
+          validation_data=pretrain_dataloader.BertPretrainDataConfig(
+              is_training=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=1000000,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay': [
+                          'LayerNorm', 'layer_norm', 'bias'
+                      ],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 1e-4,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config