Merge branch 'master' of https://github.com/tensorflow/models into RTESuperGLUE

2e9bb539 · stephenwu · 7bae5317 · 8fba84f8 · 2e9bb539 · 2e9bb539
Commit 2e9bb539 authored Feb 25, 2021 by stephenwu
20 changed files
--- a/official/README.md
+++ b/official/README.md
@@ -19,7 +19,7 @@ In the near future, we will add:
 * State-of-the-art language understanding models.
 * State-of-the-art image classification models.
-* State-of-the-art objection detection and instance segmentation models.
+* State-of-the-art object detection and instance segmentation models.
 ## Table of Contents

--- a/official/colab/fine_tuning_bert.ipynb
+++ b/official/colab/fine_tuning_bert.ipynb
@@ -3,7 +3,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "vXLA5InzXydn"
      },
      "source": [
@@ -15,8 +14,6 @@
      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "RuRlpLL-X0R_"
      },
      "outputs": [],
@@ -37,7 +34,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "1mLJmVotXs64"
      },
      "source": [
@@ -47,7 +43,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "hYEwGTeCXnnX"
      },
      "source": [
@@ -73,7 +68,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "YN2ACivEPxgD"
      },
      "source": [
@@ -85,7 +79,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "s2d9S2CSSO1z"
      },
      "source": [
@@ -95,7 +88,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "fsACVQpVSifi"
      },
      "source": [
@@ -110,19 +102,16 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "NvNr2svBM-p3"
      },
      "outputs": [],
      "source": [
-        "!pip install -q tf-models-official==2.3.0"
+        "!pip install -q tf-models-official==2.4.0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "U-7qPCjWUAyy"
      },
      "source": [
@@ -133,8 +122,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "lXsXev5MNr20"
      },
      "outputs": [],
@@ -163,13 +150,12 @@
        "import official.nlp.data.classifier_data_lib\n",
        "import official.nlp.modeling.losses\n",
        "import official.nlp.modeling.models\n",
-        "import official.nlp.modeling.networks"
+        "import official.nlp.modeling.networks\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "mbanlzTvJBsz"
      },
      "source": [
@@ -179,7 +165,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "PpW0x8TpR8DT"
      },
      "source": [
@@ -190,20 +175,17 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "vzRHOLciR8eq"
      },
      "outputs": [],
      "source": [
-        "gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12\"\n",
+        "gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12\"\n",
        "tf.io.gfile.listdir(gs_folder_bert)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9uFskufsR2LT"
      },
      "source": [
@@ -214,19 +196,16 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "e0dAkUttJAzj"
      },
      "outputs": [],
      "source": [
-        "hub_url_bert = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\""
+        "hub_url_bert = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Qv6abtRvH4xO"
      },
      "source": [
@@ -239,7 +218,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "28DvUhC1YUiB"
      },
      "source": [
@@ -257,8 +235,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Ijikx5OsH9AT"
      },
      "outputs": [],
@@ -272,8 +248,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "xf9zz4vLYXjr"
      },
      "outputs": [],
@@ -284,7 +258,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "ZgBg2r2nYT-K"
      },
      "source": [
@@ -295,8 +268,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "IQrHxv7W7jH5"
      },
      "outputs": [],
@@ -307,7 +278,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "vhsVWYNxazz5"
      },
      "source": [
@@ -318,8 +288,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "n0gfc_VTayfQ"
      },
      "outputs": [],
@@ -330,7 +298,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "38zJcap6xkbC"
      },
      "source": [
@@ -341,8 +308,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "xON_i6SkwApW"
      },
      "outputs": [],
@@ -356,7 +321,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9fbTyfJpNr7x"
      },
      "source": [
@@ -366,7 +330,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "wqeN54S61ZKQ"
      },
      "source": [
@@ -381,8 +344,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "idxyhmrCQcw5"
      },
      "outputs": [],
@@ -398,7 +359,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "zYHDSquU2lDU"
      },
      "source": [
@@ -409,8 +369,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "L_OfOYPg853R"
      },
      "outputs": [],
@@ -424,7 +382,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "kkAXLtuyWWDI"
      },
      "source": [
@@ -438,7 +395,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "62UTWLQd9-LB"
      },
      "source": [
@@ -451,8 +407,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "bdL-dRNRBRJT"
      },
      "outputs": [],
@@ -463,7 +417,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "UrPktnqpwqie"
      },
      "source": [
@@ -474,8 +427,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "BR7BmtU498Bh"
      },
      "outputs": [],
@@ -495,8 +446,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "has42aUdfky-"
      },
      "outputs": [],
@@ -508,7 +457,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MU9lTWy_xXbb"
      },
      "source": [
@@ -519,8 +467,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "USD8uihw-g4J"
      },
      "outputs": [],
@@ -533,7 +479,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xmNv4l4k-dBZ"
      },
      "source": [
@@ -543,7 +488,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "DIWjNIKq-ldh"
      },
      "source": [
@@ -556,7 +500,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "ulNZ4U96-8JZ"
      },
      "source": [
@@ -567,8 +510,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "EezOO9qj91kP"
      },
      "outputs": [],
@@ -581,7 +522,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "rxLenwAvCkBf"
      },
      "source": [
@@ -592,8 +532,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "2CetH_5C9P2m"
      },
      "outputs": [],
@@ -609,7 +547,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "P5UBnCn8Ii6s"
      },
      "source": [
@@ -622,8 +559,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "sDGiWYPLEd5a"
      },
      "outputs": [],
@@ -666,8 +601,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "yuLKxf6zHxw-"
      },
      "outputs": [],
@@ -685,7 +618,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "7FC5aLVxKVKK"
      },
      "source": [
@@ -696,8 +628,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "jyjTdGpFhO_1"
      },
      "outputs": [],
@@ -711,7 +641,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "FSwymsbkbLDA"
      },
      "source": [
@@ -721,7 +650,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Efrj3Cn1kLAp"
      },
      "source": [
@@ -731,7 +659,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xxpOY5r2Ayq6"
      },
      "source": [
@@ -742,8 +669,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "ujapVfZ_AKW7"
      },
      "outputs": [],
@@ -761,7 +686,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "96ldxDSwkVkj"
      },
      "source": [
@@ -774,8 +698,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "cH682__U0FBv"
      },
      "outputs": [],
@@ -787,7 +709,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "XqKp3-5GIZlw"
      },
      "source": [
@@ -798,8 +719,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "bAQblMIjwkvx"
      },
      "outputs": [],
@@ -810,7 +729,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "sFmVG4SKZAw8"
      },
      "source": [
@@ -821,8 +739,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "VTjgPbp4ZDKo"
      },
      "outputs": [],
@@ -837,7 +753,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Q0NTdwZsQK8n"
      },
      "source": [
@@ -850,8 +765,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "8L__-erBwLIQ"
      },
      "outputs": [],
@@ -862,7 +775,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "mKAvkQc3heSy"
      },
      "source": [
@@ -875,21 +787,18 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "97Ll2Gichd_Y"
      },
      "outputs": [],
      "source": [
-        "checkpoint = tf.train.Checkpoint(model=bert_encoder)\n",
+        "checkpoint = tf.train.Checkpoint(encoder=bert_encoder)\n",
-        "checkpoint.restore(\n",
+        "checkpoint.read(\n",
        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "2oHOql35k3Dd"
      },
      "source": [
@@ -899,7 +808,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "115caFLMk-_l"
      },
      "source": [
@@ -913,8 +821,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "w8qXKRZuCwW4"
      },
      "outputs": [],
@@ -937,7 +843,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "pXRGxiRNEHS2"
      },
      "source": [
@@ -948,8 +853,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "eQNA16bhDpky"
      },
      "outputs": [],
@@ -960,7 +863,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xqu_K71fJQB8"
      },
      "source": [
@@ -970,7 +872,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "78FEUOOEkoP0"
      },
      "source": [
@@ -980,7 +881,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "OTNcA0O0nSq9"
      },
      "source": [
@@ -991,8 +891,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "nzi8hjeTQTRs"
      },
      "outputs": [],
@@ -1015,7 +913,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "IFtKFWbNKb0u"
      },
      "source": [
@@ -1028,8 +925,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "9ZoUgDUNJPz3"
      },
      "outputs": [],
@@ -1049,7 +944,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "7ynJibkBRTJF"
      },
      "source": [
@@ -1060,8 +954,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "umo0ttrgRYIM"
      },
      "outputs": [],
@@ -1076,8 +968,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "utGl0M3aZCE4"
      },
      "outputs": [],
@@ -1088,7 +978,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "fVo_AnT0l26j"
      },
      "source": [
@@ -1101,8 +990,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Nl5x6nElZqkP"
      },
      "outputs": [],
@@ -1115,8 +1002,7 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
+        "collapsed": true,
-        "colab_type": "code",
        "id": "y_ACvKPsVUXC"
      },
      "outputs": [],
@@ -1137,7 +1023,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "eQceYqRFT_Eg"
      },
      "source": [
@@ -1147,7 +1032,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "SaC1RlFawUpc"
      },
      "source": [
@@ -1158,7 +1042,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "CwUdjFBkzUgh"
      },
      "source": [
@@ -1170,7 +1053,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "2UTQrkyOT5wD"
      },
      "source": [
@@ -1181,8 +1063,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "XQeDFOzYR9Z9"
      },
      "outputs": [],
@@ -1195,7 +1075,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "XrFQbfErUWxa"
      },
      "source": [
@@ -1206,8 +1085,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "ymw7GOHpSHKU"
      },
      "outputs": [],
@@ -1234,7 +1111,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "uX_Sp-wTUoRm"
      },
      "source": [
@@ -1245,8 +1121,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "rkHxIK57SQ_r"
      },
      "outputs": [],
@@ -1267,7 +1141,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "stbaVouogvzS"
      },
      "source": [
@@ -1278,8 +1151,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "gwhrlQl4gxVF"
      },
      "outputs": [],
@@ -1290,7 +1161,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "dbJ76vSJj77j"
      },
      "source": [
@@ -1300,7 +1170,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9J95LFRohiYw"
      },
      "source": [
@@ -1311,8 +1180,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "gCvaLLAxPuMc"
      },
      "outputs": [],
@@ -1356,8 +1223,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "rutkBadrhzdR"
      },
      "outputs": [],
@@ -1384,8 +1249,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "59TVgt4Z7fuU"
      },
      "outputs": [],
@@ -1396,7 +1259,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "QbklKt-w_CiI"
      },
      "source": [
@@ -1411,8 +1273,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "GDWrHm0BGpbX"
      },
      "outputs": [],
@@ -1426,8 +1286,6 @@
      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "Y29meH0qGq_5"
      },
      "outputs": [],
@@ -1439,13 +1297,11 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "lo6479At4sP1"
      },
      "outputs": [],
      "source": [
-        "hub_encoder = hub.KerasLayer(f\"https://tfhub.dev/tensorflow/{hub_model_name}/2\",\n",
+        "hub_encoder = hub.KerasLayer(f\"https://tfhub.dev/tensorflow/{hub_model_name}/3\",\n",
        "                             trainable=True)\n",
        "\n",
        "print(f\"The Hub encoder has {len(hub_encoder.trainable_variables)} trainable variables\")"
@@ -1454,7 +1310,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "iTzF574wivQv"
      },
      "source": [
@@ -1465,27 +1320,25 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "XEcYrCR45Uwo"
      },
      "outputs": [],
      "source": [
        "result = hub_encoder(\n",
-        "    inputs=[glue_train['input_word_ids'][:10],\n",
+        "    inputs=dict(\n",
-        "            glue_train['input_mask'][:10],\n",
+        "        input_word_ids=glue_train['input_word_ids'][:10],\n",
-        "            glue_train['input_type_ids'][:10],],\n",
+        "        input_mask=glue_train['input_mask'][:10],\n",
+        "        input_type_ids=glue_train['input_type_ids'][:10],),\n",
        "    training=False,\n",
        ")\n",
        "\n",
-        "print(\"Pooled output shape:\", result[0].shape)\n",
+        "print(\"Pooled output shape:\", result['pooled_output'].shape)\n",
-        "print(\"Sequence output shape:\", result[1].shape)"
+        "print(\"Sequence output shape:\", result['sequence_output'].shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "cjojn8SmLSRI"
      },
      "source": [
@@ -1498,33 +1351,31 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "9nTDaApyLR70"
      },
      "outputs": [],
      "source": [
-        "hub_classifier, hub_encoder = bert.bert_models.classifier_model(\n",
+        "hub_classifier = nlp.modeling.models.BertClassifier(\n",
-        "    # Caution: Most of `bert_config` is ignored if you pass a hub url.\n",
+        "    bert_encoder,\n",
-        "    bert_config=bert_config, hub_module_url=hub_url_bert, num_labels=2)"
+        "    num_classes=2,\n",
+        "    dropout_rate=0.1,\n",
+        "    initializer=tf.keras.initializers.TruncatedNormal(\n",
+        "        stddev=0.02))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xMJX3wV0_v7I"
      },
      "source": [
-        "The one downside to loading this model from TFHub is that the structure of internal keras layers is not restored. So it's more difficult to inspect or modify the model. The `TransformerEncoder` model is now a single layer:"
+        "The one downside to loading this model from TFHub is that the structure of internal keras layers is not restored. So it's more difficult to inspect or modify the model. The `BertEncoder` model is now a single layer:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "pD71dnvhM2QS"
      },
      "outputs": [],
@@ -1536,8 +1387,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "nLZD-isBzNKi"
      },
      "outputs": [],
@@ -1552,7 +1401,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "ZxSqH0dNAgXV"
      },
      "source": [
@@ -1560,13 +1408,12 @@
        "\n",
        "### Low level model building\n",
        "\n",
-        "If you need a more control over the construction of the model it's worth noting that the `classifier_model` function used earlier is really just a thin wrapper over the `nlp.modeling.networks.TransformerEncoder` and `nlp.modeling.models.BertClassifier` classes. Just remember that if you start modifying the architecture it may not be correct or possible to reload the pre-trained checkpoint so you'll need to retrain from scratch."
+        "If you need a more control over the construction of the model it's worth noting that the `classifier_model` function used earlier is really just a thin wrapper over the `nlp.modeling.networks.BertEncoder` and `nlp.modeling.models.BertClassifier` classes. Just remember that if you start modifying the architecture it may not be correct or possible to reload the pre-trained checkpoint so you'll need to retrain from scratch."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0cgABEwDj06P"
      },
      "source": [
@@ -1577,43 +1424,38 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "5r_yqhBFSVEM"
      },
      "outputs": [],
      "source": [
-        "transformer_config = config_dict.copy()\n",
+        "bert_encoder_config = config_dict.copy()\n",
        "\n",
        "# You need to rename a few fields to make this work:\n",
-        "transformer_config['attention_dropout_rate'] = transformer_config.pop('attention_probs_dropout_prob')\n",
+        "bert_encoder_config['attention_dropout_rate'] = bert_encoder_config.pop('attention_probs_dropout_prob')\n",
-        "transformer_config['activation'] = tf_utils.get_activation(transformer_config.pop('hidden_act'))\n",
+        "bert_encoder_config['activation'] = tf_utils.get_activation(bert_encoder_config.pop('hidden_act'))\n",
-        "transformer_config['dropout_rate'] = transformer_config.pop('hidden_dropout_prob')\n",
+        "bert_encoder_config['dropout_rate'] = bert_encoder_config.pop('hidden_dropout_prob')\n",
-        "transformer_config['initializer'] = tf.keras.initializers.TruncatedNormal(\n",
+        "bert_encoder_config['initializer'] = tf.keras.initializers.TruncatedNormal(\n",
-        "          stddev=transformer_config.pop('initializer_range'))\n",
+        "          stddev=bert_encoder_config.pop('initializer_range'))\n",
-        "transformer_config['max_sequence_length'] = transformer_config.pop('max_position_embeddings')\n",
+        "bert_encoder_config['max_sequence_length'] = bert_encoder_config.pop('max_position_embeddings')\n",
-        "transformer_config['num_layers'] = transformer_config.pop('num_hidden_layers')\n",
+        "bert_encoder_config['num_layers'] = bert_encoder_config.pop('num_hidden_layers')\n",
        "\n",
-        "transformer_config"
+        "bert_encoder_config"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "rIO8MI7LLijh"
      },
      "outputs": [],
      "source": [
-        "manual_encoder = nlp.modeling.networks.TransformerEncoder(**transformer_config)"
+        "manual_encoder = nlp.modeling.networks.BertEncoder(**bert_encoder_config)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "4a4tFSg9krRi"
      },
      "source": [
@@ -1624,21 +1466,18 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "X6N9NEqfXJCx"
      },
      "outputs": [],
      "source": [
-        "checkpoint = tf.train.Checkpoint(model=manual_encoder)\n",
+        "checkpoint = tf.train.Checkpoint(encoder=manual_encoder)\n",
-        "checkpoint.restore(\n",
+        "checkpoint.read(\n",
        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "1BPiPO4ykuwM"
      },
      "source": [
@@ -1649,8 +1488,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "hlVdgJKmj389"
      },
      "outputs": [],
@@ -1664,7 +1501,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "nJMXvVgJkyBv"
      },
      "source": [
@@ -1675,8 +1511,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "tQX57GJ6wkAb"
      },
      "outputs": [],
@@ -1684,17 +1518,14 @@
        "manual_classifier = nlp.modeling.models.BertClassifier(\n",
        "        bert_encoder,\n",
        "        num_classes=2,\n",
-        "        dropout_rate=transformer_config['dropout_rate'],\n",
+        "        dropout_rate=bert_encoder_config['dropout_rate'],\n",
-        "        initializer=tf.keras.initializers.TruncatedNormal(\n",
+        "        initializer=bert_encoder_config['initializer'])"
-        "          stddev=bert_config.initializer_range))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "kB-nBWhQk0dS"
      },
      "outputs": [],
@@ -1705,7 +1536,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "E6AJlOSyIO1L"
      },
      "source": [
@@ -1720,8 +1550,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "28Dv3BPRlFTD"
      },
      "outputs": [],
@@ -1733,7 +1561,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "LRjcHr0UlT8c"
      },
      "source": [
@@ -1746,8 +1573,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "MHY8K6kDngQn"
      },
      "outputs": [],
@@ -1765,8 +1590,7 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
+        "collapsed": true,
-        "colab_type": "code",
        "id": "wKIcSprulu3P"
      },
      "outputs": [],
@@ -1782,7 +1606,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "IMTC_gfAl_PZ"
      },
      "source": [
@@ -1793,8 +1616,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "YRt3VTmBmCBY"
      },
      "outputs": [],
@@ -1816,7 +1637,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "l8D9Lv3Bn740"
      },
      "source": [
@@ -1827,8 +1647,6 @@
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "2Hf2rpRXk89N"
      },
      "outputs": [],

--- a/official/colab/nlp/customize_encoder.ipynb
+++ b/official/colab/nlp/customize_encoder.ipynb
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Customizing a Transformer Encoder",
+      "private_outputs": true,
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Bp8t2AI8i7uP"
      },
      "source": [
@@ -12,14 +26,10 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "rxPj2Lsni9O4"
      },
-      "outputs": [],
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
@@ -32,12 +42,13 @@
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "6xS-9i5DrRvO"
      },
      "source": [
@@ -47,30 +58,28 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Mwb9uw1cDXsa"
      },
      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  \u003ctd\u003e\n",
+        "  <td>\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  \u003c/td\u003e\n",
+        "  </td>\n",
-        "  \u003ctd\u003e\n",
+        "  <td>\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  \u003c/td\u003e\n",
+        "  </td>\n",
-        "  \u003ctd\u003e\n",
+        "  <td>\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  \u003c/td\u003e\n",
+        "  </td>\n",
-        "  \u003ctd\u003e\n",
+        "  <td>\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  \u003c/td\u003e\n",
+        "  </td>\n",
-        "\u003c/table\u003e"
+        "</table>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "iLrcV4IyrcGX"
      },
      "source": [
@@ -84,7 +93,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "YYxdyoWgsl8t"
      },
      "source": [
@@ -94,7 +102,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "fEJSFutUsn_h"
      },
      "source": [
@@ -107,21 +114,18 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "thsKZDjhswhR"
      },
-      "outputs": [],
      "source": [
-        "!pip install -q tf-models-official==2.3.0"
+        "!pip install -q tf-models-official==2.4.0"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "hpf7JPCVsqtv"
      },
      "source": [
@@ -130,13 +134,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "my4dp-RMssQe"
      },
-      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import tensorflow as tf\n",
@@ -144,12 +144,13 @@
        "from official.modeling import activations\n",
        "from official.nlp import modeling\n",
        "from official.nlp.modeling import layers, losses, models, networks"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "vjDmVsFfs85n"
      },
      "source": [
@@ -160,13 +161,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Oav8sbgstWc-"
      },
-      "outputs": [],
      "source": [
        "cfg = {\n",
        "    \"vocab_size\": 100,\n",
@@ -177,22 +174,23 @@
        "    \"activation\": activations.gelu,\n",
        "    \"dropout_rate\": 0.1,\n",
        "    \"attention_dropout_rate\": 0.1,\n",
-        "    \"sequence_length\": 16,\n",
+        "    \"max_sequence_length\": 16,\n",
        "    \"type_vocab_size\": 2,\n",
        "    \"initializer\": tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
        "}\n",
-        "bert_encoder = modeling.networks.TransformerEncoder(**cfg)\n",
+        "bert_encoder = modeling.networks.BertEncoder(**cfg)\n",
        "\n",
        "def build_classifier(bert_encoder):\n",
        "  return modeling.models.BertClassifier(bert_encoder, num_classes=2)\n",
        "\n",
        "canonical_classifier_model = build_classifier(bert_encoder)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Qe2UWI6_tsHo"
      },
      "source": [
@@ -203,31 +201,28 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "csED2d-Yt5h6"
      },
-      "outputs": [],
      "source": [
        "def predict(model):\n",
        "  batch_size = 3\n",
        "  np.random.seed(0)\n",
        "  word_ids = np.random.randint(\n",
-        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
-        "  mask = np.random.randint(2, size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "  mask = np.random.randint(2, size=(batch_size, cfg[\"max_sequence_length\"]))\n",
        "  type_ids = np.random.randint(\n",
-        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"sequence_length\"]))\n",
+        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
        "  print(model([word_ids, mask, type_ids], training=False))\n",
        "\n",
        "predict(canonical_classifier_model)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "PzKStEK9t_Pb"
      },
      "source": [
@@ -239,7 +234,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "rmwQfhj6fmKz"
      },
      "source": [
@@ -250,7 +244,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xsMgEVHAui11"
      },
      "source": [
@@ -263,26 +256,21 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "-JBabpa2AOz8"
      },
      "source": [
        "#### Without Customization\n",
        "\n",
-        "Without any customization, `EncoderScaffold` behaves the same the canonical `TransformerEncoder`.\n",
+        "Without any customization, `EncoderScaffold` behaves the same the canonical `BertEncoder`.\n",
        "\n",
-        "As shown in the following example, `EncoderScaffold` can load `TransformerEncoder`'s weights and output the same values:"
+        "As shown in the following example, `EncoderScaffold` can load `BertEncoder`'s weights and output the same values:"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "ktNzKuVByZQf"
      },
-      "outputs": [],
      "source": [
        "default_hidden_cfg = dict(\n",
        "    num_attention_heads=cfg[\"num_attention_heads\"],\n",
@@ -296,10 +284,9 @@
        "    vocab_size=cfg[\"vocab_size\"],\n",
        "    type_vocab_size=cfg[\"type_vocab_size\"],\n",
        "    hidden_size=cfg[\"hidden_size\"],\n",
-        "    seq_length=cfg[\"sequence_length\"],\n",
        "    initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
        "    dropout_rate=cfg[\"dropout_rate\"],\n",
-        "    max_seq_length=cfg[\"sequence_length\"],\n",
+        "    max_seq_length=cfg[\"max_sequence_length\"]\n",
        ")\n",
        "default_kwargs = dict(\n",
        "    hidden_cfg=default_hidden_cfg,\n",
@@ -309,17 +296,19 @@
        "    return_all_layer_outputs=True,\n",
        "    pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
        ")\n",
+        "\n",
        "encoder_scaffold = modeling.networks.EncoderScaffold(**default_kwargs)\n",
        "classifier_model_from_encoder_scaffold = build_classifier(encoder_scaffold)\n",
        "classifier_model_from_encoder_scaffold.set_weights(\n",
        "    canonical_classifier_model.get_weights())\n",
        "predict(classifier_model_from_encoder_scaffold)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "sMaUmLyIuwcs"
      },
      "source": [
@@ -332,18 +321,14 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "LTinnaG6vcsw"
      },
-      "outputs": [],
      "source": [
        "word_ids = tf.keras.layers.Input(\n",
-        "    shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
+        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
        "mask = tf.keras.layers.Input(\n",
-        "    shape=(cfg['sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
+        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
        "embedding_layer = modeling.layers.OnDeviceEmbedding(\n",
        "    vocab_size=cfg['vocab_size'],\n",
        "    embedding_width=cfg['hidden_size'],\n",
@@ -353,12 +338,13 @@
        "attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])\n",
        "new_embedding_network = tf.keras.Model([word_ids, mask],\n",
        "                                       [word_embeddings, attention_mask])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "HN7_yu-6O3qI"
      },
      "source": [
@@ -368,21 +354,18 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "fO9zKFE4OpHp"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(new_embedding_network, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "9cOaGQHLv12W"
      },
      "source": [
@@ -391,13 +374,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "mtFDMNf2vIl9"
      },
-      "outputs": [],
      "source": [
        "kwargs = dict(default_kwargs)\n",
        "\n",
@@ -412,12 +391,13 @@
        "\n",
        "# Assert that there are only two inputs.\n",
        "assert len(classifier_model.inputs) == 2"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Z73ZQDtmwg9K"
      },
      "source": [
@@ -432,13 +412,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "uAIarLZgw6pA"
      },
-      "outputs": [],
      "source": [
        "kwargs = dict(default_kwargs)\n",
        "\n",
@@ -452,12 +428,13 @@
        "\n",
        "# Assert that the variable `rezero_alpha` from ReZeroTransformer exists.\n",
        "assert 'rezero_alpha' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "6PMHFdvnxvR0"
      },
      "source": [
@@ -470,7 +447,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "D6FejlgwyAy_"
      },
      "source": [
@@ -485,13 +461,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "nFrSMrZuyNeQ"
      },
-      "outputs": [],
      "source": [
        "# Use TalkingHeadsAttention\n",
        "hidden_cfg = dict(default_hidden_cfg)\n",
@@ -508,12 +480,13 @@
        "\n",
        "# Assert that the variable `pre_softmax_weight` from TalkingHeadsAttention exists.\n",
        "assert 'pre_softmax_weight' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "kuEJcTyByVvI"
      },
      "source": [
@@ -528,13 +501,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "XAbKy_l4y_-i"
      },
-      "outputs": [],
      "source": [
        "# Use TalkingHeadsAttention\n",
        "hidden_cfg = dict(default_hidden_cfg)\n",
@@ -551,12 +520,13 @@
        "\n",
        "# Assert that the variable `gate` from GatedFeedforward exists.\n",
        "assert 'gate' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "a_8NWUhkzeAq"
      },
      "source": [
@@ -564,29 +534,26 @@
        "\n",
        "Finally, you could also build a new encoder using building blocks in the modeling library.\n",
        "\n",
-        "See [AlbertTransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_transformer_encoder.py) as an example:\n"
+        "See [AlbertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_encoder.py) as an example:\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "xsiA3RzUzmUM"
      },
-      "outputs": [],
      "source": [
-        "albert_encoder = modeling.networks.AlbertTransformerEncoder(**cfg)\n",
+        "albert_encoder = modeling.networks.AlbertEncoder(**cfg)\n",
        "classifier_model = build_classifier(albert_encoder)\n",
        "# ... Train the model ...\n",
        "predict(classifier_model)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MeidDfhlHKSO"
      },
      "source": [
@@ -595,31 +562,14 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Uv_juT22HERW"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(albert_encoder, show_shapes=True, dpi=48)"
-      ]
+      ],
-    }
+      "execution_count": null,
-  ],
+      "outputs": []
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Customizing a Transformer Encoder",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
    }
-  },
+  ]
-  "nbformat": 4,
+}
-  "nbformat_minor": 0
\ No newline at end of file
-}
--- a/official/colab/nlp/nlp_modeling_library_intro.ipynb
+++ b/official/colab/nlp/nlp_modeling_library_intro.ipynb
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Introduction to the TensorFlow Models NLP library",
+      "private_outputs": true,
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "80xnUmoI7fBX"
      },
      "source": [
@@ -12,14 +26,10 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
        "id": "8nvTnfs6Q692"
      },
-      "outputs": [],
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
@@ -32,12 +42,13 @@
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "WmfcMK5P5C1G"
      },
      "source": [
@@ -47,30 +58,28 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "cH-oJ8R6AHMK"
      },
      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  \u003ctd\u003e\n",
+        "  <td>\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  \u003c/td\u003e\n",
+        "  </td>\n",
-        "  \u003ctd\u003e\n",
+        "  <td>\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  \u003c/td\u003e\n",
+        "  </td>\n",
-        "  \u003ctd\u003e\n",
+        "  <td>\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  \u003c/td\u003e\n",
+        "  </td>\n",
-        "  \u003ctd\u003e\n",
+        "  <td>\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  \u003c/td\u003e\n",
+        "  </td>\n",
-        "\u003c/table\u003e"
+        "</table>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0H_EFIhq4-MJ"
      },
      "source": [
@@ -82,7 +91,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "2N97-dps_nUk"
      },
      "source": [
@@ -92,7 +100,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "459ygAVl_rg0"
      },
      "source": [
@@ -105,21 +112,18 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "Y-qGkdh6_sZc"
      },
-      "outputs": [],
      "source": [
-        "!pip install -q tf-models-official==2.3.0"
+        "!pip install -q tf-models-official==2.4.0"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "e4huSSwyAG_5"
      },
      "source": [
@@ -128,25 +132,22 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "jqYXqtjBAJd9"
      },
-      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import tensorflow as tf\n",
        "\n",
        "from official.nlp import modeling\n",
        "from official.nlp.modeling import layers, losses, models, networks"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "djBQWjvy-60Y"
      },
      "source": [
@@ -160,38 +161,34 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MKuHVlsCHmiq"
      },
      "source": [
-        "### Build a `BertPretrainer` model wrapping `TransformerEncoder`\n",
+        "### Build a `BertPretrainer` model wrapping `BertEncoder`\n",
        "\n",
-        "The [TransformerEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/transformer_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
+        "The [BertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/bert_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
        "\n",
        "The [BertPretrainer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_pretrainer.py) allows a user to pass in a transformer stack, and instantiates the masked language model and classification networks that are used to create the training objectives."
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "EXkcXz-9BwB3"
      },
-      "outputs": [],
      "source": [
        "# Build a small transformer network.\n",
        "vocab_size = 100\n",
        "sequence_length = 16\n",
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "    vocab_size=vocab_size, num_layers=2, sequence_length=16)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0NH5irV5KTMS"
      },
      "source": [
@@ -202,37 +199,32 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "lZNoZkBrIoff"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(network, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "o7eFOZXiIl-b"
      },
-      "outputs": [],
      "source": [
        "# Create a BERT pretrainer with the created network.\n",
        "num_token_predictions = 8\n",
        "bert_pretrainer = modeling.models.BertPretrainer(\n",
        "    network, num_classes=2, num_token_predictions=num_token_predictions, output='predictions')"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "d5h5HT7gNHx_"
      },
      "source": [
@@ -241,26 +233,20 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "2tcNfm03IBF7"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_pretrainer, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "F2oHrXGUIS0M"
      },
-      "outputs": [],
      "source": [
        "# We can feed some dummy data to get masked language model and sentence output.\n",
        "batch_size = 2\n",
@@ -275,12 +261,13 @@
        "sentence_output = outputs[\"classification\"]\n",
        "print(lm_output)\n",
        "print(sentence_output)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "bnx3UCHniCS5"
      },
      "source": [
@@ -290,13 +277,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "k30H4Q86f52x"
      },
-      "outputs": [],
      "source": [
        "masked_lm_ids_data = np.random.randint(vocab_size, size=(batch_size, num_token_predictions))\n",
        "masked_lm_weights_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
@@ -311,12 +294,13 @@
        "    predictions=sentence_output)\n",
        "loss = mlm_loss + sentence_loss\n",
        "print(loss)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "wrmSs8GjHxVw"
      },
      "source": [
@@ -328,7 +312,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "k8cQVFvBCV4s"
      },
      "source": [
@@ -342,38 +325,34 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "xrLLEWpfknUW"
      },
      "source": [
-        "### Build a BertSpanLabeler wrapping TransformerEncoder\n",
+        "### Build a BertSpanLabeler wrapping BertEncoder\n",
        "\n",
        "[BertSpanLabeler](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_span_labeler.py) implements a simple single-span start-end predictor (that is, a model that predicts two values: a start token index and an end token index), suitable for SQuAD-style tasks.\n",
        "\n",
-        "Note that `BertSpanLabeler` wraps a `TransformerEncoder`, the weights of which can be restored from the above pretraining model.\n"
+        "Note that `BertSpanLabeler` wraps a `BertEncoder`, the weights of which can be restored from the above pretraining model.\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "B941M4iUCejO"
      },
-      "outputs": [],
      "source": [
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
        "\n",
        "# Create a BERT trainer with the created network.\n",
        "bert_span_labeler = modeling.models.BertSpanLabeler(network)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "QpB9pgj4PpMg"
      },
      "source": [
@@ -382,26 +361,20 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "RbqRNJCLJu4H"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_span_labeler, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "fUf1vRxZJwio"
      },
-      "outputs": [],
      "source": [
        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
@@ -412,12 +385,13 @@
        "start_logits, end_logits = bert_span_labeler([word_id_data, mask_data, type_id_data])\n",
        "print(start_logits)\n",
        "print(end_logits)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "WqhgQaN1lt-G"
      },
      "source": [
@@ -427,13 +401,9 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "waqs6azNl3Nn"
      },
-      "outputs": [],
      "source": [
        "start_positions = np.random.randint(sequence_length, size=(batch_size))\n",
        "end_positions = np.random.randint(sequence_length, size=(batch_size))\n",
@@ -445,12 +415,13 @@
        "\n",
        "total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2\n",
        "print(total_loss)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "Zdf03YtZmd_d"
      },
      "source": [
@@ -460,7 +431,6 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "0A1XnGSTChg9"
      },
      "source": [
@@ -472,38 +442,34 @@
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "MSK8OpZgnQa9"
      },
      "source": [
-        "### Build a BertClassifier model wrapping TransformerEncoder\n",
+        "### Build a BertClassifier model wrapping BertEncoder\n",
        "\n",
        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a [CLS] token classification model containing a single classification head."
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "cXXCsffkCphk"
      },
-      "outputs": [],
      "source": [
-        "network = modeling.networks.TransformerEncoder(\n",
+        "network = modeling.networks.BertEncoder(\n",
        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
        "\n",
        "# Create a BERT trainer with the created network.\n",
        "num_classes = 2\n",
        "bert_classifier = modeling.models.BertClassifier(\n",
        "    network, num_classes=num_classes)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "8tZKueKYP4bB"
      },
      "source": [
@@ -512,26 +478,20 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "snlutm9ZJgEZ"
      },
-      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "yyHPHsqBJkCz"
      },
-      "outputs": [],
      "source": [
        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
@@ -541,12 +501,13 @@
        "# Feed the data to the model.\n",
        "logits = bert_classifier([word_id_data, mask_data, type_id_data])\n",
        "print(logits)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "w--a2mg4nzKm"
      },
      "source": [
@@ -557,45 +518,27 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
      "metadata": {
-        "colab": {},
-        "colab_type": "code",
        "id": "9X0S1DoFn_5Q"
      },
-      "outputs": [],
      "source": [
        "labels = np.random.randint(num_classes, size=(batch_size))\n",
        "\n",
        "loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
        "    labels=labels, predictions=tf.nn.log_softmax(logits, axis=-1))\n",
        "print(loss)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "colab_type": "text",
        "id": "mzBqOylZo3og"
      },
      "source": [
        "With the `loss`, you can optimize the model. Please see [run_classifier.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_classifier.py) or the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb) for the full example."
      ]
    }
-  ],
+  ]
-  "metadata": {
+}
-    "colab": {
\ No newline at end of file
-      "collapsed_sections": [],
-      "name": "Introduction to the TensorFlow Models NLP library",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
--- a/official/common/distribute_utils.py
+++ b/official/common/distribute_utils.py
@@ -127,6 +127,15 @@ def get_distribution_strategy(distribution_strategy="mirrored",
  if num_gpus < 0:
    raise ValueError("`num_gpus` can not be negative.")
+  if not isinstance(distribution_strategy, str):
+    msg = ("distribution_strategy must be a string but got: %s." %
+           (distribution_strategy,))
+    if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
+      msg += (" If you meant to pass the string 'off', make sure you add "
+              "quotes around 'off' so that yaml interprets it as a string "
+              "instead of a bool.")
+    raise ValueError(msg)
  distribution_strategy = distribution_strategy.lower()
  if distribution_strategy == "off":
    if num_gpus > 1:

--- a/official/common/distribute_utils_test.py
+++ b/official/common/distribute_utils_test.py
@@ -41,6 +41,19 @@ class GetDistributionStrategyTest(tf.test.TestCase):
    for device in ds.extended.worker_devices:
      self.assertIn('GPU', device)
+  def test_no_strategy(self):
+    ds = distribute_utils.get_distribution_strategy('off')
+    self.assertIsNone(ds)
+  def test_invalid_strategy(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'distribution_strategy must be a string but got: False. If'):
+      distribute_utils.get_distribution_strategy(False)
+    with self.assertRaisesRegexp(
+        ValueError, 'distribution_strategy must be a string but got: 1'):
+      distribute_utils.get_distribution_strategy(1)
 if __name__ == '__main__':
  tf.test.main()
--- a/official/core/input_reader.py
+++ b/official/core/input_reader.py
@@ -14,7 +14,7 @@
 """A common dataset reader."""
 import random
-from typing import Any, Callable, Optional
+from typing import Any, Callable, List, Optional
 from absl import logging
 import tensorflow as tf
@@ -27,6 +27,13 @@ def _get_random_integer():
  return random.randint(0, (1 << 31) - 1)
+def _maybe_map_fn(dataset: tf.data.Dataset,
+                  fn: Optional[Callable[..., Any]] = None) -> tf.data.Dataset:
+  """Calls dataset.map if a valid function is passed in."""
+  return dataset if fn is None else dataset.map(
+      fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
 class InputReader:
  """Input reader that returns a tf.data.Dataset instance."""
@@ -74,38 +81,7 @@ class InputReader:
    self._tfds_builder = None
    self._matched_files = []
    if params.input_path:
-      # Read dataset from files.
+      self._matched_files = self._match_files(params.input_path)
-      usage = ('`input_path` should be either (1) a str indicating a file '
-               'path/pattern, or (2) a str indicating multiple file '
-               'paths/patterns separated by comma (e.g "a, b, c" or no spaces '
-               '"a,b,c", or (3) a list of str, each of which is a file '
-               'path/pattern or multiple file paths/patterns separated by '
-               'comma, but got: %s')
-      if isinstance(params.input_path, str):
-        input_path_list = [params.input_path]
-      elif isinstance(params.input_path, (list, tuple)):
-        if any(not isinstance(x, str) for x in params.input_path):
-          raise ValueError(usage % params.input_path)
-        input_path_list = params.input_path
-      else:
-        raise ValueError(usage % params.input_path)
-      for input_path in input_path_list:
-        input_patterns = input_path.strip().split(',')
-        for input_pattern in input_patterns:
-          input_pattern = input_pattern.strip()
-          if not input_pattern:
-            continue
-          if '*' in input_pattern or '?' in input_pattern:
-            tmp_matched_files = tf.io.gfile.glob(input_pattern)
-            if not tmp_matched_files:
-              raise ValueError('%s does not match any files.' % input_pattern)
-            self._matched_files.extend(tmp_matched_files)
-          else:
-            self._matched_files.append(input_pattern)
-      if not self._matched_files:
-        raise ValueError('%s does not match any files.' % params.input_path)
    else:
      # Read dataset from TFDS.
      if not params.tfds_split:
@@ -135,7 +111,10 @@ class InputReader:
    self._parser_fn = parser_fn
    self._transform_and_batch_fn = transform_and_batch_fn
    self._postprocess_fn = postprocess_fn
-    self._seed = _get_random_integer()
+    # When tf.data service is enabled, each data service worker should get
+    # different random seeds. Thus, we set `seed` to None.
+    self._seed = (None
+                  if params.enable_tf_data_service else _get_random_integer())
    self._enable_tf_data_service = (
        params.enable_tf_data_service and params.tf_data_service_address)
@@ -148,15 +127,57 @@ class InputReader:
      self._enable_round_robin_tf_data_service = params.get(
          'enable_round_robin_tf_data_service', False)
+  def _match_files(self, input_path: str) -> List[str]:
+    """Matches files from an input_path."""
+    matched_files = []
+    # Read dataset from files.
+    usage = ('`input_path` should be either (1) a str indicating a file '
+             'path/pattern, or (2) a str indicating multiple file '
+             'paths/patterns separated by comma (e.g "a, b, c" or no spaces '
+             '"a,b,c", or (3) a list of str, each of which is a file '
+             'path/pattern or multiple file paths/patterns separated by '
+             'comma, but got: %s')
+    if isinstance(input_path, str):
+      input_path_list = [input_path]
+    elif isinstance(input_path, (list, tuple)):
+      if any(not isinstance(x, str) for x in input_path):
+        raise ValueError(usage % input_path)
+      input_path_list = input_path
+    else:
+      raise ValueError(usage % input_path)
+    for input_path in input_path_list:
+      input_patterns = input_path.strip().split(',')
+      for input_pattern in input_patterns:
+        input_pattern = input_pattern.strip()
+        if not input_pattern:
+          continue
+        if '*' in input_pattern or '?' in input_pattern:
+          tmp_matched_files = tf.io.gfile.glob(input_pattern)
+          if not tmp_matched_files:
+            raise ValueError('%s does not match any files.' % input_pattern)
+          matched_files.extend(tmp_matched_files)
+        else:
+          matched_files.append(input_pattern)
+    if not matched_files:
+      raise ValueError('%s does not match any files.' % input_path)
+    return matched_files
  def _shard_files_then_read(
-      self, input_context: Optional[tf.distribute.InputContext] = None):
+      self,
+      matched_files: List[str],
+      dataset_fn,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
    """Shards the data files and then sent a split to every worker to read."""
-    dataset = tf.data.Dataset.from_tensor_slices(self._matched_files)
+    dataset = tf.data.Dataset.from_tensor_slices(matched_files)
    # Shuffle and repeat at file level.
    if self._is_training:
      dataset = dataset.shuffle(
-          len(self._matched_files),
+          len(matched_files),
          seed=self._seed,
          reshuffle_each_iteration=True)
@@ -171,7 +192,7 @@ class InputReader:
      dataset = dataset.repeat()
    dataset = dataset.interleave(
-        map_func=self._dataset_fn,
+        map_func=dataset_fn,
        cycle_length=self._cycle_length,
        block_length=self._block_length,
        num_parallel_calls=(self._cycle_length if self._cycle_length else
@@ -180,9 +201,13 @@ class InputReader:
    return dataset
  def _read_files_then_shard(
-      self, input_context: Optional[tf.distribute.InputContext] = None):
+      self,
+      matched_files: List[str],
+      dataset_fn,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
    """Sends all data files to every worker and then shard by data."""
-    dataset = self._dataset_fn(self._matched_files)
+    dataset = dataset_fn(matched_files)
    # When `input_file` is a path to a single file or the number of files is
    # less than the number of input pipelines, disable auto sharding
@@ -238,26 +263,35 @@ class InputReader:
      raise ValueError('tfds_info is not available, because the dataset '
                       'is not loaded from tfds.')
-  def read(
+  def _read_decode_and_parse_dataset(
      self,
-      input_context: Optional[tf.distribute.InputContext] = None
+      matched_files: List[str],
-  ) -> tf.data.Dataset:
+      dataset_fn,
-    """Generates a tf.data.Dataset object."""
+      batch_size: int,
-    if self._tfds_builder:
+      input_context: Optional[tf.distribute.InputContext] = None,
+      tfds_builder: bool = False) -> tf.data.Dataset:
+    """Returns a tf.data.Dataset object after reading, decoding, and parsing."""
+    if tfds_builder:
      dataset = self._read_tfds(input_context)
    elif len(self._matched_files) > 1:
-      if input_context and (len(self._matched_files) <
+      if input_context and (len(matched_files) <
                            input_context.num_input_pipelines):
        logging.warn(
            'The number of files %d is less than the number of input pipelines '
            '%d. We will send all input files to every worker. '
            'Please consider sharding your data into more files.',
-            len(self._matched_files), input_context.num_input_pipelines)
+            len(matched_files), input_context.num_input_pipelines)
-        dataset = self._read_files_then_shard(input_context)
+        dataset = self._read_files_then_shard(matched_files,
+                                              dataset_fn,
+                                              input_context)
      else:
-        dataset = self._shard_files_then_read(input_context)
+        dataset = self._shard_files_then_read(matched_files,
-    elif len(self._matched_files) == 1:
+                                              dataset_fn,
-      dataset = self._read_files_then_shard(input_context)
+                                              input_context)
+    elif len(matched_files) == 1:
+      dataset = self._read_files_then_shard(matched_files,
+                                            dataset_fn,
+                                            input_context)
    else:
      raise ValueError('It is unexpected that `tfds_builder` is None and '
                       'there is also no `matched_files`.')
@@ -268,25 +302,28 @@ class InputReader:
    if self._is_training:
      dataset = dataset.shuffle(self._shuffle_buffer_size)
-    def maybe_map_fn(dataset, fn):
+    dataset = _maybe_map_fn(dataset, self._decoder_fn)
-      return dataset if fn is None else dataset.map(
-          fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    dataset = maybe_map_fn(dataset, self._decoder_fn)
    if self._sample_fn is not None:
      dataset = dataset.apply(self._sample_fn)
-    dataset = maybe_map_fn(dataset, self._parser_fn)
+    dataset = _maybe_map_fn(dataset, self._parser_fn)
    if self._transform_and_batch_fn is not None:
      dataset = self._transform_and_batch_fn(dataset, input_context)
    else:
      per_replica_batch_size = input_context.get_per_replica_batch_size(
-          self._global_batch_size) if input_context else self._global_batch_size
+          batch_size) if input_context else batch_size
      dataset = dataset.batch(
-          per_replica_batch_size, drop_remainder=self._drop_remainder)
+          per_replica_batch_size, drop_remainder=self._drop_remainder
+      )
-    dataset = maybe_map_fn(dataset, self._postprocess_fn)
+    return dataset
+  def _maybe_apply_data_service(
+      self,
+      dataset: tf.data.Dataset,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Potentially distributes a dataset."""
    if self._enable_tf_data_service and input_context:
      if self._enable_round_robin_tf_data_service:
        replicas_per_input_pipeline = input_context.num_replicas_in_sync // (
@@ -316,6 +353,20 @@ class InputReader:
                processing_mode='parallel_epochs',
                service=self._tf_data_service_address,
                job_name=self._tf_data_service_job_name))
+    return dataset
+  def read(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Generates a tf.data.Dataset object."""
+    dataset = self._read_decode_and_parse_dataset(self._matched_files,
+                                                  self._dataset_fn,
+                                                  self._global_batch_size,
+                                                  input_context,
+                                                  self._tfds_builder)
+    dataset = _maybe_map_fn(dataset, self._postprocess_fn)
+    dataset = self._maybe_apply_data_service(dataset, input_context)
    if self._deterministic is not None:
      options = tf.data.Options()

--- a/official/core/train_lib.py
+++ b/official/core/train_lib.py
@@ -27,26 +27,7 @@ from official.core import config_definitions
 from official.core import train_utils
 BestCheckpointExporter = train_utils.BestCheckpointExporter
+maybe_create_best_ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter
-def maybe_create_best_ckpt_exporter(params: config_definitions.ExperimentConfig,
-                                    data_dir: str) -> Any:
-  """Maybe create a BestCheckpointExporter object, according to the config."""
-  export_subdir = params.trainer.best_checkpoint_export_subdir
-  metric_name = params.trainer.best_checkpoint_eval_metric
-  metric_comp = params.trainer.best_checkpoint_metric_comp
-  if data_dir and export_subdir and metric_name:
-    best_ckpt_dir = os.path.join(data_dir, export_subdir)
-    best_ckpt_exporter = BestCheckpointExporter(
-        best_ckpt_dir, metric_name, metric_comp)
-    logging.info(
-        'Created the best checkpoint exporter. '
-        'data_dir: %s, export_subdir: %s, metric_name: %s', data_dir,
-        export_subdir, metric_name)
-  else:
-    best_ckpt_exporter = None
-  return best_ckpt_exporter
 def run_experiment(distribution_strategy: tf.distribute.Strategy,
@@ -83,7 +64,8 @@ def run_experiment(distribution_strategy: tf.distribute.Strategy,
        task,
        train='train' in mode,
        evaluate=('eval' in mode) or run_post_eval,
-        checkpoint_exporter=maybe_create_best_ckpt_exporter(params, model_dir))
+        checkpoint_exporter=maybe_create_best_ckpt_exporter(
+            params, model_dir))
  if trainer.checkpoint:
    checkpoint_manager = tf.train.CheckpointManager(

--- a/official/core/train_utils.py
+++ b/official/core/train_utils.py
@@ -17,7 +17,7 @@ import copy
 import json
 import os
 import pprint
-from typing import List, Optional
+from typing import Any, Callable, Dict, List, Optional
 from absl import logging
 import dataclasses
@@ -32,6 +32,75 @@ from official.core import exp_factory
 from official.modeling import hyperparams
+def get_leaf_nested_dict(
+    d: Dict[str, Any], keys: List[str]) -> Dict[str, Any]:
+  """Get leaf from a dictionary with arbitrary depth with a list of keys.
+  Args:
+    d: The dictionary to extract value from.
+    keys: The list of keys to extract values recursively.
+  Returns:
+    The value of the leaf.
+  Raises:
+    KeyError: If the value of keys extracted is a dictionary.
+  """
+  leaf = d
+  for k in keys:
+    if not isinstance(leaf, dict) or k not in leaf:
+      raise KeyError(
+          'Path not exist while traversing the dictionary: d with keys'
+          ': %s.' % keys)
+    leaf = leaf[k]
+  if isinstance(leaf, dict):
+    raise KeyError('The value extracted with keys: %s is not a leaf of the '
+                   'dictionary: %s.' % (keys, d))
+  return leaf
+def cast_leaf_nested_dict(
+    d: Dict[str, Any],
+    cast_fn: Callable[[Any], Any]) -> Dict[str, Any]:
+  """Cast the leaves of a dictionary with arbitrary depth in place.
+  Args:
+    d: The dictionary to extract value from.
+    cast_fn: The casting function.
+  Returns:
+    A dictionray with the same structure as d.
+  """
+  for key, value in d.items():
+    if isinstance(value, dict):
+      d[key] = cast_leaf_nested_dict(value, cast_fn)
+    else:
+      d[key] = cast_fn(value)
+  return d
+def maybe_create_best_ckpt_exporter(params: config_definitions.ExperimentConfig,
+                                    data_dir: str) -> Any:
+  """Maybe create a BestCheckpointExporter object, according to the config."""
+  export_subdir = params.trainer.best_checkpoint_export_subdir
+  metric_name = params.trainer.best_checkpoint_eval_metric
+  metric_comp = params.trainer.best_checkpoint_metric_comp
+  if data_dir and export_subdir and metric_name:
+    best_ckpt_dir = os.path.join(data_dir, export_subdir)
+    best_ckpt_exporter = BestCheckpointExporter(
+        best_ckpt_dir, metric_name, metric_comp)
+    logging.info(
+        'Created the best checkpoint exporter. '
+        'data_dir: %s, export_subdir: %s, metric_name: %s', data_dir,
+        export_subdir, metric_name)
+  else:
+    best_ckpt_exporter = None
+  return best_ckpt_exporter
+# TODO(b/180147589): Add tests for this module.
 class BestCheckpointExporter:
  """Keeps track of the best result, and saves its checkpoint.
@@ -45,17 +114,32 @@ class BestCheckpointExporter:
    Args:
      export_dir: The directory that will contain exported checkpoints.
      metric_name: Indicates which metric to look at, when determining which
-        result is better.
+        result is better. If eval_logs being passed to maybe_export_checkpoint
+        is a nested dictionary, use `|` as a seperator for different layers.
      metric_comp: Indicates how to compare results. Either `lower` or `higher`.
    """
    self._export_dir = export_dir
-    self._metric_name = metric_name
+    self._metric_name = metric_name.split('|')
    self._metric_comp = metric_comp
    if self._metric_comp not in ('lower', 'higher'):
      raise ValueError('best checkpoint metric comp must be one of '
                       'higher, lower. Got: {}'.format(self._metric_comp))
    tf.io.gfile.makedirs(os.path.dirname(self.best_ckpt_logs_path))
    self._best_ckpt_logs = self._maybe_load_best_eval_metric()
+    self._checkpoint_manager = None
+  def _get_checkpoint_manager(self, checkpoint):
+    """Gets an existing checkpoint manager or creates a new one."""
+    if self._checkpoint_manager is None or (
+        self._checkpoint_manager.checkpoint != checkpoint):
+      logging.info('Creates a new checkpoint manager.')
+      self._checkpoint_manager = tf.train.CheckpointManager(
+          checkpoint,
+          directory=self._export_dir,
+          max_to_keep=1,
+          checkpoint_name='best_ckpt')
+    return self._checkpoint_manager
  def maybe_export_checkpoint(self, checkpoint, eval_logs, global_step):
    logging.info('[BestCheckpointExporter] received eval_logs: %s, at step: %d',
@@ -74,12 +158,10 @@ class BestCheckpointExporter:
  def _new_metric_is_better(self, old_logs, new_logs):
    """Check if the metric in new_logs is better than the metric in old_logs."""
-    if self._metric_name not in old_logs or self._metric_name not in new_logs:
+    old_value = float(orbit.utils.get_value(
-      raise KeyError('best checkpoint eval metric name {} is not valid. '
+        get_leaf_nested_dict(old_logs, self._metric_name)))
-                     'old_logs: {}, new_logs: {}'.format(
+    new_value = float(orbit.utils.get_value(
-                         self._metric_name, old_logs, new_logs))
+        get_leaf_nested_dict(new_logs, self._metric_name)))
-    old_value = float(orbit.utils.get_value(old_logs[self._metric_name]))
-    new_value = float(orbit.utils.get_value(new_logs[self._metric_name]))
    logging.info('[BestCheckpointExporter] comparing results. old: %f, new: %f',
                 old_value, new_value)
@@ -99,16 +181,13 @@ class BestCheckpointExporter:
    """Export evaluation results of the best checkpoint into a json file."""
    eval_logs_ext = copy.copy(eval_logs)
    eval_logs_ext['best_ckpt_global_step'] = global_step
-    for name, value in eval_logs_ext.items():
+    eval_logs_ext = cast_leaf_nested_dict(
-      eval_logs_ext[name] = float(orbit.utils.get_value(value))
+        eval_logs_ext, lambda x: float(orbit.utils.get_value(x)))
    # Saving json file is very fast.
    with tf.io.gfile.GFile(self.best_ckpt_logs_path, 'w') as writer:
      writer.write(json.dumps(eval_logs_ext, indent=4) + '\n')
-    # Saving the best checkpoint might be interrupted if the job got killed.
+    self._get_checkpoint_manager(checkpoint).save()
-    for file_to_remove in tf.io.gfile.glob(self.best_ckpt_path + '*'):
-      tf.io.gfile.remove(file_to_remove)
-    checkpoint.write(self.best_ckpt_path)
  @property
  def best_ckpt_logs(self):
@@ -120,7 +199,8 @@ class BestCheckpointExporter:
  @property
  def best_ckpt_path(self):
-    return os.path.join(self._export_dir, 'best_ckpt')
+    """Returns the best ckpt path or None if there is no ckpt yet."""
+    return tf.train.latest_checkpoint(self._export_dir)
 @gin.configurable

--- a/official/core/train_utils_test.py
+++ b/official/core/train_utils_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.core.train_utils."""
+import tensorflow as tf
+from official.core import train_utils
+class TrainUtilsTest(tf.test.TestCase):
+  def test_get_leaf_nested_dict(self):
+    d = {'a': {'i': {'x': 5}}}
+    self.assertEqual(train_utils.get_leaf_nested_dict(d, ['a', 'i', 'x']), 5)
+  def test_get_leaf_nested_dict_not_leaf(self):
+    with self.assertRaisesRegex(KeyError, 'The value extracted with keys.*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i'])
+  def test_get_leaf_nested_dict_path_not_exist_missing_key(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'y'])
+  def test_get_leaf_nested_dict_path_not_exist_out_of_range(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'z'])
+  def test_get_leaf_nested_dict_path_not_exist_meets_leaf(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': 5}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'z'])
+  def test_cast_leaf_nested_dict(self):
+    d = {'a': {'i': {'x': '123'}}, 'b': 456.5}
+    d = train_utils.cast_leaf_nested_dict(d, int)
+    self.assertEqual(d['a']['i']['x'], 123)
+    self.assertEqual(d['b'], 456)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/multitask/configs.py
+++ b/official/modeling/multitask/configs.py
@@ -37,16 +37,10 @@ class MultiTaskConfig(hyperparams.Config):
 @dataclasses.dataclass
-class MultiEvalExperimentConfig(hyperparams.Config):
+class MultiEvalExperimentConfig(cfg.ExperimentConfig):
  """An experiment config for single-task training and multi-task evaluation.
  Attributes:
-    task: the single-stream training task.
    eval_tasks: individual evaluation tasks.
-    trainer: the trainer configuration.
-    runtime: the runtime configuration.
  """
-  task: cfg.TaskConfig = cfg.TaskConfig()
  eval_tasks: MultiTaskConfig = MultiTaskConfig()
-  trainer: cfg.TrainerConfig = cfg.TrainerConfig()
-  runtime: cfg.RuntimeConfig = cfg.RuntimeConfig()
--- a/official/modeling/multitask/evaluator.py
+++ b/official/modeling/multitask/evaluator.py
@@ -21,6 +21,7 @@ import gin
 import orbit
 import tensorflow as tf
+from official.core import train_utils
 from official.modeling.multitask import base_model
 from official.modeling.multitask import multitask
@@ -29,16 +30,20 @@ from official.modeling.multitask import multitask
 class MultiTaskEvaluator(orbit.AbstractEvaluator):
  """Implements the common trainer shared for TensorFlow models."""
-  def __init__(self,
+  def __init__(
-               task: multitask.MultiTask,
+      self,
-               model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
+      task: multitask.MultiTask,
-               global_step: Optional[tf.Variable] = None):
+      model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
+      global_step: Optional[tf.Variable] = None,
+      checkpoint_exporter: Optional[train_utils.BestCheckpointExporter] = None):
    """Initialize common trainer for TensorFlow models.
    Args:
      task: A multitask.MultiTask instance.
      model: tf.keras.Model instance.
      global_step: the global step variable.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
    """
    # Gets the current distribution strategy. If not inside any strategy scope,
    # it gets a single-replica no-op strategy.
@@ -46,19 +51,10 @@ class MultiTaskEvaluator(orbit.AbstractEvaluator):
    self._task = task
    self._model = model
    self._global_step = global_step or orbit.utils.create_global_step()
-    # TODO(hongkuny): Define a more robust way to handle the training/eval
+    self._checkpoint_exporter = checkpoint_exporter
-    # checkpoint loading.
-    if hasattr(self.model, "checkpoint_items"):
-      # Each evaluation task can have different models and load a subset of
-      # components from the training checkpoint. This is assuming the
-      # checkpoint items are able to load the weights of the evaluation model.
-      checkpoint_items = self.model.checkpoint_items
-    else:
-      # This is assuming the evaluation model is exactly the training model.
-      checkpoint_items = dict(model=self.model)
    self._checkpoint = tf.train.Checkpoint(
        global_step=self.global_step,
-        **checkpoint_items)
+        model=self.model)
    self._validation_losses = None
    self._validation_metrics = None
@@ -168,4 +164,8 @@ class MultiTaskEvaluator(orbit.AbstractEvaluator):
        metrics = task.reduce_aggregated_logs(outputs)
        logs.update(metrics)
      results[name] = logs
+    if self._checkpoint_exporter:
+      self._checkpoint_exporter.maybe_export_checkpoint(
+          self.checkpoint, results, self.global_step.numpy())
    return results
--- a/official/modeling/multitask/train_lib.py
+++ b/official/modeling/multitask/train_lib.py
@@ -20,6 +20,7 @@ import orbit
 import tensorflow as tf
 from official.core import base_task
 from official.core import base_trainer as core_lib
+from official.core import train_utils
 from official.modeling.multitask import configs
 from official.modeling.multitask import evaluator as evaluator_lib
 from official.modeling.multitask import multitask
@@ -73,7 +74,9 @@ def run_experiment_with_multitask_eval(
      evaluator = evaluator_lib.MultiTaskEvaluator(
          task=eval_tasks,
          model=model,
-          global_step=trainer.global_step if is_training else None)
+          global_step=trainer.global_step if is_training else None,
+          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+              params, model_dir))
    else:
      evaluator = None

--- a/official/nlp/albert/export_albert_tfhub.py
+++ b/official/nlp/albert/export_albert_tfhub.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A script to export the ALBERT core model as a TF-Hub SavedModel."""
-# Import libraries
-from absl import app
-from absl import flags
-import tensorflow as tf
-from typing import Text
-from official.nlp.albert import configs
-from official.nlp.bert import bert_models
-FLAGS = flags.FLAGS
-flags.DEFINE_string("albert_config_file", None,
-                    "Albert configuration file to define core albert layers.")
-flags.DEFINE_string("model_checkpoint_path", None,
-                    "File path to TF model checkpoint.")
-flags.DEFINE_string("export_path", None, "TF-Hub SavedModel destination path.")
-flags.DEFINE_string(
-    "sp_model_file", None,
-    "The sentence piece model file that the ALBERT model was trained on.")
-def create_albert_model(
-    albert_config: configs.AlbertConfig) -> tf.keras.Model:
-  """Creates an ALBERT keras core model from ALBERT configuration.
-  Args:
-    albert_config: An `AlbertConfig` to create the core model.
-  Returns:
-    A keras model.
-  """
-  # Adds input layers just as placeholders.
-  input_word_ids = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_word_ids")
-  input_mask = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_mask")
-  input_type_ids = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_type_ids")
-  transformer_encoder = bert_models.get_transformer_encoder(
-      albert_config, sequence_length=None)
-  sequence_output, pooled_output = transformer_encoder(
-      [input_word_ids, input_mask, input_type_ids])
-  # To keep consistent with legacy hub modules, the outputs are
-  # "pooled_output" and "sequence_output".
-  return tf.keras.Model(
-      inputs=[input_word_ids, input_mask, input_type_ids],
-      outputs=[pooled_output, sequence_output]), transformer_encoder
-def export_albert_tfhub(albert_config: configs.AlbertConfig,
-                        model_checkpoint_path: Text, hub_destination: Text,
-                        sp_model_file: Text):
-  """Restores a tf.keras.Model and saves for TF-Hub."""
-  core_model, encoder = create_albert_model(albert_config)
-  checkpoint = tf.train.Checkpoint(model=encoder)
-  checkpoint.restore(model_checkpoint_path).assert_consumed()
-  core_model.sp_model_file = tf.saved_model.Asset(sp_model_file)
-  core_model.save(hub_destination, include_optimizer=False, save_format="tf")
-def main(_):
-  albert_config = configs.AlbertConfig.from_json_file(
-      FLAGS.albert_config_file)
-  export_albert_tfhub(albert_config, FLAGS.model_checkpoint_path,
-                      FLAGS.export_path, FLAGS.sp_model_file)
-if __name__ == "__main__":
-  app.run(main)
--- a/official/nlp/albert/export_albert_tfhub_test.py
+++ b/official/nlp/albert/export_albert_tfhub_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests official.nlp.albert.export_albert_tfhub."""
-import os
-import numpy as np
-import tensorflow as tf
-import tensorflow_hub as hub
-from official.nlp.albert import configs
-from official.nlp.albert import export_albert_tfhub
-class ExportAlbertTfhubTest(tf.test.TestCase):
-  def test_export_albert_tfhub(self):
-    # Exports a savedmodel for TF-Hub
-    albert_config = configs.AlbertConfig(
-        vocab_size=100,
-        embedding_size=8,
-        hidden_size=16,
-        intermediate_size=32,
-        max_position_embeddings=128,
-        num_attention_heads=2,
-        num_hidden_layers=1)
-    bert_model, encoder = export_albert_tfhub.create_albert_model(albert_config)
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(model=encoder)
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-    sp_model_file = os.path.join(self.get_temp_dir(), "sp_tokenizer.model")
-    with tf.io.gfile.GFile(sp_model_file, "w") as f:
-      f.write("dummy content")
-    hub_destination = os.path.join(self.get_temp_dir(), "hub")
-    export_albert_tfhub.export_albert_tfhub(
-        albert_config,
-        model_checkpoint_path,
-        hub_destination,
-        sp_model_file=sp_model_file)
-    # Restores a hub KerasLayer.
-    hub_layer = hub.KerasLayer(hub_destination, trainable=True)
-    if hasattr(hub_layer, "resolved_object"):
-      with tf.io.gfile.GFile(
-          hub_layer.resolved_object.sp_model_file.asset_path.numpy()) as f:
-        self.assertEqual("dummy content", f.read())
-    # Checks the hub KerasLayer.
-    for source_weight, hub_weight in zip(bert_model.trainable_weights,
-                                         hub_layer.trainable_weights):
-      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
-    dummy_ids = np.zeros((2, 10), dtype=np.int32)
-    hub_outputs = hub_layer([dummy_ids, dummy_ids, dummy_ids])
-    source_outputs = bert_model([dummy_ids, dummy_ids, dummy_ids])
-    # The outputs of hub module are "pooled_output" and "sequence_output",
-    # while the outputs of encoder is in reversed order, i.e.,
-    # "sequence_output" and "pooled_output".
-    encoder_outputs = reversed(encoder([dummy_ids, dummy_ids, dummy_ids]))
-    self.assertEqual(hub_outputs[0].shape, (2, 16))
-    self.assertEqual(hub_outputs[1].shape, (2, 10, 16))
-    for source_output, hub_output, encoder_output in zip(
-        source_outputs, hub_outputs, encoder_outputs):
-      self.assertAllClose(source_output.numpy(), hub_output.numpy())
-      self.assertAllClose(source_output.numpy(), encoder_output.numpy())
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
+++ b/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
@@ -65,6 +65,7 @@ ALBERT_NAME_REPLACEMENTS = (
    ("ffn_1/intermediate/output/dense", "output"),
    ("transformer/LayerNorm_1/", "transformer/output_layer_norm/"),
    ("pooler/dense", "pooler_transform"),
+    ("cls/predictions", "bert/cls/predictions"),
    ("cls/predictions/output_bias", "cls/predictions/output_bias/bias"),
    ("cls/seq_relationship/output_bias", "predictions/transform/logits/bias"),
    ("cls/seq_relationship/output_weights",
@@ -113,6 +114,8 @@ def _create_pretrainer_model(cfg):
      mlm_activation=tf_utils.get_activation(cfg.hidden_act),
      mlm_initializer=tf.keras.initializers.TruncatedNormal(
          stddev=cfg.initializer_range))
+  # Makes sure masked_lm layer's variables in pretrainer are created.
+  _ = pretrainer(pretrainer.inputs)
  return pretrainer

--- a/official/nlp/bert/export_tfhub.py
+++ b/official/nlp/bert/export_tfhub.py
@@ -12,14 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A script to export the BERT core model as a TF-Hub SavedModel."""
+"""A script to export BERT as a TF-Hub SavedModel.
+This script is **DEPRECATED** for exporting BERT encoder models;
+see the error message in by main() for details.
+"""
+from typing import Text
 # Import libraries
 from absl import app
 from absl import flags
 from absl import logging
 import tensorflow as tf
-from typing import Text
 from official.nlp.bert import bert_models
 from official.nlp.bert import configs
@@ -112,6 +117,14 @@ def export_bert_squad_tfhub(bert_config: configs.BertConfig,
 def main(_):
  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
  if FLAGS.model_type == "encoder":
+    deprecation_note = (
+        "nlp/bert/export_tfhub is **DEPRECATED** for exporting BERT encoder "
+        "models. Please switch to nlp/tools/export_tfhub for exporting BERT "
+        "(and other) encoders with dict inputs/outputs conforming to "
+        "https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders"
+    )
+    logging.error(deprecation_note)
+    print("\n\nNOTICE:", deprecation_note, "\n")
    export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path,
                      FLAGS.export_path, FLAGS.vocab_file, FLAGS.do_lower_case)
  elif FLAGS.model_type == "squad":

--- a/official/nlp/bert/tf1_checkpoint_converter_lib.py
+++ b/official/nlp/bert/tf1_checkpoint_converter_lib.py
@@ -116,7 +116,13 @@ def create_v2_checkpoint(model,
  """Converts a name-based matched TF V1 checkpoint to TF V2 checkpoint."""
  # Uses streaming-restore in eager model to read V1 name-based checkpoints.
  model.load_weights(src_checkpoint).assert_existing_objects_matched()
-  checkpoint = tf.train.Checkpoint(**{checkpoint_model_name: model})
+  if hasattr(model, "checkpoint_items"):
+    checkpoint_items = model.checkpoint_items
+  else:
+    checkpoint_items = {}
+  checkpoint_items[checkpoint_model_name] = model
+  checkpoint = tf.train.Checkpoint(**checkpoint_items)
  checkpoint.save(output_path)

--- a/official/nlp/configs/experiment_configs.py
+++ b/official/nlp/configs/experiment_configs.py
@@ -16,3 +16,4 @@
 """Experiments definition."""
 # pylint: disable=unused-import
 from official.nlp.configs import finetuning_experiments
+from official.nlp.configs import pretraining_experiments
--- a/official/nlp/configs/pretraining_experiments.py
+++ b/official/nlp/configs/pretraining_experiments.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pretraining experiment configurations."""
+# pylint: disable=g-doc-return-or-yield,line-too-long
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import optimization
+from official.nlp.data import pretrain_dataloader
+from official.nlp.tasks import masked_lm
+@exp_factory.register_config_factory('bert/pretraining')
+def bert_pretraining() -> cfg.ExperimentConfig:
+  """BERT pretraining experiment."""
+  config = cfg.ExperimentConfig(
+      task=masked_lm.MaskedLMConfig(
+          train_data=pretrain_dataloader.BertPretrainDataConfig(),
+          validation_data=pretrain_dataloader.BertPretrainDataConfig(
+              is_training=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=1000000,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay': [
+                          'LayerNorm', 'layer_norm', 'bias'
+                      ],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 1e-4,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config