Merge remote-tracking branch 'upstream/master' into fasterrcnn_fpn_keras_feature_extractor

b1025b3b · syiming · 69ce1c45 · e9df75ab · b1025b3b · b1025b3b
Commit b1025b3b authored Jun 18, 2020 by syiming
20 changed files
--- a/official/benchmark/unet3d_benchmark.py
+++ b/official/benchmark/unet3d_benchmark.py
@@ -32,9 +32,9 @@ from official.vision.segmentation import unet_model as unet_model_lib

 UNET3D_MIN_ACCURACY = 0.90
 UNET3D_MAX_ACCURACY = 0.98
-UNET_TRAINING_FILES = 'unet_training_data_files'
-UNET_EVAL_FILES = 'unet_eval_data_files'
-UNET_MODEL_CONFIG_FILE = 'unet_model_config'
+UNET_TRAINING_FILES = 'gs://mlcompass-data/unet3d/train_data/*'
+UNET_EVAL_FILES = 'gs://mlcompass-data/unet3d/eval_data/*'
+UNET_MODEL_CONFIG_FILE = 'gs://mlcompass-data/unet3d/config/unet_config.yaml'

 FLAGS = flags.FLAGS


--- a/official/colab/fine_tuning_bert.ipynb
+++ b/official/colab/fine_tuning_bert.ipynb
@@ -4,64 +4,79 @@
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "YN2ACivEPxgD"
+        "id": "vXLA5InzXydn"
      },
      "source": [
-        "## How-to Guide: Using a PIP package for fine-tuning a BERT model\n",
-        "\n",
-        "Authors: [Chen Chen](https://github.com/chenGitHuber), [Claire Yao](https://github.com/claireyao-fen)\n",
-        "\n",
-        "In this example, we will work through fine-tuning a BERT model using the tensorflow-models PIP package."
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "RuRlpLL-X0R_"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "T7BBEc1-RNCQ"
+        "id": "1mLJmVotXs64"
      },
      "source": [
-        "## License\n",
-        "\n",
-        "Copyright 2020 The TensorFlow Authors. All Rights Reserved.\n",
-        "\n",
-        "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "you may not use this file except in compliance with the License.\n",
-        "You may obtain a copy of the License at\n",
-        "\n",
-        "    http://www.apache.org/licenses/LICENSE-2.0\n",
-        "\n",
-        "Unless required by applicable law or agreed to in writing, software\n",
-        "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "See the License for the specific language governing permissions and\n",
-        "limitations under the License."
+        "# Fine-tuning a BERT model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "Pf6xzoKjywY_"
+        "id": "hYEwGTeCXnnX"
      },
      "source": [
-        "## Learning objectives\n",
-        "\n",
-        "In this Colab notebook, you will learn how to fine-tune a BERT model using the TensorFlow Model Garden PIP package."
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/fine_tune_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "YHkmV89jRWkS"
+        "id": "YN2ACivEPxgD"
      },
      "source": [
-        "## Enable the GPU acceleration\n",
-        "Please enable GPU for better performance.\n",
-        "*   Navigate to Edit.\n",
-        "*   Find Notebook settings.\n",
-        "*   Select GPU from the \"Hardware Accelerator\" drop-down list, save it."
+        "In this example, we will work through fine-tuning a BERT model using the tensorflow-models PIP package.\n",
+        "\n",
+        "The pretrained BERT model this tutorial is based on is also available on [TensorFlow Hub](https://tensorflow.org/hub), to see how to use it refer to the [Hub Appendix](#hub_bert)"
      ]
    },
    {
@@ -71,7 +86,7 @@
        "id": "s2d9S2CSSO1z"
      },
      "source": [
-        "##Install and import"
+        "## Setup"
      ]
    },
    {
@@ -83,7 +98,7 @@
      "source": [
        "### Install the TensorFlow Model Garden pip package\n",
        "\n",
-        "*  tf-models-nightly is the nightly Model Garden package created daily automatically.\n",
+        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
        "*  pip will install all models and dependencies automatically."
      ]
    },
@@ -97,7 +112,8 @@
      },
      "outputs": [],
      "source": [
-        "!pip install tf-models-nightly"
+        "!pip install -q tf-nightly\n",
+        "!pip install -q tf-models-nightly"
      ]
    },
    {
@@ -107,7 +123,7 @@
        "id": "U-7qPCjWUAyy"
      },
      "source": [
-        "### Import Tensorflow and other libraries"
+        "### Imports"
      ]
    },
    {
@@ -123,67 +139,176 @@
        "import os\n",
        "\n",
        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
        "import tensorflow as tf\n",
        "\n",
+        "import tensorflow_hub as hub\n",
+        "import tensorflow_datasets as tfds\n",
+        "tfds.disable_progress_bar()\n",
+        "\n",
        "from official.modeling import tf_utils\n",
-        "from official.nlp import optimization\n",
-        "from official.nlp.bert import configs as bert_configs\n",
-        "from official.nlp.bert import tokenization\n",
-        "from official.nlp.data import classifier_data_lib\n",
-        "from official.nlp.modeling import losses\n",
-        "from official.nlp.modeling import models\n",
-        "from official.nlp.modeling import networks"
+        "from official import nlp\n",
+        "from official.nlp import bert\n",
+        "\n",
+        "# Load the required submodules\n",
+        "import official.nlp.optimization\n",
+        "import official.nlp.bert.bert_models\n",
+        "import official.nlp.bert.configs\n",
+        "import official.nlp.bert.run_classifier\n",
+        "import official.nlp.bert.tokenization\n",
+        "import official.nlp.data.classifier_data_lib\n",
+        "import official.nlp.modeling.losses\n",
+        "import official.nlp.modeling.models\n",
+        "import official.nlp.modeling.networks"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "mbanlzTvJBsz"
+      },
+      "source": [
+        "### Resources"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "PpW0x8TpR8DT"
+      },
+      "source": [
+        "This directory contains the configuration, vocabulary, and a pre-trained checkpoint used in this tutorial:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "vzRHOLciR8eq"
+      },
+      "outputs": [],
+      "source": [
+        "gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12\"\n",
+        "tf.io.gfile.listdir(gs_folder_bert)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "9uFskufsR2LT"
+      },
+      "source": [
+        "You can get a pre-trained BERT encoder from TensorFlow Hub here:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "e0dAkUttJAzj"
+      },
+      "outputs": [],
+      "source": [
+        "hub_url_bert = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "C2drjD7OVCmh"
+        "id": "Qv6abtRvH4xO"
      },
      "source": [
-        "## Preprocess the raw data and output tf.record files"
+        "## The data\n",
+        "For this example we used the [GLUE MRPC dataset from TFDS](https://www.tensorflow.org/datasets/catalog/glue#gluemrpc).\n",
+        "\n",
+        "This dataset is not set up so that it can be directly fed into the BERT model, so this section also handles the necessary preprocessing."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "qfjcKj5FYQOp"
+        "id": "28DvUhC1YUiB"
      },
      "source": [
-        "### Introduction of dataset\n",
+        "### Get the dataset from TensorFlow Datasets\n",
        "\n",
        "The Microsoft Research Paraphrase Corpus (Dolan \u0026 Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n",
        "\n",
        "*   Number of labels: 2.\n",
        "*   Size of training dataset: 3668.\n",
        "*   Size of evaluation dataset: 408.\n",
-        "*   Maximum sequence length of training and evaluation dataset: 128.\n",
-        "*   Please refer here for details: https://www.tensorflow.org/datasets/catalog/glue#gluemrpc"
+        "*   Maximum sequence length of training and evaluation dataset: 128.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Ijikx5OsH9AT"
+      },
+      "outputs": [],
+      "source": [
+        "glue, info = tfds.load('glue/mrpc', with_info=True,\n",
+        "                       # It's small, load the whole dataset\n",
+        "                       batch_size=-1)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "xf9zz4vLYXjr"
+      },
+      "outputs": [],
+      "source": [
+        "list(glue.keys())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "28DvUhC1YUiB"
+        "id": "ZgBg2r2nYT-K"
      },
      "source": [
-        "### Get dataset from TensorFlow Datasets (TFDS)\n",
-        "\n",
-        "For example, we used the GLUE MRPC dataset from TFDS: https://www.tensorflow.org/datasets/catalog/glue#gluemrpc."
+        "The `info` object describes the dataset and it's features:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "IQrHxv7W7jH5"
+      },
+      "outputs": [],
+      "source": [
+        "info.features"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "4PhRLWh9jaXp"
+        "id": "vhsVWYNxazz5"
      },
      "source": [
-        "### Preprocess the data and write to TensorFlow record file\n",
-        "\n"
+        "The two classes are:"
      ]
    },
    {
@@ -192,43 +317,21 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "FhcMdzsrjWzG"
+        "id": "n0gfc_VTayfQ"
      },
      "outputs": [],
      "source": [
-        "gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12\"\n",
-        "\n",
-        "# Set up tokenizer to generate Tensorflow dataset\n",
-        "tokenizer = tokenization.FullTokenizer(\n",
-        "    vocab_file=os.path.join(gs_folder_bert, \"vocab.txt\"), do_lower_case=True)\n",
-        "\n",
-        "# Set up processor to generate Tensorflow dataset\n",
-        "processor = classifier_data_lib.TfdsProcessor(\n",
-        "    tfds_params=\"dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2\",\n",
-        "    process_text_fn=tokenization.convert_to_unicode)\n",
-        "\n",
-        "# Set up output of training and evaluation Tensorflow dataset\n",
-        "train_data_output_path=\"./mrpc_train.tf_record\"\n",
-        "eval_data_output_path=\"./mrpc_eval.tf_record\"\n",
-        "\n",
-        "# Generate and save training data into a tf record file\n",
-        "input_meta_data = classifier_data_lib.generate_tf_record_from_data_file(\n",
-        "    processor=processor,\n",
-        "    data_dir=None,  # It is `None` because data is from tfds, not local dir.\n",
-        "    tokenizer=tokenizer,\n",
-        "    train_data_output_path=train_data_output_path,\n",
-        "    eval_data_output_path=eval_data_output_path,\n",
-        "    max_seq_length=128)"
+        "info.features['label'].names"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "dbJ76vSJj77j"
+        "id": "38zJcap6xkbC"
      },
      "source": [
-        "### Create tf.dataset for training and evaluation\n"
+        "Here is one example from the training set:"
      ]
    },
    {
@@ -237,82 +340,38 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "gCvaLLAxPuMc"
+        "id": "xON_i6SkwApW"
      },
      "outputs": [],
      "source": [
-        "def create_classifier_dataset(file_path, seq_length, batch_size, is_training):\n",
-        "  \"\"\"Creates input dataset from (tf)records files for train/eval.\"\"\"\n",
-        "  dataset = tf.data.TFRecordDataset(file_path)\n",
-        "  if is_training:\n",
-        "    dataset = dataset.shuffle(100)\n",
-        "    dataset = dataset.repeat()\n",
-        "\n",
-        "  def decode_record(record):\n",
-        "    name_to_features = {\n",
-        "      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
-        "      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
-        "      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
-        "      'label_ids': tf.io.FixedLenFeature([], tf.int64),\n",
-        "    }\n",
-        "    return tf.io.parse_single_example(record, name_to_features)\n",
-        "\n",
-        "  def _select_data_from_record(record):\n",
-        "    x = {\n",
-        "        'input_word_ids': record['input_ids'],\n",
-        "        'input_mask': record['input_mask'],\n",
-        "        'input_type_ids': record['segment_ids']\n",
-        "    }\n",
-        "    y = record['label_ids']\n",
-        "    return (x, y)\n",
-        "\n",
-        "  dataset = dataset.map(decode_record,\n",
-        "                        num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
-        "  dataset = dataset.map(\n",
-        "      _select_data_from_record,\n",
-        "      num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
-        "  dataset = dataset.batch(batch_size, drop_remainder=is_training)\n",
-        "  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)\n",
-        "  return dataset\n",
-        "\n",
-        "# Set up batch sizes\n",
-        "batch_size = 32\n",
-        "eval_batch_size = 32\n",
-        "\n",
-        "# Return Tensorflow dataset\n",
-        "training_dataset = create_classifier_dataset(\n",
-        "    train_data_output_path,\n",
-        "    input_meta_data['max_seq_length'],\n",
-        "    batch_size,\n",
-        "    is_training=True)\n",
+        "glue_train = glue['train']\n",
        "\n",
-        "evaluation_dataset = create_classifier_dataset(\n",
-        "    eval_data_output_path,\n",
-        "    input_meta_data['max_seq_length'],\n",
-        "    eval_batch_size,\n",
-        "    is_training=False)\n"
+        "for key, value in glue_train.items():\n",
+        "  print(f\"{key:9s}: {value[0].numpy()}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "Efrj3Cn1kLAp"
+        "id": "9fbTyfJpNr7x"
      },
      "source": [
-        "## Create, compile and train the model"
+        "### The BERT tokenizer"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "96ldxDSwkVkj"
+        "id": "wqeN54S61ZKQ"
      },
      "source": [
-        "### Construct a Bert Model\n",
+        "To fine tune a pre-trained model you need to be sure that you're using exactly the same tokenization, vocabulary, and index mapping as you used during training.\n",
        "\n",
-        "Here, a Bert Model is constructed from the json file with parameters. The bert_config defines the core Bert Model, which is a Keras model to predict the outputs of *num_classes* from the inputs with maximum sequence length *max_seq_length*. "
+        "The BERT tokenizer used in this tutorial is written in pure Python (It's not built out of TensorFlow ops). So you can't just plug it into your model as a `keras.layer` like you can with `preprocessing.TextVectorization`.\n",
+        "\n",
+        "The following code rebuilds the tokenizer that was used by the base model:"
      ]
    },
    {
@@ -321,44 +380,26 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "Qgajw8WPYzJZ"
+        "id": "idxyhmrCQcw5"
      },
      "outputs": [],
      "source": [
-        "bert_config_file = os.path.join(gs_folder_bert, \"bert_config.json\")\n",
-        "bert_config = bert_configs.BertConfig.from_json_file(bert_config_file)\n",
-        "\n",
-        "bert_encoder = networks.TransformerEncoder(vocab_size=bert_config.vocab_size,\n",
-        "      hidden_size=bert_config.hidden_size,\n",
-        "      num_layers=bert_config.num_hidden_layers,\n",
-        "      num_attention_heads=bert_config.num_attention_heads,\n",
-        "      intermediate_size=bert_config.intermediate_size,\n",
-        "      activation=tf_utils.get_activation(bert_config.hidden_act),\n",
-        "      dropout_rate=bert_config.hidden_dropout_prob,\n",
-        "      attention_dropout_rate=bert_config.attention_probs_dropout_prob,\n",
-        "      sequence_length=input_meta_data['max_seq_length'],\n",
-        "      max_sequence_length=bert_config.max_position_embeddings,\n",
-        "      type_vocab_size=bert_config.type_vocab_size,\n",
-        "      embedding_width=bert_config.embedding_size,\n",
-        "      initializer=tf.keras.initializers.TruncatedNormal(\n",
-        "          stddev=bert_config.initializer_range))\n",
-        "\n",
-        "classifier_model = models.BertClassifier(\n",
-        "        bert_encoder,\n",
-        "        num_classes=input_meta_data['num_labels'],\n",
-        "        dropout_rate=bert_config.hidden_dropout_prob,\n",
-        "        initializer=tf.keras.initializers.TruncatedNormal(\n",
-        "          stddev=bert_config.initializer_range))"
+        "# Set up tokenizer to generate Tensorflow dataset\n",
+        "tokenizer = bert.tokenization.FullTokenizer(\n",
+        "    vocab_file=os.path.join(gs_folder_bert, \"vocab.txt\"),\n",
+        "     do_lower_case=True)\n",
+        "\n",
+        "print(\"Vocab size:\", len(tokenizer.vocab))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "pkSq1wbNXBaa"
+        "id": "zYHDSquU2lDU"
      },
      "source": [
-        "### Initialize the encoder from a pretrained model"
+        "Tokenize a sentence:"
      ]
    },
    {
@@ -367,26 +408,40 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "X6N9NEqfXJCx"
+        "id": "L_OfOYPg853R"
      },
      "outputs": [],
      "source": [
-        "checkpoint = tf.train.Checkpoint(model=bert_encoder)\n",
-        "checkpoint.restore(\n",
-        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
+        "tokens = tokenizer.tokenize(\"Hello TensorFlow!\")\n",
+        "print(tokens)\n",
+        "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
+        "print(ids)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "115caFLMk-_l"
+        "id": "kkAXLtuyWWDI"
      },
      "source": [
-        "### Set up an optimizer for the model\n",
+        "### Preprocess the data\n",
        "\n",
-        "BERT model adopts the Adam optimizer with weight decay.\n",
-        "It also employs a learning rate schedule that firstly warms up from 0 and then decays to 0."
+        "The section manually preprocessed the dataset into the format expected by the model.\n",
+        "\n",
+        "This dataset is small, so preprocessing can be done quickly and easily in memory. For larger datasets the `tf_models` library includes some tools for preprocessing and re-serializing a dataset. See [Appendix: Re-encoding a large dataset](#re_encoding_tools) for details."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "62UTWLQd9-LB"
+      },
+      "source": [
+        "#### Encode the sentences\n",
+        "\n",
+        "The model expects its two inputs sentences to be concatenated together. This input is expected to start with a `[CLS]` \"This is a classification problem\" token, and each sentence should end with a `[SEP]` \"Separator\" token:"
      ]
    },
    {
@@ -395,45 +450,21 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "2Hf2rpRXk89N"
+        "id": "bdL-dRNRBRJT"
      },
      "outputs": [],
      "source": [
-        "# Set up epochs and steps\n",
-        "epochs = 3\n",
-        "train_data_size = input_meta_data['train_data_size']\n",
-        "steps_per_epoch = int(train_data_size / batch_size)\n",
-        "num_train_steps = steps_per_epoch * epochs\n",
-        "warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)\n",
-        "\n",
-        "# Create learning rate schedule that firstly warms up from 0 and they decy to 0.\n",
-        "lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(\n",
-        "      initial_learning_rate=2e-5,\n",
-        "      decay_steps=num_train_steps,\n",
-        "      end_learning_rate=0)\n",
-        "lr_schedule = optimization.WarmUp(\n",
-        "        initial_learning_rate=2e-5,\n",
-        "        decay_schedule_fn=lr_schedule,\n",
-        "        warmup_steps=warmup_steps)\n",
-        "optimizer = optimization.AdamWeightDecay(\n",
-        "        learning_rate=lr_schedule,\n",
-        "        weight_decay_rate=0.01,\n",
-        "        beta_1=0.9,\n",
-        "        beta_2=0.999,\n",
-        "        epsilon=1e-6,\n",
-        "        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])"
+        "tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "OTNcA0O0nSq9"
+        "id": "UrPktnqpwqie"
      },
      "source": [
-        "### Define metric_fn and loss_fn\n",
-        "\n",
-        "The metric is accuracy and we use sparse categorical cross-entropy as loss."
+        "Start by encoding all the sentences while appending a `[SEP]` token, and packing them into ragged-tensors:"
      ]
    },
    {
@@ -442,27 +473,43 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "ELHjRp87nVNH"
+        "id": "BR7BmtU498Bh"
      },
      "outputs": [],
      "source": [
-        "def metric_fn():\n",
-        "  return tf.keras.metrics.SparseCategoricalAccuracy(\n",
-        "      'accuracy', dtype=tf.float32)\n",
+        "def encode_sentence(s):\n",
+        "   tokens = list(tokenizer.tokenize(s.numpy()))\n",
+        "   tokens.append('[SEP]')\n",
+        "   return tokenizer.convert_tokens_to_ids(tokens)\n",
        "\n",
-        "def classification_loss_fn(labels, logits):\n",
-        "  return losses.weighted_sparse_categorical_crossentropy_loss(\n",
-        "    labels=labels, predictions=tf.nn.log_softmax(logits, axis=-1))\n"
+        "sentence1 = tf.ragged.constant([\n",
+        "    encode_sentence(s) for s in glue_train[\"sentence1\"]])\n",
+        "sentence2 = tf.ragged.constant([\n",
+        "    encode_sentence(s) for s in glue_train[\"sentence2\"]])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "has42aUdfky-"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Sentence1 shape:\", sentence1.shape.as_list())\n",
+        "print(\"Sentence2 shape:\", sentence2.shape.as_list())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "78FEUOOEkoP0"
+        "id": "MU9lTWy_xXbb"
      },
      "source": [
-        "### Compile and train the model"
+        "Now prepend a `[CLS]` token, and concatenate the ragged tensors to form a single `input_word_ids` tensor for each example. `RaggedTensor.to_tensor()` zero pads to the longest sequence."
      ]
    },
    {
@@ -471,29 +518,46 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "nzi8hjeTQTRs"
+        "id": "USD8uihw-g4J"
      },
      "outputs": [],
      "source": [
-        "classifier_model.compile(optimizer=optimizer,\n",
-        "                         loss=classification_loss_fn,\n",
-        "                         metrics=[metric_fn()])\n",
-        "classifier_model.fit(\n",
-        "      x=training_dataset,\n",
-        "      validation_data=evaluation_dataset,\n",
-        "      steps_per_epoch=steps_per_epoch,\n",
-        "      epochs=epochs,\n",
-        "      validation_steps=int(input_meta_data['eval_data_size'] / eval_batch_size))"
+        "cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]\n",
+        "input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)\n",
+        "_ = plt.pcolormesh(input_word_ids.to_tensor())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "fVo_AnT0l26j"
+        "id": "xmNv4l4k-dBZ"
+      },
+      "source": [
+        "#### Mask and input type"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "DIWjNIKq-ldh"
+      },
+      "source": [
+        "The model expects two additional inputs:\n",
+        "\n",
+        "* The input mask\n",
+        "* The input type"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ulNZ4U96-8JZ"
      },
      "source": [
-        "### Save the model"
+        "The mask allows the model to cleanly differentiate between the content and the padding. The mask has the same shape as the `input_word_ids`, and contains a `1` anywhere the `input_word_ids` is not padding."
      ]
    },
    {
@@ -502,21 +566,23 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "Nl5x6nElZqkP"
+        "id": "EezOO9qj91kP"
      },
      "outputs": [],
      "source": [
-        "classifier_model.save('./saved_model', include_optimizer=False, save_format='tf')"
+        "input_mask = tf.ones_like(input_word_ids).to_tensor()\n",
+        "\n",
+        "plt.pcolormesh(input_mask)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
-        "id": "nWsE6yeyfW00"
+        "id": "rxLenwAvCkBf"
      },
      "source": [
-        "## Use the trained model to predict\n"
+        "The \"input type\" also has the same shape, but inside the non-padded region, contains a `0` or a `1` indicating which sentence the token is a part of. "
      ]
    },
    {
@@ -525,13 +591,1223 @@
      "metadata": {
        "colab": {},
        "colab_type": "code",
-        "id": "vz7YJY2QYAjP"
+        "id": "2CetH_5C9P2m"
      },
      "outputs": [],
      "source": [
-        "eval_predictions = classifier_model.predict(evaluation_dataset)\n",
-        "for prediction in eval_predictions:\n",
-        "  print(\"Predicted label id: %s\" % np.argmax(prediction))"
+        "type_cls = tf.zeros_like(cls)\n",
+        "type_s1 = tf.zeros_like(sentence1)\n",
+        "type_s2 = tf.ones_like(sentence2)\n",
+        "input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()\n",
+        "\n",
+        "plt.pcolormesh(input_type_ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "P5UBnCn8Ii6s"
+      },
+      "source": [
+        "#### Put it all together\n",
+        "\n",
+        "Collect the above text parsing code into a single function, and apply it to each split of the `glue/mrpc` dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "sDGiWYPLEd5a"
+      },
+      "outputs": [],
+      "source": [
+        "def encode_sentence(s, tokenizer):\n",
+        "   tokens = list(tokenizer.tokenize(s))\n",
+        "   tokens.append('[SEP]')\n",
+        "   return tokenizer.convert_tokens_to_ids(tokens)\n",
+        "\n",
+        "def bert_encode(glue_dict, tokenizer):\n",
+        "  num_examples = len(glue_dict[\"sentence1\"])\n",
+        "  \n",
+        "  sentence1 = tf.ragged.constant([\n",
+        "      encode_sentence(s, tokenizer)\n",
+        "      for s in np.array(glue_dict[\"sentence1\"])])\n",
+        "  sentence2 = tf.ragged.constant([\n",
+        "      encode_sentence(s, tokenizer)\n",
+        "       for s in np.array(glue_dict[\"sentence2\"])])\n",
+        "\n",
+        "  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]\n",
+        "  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)\n",
+        "\n",
+        "  input_mask = tf.ones_like(input_word_ids).to_tensor()\n",
+        "\n",
+        "  type_cls = tf.zeros_like(cls)\n",
+        "  type_s1 = tf.zeros_like(sentence1)\n",
+        "  type_s2 = tf.ones_like(sentence2)\n",
+        "  input_type_ids = tf.concat(\n",
+        "      [type_cls, type_s1, type_s2], axis=-1).to_tensor()\n",
+        "\n",
+        "  inputs = {\n",
+        "      'input_word_ids': input_word_ids.to_tensor(),\n",
+        "      'input_mask': input_mask,\n",
+        "      'input_type_ids': input_type_ids}\n",
+        "\n",
+        "  return inputs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "yuLKxf6zHxw-"
+      },
+      "outputs": [],
+      "source": [
+        "glue_train = bert_encode(glue['train'], tokenizer)\n",
+        "glue_train_labels = glue['train']['label']\n",
+        "\n",
+        "glue_validation = bert_encode(glue['validation'], tokenizer)\n",
+        "glue_validation_labels = glue['validation']['label']\n",
+        "\n",
+        "glue_test = bert_encode(glue['test'], tokenizer)\n",
+        "glue_test_labels  = glue['test']['label']"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7FC5aLVxKVKK"
+      },
+      "source": [
+        "Each subset of the data has been converted to a dictionary of features, and a set of labels. Each feature in the input dictionary has the same shape, and the number of labels should match:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "jyjTdGpFhO_1"
+      },
+      "outputs": [],
+      "source": [
+        "for key, value in glue_train.items():\n",
+        "  print(f'{key:15s} shape: {value.shape}')\n",
+        "\n",
+        "print(f'glue_train_labels shape: {glue_train_labels.shape}')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "FSwymsbkbLDA"
+      },
+      "source": [
+        "## The model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Efrj3Cn1kLAp"
+      },
+      "source": [
+        "### Build the model\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xxpOY5r2Ayq6"
+      },
+      "source": [
+        "The first step is to download the configuration  for the pre-trained model.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ujapVfZ_AKW7"
+      },
+      "outputs": [],
+      "source": [
+        "import json\n",
+        "\n",
+        "bert_config_file = os.path.join(gs_folder_bert, \"bert_config.json\")\n",
+        "config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())\n",
+        "\n",
+        "bert_config = bert.configs.BertConfig.from_dict(config_dict)\n",
+        "\n",
+        "config_dict"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "96ldxDSwkVkj"
+      },
+      "source": [
+        "The `config` defines the core BERT Model, which is a Keras model to predict the outputs of `num_classes` from the inputs with maximum sequence length `max_seq_length`.\n",
+        "\n",
+        "This function returns both the encoder and the classifier."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "cH682__U0FBv"
+      },
+      "outputs": [],
+      "source": [
+        "bert_classifier, bert_encoder = bert.bert_models.classifier_model(\n",
+        "    bert_config, num_labels=2)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "XqKp3-5GIZlw"
+      },
+      "source": [
+        "The classifier has three inputs and one output:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "bAQblMIjwkvx"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "sFmVG4SKZAw8"
+      },
+      "source": [
+        "Run it on a test batch of data 10 examples from the training set. The output is the logits for the two classes:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "VTjgPbp4ZDKo"
+      },
+      "outputs": [],
+      "source": [
+        "glue_batch = {key: val[:10] for key, val in glue_train.items()}\n",
+        "\n",
+        "bert_classifier(\n",
+        "    glue_batch, training=True\n",
+        ").numpy()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Q0NTdwZsQK8n"
+      },
+      "source": [
+        "The `TransformerEncoder` in the center of the classifier above **is** the `bert_encoder`.\n",
+        "\n",
+        "Inspecting the encoder, we see its stack of `Transformer` layers connected to those same three inputs:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "8L__-erBwLIQ"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(bert_encoder, show_shapes=True, dpi=48)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "mKAvkQc3heSy"
+      },
+      "source": [
+        "### Restore the encoder weights\n",
+        "\n",
+        "When built the encoder is randomly initialized. Restore the encoder's weights from the checkpoint:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "97Ll2Gichd_Y"
+      },
+      "outputs": [],
+      "source": [
+        "checkpoint = tf.train.Checkpoint(model=bert_encoder)\n",
+        "checkpoint.restore(\n",
+        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2oHOql35k3Dd"
+      },
+      "source": [
+        "Note: The pretrained `TransformerEncoder` is also available on [TensorFlow Hub](https://tensorflow.org/hub). See the [Hub appendix](#hub_bert) for details. "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "115caFLMk-_l"
+      },
+      "source": [
+        "### Set up the optimizer\n",
+        "\n",
+        "BERT adopts the Adam optimizer with weight decay (aka \"[AdamW](https://arxiv.org/abs/1711.05101)\").\n",
+        "It also employs a learning rate schedule that firstly warms up from 0 and then decays to 0."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "w8qXKRZuCwW4"
+      },
+      "outputs": [],
+      "source": [
+        "# Set up epochs and steps\n",
+        "epochs = 3\n",
+        "batch_size = 32\n",
+        "eval_batch_size = 32\n",
+        "\n",
+        "train_data_size = len(glue_train_labels)\n",
+        "steps_per_epoch = int(train_data_size / batch_size)\n",
+        "num_train_steps = steps_per_epoch * epochs\n",
+        "warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)\n",
+        "\n",
+        "# creates an optimizer with learning rate schedule\n",
+        "optimizer = nlp.optimization.create_optimizer(\n",
+        "    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "pXRGxiRNEHS2"
+      },
+      "source": [
+        "This returns an `AdamWeightDecay`  optimizer with the learning rate schedule set:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "eQNA16bhDpky"
+      },
+      "outputs": [],
+      "source": [
+        "type(optimizer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xqu_K71fJQB8"
+      },
+      "source": [
+        "To see an example of how to customize the optimizer and it's schedule, see the [Optimizer schedule appendix](#optiizer_schedule)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "78FEUOOEkoP0"
+      },
+      "source": [
+        "### Train the model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "OTNcA0O0nSq9"
+      },
+      "source": [
+        "The metric is accuracy and we use sparse categorical cross-entropy as loss."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "nzi8hjeTQTRs"
+      },
+      "outputs": [],
+      "source": [
+        "metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]\n",
+        "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
+        "\n",
+        "bert_classifier.compile(\n",
+        "    optimizer=optimizer,\n",
+        "    loss=loss,\n",
+        "    metrics=metrics)\n",
+        "\n",
+        "bert_classifier.fit(\n",
+        "      glue_train, glue_train_labels,\n",
+        "      validation_data=(glue_validation, glue_validation_labels),\n",
+        "      batch_size=32,\n",
+        "      epochs=epochs)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "IFtKFWbNKb0u"
+      },
+      "source": [
+        "Now run the fine-tuned model on a custom example to see that it works.\n",
+        "\n",
+        "Start by encoding some sentence pairs:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "9ZoUgDUNJPz3"
+      },
+      "outputs": [],
+      "source": [
+        "my_examples = bert_encode(\n",
+        "    glue_dict = {\n",
+        "        'sentence1':[\n",
+        "            'The rain in Spain falls mainly on the plain.',\n",
+        "            'Look I fine tuned BERT.'],\n",
+        "        'sentence2':[\n",
+        "            'It mostly rains on the flat lands of Spain.',\n",
+        "            'Is it working? This does not match.']\n",
+        "    },\n",
+        "    tokenizer=tokenizer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7ynJibkBRTJF"
+      },
+      "source": [
+        "The model should report class `1` \"match\" for the first example and class `0` \"no-match\" for the second:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "umo0ttrgRYIM"
+      },
+      "outputs": [],
+      "source": [
+        "result = bert_classifier(my_examples, training=False)\n",
+        "\n",
+        "result = tf.argmax(result).numpy()\n",
+        "result"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "utGl0M3aZCE4"
+      },
+      "outputs": [],
+      "source": [
+        "np.array(info.features['label'].names)[result]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fVo_AnT0l26j"
+      },
+      "source": [
+        "### Save the model\n",
+        "\n",
+        "Often the goal of training a model is to _use_ it for something, so export the model and then restore it to be sure that it works."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Nl5x6nElZqkP"
+      },
+      "outputs": [],
+      "source": [
+        "export_dir='./saved_model'\n",
+        "tf.saved_model.save(bert_classifier, export_dir=export_dir)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "y_ACvKPsVUXC"
+      },
+      "outputs": [],
+      "source": [
+        "reloaded = tf.saved_model.load(export_dir)\n",
+        "reloaded_result = reloaded([my_examples['input_word_ids'],\n",
+        "                            my_examples['input_mask'],\n",
+        "                            my_examples['input_type_ids']], training=False)\n",
+        "\n",
+        "original_result = bert_classifier(my_examples, training=False)\n",
+        "\n",
+        "# The results are (nearly) identical:\n",
+        "print(original_result.numpy())\n",
+        "print()\n",
+        "print(reloaded_result.numpy())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eQceYqRFT_Eg"
+      },
+      "source": [
+        "## Appendix"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "SaC1RlFawUpc"
+      },
+      "source": [
+        "\u003ca id=re_encoding_tools\u003e\u003c/a\u003e\n",
+        "### Re-encoding a large dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "CwUdjFBkzUgh"
+      },
+      "source": [
+        "This tutorial you re-encoded the dataset in memory, for clarity.\n",
+        "\n",
+        "This was only possible because `glue/mrpc` is a very small dataset. To deal with larger datasets `tf_models` library includes some tools for processing and re-encoding a dataset for efficient training."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2UTQrkyOT5wD"
+      },
+      "source": [
+        "The first step is to describe which features of the dataset should be transformed:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "XQeDFOzYR9Z9"
+      },
+      "outputs": [],
+      "source": [
+        "processor = nlp.data.classifier_data_lib.TfdsProcessor(\n",
+        "    tfds_params=\"dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2\",\n",
+        "    process_text_fn=bert.tokenization.convert_to_unicode)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "XrFQbfErUWxa"
+      },
+      "source": [
+        "Then apply the transformation to generate new TFRecord files."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ymw7GOHpSHKU"
+      },
+      "outputs": [],
+      "source": [
+        "# Set up output of training and evaluation Tensorflow dataset\n",
+        "train_data_output_path=\"./mrpc_train.tf_record\"\n",
+        "eval_data_output_path=\"./mrpc_eval.tf_record\"\n",
+        "\n",
+        "max_seq_length = 128\n",
+        "batch_size = 32\n",
+        "eval_batch_size = 32\n",
+        "\n",
+        "# Generate and save training data into a tf record file\n",
+        "input_meta_data = (\n",
+        "    nlp.data.classifier_data_lib.generate_tf_record_from_data_file(\n",
+        "      processor=processor,\n",
+        "      data_dir=None,  # It is `None` because data is from tfds, not local dir.\n",
+        "      tokenizer=tokenizer,\n",
+        "      train_data_output_path=train_data_output_path,\n",
+        "      eval_data_output_path=eval_data_output_path,\n",
+        "      max_seq_length=max_seq_length))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "uX_Sp-wTUoRm"
+      },
+      "source": [
+        "Finally create `tf.data` input pipelines from those TFRecord files:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "rkHxIK57SQ_r"
+      },
+      "outputs": [],
+      "source": [
+        "training_dataset = bert.run_classifier.get_dataset_fn(\n",
+        "    train_data_output_path,\n",
+        "    max_seq_length,\n",
+        "    batch_size,\n",
+        "    is_training=True)()\n",
+        "\n",
+        "evaluation_dataset = bert.run_classifier.get_dataset_fn(\n",
+        "    eval_data_output_path,\n",
+        "    max_seq_length,\n",
+        "    eval_batch_size,\n",
+        "    is_training=False)()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "stbaVouogvzS"
+      },
+      "source": [
+        "The resulting `tf.data.Datasets` return `(features, labels)` pairs, as expected by `keras.Model.fit`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "gwhrlQl4gxVF"
+      },
+      "outputs": [],
+      "source": [
+        "training_dataset.element_spec"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "dbJ76vSJj77j"
+      },
+      "source": [
+        "#### Create tf.data.Dataset for training and evaluation\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "9J95LFRohiYw"
+      },
+      "source": [
+        "If you need to modify the data loading here is some code to get you started:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "gCvaLLAxPuMc"
+      },
+      "outputs": [],
+      "source": [
+        "def create_classifier_dataset(file_path, seq_length, batch_size, is_training):\n",
+        "  \"\"\"Creates input dataset from (tf)records files for train/eval.\"\"\"\n",
+        "  dataset = tf.data.TFRecordDataset(file_path)\n",
+        "  if is_training:\n",
+        "    dataset = dataset.shuffle(100)\n",
+        "    dataset = dataset.repeat()\n",
+        "\n",
+        "  def decode_record(record):\n",
+        "    name_to_features = {\n",
+        "      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
+        "      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
+        "      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
+        "      'label_ids': tf.io.FixedLenFeature([], tf.int64),\n",
+        "    }\n",
+        "    return tf.io.parse_single_example(record, name_to_features)\n",
+        "\n",
+        "  def _select_data_from_record(record):\n",
+        "    x = {\n",
+        "        'input_word_ids': record['input_ids'],\n",
+        "        'input_mask': record['input_mask'],\n",
+        "        'input_type_ids': record['segment_ids']\n",
+        "    }\n",
+        "    y = record['label_ids']\n",
+        "    return (x, y)\n",
+        "\n",
+        "  dataset = dataset.map(decode_record,\n",
+        "                        num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
+        "  dataset = dataset.map(\n",
+        "      _select_data_from_record,\n",
+        "      num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
+        "  dataset = dataset.batch(batch_size, drop_remainder=is_training)\n",
+        "  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)\n",
+        "  return dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "rutkBadrhzdR"
+      },
+      "outputs": [],
+      "source": [
+        "# Set up batch sizes\n",
+        "batch_size = 32\n",
+        "eval_batch_size = 32\n",
+        "\n",
+        "# Return Tensorflow dataset\n",
+        "training_dataset = create_classifier_dataset(\n",
+        "    train_data_output_path,\n",
+        "    input_meta_data['max_seq_length'],\n",
+        "    batch_size,\n",
+        "    is_training=True)\n",
+        "\n",
+        "evaluation_dataset = create_classifier_dataset(\n",
+        "    eval_data_output_path,\n",
+        "    input_meta_data['max_seq_length'],\n",
+        "    eval_batch_size,\n",
+        "    is_training=False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "59TVgt4Z7fuU"
+      },
+      "outputs": [],
+      "source": [
+        "training_dataset.element_spec"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QbklKt-w_CiI"
+      },
+      "source": [
+        "\u003ca id=\"hub_bert\"\u003e\u003c/a\u003e\n",
+        "\n",
+        "### TFModels BERT on TFHub\n",
+        "\n",
+        "You can get [the BERT model](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2) off the shelf from [TFHub](https://tensorflow.org/hub). It would not be hard to add a classification head on top of this `hub.KerasLayer`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "lo6479At4sP1"
+      },
+      "outputs": [],
+      "source": [
+        "# Note: 350MB download.\n",
+        "import tensorflow_hub as hub\n",
+        "hub_encoder = hub.KerasLayer(hub_url_bert, trainable=True)\n",
+        "\n",
+        "print(f\"The Hub encoder has {len(hub_encoder.trainable_variables)} trainable variables\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "iTzF574wivQv"
+      },
+      "source": [
+        "Test run it on a batch of data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "XEcYrCR45Uwo"
+      },
+      "outputs": [],
+      "source": [
+        "result = hub_encoder(\n",
+        "    inputs=[glue_train['input_word_ids'][:10],\n",
+        "            glue_train['input_mask'][:10],\n",
+        "            glue_train['input_type_ids'][:10],],\n",
+        "    training=False,\n",
+        ")\n",
+        "\n",
+        "print(\"Pooled output shape:\", result[0].shape)\n",
+        "print(\"Sequence output shape:\", result[1].shape)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "cjojn8SmLSRI"
+      },
+      "source": [
+        "At this point it would be simple to add a classification head yourself.\n",
+        "\n",
+        "The `bert_models.classifier_model` function can also build a classifier onto the encoder from TensorFlow Hub:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "9nTDaApyLR70"
+      },
+      "outputs": [],
+      "source": [
+        "hub_classifier, hub_encoder = bert.bert_models.classifier_model(\n",
+        "    # Caution: Most of `bert_config` is ignored if you pass a hub url.\n",
+        "    bert_config=bert_config, hub_module_url=hub_url_bert, num_labels=2)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xMJX3wV0_v7I"
+      },
+      "source": [
+        "The one downside to loading this model from TFHub is that the structure of internal keras layers is not restored. So it's more difficult to inspect or modify the model. The `TransformerEncoder` model is now a single layer:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "pD71dnvhM2QS"
+      },
+      "outputs": [],
+      "source": [
+        "tf.keras.utils.plot_model(hub_classifier, show_shapes=True, dpi=64)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "nLZD-isBzNKi"
+      },
+      "outputs": [],
+      "source": [
+        "try:\n",
+        "  tf.keras.utils.plot_model(hub_encoder, show_shapes=True, dpi=64)\n",
+        "  assert False\n",
+        "except Exception as e:\n",
+        "  print(f\"{type(e).__name__}: {e}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ZxSqH0dNAgXV"
+      },
+      "source": [
+        "\u003ca id=\"model_builder_functions\"\u003e\u003c/a\u003e\n",
+        "\n",
+        "### Low level model building\n",
+        "\n",
+        "If you need a more control over the construction of the model it's worth noting that the `classifier_model` function used earlier is really just a thin wrapper over the `nlp.modeling.networks.TransformerEncoder` and `nlp.modeling.models.BertClassifier` classes. Just remember that if you start modifying the architecture it may not be correct or possible to reload the pre-trained checkpoint so you'll need to retrain from scratch."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0cgABEwDj06P"
+      },
+      "source": [
+        "Build the encoder:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "5r_yqhBFSVEM"
+      },
+      "outputs": [],
+      "source": [
+        "transformer_config = config_dict.copy()\n",
+        "\n",
+        "# You need to rename a few fields to make this work:\n",
+        "transformer_config['attention_dropout_rate'] = transformer_config.pop('attention_probs_dropout_prob')\n",
+        "transformer_config['activation'] = tf_utils.get_activation(transformer_config.pop('hidden_act'))\n",
+        "transformer_config['dropout_rate'] = transformer_config.pop('hidden_dropout_prob')\n",
+        "transformer_config['initializer'] = tf.keras.initializers.TruncatedNormal(\n",
+        "          stddev=transformer_config.pop('initializer_range'))\n",
+        "transformer_config['max_sequence_length'] = transformer_config.pop('max_position_embeddings')\n",
+        "transformer_config['num_layers'] = transformer_config.pop('num_hidden_layers')\n",
+        "\n",
+        "transformer_config"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "rIO8MI7LLijh"
+      },
+      "outputs": [],
+      "source": [
+        "manual_encoder = nlp.modeling.networks.TransformerEncoder(**transformer_config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4a4tFSg9krRi"
+      },
+      "source": [
+        "Restore the weights:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "X6N9NEqfXJCx"
+      },
+      "outputs": [],
+      "source": [
+        "checkpoint = tf.train.Checkpoint(model=manual_encoder)\n",
+        "checkpoint.restore(\n",
+        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "1BPiPO4ykuwM"
+      },
+      "source": [
+        "Test run it:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "hlVdgJKmj389"
+      },
+      "outputs": [],
+      "source": [
+        "result = manual_encoder(my_examples, training=True)\n",
+        "\n",
+        "print(\"Sequence output shape:\", result[0].shape)\n",
+        "print(\"Pooled output shape:\", result[1].shape)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "nJMXvVgJkyBv"
+      },
+      "source": [
+        "Wrap it in a classifier:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "tQX57GJ6wkAb"
+      },
+      "outputs": [],
+      "source": [
+        "manual_classifier = nlp.modeling.models.BertClassifier(\n",
+        "        bert_encoder,\n",
+        "        num_classes=2,\n",
+        "        dropout_rate=transformer_config['dropout_rate'],\n",
+        "        initializer=tf.keras.initializers.TruncatedNormal(\n",
+        "          stddev=bert_config.initializer_range))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "kB-nBWhQk0dS"
+      },
+      "outputs": [],
+      "source": [
+        "manual_classifier(my_examples, training=True).numpy()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "E6AJlOSyIO1L"
+      },
+      "source": [
+        "\u003ca id=\"optiizer_schedule\"\u003e\u003c/a\u003e\n",
+        "\n",
+        "### Optimizers and schedules\n",
+        "\n",
+        "The optimizer used to train the model was created using the `nlp.optimization.create_optimizer` function:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "28Dv3BPRlFTD"
+      },
+      "outputs": [],
+      "source": [
+        "optimizer = nlp.optimization.create_optimizer(\n",
+        "    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "LRjcHr0UlT8c"
+      },
+      "source": [
+        "That high level wrapper sets up the learning rate schedules and the optimizer.\n",
+        "\n",
+        "The base learning rate schedule used here is a linear decay to zero over the training run:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "MHY8K6kDngQn"
+      },
+      "outputs": [],
+      "source": [
+        "epochs = 3\n",
+        "batch_size = 32\n",
+        "eval_batch_size = 32\n",
+        "\n",
+        "train_data_size = len(glue_train_labels)\n",
+        "steps_per_epoch = int(train_data_size / batch_size)\n",
+        "num_train_steps = steps_per_epoch * epochs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "wKIcSprulu3P"
+      },
+      "outputs": [],
+      "source": [
+        "decay_schedule = tf.keras.optimizers.schedules.PolynomialDecay(\n",
+        "      initial_learning_rate=2e-5,\n",
+        "      decay_steps=num_train_steps,\n",
+        "      end_learning_rate=0)\n",
+        "\n",
+        "plt.plot([decay_schedule(n) for n in range(num_train_steps)])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "IMTC_gfAl_PZ"
+      },
+      "source": [
+        "This, in turn is wrapped in a `WarmUp` schedule that linearly increases the learning rate to the target value over the first 10% of training:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "YRt3VTmBmCBY"
+      },
+      "outputs": [],
+      "source": [
+        "warmup_steps = num_train_steps * 0.1\n",
+        "\n",
+        "warmup_schedule = nlp.optimization.WarmUp(\n",
+        "        initial_learning_rate=2e-5,\n",
+        "        decay_schedule_fn=decay_schedule,\n",
+        "        warmup_steps=warmup_steps)\n",
+        "\n",
+        "# The warmup overshoots, because it warms up to the `initial_learning_rate`\n",
+        "# following the original implementation. You can set\n",
+        "# `initial_learning_rate=decay_schedule(warmup_steps)` if you don't like the\n",
+        "# overshoot.\n",
+        "plt.plot([warmup_schedule(n) for n in range(num_train_steps)])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "l8D9Lv3Bn740"
+      },
+      "source": [
+        "Then create the `nlp.optimization.AdamWeightDecay` using that schedule, configured for the BERT model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "2Hf2rpRXk89N"
+      },
+      "outputs": [],
+      "source": [
+        "optimizer = nlp.optimization.AdamWeightDecay(\n",
+        "        learning_rate=warmup_schedule,\n",
+        "        weight_decay_rate=0.01,\n",
+        "        epsilon=1e-6,\n",
+        "        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])"
      ]
    }
  ],
@@ -539,8 +1815,10 @@
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
-      "name": "How-to Guide: Using a PIP package for fine-tuning a BERT model",
-      "provenance": []
+      "name": "fine_tuning_bert.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3",

--- a/official/core/base_task.py
+++ b/official/core/base_task.py
@@ -14,15 +14,18 @@
 # limitations under the License.
 # ==============================================================================
 """Defines the base task abstraction."""
+import abc
 import functools
 from typing import Any, Callable, Optional

+import six
 import tensorflow as tf

 from official.modeling.hyperparams import config_definitions as cfg
 from official.utils import registry


+@six.add_metaclass(abc.ABCMeta)
 class Task(tf.Module):
  """A single-replica view of training procedure.

@@ -54,14 +57,13 @@ class Task(tf.Module):
    """
    pass

+  @abc.abstractmethod
  def build_model(self) -> tf.keras.Model:
    """Creates the model architecture.

    Returns:
      A model instance.
    """
-    # TODO(hongkuny): the base task should call network factory.
-    pass

  def compile_model(self,
                    model: tf.keras.Model,
@@ -98,6 +100,7 @@ class Task(tf.Module):
      model.test_step = functools.partial(validation_step, model=model)
    return model

+  @abc.abstractmethod
  def build_inputs(self,
                   params: cfg.DataConfig,
                   input_context: Optional[tf.distribute.InputContext] = None):
@@ -112,20 +115,19 @@ class Task(tf.Module):
    Returns:
      A nested structure of per-replica input functions.
    """
-    pass

-  def build_losses(self, features, model_outputs, aux_losses=None) -> tf.Tensor:
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
    """Standard interface to compute losses.

    Args:
-      features: optional feature/labels tensors.
+      labels: optional label tensors.
      model_outputs: a nested structure of output tensors.
      aux_losses: auxiliarly loss tensors, i.e. `losses` in keras.Model.

    Returns:
      The total loss tensor.
    """
-    del model_outputs, features
+    del model_outputs, labels

    if aux_losses is None:
      losses = [tf.constant(0.0, dtype=tf.float32)]
@@ -139,29 +141,29 @@ class Task(tf.Module):
    del training
    return []

-  def process_metrics(self, metrics, labels, outputs):
+  def process_metrics(self, metrics, labels, model_outputs):
    """Process and update metrics. Called when using custom training loop API.

    Args:
      metrics: a nested structure of metrics objects.
        The return of function self.build_metrics.
      labels: a tensor or a nested structure of tensors.
-      outputs: a tensor or a nested structure of tensors.
+      model_outputs: a tensor or a nested structure of tensors.
        For example, output of the keras model built by self.build_model.
    """
    for metric in metrics:
-      metric.update_state(labels, outputs)
+      metric.update_state(labels, model_outputs)

-  def process_compiled_metrics(self, compiled_metrics, labels, outputs):
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
    """Process and update compiled_metrics. call when using compile/fit API.

    Args:
      compiled_metrics: the compiled metrics (model.compiled_metrics).
      labels: a tensor or a nested structure of tensors.
-      outputs: a tensor or a nested structure of tensors.
+      model_outputs: a tensor or a nested structure of tensors.
        For example, output of the keras model built by self.build_model.
    """
-    compiled_metrics.update_state(labels, outputs)
+    compiled_metrics.update_state(labels, model_outputs)

  def train_step(self,
                 inputs,
@@ -187,7 +189,7 @@ class Task(tf.Module):
      outputs = model(features, training=True)
      # Computes per-replica loss.
      loss = self.build_losses(
-          features=labels, model_outputs=outputs, aux_losses=model.losses)
+          labels=labels, model_outputs=outputs, aux_losses=model.losses)
      # Scales loss as the default gradients allreduce performs sum inside the
      # optimizer.
      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
@@ -231,7 +233,7 @@ class Task(tf.Module):
      features, labels = inputs, inputs
    outputs = self.inference_step(features, model)
    loss = self.build_losses(
-        features=labels, model_outputs=outputs, aux_losses=model.losses)
+        labels=labels, model_outputs=outputs, aux_losses=model.losses)
    logs = {self.loss: loss}
    if metrics:
      self.process_metrics(metrics, labels, outputs)
@@ -250,11 +252,44 @@ _REGISTERED_TASK_CLS = {}


 # TODO(b/158268740): Move these outside the base class file.
-def register_task_cls(task_config: cfg.TaskConfig) -> Task:
-  """Register ExperimentConfig factory method."""
-  return registry.register(_REGISTERED_TASK_CLS, task_config)
+# TODO(b/158741360): Add type annotations once pytype checks across modules.
+def register_task_cls(task_config_cls):
+  """Decorates a factory of Tasks for lookup by a subclass of TaskConfig.
+
+  This decorator supports registration of tasks as follows:
+
+  ```
+  @dataclasses.dataclass
+  class MyTaskConfig(TaskConfig):
+    # Add fields here.
+    pass
+
+  @register_task_cls(MyTaskConfig)
+  class MyTask(Task):
+    # Inherits def __init__(self, task_config).
+    pass
+
+  my_task_config = MyTaskConfig()
+  my_task = get_task(my_task_config)  # Returns MyTask(my_task_config).
+  ```
+
+  Besisdes a class itself, other callables that create a Task from a TaskConfig
+  can be decorated by the result of this function, as long as there is at most
+  one registration for each config class.
+
+  Args:
+    task_config_cls: a subclass of TaskConfig (*not* an instance of TaskConfig).
+      Each task_config_cls can only be used for a single registration.
+
+  Returns:
+    A callable for use as class decorator that registers the decorated class
+    for creation from an instance of task_config_cls.
+  """
+  return registry.register(_REGISTERED_TASK_CLS, task_config_cls)


-def get_task_cls(task_config: cfg.TaskConfig) -> Task:
-  task_cls = registry.lookup(_REGISTERED_TASK_CLS, task_config)
+# The user-visible get_task() is defined after classes have been registered.
+# TODO(b/158741360): Add type annotations once pytype checks across modules.
+def get_task_cls(task_config_cls):
+  task_cls = registry.lookup(_REGISTERED_TASK_CLS, task_config_cls)
  return task_cls
--- a/official/modeling/hyperparams/config_definitions.py
+++ b/official/modeling/hyperparams/config_definitions.py
@@ -162,6 +162,21 @@ class CallbacksConfig(base_config.Config):

 @dataclasses.dataclass
 class TrainerConfig(base_config.Config):
+  """Configuration for trainer.
+
+  Attributes:
+    optimizer_config: optimizer config, it includes optimizer, learning rate,
+      and warmup schedule configs.
+    train_tf_while_loop: whether or not to use tf while loop.
+    train_tf_function: whether or not to use tf_function for training loop.
+    eval_tf_function: whether or not to use tf_function for eval.
+    steps_per_loop: number of steps per loop.
+    summary_interval: number of steps between each summary.
+    checkpoint_intervals: number of steps between checkpoints.
+    max_to_keep: max checkpoints to keep.
+    continuous_eval_timeout: maximum number of seconds to wait between
+      checkpoints, if set to None, continuous eval will wait indefinetely.
+  """
  optimizer_config: OptimizationConfig = OptimizationConfig()
  train_tf_while_loop: bool = True
  train_tf_function: bool = True
@@ -170,6 +185,7 @@ class TrainerConfig(base_config.Config):
  summary_interval: int = 1000
  checkpoint_interval: int = 1000
  max_to_keep: int = 5
+  continuous_eval_timeout: Optional[int] = None


 @dataclasses.dataclass

--- a/official/nlp/bert/bert_models.py
+++ b/official/nlp/bert/bert_models.py
@@ -230,9 +230,10 @@ def pretrain_model(bert_config,
      initializer=initializer,
      output='predictions')

-  lm_output, sentence_output = pretrainer_model(
+  outputs = pretrainer_model(
      [input_word_ids, input_mask, input_type_ids, masked_lm_positions])
-
+  lm_output = outputs['masked_lm']
+  sentence_output = outputs['classification']
  pretrain_loss_layer = BertPretrainLossAndMetricLayer(
      vocab_size=bert_config.vocab_size)
  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,

--- a/official/nlp/bert/model_training_utils.py
+++ b/official/nlp/bert/model_training_utils.py
@@ -111,6 +111,7 @@ def run_customized_training_loop(
    model_dir=None,
    train_input_fn=None,
    steps_per_epoch=None,
+    num_eval_per_epoch=1,
    steps_per_loop=None,
    epochs=1,
    eval_input_fn=None,
@@ -144,6 +145,7 @@ def run_customized_training_loop(
      steps_per_epoch: Number of steps to run per epoch. At the end of each
        epoch, model checkpoint will be saved and evaluation will be conducted
        if evaluation dataset is provided.
+      num_eval_per_epoch: Number of evaluations per epoch.
      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
        communication in eager context, training logs are printed every
        steps_per_loop.
@@ -158,16 +160,17 @@ def run_customized_training_loop(
      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
        `model_fn`.
      custom_callbacks: A list of Keras Callbacks objects to run during
-        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
-        `on_epoch_begin()`, `on_epoch_end()` methods are invoked during
-        training.  Note that some metrics may be missing from `logs`.
+        training. More specifically, `on_train_begin(), on_train_end(),
+        on_batch_begin()`, `on_batch_end()`, `on_epoch_begin()`,
+        `on_epoch_end()` methods are invoked during training.
+        Note that some metrics may be missing from `logs`.
      run_eagerly: Whether to run model training in pure eager execution. This
        should be disable for TPUStrategy.
      sub_model_export_name: If not None, will export `sub_model` returned by
        `model_fn` into checkpoint files. The name of intermediate checkpoint
        file is {sub_model_export_name}_step_{step}.ckpt and the last
-        checkpint's name is {sub_model_export_name}.ckpt;
-        if None, `sub_model` will not be exported as checkpoint.
+        checkpint's name is {sub_model_export_name}.ckpt; if None, `sub_model`
+        will not be exported as checkpoint.
      explicit_allreduce: Whether to explicitly perform gradient allreduce,
        instead of relying on implicit allreduce in optimizer.apply_gradients().
        default is False. For now, if training using FP16 mixed precision,
@@ -177,10 +180,10 @@ def run_customized_training_loop(
      pre_allreduce_callbacks: A list of callback functions that takes gradients
        and model variables pairs as input, manipulate them, and returns a new
        gradients and model variables paris. The callback functions will be
-        invoked in the list order and before gradients are allreduced.
-        With mixed precision training, the pre_allreduce_allbacks will be
-        applied on scaled_gradients. Default is no callbacks.
-        Only used when explicit_allreduce=True.
+        invoked in the list order and before gradients are allreduced. With
+        mixed precision training, the pre_allreduce_allbacks will be applied on
+        scaled_gradients. Default is no callbacks. Only used when
+        explicit_allreduce=True.
      post_allreduce_callbacks: A list of callback functions that takes
        gradients and model variables pairs as input, manipulate them, and
        returns a new gradients and model variables paris. The callback
@@ -208,6 +211,8 @@ def run_customized_training_loop(
  required_arguments = [
      strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
  ]
+
+  steps_between_evals = int(steps_per_epoch / num_eval_per_epoch)
  if [arg for arg in required_arguments if arg is None]:
    raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
                     '`steps_per_epoch` and `train_input_fn` are required '
@@ -216,17 +221,17 @@ def run_customized_training_loop(
    if tf.config.list_logical_devices('TPU'):
      # One can't fully utilize a TPU with steps_per_loop=1, so in this case
      # default users to a more useful value.
-      steps_per_loop = min(1000, steps_per_epoch)
+      steps_per_loop = min(1000, steps_between_evals)
    else:
      steps_per_loop = 1
    logging.info('steps_per_loop not specified. Using steps_per_loop=%d',
                 steps_per_loop)
-  if steps_per_loop > steps_per_epoch:
+  if steps_per_loop > steps_between_evals:
    logging.warning(
        'steps_per_loop: %d is specified to be greater than '
-        ' steps_per_epoch: %d, we will use steps_per_epoch as'
-        ' steps_per_loop.', steps_per_loop, steps_per_epoch)
-    steps_per_loop = steps_per_epoch
+        ' steps_between_evals: %d, we will use steps_between_evals as'
+        ' steps_per_loop.', steps_per_loop, steps_between_evals)
+    steps_per_loop = steps_between_evals
  assert tf.executing_eagerly()

  if run_eagerly:
@@ -242,12 +247,9 @@ def run_customized_training_loop(
    raise ValueError(
        'if `metric_fn` is specified, metric_fn must be a callable.')

-  callback_list = tf.keras.callbacks.CallbackList(custom_callbacks)
-
  total_training_steps = steps_per_epoch * epochs
  train_iterator = _get_input_iterator(train_input_fn, strategy)
-  eval_loss_metric = tf.keras.metrics.Mean(
-      'training_loss', dtype=tf.float32)
+  eval_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)

  with distribution_utils.get_strategy_scope(strategy):
    # To correctly place the model weights on accelerators,
@@ -260,6 +262,9 @@ def run_customized_training_loop(
      raise ValueError('sub_model_export_name is specified as %s, but '
                       'sub_model is None.' % sub_model_export_name)

+    callback_list = tf.keras.callbacks.CallbackList(
+        callbacks=custom_callbacks, model=model)
+
    optimizer = model.optimizer

    if init_checkpoint:
@@ -270,8 +275,7 @@ def run_customized_training_loop(
      checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
      logging.info('Loading from checkpoint file completed')

-    train_loss_metric = tf.keras.metrics.Mean(
-        'training_loss', dtype=tf.float32)
+    train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
    eval_metrics = [metric_fn()] if metric_fn else []
    # If evaluation is required, make a copy of metric as it will be used by
    # both train and evaluation.
@@ -440,18 +444,20 @@ def run_customized_training_loop(

    latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
    if latest_checkpoint_file:
-      logging.info(
-          'Checkpoint file %s found and restoring from '
-          'checkpoint', latest_checkpoint_file)
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
      checkpoint.restore(latest_checkpoint_file)
      logging.info('Loading from checkpoint file completed')

    current_step = optimizer.iterations.numpy()
    checkpoint_name = 'ctl_step_{step}.ckpt'

-    while current_step < total_training_steps:
+    logs = {}
+    callback_list.on_train_begin()
+    while current_step < total_training_steps and not model.stop_training:
      if current_step % steps_per_epoch == 0:
-        callback_list.on_epoch_begin(int(current_step / steps_per_epoch) + 1)
+        callback_list.on_epoch_begin(
+            int(current_step / steps_per_epoch) + 1)

      # Training loss/metric are taking average over steps inside micro
      # training loop. We reset the their values before each round.
@@ -461,7 +467,7 @@ def run_customized_training_loop(

      callback_list.on_batch_begin(current_step)
      # Runs several steps in the host while loop.
-      steps = steps_to_run(current_step, steps_per_epoch, steps_per_loop)
+      steps = steps_to_run(current_step, steps_between_evals, steps_per_loop)

      if tf.config.list_physical_devices('GPU'):
        # TODO(zongweiz): merge with train_steps once tf.while_loop
@@ -470,11 +476,9 @@ def run_customized_training_loop(
          train_single_step(train_iterator)
      else:
        # Converts steps to a Tensor to avoid tf.function retracing.
-        train_steps(train_iterator,
-                    tf.convert_to_tensor(steps, dtype=tf.int32))
+        train_steps(train_iterator, tf.convert_to_tensor(steps, dtype=tf.int32))
      train_loss = _float_metric_value(train_loss_metric)
      current_step += steps
-      callback_list.on_batch_end(current_step - 1, {'loss': train_loss})

      # Updates training logging.
      training_status = 'Train Step: %d/%d  / loss = %s' % (
@@ -492,8 +496,7 @@ def run_customized_training_loop(
              'learning_rate',
              optimizer.learning_rate(current_step),
              step=current_step)
-        tf.summary.scalar(
-            train_loss_metric.name, train_loss, step=current_step)
+        tf.summary.scalar(train_loss_metric.name, train_loss, step=current_step)
        for metric in train_metrics + model.metrics:
          metric_value = _float_metric_value(metric)
          training_status += '  %s = %f' % (metric.name, metric_value)
@@ -501,7 +504,11 @@ def run_customized_training_loop(
        summary_writer.flush()
      logging.info(training_status)

-      if current_step % steps_per_epoch == 0:
+      # If no need for evaluation, we only call on_batch_end with train_loss,
+      # this is to ensure we get granular global_step/sec on Tensorboard.
+      if current_step % steps_between_evals:
+        callback_list.on_batch_end(current_step - 1, {'loss': train_loss})
+      else:
        # Save a submodel with the step in the file name after each epoch.
        if sub_model_export_name:
          _save_checkpoint(
@@ -514,7 +521,6 @@ def run_customized_training_loop(
        if current_step < total_training_steps:
          _save_checkpoint(strategy, checkpoint, model_dir,
                           checkpoint_name.format(step=current_step))
-          logs = None
          if eval_input_fn:
            logging.info('Running evaluation after step: %s.', current_step)
            logs = _run_evaluation(current_step,
@@ -523,8 +529,15 @@ def run_customized_training_loop(
            eval_loss_metric.reset_states()
            for metric in eval_metrics + model.metrics:
              metric.reset_states()
+        # We add train_loss here rather than call on_batch_end twice to make
+        # sure that no duplicated values are generated.
+        logs['loss'] = train_loss
+        callback_list.on_batch_end(current_step - 1, logs)

-          callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
+      # Calls on_epoch_end after each real epoch ends to prevent mis-calculation
+      # of training steps.
+      if current_step % steps_per_epoch == 0:
+        callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)

    if sub_model_export_name:
      _save_checkpoint(strategy, sub_model_checkpoint, model_dir,
@@ -532,14 +545,11 @@ def run_customized_training_loop(

    _save_checkpoint(strategy, checkpoint, model_dir,
                     checkpoint_name.format(step=current_step))
-    logs = None
    if eval_input_fn:
      logging.info('Running final evaluation after training is complete.')
      logs = _run_evaluation(current_step,
                             _get_input_iterator(eval_input_fn, strategy))
-
    callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
-
    training_summary = {
        'total_training_steps': total_training_steps,
        'train_loss': _float_metric_value(train_loss_metric),
@@ -557,4 +567,6 @@ def run_customized_training_loop(
    if not _should_export_summary(strategy):
      tf.io.gfile.rmtree(summary_dir)

+    callback_list.on_train_end()
+
    return model
--- a/official/nlp/bert/model_training_utils_test.py
+++ b/official/nlp/bert/model_training_utils_test.py
@@ -258,6 +258,7 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
        loss_fn=tf.keras.losses.categorical_crossentropy,
        model_dir=model_dir,
        steps_per_epoch=20,
+        num_eval_per_epoch=4,
        steps_per_loop=10,
        epochs=2,
        train_input_fn=input_fn,
@@ -269,14 +270,15 @@ class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
        run_eagerly=False)
    self.assertEqual(callback.epoch_begin, [(1, {}), (2, {})])
    epoch_ends, epoch_end_infos = zip(*callback.epoch_end)
-    self.assertEqual(list(epoch_ends), [1, 2])
+    self.assertEqual(list(epoch_ends), [1, 2, 2])
    for info in epoch_end_infos:
      self.assertIn('accuracy', info)

-    self.assertEqual(callback.batch_begin,
-                     [(0, {}), (10, {}), (20, {}), (30, {})])
+    self.assertEqual(callback.batch_begin, [(0, {}), (5, {}), (10, {}),
+                                            (15, {}), (20, {}), (25, {}),
+                                            (30, {}), (35, {})])
    batch_ends, batch_end_infos = zip(*callback.batch_end)
-    self.assertEqual(list(batch_ends), [9, 19, 29, 39])
+    self.assertEqual(list(batch_ends), [4, 9, 14, 19, 24, 29, 34, 39])
    for info in batch_end_infos:
      self.assertIn('loss', info)


--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
@@ -45,6 +45,9 @@ assemble new layers, networks, or models.
    should be masked), the output will have masked positions set to
    approximately zero.

+* [`MaskedLM`](masked_lm.py) implements a masked language model. It assumes the
+  embedding table variable is passed to it.
+
 *   [ClassificationHead](cls_head.py) A pooling head over a sequence of
    embeddings, commonly used by classification tasks.


--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
@@ -18,6 +18,7 @@ from official.nlp.modeling.layers.attention import *
 from official.nlp.modeling.layers.cls_head import *
 from official.nlp.modeling.layers.dense_einsum import DenseEinsum
 from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
+from official.nlp.modeling.layers.masked_lm import MaskedLM
 from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
 from official.nlp.modeling.layers.position_embedding import PositionEmbedding

--- a/official/nlp/modeling/networks/masked_lm.py
+++ b/official/nlp/modeling/networks/masked_lm.py
@@ -25,91 +25,74 @@ from official.modeling import tf_utils


 @tf.keras.utils.register_keras_serializable(package='Text')
-class MaskedLM(tf.keras.Model):
+class MaskedLM(tf.keras.layers.Layer):
  """Masked language model network head for BERT modeling.

  This network implements a masked language model based on the provided network.
  It assumes that the network being passed has a "get_embedding_table()" method.

  Arguments:
-    input_width: The innermost dimension of the input tensor to this network.
-    num_predictions: The number of predictions to make per sequence.
-    source_network: The network with the embedding layer to use for the
-      embedding layer.
-    embedding_table: The embedding table of a source network, If None, the
-      `source_network.get_embedding_table()` method is used.
-    activation: The activation, if any, for the dense layer in this network.
-    initializer: The intializer for the dense layer in this network. Defaults to
-      a Glorot uniform initializer.
+    embedding_table: The embedding table of the targets.
+    activation: The activation, if any, for the dense layer.
+    initializer: The intializer for the dense layer. Defaults to a Glorot
+      uniform initializer.
    output: The output style for this network. Can be either 'logits' or
      'predictions'.
  """

  def __init__(self,
-               input_width,
-               num_predictions,
-               source_network,
-               embedding_table=None,
+               embedding_table,
               activation=None,
               initializer='glorot_uniform',
               output='logits',
+               name='cls/predictions',
               **kwargs):
+    super(MaskedLM, self).__init__(name=name, **kwargs)
+    self.embedding_table = embedding_table
+    self.activation = activation
+    self.initializer = tf.keras.initializers.get(initializer)

-    if embedding_table is None:
-      embedding_table = source_network.get_embedding_table()
-    vocab_size, hidden_size = embedding_table.shape
-
-    sequence_data = tf.keras.layers.Input(
-        shape=(None, input_width), name='sequence_data', dtype=tf.float32)
-    masked_lm_positions = tf.keras.layers.Input(
-        shape=(num_predictions,), name='masked_lm_positions', dtype=tf.int32)
-
-    masked_lm_input = tf.keras.layers.Lambda(
-        lambda x: self._gather_indexes(x[0], x[1]))(
-            [sequence_data, masked_lm_positions])
-    lm_data = (
-        tf.keras.layers.Dense(
-            hidden_size,
-            activation=activation,
-            kernel_initializer=initializer,
-            name='cls/predictions/transform/dense')(masked_lm_input))
-    lm_data = tf.keras.layers.LayerNormalization(
-        axis=-1, epsilon=1e-12, name='cls/predictions/transform/LayerNorm')(
-            lm_data)
-    lm_data = tf.keras.layers.Lambda(
-        lambda x: tf.matmul(x, embedding_table, transpose_b=True))(
-            lm_data)
-    logits = Bias(
-        initializer=tf.keras.initializers.Zeros(),
-        name='cls/predictions/output_bias')(
-            lm_data)
-
-    # We can't use the standard Keras reshape layer here, since it expects
-    # the input and output batch size to be the same.
-    reshape_layer = tf.keras.layers.Lambda(
-        lambda x: tf.reshape(x, [-1, num_predictions, vocab_size]))
-
-    self.logits = reshape_layer(logits)
-    predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(self.logits)
-
-    if output == 'logits':
-      output_tensors = self.logits
-    elif output == 'predictions':
-      output_tensors = predictions
-    else:
+    if output not in ('predictions', 'logits'):
      raise ValueError(
          ('Unknown `output` value "%s". `output` can be either "logits" or '
           '"predictions"') % output)
+    self._output_type = output

-    super(MaskedLM, self).__init__(
-        inputs=[sequence_data, masked_lm_positions],
-        outputs=output_tensors,
-        **kwargs)
+  def build(self, input_shape):
+    self._vocab_size, hidden_size = self.embedding_table.shape
+    self.dense = tf.keras.layers.Dense(
+        hidden_size,
+        activation=self.activation,
+        kernel_initializer=self.initializer,
+        name='transform/dense')
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+
+    super(MaskedLM, self).build(input_shape)
+
+  def call(self, sequence_data, masked_positions):
+    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
+    lm_data = self.dense(masked_lm_input)
+    lm_data = self.layer_norm(lm_data)
+    lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
+    logits = tf.nn.bias_add(lm_data, self.bias)
+
+    masked_positions_shape = tf_utils.get_shape_list(
+        masked_positions, name='masked_positions_tensor')
+    logits = tf.reshape(logits,
+                        [-1, masked_positions_shape[1], self._vocab_size])
+    if self._output_type == 'logits':
+      return logits
+    return tf.nn.log_softmax(logits)

  def get_config(self):
-    raise NotImplementedError('MaskedLM cannot be directly serialized at this '
-                              'time. Please use it only in Layers or '
-                              'functionally subclassed Models/Networks.')
+    raise NotImplementedError('MaskedLM cannot be directly serialized because '
+                              'it has variable sharing logic.')

  def _gather_indexes(self, sequence_tensor, positions):
    """Gathers the vectors at the specific positions.
@@ -139,51 +122,3 @@ class MaskedLM(tf.keras.Model):
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)

    return output_tensor
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-# Temporary until we can create a Dense layer that ties the embedding.
-class Bias(tf.keras.layers.Layer):
-  """Adds a bias term to an input."""
-
-  def __init__(self,
-               initializer='zeros',
-               regularizer=None,
-               constraint=None,
-               activation=None,
-               **kwargs):
-    super(Bias, self).__init__(**kwargs)
-    self._initializer = tf.keras.initializers.get(initializer)
-    self._regularizer = tf.keras.regularizers.get(regularizer)
-    self._constraint = tf.keras.constraints.get(constraint)
-    self._activation = tf.keras.activations.get(activation)
-
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    self._bias = self.add_weight(
-        'bias',
-        shape=input_shape[1:],
-        initializer=self._initializer,
-        regularizer=self._regularizer,
-        constraint=self._constraint,
-        dtype=self._dtype,
-        trainable=True)
-
-    super(Bias, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        'activation': tf.keras.activations.serialize(self._activation),
-        'initializer': tf.keras.initializers.serialize(self._initializer),
-        'regularizer': tf.keras.regularizers.serialize(self._regularizer),
-        'constraint': tf.keras.constraints.serialize(self._constraint)
-    }
-    base_config = super(Bias, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    outputs = tf.nn.bias_add(inputs, self._bias)
-    if self._activation is not None:
-      return self._activation(outputs)  # pylint: disable=not-callable
-    else:
-      return outputs
--- a/official/nlp/modeling/networks/masked_lm_test.py
+++ b/official/nlp/modeling/networks/masked_lm_test.py
@@ -23,7 +23,7 @@ import tensorflow as tf

 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import

-from official.nlp.modeling.networks import masked_lm
+from official.nlp.modeling.layers import masked_lm
 from official.nlp.modeling.networks import transformer_encoder


@@ -32,13 +32,12 @@ from official.nlp.modeling.networks import transformer_encoder
 @keras_parameterized.run_all_keras_modes
 class MaskedLMTest(keras_parameterized.TestCase):

-  def create_network(self,
-                     vocab_size,
-                     sequence_length,
-                     hidden_size,
-                     num_predictions,
-                     output='predictions',
-                     xformer_stack=None):
+  def create_layer(self,
+                   vocab_size,
+                   sequence_length,
+                   hidden_size,
+                   output='predictions',
+                   xformer_stack=None):
    # First, create a transformer stack that we can use to get the LM's
    # vocabulary weight.
    if xformer_stack is None:
@@ -49,82 +48,32 @@ class MaskedLMTest(keras_parameterized.TestCase):
          hidden_size=hidden_size,
          num_attention_heads=4,
      )
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_outputs, _ = xformer_stack([word_ids, mask, type_ids])

    # Create a maskedLM from the transformer stack.
-    test_network = masked_lm.MaskedLM(
-        num_predictions=num_predictions,
-        input_width=lm_outputs.shape[-1],
-        source_network=xformer_stack,
+    test_layer = masked_lm.MaskedLM(
+        embedding_table=xformer_stack.get_embedding_table(),
        output=output)
-    return test_network
+    return test_layer

-  def test_network_creation(self):
+  def test_layer_creation(self):
    vocab_size = 100
    sequence_length = 32
    hidden_size = 64
    num_predictions = 21
-    test_network = self.create_network(
+    test_layer = self.create_layer(
        vocab_size=vocab_size,
        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
+        hidden_size=hidden_size)

    # Make sure that the output tensor of the masked LM is the right shape.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
-        shape=(num_predictions,), dtype=tf.int32)
-    output = test_network([lm_input_tensor, masked_lm_positions])
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions=masked_positions)

    expected_output_shape = [None, num_predictions, vocab_size]
    self.assertEqual(expected_output_shape, output.shape.as_list())

-  def test_network_invocation_with_internal_logits(self):
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    test_network = self.create_network(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
-
-    # Create a model from the masked LM layer.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
-        shape=(num_predictions,), dtype=tf.int32)
-    output = test_network([lm_input_tensor, masked_lm_positions])
-    model = tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
-    logits_model = tf.keras.Model(test_network.inputs, test_network.logits)
-
-    # Invoke the masked LM on some fake data to make sure there are no runtime
-    # errors in the code.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    outputs = model.predict([lm_input_data, masked_position_data])
-    logits = logits_model.predict([lm_input_data, masked_position_data])
-
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, num_predictions, vocab_size)
-    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertEqual(expected_output_shape, logits.shape)
-
-    # Ensure that the logits, when softmaxed, create the outputs.
-    input_tensor = tf.keras.Input(expected_output_shape[1:])
-    output_tensor = tf.keras.layers.Activation(tf.nn.log_softmax)(input_tensor)
-    softmax_model = tf.keras.Model(input_tensor, output_tensor)
-
-    calculated_softmax = softmax_model.predict(logits)
-    self.assertAllClose(outputs, calculated_softmax)
-
-  def test_network_invocation_with_external_logits(self):
+  def test_layer_invocation_with_external_logits(self):
    vocab_size = 100
    sequence_length = 32
    hidden_size = 64
@@ -136,31 +85,28 @@ class MaskedLMTest(keras_parameterized.TestCase):
        hidden_size=hidden_size,
        num_attention_heads=4,
    )
-    test_network = self.create_network(
+    test_layer = self.create_layer(
        vocab_size=vocab_size,
        sequence_length=sequence_length,
        hidden_size=hidden_size,
-        num_predictions=num_predictions,
        xformer_stack=xformer_stack,
        output='predictions')
-    logit_network = self.create_network(
+    logit_layer = self.create_layer(
        vocab_size=vocab_size,
        sequence_length=sequence_length,
        hidden_size=hidden_size,
-        num_predictions=num_predictions,
        xformer_stack=xformer_stack,
        output='logits')
-    logit_network.set_weights(test_network.get_weights())

    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
-        shape=(num_predictions,), dtype=tf.int32)
-    output = test_network([lm_input_tensor, masked_lm_positions])
-    logit_output = logit_network([lm_input_tensor, masked_lm_positions])
-
-    model = tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
-    logits_model = tf.keras.Model(([lm_input_tensor, masked_lm_positions]),
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions)
+    logit_output = logit_layer(lm_input_tensor, masked_positions)
+    logit_output = tf.keras.layers.Activation(tf.nn.log_softmax)(logit_output)
+    logit_layer.set_weights(test_layer.get_weights())
+    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
+    logits_model = tf.keras.Model(([lm_input_tensor, masked_positions]),
                                  logit_output)

    # Invoke the masked LM on some fake data to make sure there are no runtime
@@ -169,40 +115,33 @@ class MaskedLMTest(keras_parameterized.TestCase):
    lm_input_data = 10 * np.random.random_sample(
        (batch_size, sequence_length, hidden_size))
    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    outputs = model.predict([lm_input_data, masked_position_data])
-    logits = logits_model.predict([lm_input_data, masked_position_data])
+        sequence_length, size=(batch_size, num_predictions))
+    # ref_outputs = model.predict([lm_input_data, masked_position_data])
+    # outputs = logits_model.predict([lm_input_data, masked_position_data])
+    ref_outputs = model([lm_input_data, masked_position_data])
+    outputs = logits_model([lm_input_data, masked_position_data])

    # Ensure that the tensor shapes are correct.
    expected_output_shape = (batch_size, num_predictions, vocab_size)
+    self.assertEqual(expected_output_shape, ref_outputs.shape)
    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertEqual(expected_output_shape, logits.shape)
+    self.assertAllClose(ref_outputs, outputs)

-    # Ensure that the logits, when softmaxed, create the outputs.
-    input_tensor = tf.keras.Input(expected_output_shape[1:])
-    output_tensor = tf.keras.layers.Activation(tf.nn.log_softmax)(input_tensor)
-    softmax_model = tf.keras.Model(input_tensor, output_tensor)
-
-    calculated_softmax = softmax_model.predict(logits)
-    self.assertAllClose(outputs, calculated_softmax)
-
-  def test_network_invocation(self):
+  def test_layer_invocation(self):
    vocab_size = 100
    sequence_length = 32
    hidden_size = 64
    num_predictions = 21
-    test_network = self.create_network(
+    test_layer = self.create_layer(
        vocab_size=vocab_size,
        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
+        hidden_size=hidden_size)

    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
-        shape=(num_predictions,), dtype=tf.int32)
-    output = test_network([lm_input_tensor, masked_lm_positions])
-    model = tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions)
+    model = tf.keras.Model([lm_input_tensor, masked_positions], output)

    # Invoke the masked LM on some fake data to make sure there are no runtime
    # errors in the code.
@@ -215,12 +154,8 @@ class MaskedLMTest(keras_parameterized.TestCase):

  def test_unknown_output_type_fails(self):
    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = self.create_network(
-          vocab_size=8,
-          sequence_length=8,
-          hidden_size=8,
-          num_predictions=8,
-          output='bad')
+      _ = self.create_layer(
+          vocab_size=8, sequence_length=8, hidden_size=8, output='bad')


 if __name__ == '__main__':

--- a/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
+++ b/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
@@ -23,6 +23,7 @@ import numpy as np
 import tensorflow as tf

 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling import layers
 from official.nlp.modeling import networks
 from official.nlp.modeling.losses import weighted_sparse_categorical_crossentropy

@@ -48,20 +49,18 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_outputs, _ = xformer_stack([word_ids, mask, type_ids])
+    _ = xformer_stack([word_ids, mask, type_ids])

    # Create a maskedLM from the transformer stack.
-    test_network = networks.MaskedLM(
-        num_predictions=num_predictions,
-        input_width=lm_outputs.shape[-1],
-        source_network=xformer_stack,
+    test_layer = layers.MaskedLM(
+        embedding_table=xformer_stack.get_embedding_table(),
        output=output)

    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
    masked_lm_positions = tf.keras.Input(
        shape=(num_predictions,), dtype=tf.int32)
-    output = test_network([lm_input_tensor, masked_lm_positions])
+    output = test_layer(lm_input_tensor, masked_positions=masked_lm_positions)
    return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)

  def create_classification_model(self, input_width, num_classes):

--- a/official/nlp/modeling/models/bert_pretrainer.py
+++ b/official/nlp/modeling/models/bert_pretrainer.py
@@ -25,6 +25,7 @@ from typing import List, Optional
 import gin
 import tensorflow as tf

+from official.nlp.modeling import layers
 from official.nlp.modeling import networks


@@ -47,8 +48,8 @@ class BertPretrainer(tf.keras.Model):
    num_token_predictions: Number of tokens to predict from the masked LM.
    embedding_table: Embedding table of a network. If None, the
      "network.get_embedding_table()" is used.
-    activation: The activation (if any) to use in the masked LM network.
-      If None, no activation will be used.
+    activation: The activation (if any) to use in the masked LM network. If
+      None, no activation will be used.
    initializer: The initializer (if any) to use in the masked LM and
      classification networks. Defaults to a Glorot uniform initializer.
    output: The output style for this network. Can be either 'logits' or
@@ -106,16 +107,16 @@ class BertPretrainer(tf.keras.Model):
        dtype=tf.int32)
    inputs.append(masked_lm_positions)

-    self.masked_lm = networks.MaskedLM(
-        num_predictions=num_token_predictions,
-        input_width=sequence_output.shape[-1],
-        source_network=network,
+    if embedding_table is None:
+      embedding_table = self.encoder.get_embedding_table()
+    self.masked_lm = layers.MaskedLM(
        embedding_table=embedding_table,
        activation=activation,
        initializer=initializer,
        output=output,
-        name='masked_lm')
-    lm_outputs = self.masked_lm([sequence_output, masked_lm_positions])
+        name='cls/predictions')
+    lm_outputs = self.masked_lm(
+        sequence_output, masked_positions=masked_lm_positions)

    self.classification = networks.Classification(
        input_width=cls_output.shape[-1],
@@ -126,7 +127,9 @@ class BertPretrainer(tf.keras.Model):
    sentence_outputs = self.classification(cls_output)

    super(BertPretrainer, self).__init__(
-        inputs=inputs, outputs=[lm_outputs, sentence_outputs], **kwargs)
+        inputs=inputs,
+        outputs=dict(masked_lm=lm_outputs, classification=sentence_outputs),
+        **kwargs)

  def get_config(self):
    return self._config
@@ -151,8 +154,8 @@ class BertPretrainerV2(tf.keras.Model):
    num_masked_tokens: Number of tokens to predict from the masked LM.
    encoder_network: A transformer network. This network should output a
      sequence output and a classification output.
-    mlm_activation: The activation (if any) to use in the masked LM network.
-      If None, no activation will be used.
+    mlm_activation: The activation (if any) to use in the masked LM network. If
+      None, no activation will be used.
    mlm_initializer: The initializer (if any) to use in the masked LM. Default
      to a Glorot uniform initializer.
    classification_heads: A list of optional head layers to transform on encoder
@@ -193,17 +196,18 @@ class BertPretrainerV2(tf.keras.Model):

    outputs = dict()
    if num_masked_tokens > 0:
-      self.masked_lm = networks.MaskedLM(
-          num_predictions=num_masked_tokens,
-          input_width=sequence_output.shape[-1],
-          source_network=self.encoder_network,
+      self.masked_lm = layers.MaskedLM(
+          embedding_table=self.encoder_network.get_embedding_table(),
          activation=mlm_activation,
          initializer=mlm_initializer,
-          name='masked_lm')
-      masked_lm_positions = copy.copy(self.masked_lm.inputs[-1])
+          name='cls/predictions')
+      masked_lm_positions = tf.keras.layers.Input(
+          shape=(num_masked_tokens,),
+          name='masked_lm_positions',
+          dtype=tf.int32)
      inputs.append(masked_lm_positions)
      outputs['lm_output'] = self.masked_lm(
-          [sequence_output, masked_lm_positions])
+          sequence_output, masked_positions=masked_lm_positions)
    for cls_head in self.classification_heads:
      outputs[cls_head.name] = cls_head(sequence_output)


--- a/official/nlp/modeling/models/bert_pretrainer_test.py
+++ b/official/nlp/modeling/models/bert_pretrainer_test.py
@@ -50,16 +50,19 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    masked_lm_positions = tf.keras.Input(
+        shape=(num_token_predictions,), dtype=tf.int32)

    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    lm_outs, cls_outs = bert_trainer_model([word_ids, mask, type_ids, lm_mask])
+    outputs = bert_trainer_model(
+        [word_ids, mask, type_ids, masked_lm_positions])

    # Validate that the outputs are of the expected shape.
    expected_lm_shape = [None, num_token_predictions, vocab_size]
    expected_classification_shape = [None, num_classes]
-    self.assertAllEqual(expected_lm_shape, lm_outs.shape.as_list())
-    self.assertAllEqual(expected_classification_shape, cls_outs.shape.as_list())
+    self.assertAllEqual(expected_lm_shape, outputs['masked_lm'].shape.as_list())
+    self.assertAllEqual(expected_classification_shape,
+                        outputs['classification'].shape.as_list())

  def test_bert_trainer_tensor_call(self):
    """Validate that the Keras object can be invoked."""
@@ -81,7 +84,7 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    # Invoke the trainer model on the tensors. In Eager mode, this does the
    # actual calculation. (We can't validate the outputs, since the network is
    # too complex: this simply ensures we're not hitting runtime errors.)
-    _, _ = bert_trainer_model([word_ids, mask, type_ids, lm_mask])
+    _ = bert_trainer_model([word_ids, mask, type_ids, lm_mask])

  def test_serialize_deserialize(self):
    """Validate that the BERT trainer can be serialized and deserialized."""
@@ -123,7 +126,7 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    lm_mask = tf.keras.Input(shape=(num_token_predictions,), dtype=tf.int32)

    # Invoke the trainer model on the inputs. This causes the layer to be built.
    outputs = bert_trainer_model([word_ids, mask, type_ids, lm_mask])

--- a/official/nlp/modeling/networks/README.md
+++ b/official/nlp/modeling/networks/README.md
@@ -16,8 +16,6 @@ Self-supervised Learning of Language Representations]
 (https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805), ALBERT refactorizes embedding parameters
 into two smaller matrices and shares parameters across layers.

-* [`MaskedLM`](masked_lm.py) implements a masked language model for BERT pretraining. It assumes that the network being passed has a `get_embedding_table()` method.
-
 * [`Classification`](classification.py) contains a single hidden layer, and is
 intended for use as a classification or regression (if number of classes is set
 to 1) head.

--- a/official/nlp/modeling/networks/__init__.py
+++ b/official/nlp/modeling/networks/__init__.py
@@ -16,7 +16,6 @@
 from official.nlp.modeling.networks.albert_transformer_encoder import AlbertTransformerEncoder
 from official.nlp.modeling.networks.classification import Classification
 from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold
-from official.nlp.modeling.networks.masked_lm import MaskedLM
 from official.nlp.modeling.networks.span_labeling import SpanLabeling
 from official.nlp.modeling.networks.token_classification import TokenClassification
 from official.nlp.modeling.networks.transformer_encoder import TransformerEncoder
--- a/official/nlp/tasks/masked_lm.py
+++ b/official/nlp/tasks/masked_lm.py
@@ -43,25 +43,25 @@ class MaskedLMTask(base_task.Task):
    return bert.instantiate_from_cfg(self.task_config.network)

  def build_losses(self,
-                   features,
+                   labels,
                   model_outputs,
                   metrics,
                   aux_losses=None) -> tf.Tensor:
    metrics = dict([(metric.name, metric) for metric in metrics])
    lm_output = tf.nn.log_softmax(model_outputs['lm_output'], axis=-1)
    mlm_loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
-        labels=features['masked_lm_ids'],
+        labels=labels['masked_lm_ids'],
        predictions=lm_output,
-        weights=features['masked_lm_weights'])
+        weights=labels['masked_lm_weights'])
    metrics['lm_example_loss'].update_state(mlm_loss)
-    if 'next_sentence_labels' in features:
+    if 'next_sentence_labels' in labels:
      policy = tf.keras.mixed_precision.experimental.global_policy()
      if policy.name == 'mixed_bfloat16':  # b/158514794: bf16 is not stable.
        policy = tf.float32
      predictions = tf.keras.layers.Activation(
          tf.nn.log_softmax, dtype=policy)(model_outputs['next_sentence'])

-      sentence_labels = features['next_sentence_labels']
+      sentence_labels = labels['next_sentence_labels']
      sentence_loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
          labels=sentence_labels,
          predictions=predictions)
@@ -112,15 +112,15 @@ class MaskedLMTask(base_task.Task):
      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
    return metrics

-  def process_metrics(self, metrics, inputs, outputs):
+  def process_metrics(self, metrics, labels, model_outputs):
    metrics = dict([(metric.name, metric) for metric in metrics])
    if 'masked_lm_accuracy' in metrics:
-      metrics['masked_lm_accuracy'].update_state(inputs['masked_lm_ids'],
-                                                 outputs['lm_output'],
-                                                 inputs['masked_lm_weights'])
+      metrics['masked_lm_accuracy'].update_state(labels['masked_lm_ids'],
+                                                 model_outputs['lm_output'],
+                                                 labels['masked_lm_weights'])
    if 'next_sentence_accuracy' in metrics:
      metrics['next_sentence_accuracy'].update_state(
-          inputs['next_sentence_labels'], outputs['next_sentence'])
+          labels['next_sentence_labels'], model_outputs['next_sentence'])

  def train_step(self, inputs, model: tf.keras.Model,
                 optimizer: tf.keras.optimizers.Optimizer, metrics):
@@ -139,7 +139,7 @@ class MaskedLMTask(base_task.Task):
      outputs = model(inputs, training=True)
      # Computes per-replica loss.
      loss = self.build_losses(
-          features=inputs,
+          labels=inputs,
          model_outputs=outputs,
          metrics=metrics,
          aux_losses=model.losses)
@@ -166,7 +166,7 @@ class MaskedLMTask(base_task.Task):
    """
    outputs = self.inference_step(inputs, model)
    loss = self.build_losses(
-        features=inputs,
+        labels=inputs,
        model_outputs=outputs,
        metrics=metrics,
        aux_losses=model.losses)

--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -29,9 +29,9 @@ from official.nlp.modeling import losses as loss_lib
 @dataclasses.dataclass
 class SentencePredictionConfig(cfg.TaskConfig):
  """The model config."""
-  # At most one of `pretrain_checkpoint_dir` and `hub_module_url` can
+  # At most one of `init_checkpoint` and `hub_module_url` can
  # be specified.
-  pretrain_checkpoint_dir: str = ''
+  init_checkpoint: str = ''
  hub_module_url: str = ''
  network: bert.BertPretrainerConfig = bert.BertPretrainerConfig(
      num_masked_tokens=0,
@@ -52,7 +52,7 @@ class SentencePredictionTask(base_task.Task):

  def __init__(self, params=cfg.TaskConfig):
    super(SentencePredictionTask, self).__init__(params)
-    if params.hub_module_url and params.pretrain_checkpoint_dir:
+    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
                       '`pretrain_checkpoint_dir` can be specified.')
    if params.hub_module_url:
@@ -79,12 +79,11 @@ class SentencePredictionTask(base_task.Task):
    else:
      return bert.instantiate_from_cfg(self.task_config.network)

-  def build_losses(self, features, model_outputs, aux_losses=None) -> tf.Tensor:
-    labels = features
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
    loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
        labels=labels,
-        predictions=tf.nn.log_softmax(model_outputs['sentence_prediction'],
-                                      axis=-1))
+        predictions=tf.nn.log_softmax(
+            model_outputs['sentence_prediction'], axis=-1))

    if aux_losses:
      loss += tf.add_n(aux_losses)
@@ -93,6 +92,7 @@ class SentencePredictionTask(base_task.Task):
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':
+
      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        x = dict(
@@ -113,22 +113,22 @@ class SentencePredictionTask(base_task.Task):

  def build_metrics(self, training=None):
    del training
-    metrics = [
-        tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')
-    ]
+    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
    return metrics

-  def process_metrics(self, metrics, labels, outputs):
+  def process_metrics(self, metrics, labels, model_outputs):
    for metric in metrics:
-      metric.update_state(labels, outputs['sentence_prediction'])
+      metric.update_state(labels, model_outputs['sentence_prediction'])

-  def process_compiled_metrics(self, compiled_metrics, labels, outputs):
-    compiled_metrics.update_state(labels, outputs['sentence_prediction'])
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    compiled_metrics.update_state(labels, model_outputs['sentence_prediction'])

  def initialize(self, model):
    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
-    pretrain_ckpt_dir = self.task_config.pretrain_checkpoint_dir
-    if not pretrain_ckpt_dir:
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
      return

    pretrain2finetune_mapping = {
@@ -138,10 +138,7 @@ class SentencePredictionTask(base_task.Task):
            model.checkpoint_items['sentence_prediction.pooler_dense'],
    }
    ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
-    latest_pretrain_ckpt = tf.train.latest_checkpoint(pretrain_ckpt_dir)
-    if latest_pretrain_ckpt is None:
-      raise FileNotFoundError(
-          'Cannot find pretrain checkpoint under {}'.format(pretrain_ckpt_dir))
-    status = ckpt.restore(latest_pretrain_ckpt)
+    status = ckpt.restore(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint.')
+    logging.info('finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/sentence_prediction_test.py
+++ b/official/nlp/tasks/sentence_prediction_test.py
@@ -43,8 +43,10 @@ class SentencePredictionTaskTest(tf.test.TestCase):

  def test_task(self):
    config = sentence_prediction.SentencePredictionConfig(
+        init_checkpoint=self.get_temp_dir(),
        network=bert.BertPretrainerConfig(
-            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
+            encoder=encoders.TransformerEncoderConfig(
+                vocab_size=30522, num_layers=1),
            num_masked_tokens=0,
            cls_heads=[
                bert.ClsHeadConfig(
@@ -62,6 +64,21 @@ class SentencePredictionTaskTest(tf.test.TestCase):
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)

+    # Saves a checkpoint.
+    pretrain_cfg = bert.BertPretrainerConfig(
+        encoder=encoders.TransformerEncoderConfig(
+            vocab_size=30522, num_layers=1),
+        num_masked_tokens=20,
+        cls_heads=[
+            bert.ClsHeadConfig(
+                inner_dim=10, num_classes=3, name="next_sentence")
+        ])
+    pretrain_model = bert.instantiate_from_cfg(pretrain_cfg)
+    ckpt = tf.train.Checkpoint(
+        model=pretrain_model, **pretrain_model.checkpoint_items)
+    ckpt.save(config.init_checkpoint)
+    task.initialize(model)
+
  def _export_bert_tfhub(self):
    bert_config = configs.BertConfig(
        vocab_size=30522,

--- a/official/nlp/transformer/misc.py
+++ b/official/nlp/transformer/misc.py
@@ -218,7 +218,7 @@ def get_callbacks():
    time_callback = keras_utils.TimeHistory(
        FLAGS.batch_size,
        FLAGS.log_steps,
-        FLAGS.model_dir if FLAGS.enable_tensorboard else None)
+        logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
    callbacks.append(time_callback)

  if FLAGS.enable_tensorboard: