Merge branch 'master' into panoptic-deeplab

dfcc691c · Srihari Humbarwadi · GitHub · 83b87f05 · a9d9e633 · dfcc691c
Unverified Commit dfcc691c authored Mar 16, 2022 by Srihari Humbarwadi Committed by GitHub Mar 16, 2022
20 changed files
--- a/official/projects/movinet/movinet_tutorial.ipynb
+++ b/official/projects/movinet/movinet_tutorial.ipynb
@@ -8,7 +8,20 @@
      "source": [
        "# MoViNet Tutorial\n",
        "\n",
-        "This notebook provides basic example code to create, build, and run [MoViNets (Mobile Video Networks)](https://arxiv.org/pdf/2103.11511.pdf). Models use TF Keras and support inference in TF 1 and TF 2. Pretrained models are provided by [TensorFlow Hub](https://tfhub.dev/google/collections/movinet/), trained on [Kinetics 600](https://deepmind.com/research/open-source/kinetics) for video action classification."
+        "This notebook provides basic example code to build, run, and fine-tune [MoViNets (Mobile Video Networks)](https://arxiv.org/pdf/2103.11511.pdf).\n",
+        "\n",
+        "Pretrained models are provided by [TensorFlow Hub](https://tfhub.dev/google/collections/movinet/) and the [TensorFlow Model Garden](https://github.com/tensorflow/models/tree/master/official/projects/movinet), trained on [Kinetics 600](https://deepmind.com/research/open-source/kinetics) for video action classification. All Models use TensorFlow 2 with Keras for inference and training.\n",
+        "\n",
+        "The following steps will be performed:\n",
+        "\n",
+        "1. [Running base model inference with TensorFlow Hub](#scrollTo=6g0tuFvf71S9\u0026line=8\u0026uniqifier=1)\n",
+        "2. [Running streaming model inference with TensorFlow Hub and plotting predictions](#scrollTo=ADrHPmwGcBZ5\u0026line=4\u0026uniqifier=1)\n",
+        "3. [Exporting a streaming model to TensorFlow Lite for mobile](#scrollTo=W3CLHvubvdSI\u0026line=3\u0026uniqifier=1)\n",
+        "4. [Fine-Tuning a base Model with the TensorFlow Model Garden](#scrollTo=_s-7bEoa3f8g\u0026line=11\u0026uniqifier=1)\n",
+        "\n",
+        "![jumping jacks plot](https://storage.googleapis.com/tf_model_garden/vision/movinet/artifacts/jumpingjacks_plot.gif)\n",
+        "\n",
+        "To generate video plots like the one above, see [section 2](#scrollTo=ADrHPmwGcBZ5\u0026line=4\u0026uniqifier=1)."
      ]
    },
    {
@@ -19,17 +32,9 @@
      "source": [
        "## Setup\n",
        "\n",
-        "It is recommended to run the models using GPUs or TPUs.\n",
-        "\n",
-        "To select a GPU/TPU in Colab, select `Runtime \u003e Change runtime type \u003e Hardware accelerator` dropdown in the top menu.\n",
-        "\n",
-        "### Install the TensorFlow Model Garden pip package\n",
-        "\n",
-        "- tf-models-official is the stable Model Garden package. Note that it may not include the latest changes in the tensorflow_models github repo.\n",
-        "- To include latest changes, you may install tf-models-nightly, which is the nightly Model Garden package created daily automatically.\n",
-        "pip will install all models and dependencies automatically.\n",
+        "For inference on smaller models (A0-A2), CPU is sufficient for this Colab. For fine-tuning, it is recommended to run the models using GPUs.\n",
        "\n",
-        "Install the [mediapy](https://github.com/google/mediapy) package for visualizing images/videos."
+        "To select a GPU in Colab, select `Runtime \u003e Change runtime type \u003e Hardware accelerator \u003e GPU` dropdown in the top menu."
      ]
    },
    {
@@ -40,10 +45,24 @@
      },
      "outputs": [],
      "source": [
-        "!pip install -q tf-models-nightly tfds-nightly\n",
+        "# Install packages\n",
        "\n",
+        "# tf-models-official is the stable Model Garden package\n",
+        "# tf-models-nightly includes latest changes\n",
+        "!pip install -q tf-models-nightly\n",
+        "\n",
+        "# Install tfds nightly to download ucf101\n",
+        "!pip install -q tfds-nightly\n",
+        "\n",
+        "# Install the mediapy package for visualizing images/videos.\n",
+        "# See https://github.com/google/mediapy\n",
        "!command -v ffmpeg \u003e/dev/null || (apt update \u0026\u0026 apt install -y ffmpeg)\n",
-        "!pip install -q mediapy"
+        "!pip install -q mediapy\n",
+        "\n",
+        "# Due to a bug, we reinstall opencv\n",
+        "# See https://stackoverflow.com/q/70537488\n",
+        "!pip uninstall -q -y opencv-python-headless\n",
+        "!pip install -q \"opencv-python-headless\u003c4.3\""
      ]
    },
    {
@@ -54,22 +73,268 @@
      },
      "outputs": [],
      "source": [
+        "# Run imports\n",
        "import os\n",
-        "from six.moves import urllib\n",
        "\n",
+        "import matplotlib as mpl\n",
        "import matplotlib.pyplot as plt\n",
        "import mediapy as media\n",
        "import numpy as np\n",
-        "from PIL import Image\n",
+        "import PIL\n",
+        "import pandas as pd\n",
        "import tensorflow as tf\n",
        "import tensorflow_datasets as tfds\n",
        "import tensorflow_hub as hub\n",
+        "import tqdm\n",
        "\n",
-        "from official.vision.beta.configs import video_classification\n",
-        "from official.projects.movinet.configs import movinet as movinet_configs\n",
-        "from official.projects.movinet.modeling import movinet\n",
-        "from official.projects.movinet.modeling import movinet_layers\n",
-        "from official.projects.movinet.modeling import movinet_model"
+        "mpl.rcParams.update({\n",
+        "    'font.size': 10,\n",
+        "})"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OnFqOXazoWgy"
+      },
+      "source": [
+        "Run the cell below to define helper functions and create variables."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "dx55NK3ZoZeh"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Run this cell to set up some helper code.\n",
+        "\n",
+        "# Download Kinetics 600 label map\n",
+        "!wget https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt -O labels.txt -q\n",
+        "\n",
+        "with tf.io.gfile.GFile('labels.txt') as f:\n",
+        "  lines = f.readlines()\n",
+        "  KINETICS_600_LABELS_LIST = [line.strip() for line in lines]\n",
+        "  KINETICS_600_LABELS = tf.constant(KINETICS_600_LABELS_LIST)\n",
+        "\n",
+        "def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):\n",
+        "  \"\"\"Outputs the top k model labels and probabilities on the given video.\"\"\"\n",
+        "  top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]\n",
+        "  top_labels = tf.gather(label_map, top_predictions, axis=-1)\n",
+        "  top_labels = [label.decode('utf8') for label in top_labels.numpy()]\n",
+        "  top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()\n",
+        "  return tuple(zip(top_labels, top_probs))\n",
+        "\n",
+        "def predict_top_k(model, video, k=5, label_map=KINETICS_600_LABELS):\n",
+        "  \"\"\"Outputs the top k model labels and probabilities on the given video.\"\"\"\n",
+        "  outputs = model.predict(video[tf.newaxis])[0]\n",
+        "  probs = tf.nn.softmax(outputs)\n",
+        "  return get_top_k(probs, k=k, label_map=label_map)\n",
+        "\n",
+        "def load_movinet_from_hub(model_id, model_mode, hub_version=3):\n",
+        "  \"\"\"Loads a MoViNet model from TF Hub.\"\"\"\n",
+        "  hub_url = f'https://tfhub.dev/tensorflow/movinet/{model_id}/{model_mode}/kinetics-600/classification/{hub_version}'\n",
+        "\n",
+        "  encoder = hub.KerasLayer(hub_url, trainable=True)\n",
+        "\n",
+        "  inputs = tf.keras.layers.Input(\n",
+        "      shape=[None, None, None, 3],\n",
+        "      dtype=tf.float32)\n",
+        "\n",
+        "  if model_mode == 'base':\n",
+        "    inputs = dict(image=inputs)\n",
+        "  else:\n",
+        "    # Define the state inputs, which is a dict that maps state names to tensors.\n",
+        "    init_states_fn = encoder.resolved_object.signatures['init_states']\n",
+        "    state_shapes = {\n",
+        "        name: ([s if s \u003e 0 else None for s in state.shape], state.dtype)\n",
+        "        for name, state in init_states_fn(tf.constant([0, 0, 0, 0, 3])).items()\n",
+        "    }\n",
+        "    states_input = {\n",
+        "        name: tf.keras.Input(shape[1:], dtype=dtype, name=name)\n",
+        "        for name, (shape, dtype) in state_shapes.items()\n",
+        "    }\n",
+        "\n",
+        "    # The inputs to the model are the states and the video\n",
+        "    inputs = {**states_input, 'image': inputs}\n",
+        "\n",
+        "  # Output shape: [batch_size, 600]\n",
+        "  outputs = encoder(inputs)\n",
+        "\n",
+        "  model = tf.keras.Model(inputs, outputs)\n",
+        "  model.build([1, 1, 1, 1, 3])\n",
+        "\n",
+        "  return model\n",
+        "\n",
+        "# Download example gif\n",
+        "!wget https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif -O jumpingjack.gif -q\n",
+        "\n",
+        "def load_gif(file_path, image_size=(224, 224)):\n",
+        "  \"\"\"Loads a gif file into a TF tensor.\"\"\"\n",
+        "  with tf.io.gfile.GFile(file_path, 'rb') as f:\n",
+        "    video = tf.io.decode_gif(f.read())\n",
+        "  video = tf.image.resize(video, image_size)\n",
+        "  video = tf.cast(video, tf.float32) / 255.\n",
+        "  return video\n",
+        "\n",
+        "def get_top_k_streaming_labels(probs, k=5, label_map=KINETICS_600_LABELS_LIST):\n",
+        "  \"\"\"Returns the top-k labels over an entire video sequence.\n",
+        "\n",
+        "  Args:\n",
+        "    probs: probability tensor of shape (num_frames, num_classes) that represents\n",
+        "      the probability of each class on each frame.\n",
+        "    k: the number of top predictions to select.\n",
+        "    label_map: a list of labels to map logit indices to label strings.\n",
+        "\n",
+        "  Returns:\n",
+        "    a tuple of the top-k probabilities, labels, and logit indices\n",
+        "  \"\"\"\n",
+        "  top_categories_last = tf.argsort(probs, -1, 'DESCENDING')[-1, :1]\n",
+        "  categories = tf.argsort(probs, -1, 'DESCENDING')[:, :k]\n",
+        "  categories = tf.reshape(categories, [-1])\n",
+        "\n",
+        "  counts = sorted([\n",
+        "      (i.numpy(), tf.reduce_sum(tf.cast(categories == i, tf.int32)).numpy())\n",
+        "      for i in tf.unique(categories)[0]\n",
+        "  ], key=lambda x: x[1], reverse=True)\n",
+        "\n",
+        "  top_probs_idx = tf.constant([i for i, _ in counts[:k]])\n",
+        "  top_probs_idx = tf.concat([top_categories_last, top_probs_idx], 0)\n",
+        "  top_probs_idx = tf.unique(top_probs_idx)[0][:k+1]\n",
+        "\n",
+        "  top_probs = tf.gather(probs, top_probs_idx, axis=-1)\n",
+        "  top_probs = tf.transpose(top_probs, perm=(1, 0))\n",
+        "  top_labels = tf.gather(label_map, top_probs_idx, axis=0)\n",
+        "  top_labels = [label.decode('utf8') for label in top_labels.numpy()]\n",
+        "\n",
+        "  return top_probs, top_labels, top_probs_idx\n",
+        "\n",
+        "def plot_streaming_top_preds_at_step(\n",
+        "    top_probs,\n",
+        "    top_labels,\n",
+        "    step=None,\n",
+        "    image=None,\n",
+        "    legend_loc='lower left',\n",
+        "    duration_seconds=10,\n",
+        "    figure_height=500,\n",
+        "    playhead_scale=0.8,\n",
+        "    grid_alpha=0.3):\n",
+        "  \"\"\"Generates a plot of the top video model predictions at a given time step.\n",
+        "\n",
+        "  Args:\n",
+        "    top_probs: a tensor of shape (k, num_frames) representing the top-k\n",
+        "      probabilities over all frames.\n",
+        "    top_labels: a list of length k that represents the top-k label strings.\n",
+        "    step: the current time step in the range [0, num_frames].\n",
+        "    image: the image frame to display at the current time step.\n",
+        "    legend_loc: the placement location of the legend.\n",
+        "    duration_seconds: the total duration of the video.\n",
+        "    figure_height: the output figure height.\n",
+        "    playhead_scale: scale value for the playhead.\n",
+        "    grid_alpha: alpha value for the gridlines.\n",
+        "\n",
+        "  Returns:\n",
+        "    A tuple of the output numpy image, figure, and axes.\n",
+        "  \"\"\"\n",
+        "  num_labels, num_frames = top_probs.shape\n",
+        "  if step is None:\n",
+        "    step = num_frames\n",
+        "\n",
+        "  fig = plt.figure(figsize=(6.5, 7), dpi=300)\n",
+        "  gs = mpl.gridspec.GridSpec(8, 1)\n",
+        "  ax2 = plt.subplot(gs[:-3, :])\n",
+        "  ax = plt.subplot(gs[-3:, :])\n",
+        "\n",
+        "  if image is not None:\n",
+        "    ax2.imshow(image, interpolation='nearest')\n",
+        "    ax2.axis('off')\n",
+        "\n",
+        "  preview_line_x = tf.linspace(0., duration_seconds, num_frames)\n",
+        "  preview_line_y = top_probs\n",
+        "\n",
+        "  line_x = preview_line_x[:step+1]\n",
+        "  line_y = preview_line_y[:, :step+1]\n",
+        "\n",
+        "  for i in range(num_labels):\n",
+        "    ax.plot(preview_line_x, preview_line_y[i], label=None, linewidth='1.5',\n",
+        "            linestyle=':', color='gray')\n",
+        "    ax.plot(line_x, line_y[i], label=top_labels[i], linewidth='2.0')\n",
+        "\n",
+        "\n",
+        "  ax.grid(which='major', linestyle=':', linewidth='1.0', alpha=grid_alpha)\n",
+        "  ax.grid(which='minor', linestyle=':', linewidth='0.5', alpha=grid_alpha)\n",
+        "\n",
+        "  min_height = tf.reduce_min(top_probs) * playhead_scale\n",
+        "  max_height = tf.reduce_max(top_probs)\n",
+        "  ax.vlines(preview_line_x[step], min_height, max_height, colors='red')\n",
+        "  ax.scatter(preview_line_x[step], max_height, color='red')\n",
+        "\n",
+        "  ax.legend(loc=legend_loc)\n",
+        "\n",
+        "  plt.xlim(0, duration_seconds)\n",
+        "  plt.ylabel('Probability')\n",
+        "  plt.xlabel('Time (s)')\n",
+        "  plt.yscale('log')\n",
+        "\n",
+        "  fig.tight_layout()\n",
+        "  fig.canvas.draw()\n",
+        "\n",
+        "  data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)\n",
+        "  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))\n",
+        "  plt.close()\n",
+        "\n",
+        "  figure_width = int(figure_height * data.shape[1] / data.shape[0])\n",
+        "  image = PIL.Image.fromarray(data).resize([figure_width, figure_height])\n",
+        "  image = np.array(image)\n",
+        "\n",
+        "  return image, (fig, ax, ax2)\n",
+        "\n",
+        "def plot_streaming_top_preds(\n",
+        "    probs,\n",
+        "    video,\n",
+        "    top_k=5,\n",
+        "    video_fps=25.,\n",
+        "    figure_height=500,\n",
+        "    use_progbar=True):\n",
+        "  \"\"\"Generates a video plot of the top video model predictions.\n",
+        "\n",
+        "  Args:\n",
+        "    probs: probability tensor of shape (num_frames, num_classes) that represents\n",
+        "      the probability of each class on each frame.\n",
+        "    video: the video to display in the plot.\n",
+        "    top_k: the number of top predictions to select.\n",
+        "    video_fps: the input video fps.\n",
+        "    figure_fps: the output video fps.\n",
+        "    figure_height: the height of the output video.\n",
+        "    use_progbar: display a progress bar.\n",
+        "\n",
+        "  Returns:\n",
+        "    A numpy array representing the output video.\n",
+        "  \"\"\"\n",
+        "  video_fps = 8.\n",
+        "  figure_height = 500\n",
+        "  steps = video.shape[0]\n",
+        "  duration = steps / video_fps\n",
+        "\n",
+        "  top_probs, top_labels, _ = get_top_k_streaming_labels(probs, k=top_k)\n",
+        "\n",
+        "  images = []\n",
+        "  step_generator = tqdm.trange(steps) if use_progbar else range(steps)\n",
+        "  for i in step_generator:\n",
+        "    image, _ = plot_streaming_top_preds_at_step(\n",
+        "        top_probs=top_probs,\n",
+        "        top_labels=top_labels,\n",
+        "        step=i,\n",
+        "        image=video[i],\n",
+        "        duration_seconds=duration,\n",
+        "        figure_height=figure_height,\n",
+        "    )\n",
+        "    images.append(image)\n",
+        "\n",
+        "  return np.array(images)"
      ]
    },
    {
@@ -78,95 +343,335 @@
        "id": "6g0tuFvf71S9"
      },
      "source": [
-        "## Example Usage with TensorFlow Hub\n",
+        "## Running Base Model Inference with TensorFlow Hub\n",
        "\n",
-        "Load MoViNet-A2-Base from TensorFlow Hub, as part of the [MoViNet collection](https://tfhub.dev/google/collections/movinet/).\n",
+        "We will load MoViNet-A2-Base from TensorFlow Hub as part of the [MoViNet collection](https://tfhub.dev/google/collections/movinet/).\n",
        "\n",
        "The following code will:\n",
        "\n",
        "- Load a MoViNet KerasLayer from [tfhub.dev](https://tfhub.dev).\n",
        "- Wrap the layer in a [Keras Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model).\n",
-        "- Load an example image, and reshape it to a single frame video.\n",
-        "- Classify the video"
+        "- Load an example gif as a video.\n",
+        "- Classify the video and print the top-5 predicted classes."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "id": "nTUdhlRJzl2o"
+        "id": "KZKKNZVBpglJ"
      },
      "outputs": [],
      "source": [
-        "movinet_a2_hub_url = 'https://tfhub.dev/tensorflow/movinet/a2/base/kinetics-600/classification/1'\n",
-        "\n",
-        "inputs = tf.keras.layers.Input(\n",
-        "    shape=[None, None, None, 3],\n",
-        "    dtype=tf.float32)\n",
+        "model = load_movinet_from_hub('a2', 'base', hub_version=3)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7kU1_pL10l0B"
+      },
+      "source": [
+        "To provide a simple example video for classification, we can load a short gif of jumping jacks being performed.\n",
        "\n",
-        "encoder = hub.KerasLayer(movinet_a2_hub_url, trainable=True)\n",
+        "![jumping jacks](https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif)\n",
        "\n",
-        "# Important: To use tf.nn.conv3d on CPU, we must compile with tf.function.\n",
-        "encoder.call = tf.function(encoder.call, experimental_compile=True)\n",
+        "Attribution: Footage shared by [Coach Bobby Bluford](https://www.youtube.com/watch?v=-AxHpj-EuPg) on YouTube under the CC-BY license."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Iy0rKRrT723_"
+      },
+      "outputs": [],
+      "source": [
+        "video = load_gif('jumpingjack.gif', image_size=(172, 172))\n",
        "\n",
-        "# [batch_size, 600]\n",
-        "outputs = encoder(dict(image=inputs))\n",
+        "# Show video\n",
+        "print(video.shape)\n",
+        "media.show_video(video.numpy(), fps=5)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "P0bZfrAsqPv2",
+        "outputId": "bd82571f-8dfd-4faf-ed10-e34708b0405d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "jumping jacks 0.9166437\n",
+            "zumba 0.016020728\n",
+            "doing aerobics 0.008053946\n",
+            "dancing charleston 0.006083599\n",
+            "lunge 0.0035062772\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Run the model on the video and output the top 5 predictions\n",
+        "outputs = predict_top_k(model, video)\n",
        "\n",
-        "model = tf.keras.Model(inputs, outputs)"
+        "for label, prob in outputs:\n",
+        "  print(label, prob)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "id": "7kU1_pL10l0B"
+        "id": "ADrHPmwGcBZ5"
      },
      "source": [
-        "To provide a simple example video for classification, we can load a static image and reshape it to produce a video with a single frame."
+        "## Run Streaming Model Inference with TensorFlow Hub and Plot Predictions\n",
+        "\n",
+        "We will load MoViNet-A0-Stream from TensorFlow Hub as part of the [MoViNet collection](https://tfhub.dev/google/collections/movinet/).\n",
+        "\n",
+        "The following code will:\n",
+        "\n",
+        "- Load a MoViNet model from [tfhub.dev](https://tfhub.dev).\n",
+        "- Classify an example video and plot the streaming predictions over time."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "id": "Iy0rKRrT723_"
+        "id": "tXWR13wthnK5"
      },
      "outputs": [],
      "source": [
-        "image_url = 'https://upload.wikimedia.org/wikipedia/commons/8/84/Ski_Famille_-_Family_Ski_Holidays.jpg'\n",
-        "image_height = 224\n",
-        "image_width = 224\n",
+        "model = load_movinet_from_hub('a2', 'stream', hub_version=3)\n",
+        "\n",
+        "# Create initial states for the stream model\n",
+        "init_states_fn = model.layers[-1].resolved_object.signatures['init_states']\n",
+        "init_states = init_states_fn(tf.shape(video[tf.newaxis]))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YqSkt7l8ltwt",
+        "outputId": "6ccf1dd6-95d1-43b1-efdb-2e931dd3a19d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "100%|██████████| 13/13 [00:08\u003c00:00,  1.58it/s]\n",
+            "jumping jacks 0.9998123\n",
+            "zumba 0.00011835508\n",
+            "doing aerobics 3.3375818e-05\n",
+            "dancing charleston 4.9819987e-06\n",
+            "finger snapping 3.8673647e-06\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Insert your video clip here\n",
+        "video = load_gif('jumpingjack.gif', image_size=(172, 172))\n",
+        "clips = tf.split(video[tf.newaxis], video.shape[0], axis=1)\n",
+        "\n",
+        "all_logits = []\n",
        "\n",
-        "with urllib.request.urlopen(image_url) as f:\n",
-        "  image = Image.open(f).resize((image_height, image_width))\n",
-        "video = tf.reshape(np.array(image), [1, 1, image_height, image_width, 3])\n",
-        "video = tf.cast(video, tf.float32) / 255.\n",
+        "# To run on a video, pass in one frame at a time\n",
+        "states = init_states\n",
+        "for clip in tqdm.tqdm(clips):\n",
+        "  # Input shape: [1, 1, 172, 172, 3]\n",
+        "  logits, states = model.predict({**states, 'image': clip}, verbose=0)\n",
+        "  all_logits.append(logits)\n",
        "\n",
-        "image"
+        "logits = tf.concat(all_logits, 0)\n",
+        "probs = tf.nn.softmax(logits)\n",
+        "\n",
+        "final_probs = probs[-1]\n",
+        "top_k = get_top_k(final_probs)\n",
+        "print()\n",
+        "for label, prob in top_k:\n",
+        "  print(label, prob)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Xdox556CtMRb"
+      },
+      "outputs": [],
+      "source": [
+        "# Generate a plot and output to a video tensor\n",
+        "plot_video = plot_streaming_top_preds(probs, video, video_fps=8.)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NSStKE9klCs3"
+      },
+      "outputs": [],
+      "source": [
+        "# For gif format, set codec='gif'\n",
+        "media.show_video(plot_video, fps=3)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
-        "id": "Yf6EefHuWfxC"
+        "id": "W3CLHvubvdSI"
      },
      "source": [
-        "Run the model and output the predicted label. Expected output should be skiing (labels 464-467). E.g., 465 = \"skiing crosscountry\".\n",
+        "## Export a Streaming Model to TensorFlow Lite for Mobile\n",
        "\n",
-        "See [here](https://gist.github.com/willprice/f19da185c9c5f32847134b87c1960769#file-kinetics_600_labels-csv) for a full list of all labels."
+        "We will convert a MoViNet-A0-Stream model to [TensorFlow Lite](https://www.tensorflow.org/lite).\n",
+        "\n",
+        "The following code will:\n",
+        "- Load a MoViNet-A0-Stream model.\n",
+        "- Convert the model to TF Lite.\n",
+        "- Run inference on an example video using the Python interpreter."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
-        "id": "OOpEKuqH8sH7"
+        "id": "KH0j-07KVh06"
      },
      "outputs": [],
      "source": [
-        "output = model(video)\n",
-        "output_label_index = tf.argmax(output, -1)[0].numpy()\n",
+        "# Run imports\n",
+        "from official.vision.configs import video_classification\n",
+        "from official.projects.movinet.configs import movinet as movinet_configs\n",
+        "from official.projects.movinet.modeling import movinet\n",
+        "from official.projects.movinet.modeling import movinet_layers\n",
+        "from official.projects.movinet.modeling import movinet_model\n",
+        "from official.projects.movinet.tools import export_saved_model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RLkV0xtPvfkY"
+      },
+      "outputs": [],
+      "source": [
+        "# Export to saved model\n",
+        "saved_model_dir = 'model'\n",
+        "tflite_filename = 'model.tflite'\n",
+        "input_shape = [1, 1, 172, 172, 3]\n",
+        "batch_size, num_frames, image_size, = input_shape[:3]\n",
        "\n",
-        "print(output_label_index)"
+        "tf.keras.backend.clear_session()\n",
+        "\n",
+        "# Create the model\n",
+        "input_specs = tf.keras.layers.InputSpec(shape=input_shape)\n",
+        "backbone = movinet.Movinet(\n",
+        "    model_id='a0',\n",
+        "    causal=True,\n",
+        "    conv_type='2plus1d',\n",
+        "    se_type='2plus3d',\n",
+        "    input_specs=input_specs,\n",
+        "    activation='hard_swish',\n",
+        "    gating_activation='hard_sigmoid',\n",
+        "    use_sync_bn=False,\n",
+        "    use_external_states=True)\n",
+        "model = movinet_model.MovinetClassifier(\n",
+        "    backbone=backbone,\n",
+        "    activation='hard_swish',\n",
+        "    num_classes=600,\n",
+        "    output_states=True,\n",
+        "    input_specs=dict(image=input_specs))\n",
+        "model.build([1, 1, 1, 1, 3])\n",
+        "\n",
+        "# Extract pretrained weights\n",
+        "!wget https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_stream.tar.gz -O movinet_a0_stream.tar.gz -q\n",
+        "!tar -xvf movinet_a0_stream.tar.gz\n",
+        "\n",
+        "checkpoint_dir = 'movinet_a0_stream'\n",
+        "checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)\n",
+        "\n",
+        "# Convert to saved model\n",
+        "export_saved_model.export_saved_model(\n",
+        "    model=model,\n",
+        "    input_shape=input_shape,\n",
+        "    export_path=saved_model_dir,\n",
+        "    causal=True,\n",
+        "    bundle_input_init_states_fn=False,\n",
+        "    checkpoint_path=checkpoint_path)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gPg_6eMC8IwF"
+      },
+      "outputs": [],
+      "source": [
+        "# Convert to TF Lite\n",
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
+        "tflite_model = converter.convert()\n",
+        "\n",
+        "with open(tflite_filename, 'wb') as f:\n",
+        "  f.write(tflite_model)\n",
+        "\n",
+        "# Create the interpreter and signature runner\n",
+        "interpreter = tf.lite.Interpreter(model_path=tflite_filename)\n",
+        "runner = interpreter.get_signature_runner()\n",
+        "\n",
+        "init_states = {\n",
+        "    name: tf.zeros(x['shape'], dtype=x['dtype'])\n",
+        "    for name, x in runner.get_input_details().items()\n",
+        "}\n",
+        "del init_states['image']"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-TQ-7oSJIlTA",
+        "outputId": "a15519ff-d08c-40bc-fbea-d3a58169450c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "jumping jacks 0.9791285\n",
+            "jogging 0.0019550633\n",
+            "riding unicycle 0.0017429002\n",
+            "passing soccer ball 0.0016952101\n",
+            "stretching arm 0.0014458151\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Insert your video clip here\n",
+        "video = load_gif('jumpingjack.gif', image_size=(172, 172))\n",
+        "clips = tf.split(video[tf.newaxis], video.shape[0], axis=1)\n",
+        "\n",
+        "# To run on a video, pass in one frame at a time\n",
+        "states = init_states\n",
+        "for clip in clips:\n",
+        "  # Input shape: [1, 1, 172, 172, 3]\n",
+        "  outputs = runner(**states, image=clip)\n",
+        "  logits = outputs.pop('logits')[0]\n",
+        "  states = outputs\n",
+        "\n",
+        "probs = tf.nn.softmax(logits)\n",
+        "top_k = get_top_k(probs)\n",
+        "print()\n",
+        "for label, prob in top_k:\n",
+        "  print(label, prob)"
      ]
    },
    {
@@ -175,17 +680,17 @@
        "id": "_s-7bEoa3f8g"
      },
      "source": [
-        "## Example Usage with the TensorFlow Model Garden\n",
+        "## Fine-Tune a Base Model with the TensorFlow Model Garden\n",
        "\n",
-        "Fine-tune MoViNet-A0-Base on [UCF-101](https://www.crcv.ucf.edu/research/data-sets/ucf101/).\n",
+        "We will Fine-tune MoViNet-A0-Base on [UCF-101](https://www.crcv.ucf.edu/research/data-sets/ucf101/).\n",
        "\n",
        "The following code will:\n",
        "\n",
        "- Load the UCF-101 dataset with [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/ucf101).\n",
-        "- Create a [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) pipeline for training and evaluation.\n",
+        "- Create a simple [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) pipeline for training and evaluation.\n",
        "- Display some example videos from the dataset.\n",
        "- Build a MoViNet model and load pretrained weights.\n",
-        "- Fine-tune the final classifier layers on UCF-101."
+        "- Fine-tune the final classifier layers on UCF-101 and evaluate accuracy on the validation set."
      ]
    },
    {
@@ -196,7 +701,25 @@
      "source": [
        "### Load the UCF-101 Dataset with TensorFlow Datasets\n",
        "\n",
-        "Calling `download_and_prepare()` will automatically download the dataset. After downloading, this cell will output information about the dataset."
+        "Calling `download_and_prepare()` will automatically download the dataset. This step may take up to 1 hour depending on the download and extraction speed. After downloading, the next cell will output information about the dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2IHLbPAfrs5P"
+      },
+      "outputs": [],
+      "source": [
+        "# Run imports\n",
+        "import tensorflow_datasets as tfds\n",
+        "\n",
+        "from official.vision.configs import video_classification\n",
+        "from official.projects.movinet.configs import movinet as movinet_configs\n",
+        "from official.projects.movinet.modeling import movinet\n",
+        "from official.projects.movinet.modeling import movinet_layers\n",
+        "from official.projects.movinet.modeling import movinet_model"
      ]
    },
    {
@@ -288,7 +811,7 @@
              ")"
            ]
          },
-          "execution_count": 0,
+          "execution_count": null,
          "metadata": {
            "tags": []
          },
@@ -310,15 +833,6 @@
        "builder.info"
      ]
    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BsJJgnBBqDKZ"
-      },
-      "source": [
-        "Build the training and evaluation datasets."
-      ]
-    },
    {
      "cell_type": "code",
      "execution_count": null,
@@ -327,6 +841,8 @@
      },
      "outputs": [],
      "source": [
+        "# Build the training and evaluation datasets.\n",
+        "\n",
        "batch_size = 8\n",
        "num_frames = 8\n",
        "frame_stride = 10\n",
@@ -392,16 +908,9 @@
        "id": "R3RHeuHdsd_3"
      },
      "source": [
-        "### Build MoViNet-A0-Base and Load Pretrained Weights"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "JXVQOP9Rqk0I"
-      },
-      "source": [
-        "Here we create a MoViNet model using the open source code provided in [tensorflow/models](https://github.com/tensorflow/models) and load the pretrained weights. Here we freeze the all layers except the final classifier head to speed up fine-tuning."
+        "### Build MoViNet-A0-Base and Load Pretrained Weights\n",
+        "\n",
+        "Here we create a MoViNet model using the open source code provided in [official/projects/movinet](https://github.com/tensorflow/models/tree/master/official/projects/movinet) and load the pretrained weights. Here we freeze the all layers except the final classifier head to speed up fine-tuning."
      ]
    },
    {
@@ -416,32 +925,38 @@
        "\n",
        "tf.keras.backend.clear_session()\n",
        "\n",
-        "backbone = movinet.Movinet(\n",
-        "    model_id=model_id)\n",
-        "model = movinet_model.MovinetClassifier(\n",
-        "    backbone=backbone,\n",
-        "    num_classes=600)\n",
-        "model.build([batch_size, num_frames, resolution, resolution, 3])\n",
+        "backbone = movinet.Movinet(model_id=model_id)\n",
+        "model = movinet_model.MovinetClassifier(backbone=backbone, num_classes=600)\n",
+        "model.build([1, 1, 1, 1, 3])\n",
        "\n",
-        "# Load pretrained weights from TF Hub\n",
-        "movinet_hub_url = f'https://tfhub.dev/tensorflow/movinet/{model_id}/base/kinetics-600/classification/1'\n",
-        "movinet_hub_model = hub.KerasLayer(movinet_hub_url, trainable=True)\n",
-        "pretrained_weights = {w.name: w for w in movinet_hub_model.weights}\n",
-        "model_weights = {w.name: w for w in model.weights}\n",
-        "for name in pretrained_weights:\n",
-        "  model_weights[name].assign(pretrained_weights[name])\n",
+        "# Load pretrained weights\n",
+        "!wget https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_base.tar.gz -O movinet_a0_base.tar.gz -q\n",
+        "!tar -xvf movinet_a0_base.tar.gz\n",
        "\n",
-        "# Wrap the backbone with a new classifier to create a new classifier head\n",
-        "# with num_classes outputs\n",
-        "model = movinet_model.MovinetClassifier(\n",
-        "    backbone=backbone,\n",
-        "    num_classes=num_classes)\n",
-        "model.build([batch_size, num_frames, resolution, resolution, 3])\n",
+        "checkpoint_dir = 'movinet_a0_base'\n",
+        "checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)\n",
+        "checkpoint = tf.train.Checkpoint(model=model)\n",
+        "status = checkpoint.restore(checkpoint_path)\n",
+        "status.assert_existing_objects_matched()\n",
+        "\n",
+        "def build_classifier(backbone, num_classes, freeze_backbone=False):\n",
+        "  \"\"\"Builds a classifier on top of a backbone model.\"\"\"\n",
+        "  model = movinet_model.MovinetClassifier(\n",
+        "      backbone=backbone,\n",
+        "      num_classes=num_classes)\n",
+        "  model.build([batch_size, num_frames, resolution, resolution, 3])\n",
        "\n",
-        "# Freeze all layers except for the final classifier head\n",
-        "for layer in model.layers[:-1]:\n",
-        "  layer.trainable = False\n",
-        "model.layers[-1].trainable = True"
+        "  if freeze_backbone:\n",
+        "    for layer in model.layers[:-1]:\n",
+        "      layer.trainable = False\n",
+        "    model.layers[-1].trainable = True\n",
+        "\n",
+        "  return model\n",
+        "\n",
+        "# Wrap the backbone with a new classifier to create a new classifier head\n",
+        "# with num_classes outputs (101 classes for UCF101).\n",
+        "# Freeze all layers except for the final classifier head.\n",
+        "model = build_classifier(backbone, num_classes, freeze_backbone=True)"
      ]
    },
    {
@@ -500,7 +1015,7 @@
        "id": "0IyAOOlcpHna"
      },
      "source": [
-        "Run the fine-tuning with Keras compile/fit. After fine-tuning the model, we should be able to achieve \u003e70% accuracy on the test set."
+        "Run the fine-tuning with Keras compile/fit. After fine-tuning the model, we should be able to achieve \u003e85% accuracy on the test set."
      ]
    },
    {
@@ -527,11 +1042,11 @@
          "output_type": "stream",
          "text": [
            "Epoch 1/3\n",
-            "1192/1192 [==============================] - 348s 286ms/step - loss: 3.4914 - top_1: 0.3639 - top_5: 0.6294 - val_loss: 2.5153 - val_top_1: 0.5975 - val_top_5: 0.8565\n",
+            "1192/1192 [==============================] - 551s 451ms/step - loss: 2.5050 - top_1: 0.6692 - top_5: 0.8753 - val_loss: 1.6310 - val_top_1: 0.8109 - val_top_5: 0.9701\n",
            "Epoch 2/3\n",
-            "1192/1192 [==============================] - 286s 240ms/step - loss: 2.1397 - top_1: 0.6794 - top_5: 0.9231 - val_loss: 2.0695 - val_top_1: 0.6838 - val_top_5: 0.9070\n",
+            "1192/1192 [==============================] - 533s 447ms/step - loss: 1.3336 - top_1: 0.9024 - top_5: 0.9906 - val_loss: 1.4576 - val_top_1: 0.8451 - val_top_5: 0.9740\n",
            "Epoch 3/3\n",
-            "1192/1192 [==============================] - 348s 292ms/step - loss: 1.8925 - top_1: 0.7660 - top_5: 0.9454 - val_loss: 1.9848 - val_top_1: 0.7116 - val_top_5: 0.9227\n"
+            "1192/1192 [==============================] - 531s 446ms/step - loss: 1.2298 - top_1: 0.9329 - top_5: 0.9943 - val_loss: 1.4351 - val_top_1: 0.8514 - val_top_5: 0.9762\n"
          ]
        }
      ],
@@ -573,7 +1088,7 @@
    "colab": {
      "collapsed_sections": [],
      "last_runtime": {
-        "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook",
+        "build_target": "//learning/deepmind/dm_python:dm_notebook3",
        "kind": "private"
      },
      "name": "movinet_tutorial.ipynb",

--- a/official/projects/movinet/tools/__init__.py
+++ b/official/projects/movinet/tools/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/projects/movinet/tools/export_saved_model.py
+++ b/official/projects/movinet/tools/export_saved_model.py
@@ -51,6 +51,8 @@ python3 export_saved_model.py \
 To use an exported saved_model, refer to export_saved_model_test.py.
 """

+from typing import Optional, Tuple
+
 from absl import app
 from absl import flags
 import tensorflow as tf
@@ -113,62 +115,50 @@ flags.DEFINE_string(
 FLAGS = flags.FLAGS


-def main(_) -> None:
-  input_specs = tf.keras.layers.InputSpec(shape=[
-      FLAGS.batch_size,
-      FLAGS.num_frames,
-      FLAGS.image_size,
-      FLAGS.image_size,
-      3,
-  ])
+def export_saved_model(
+    model: tf.keras.Model,
+    input_shape: Tuple[int, int, int, int, int],
+    export_path: str = '/tmp/movinet/',
+    causal: bool = False,
+    bundle_input_init_states_fn: bool = True,
+    checkpoint_path: Optional[str] = None) -> None:
+  """Exports a MoViNet model to a saved model.
+
+  Args:
+    model: the tf.keras.Model to export.
+    input_shape: The 5D spatiotemporal input shape of size
+      [batch_size, num_frames, image_height, image_width, num_channels].
+      Set the field or a shape position in the field to None for dynamic input.
+    export_path: Export path to save the saved_model file.
+    causal: Run the model in causal mode.
+    bundle_input_init_states_fn: Add init_states as a function signature to the
+      saved model. This is not necessary if the input shape is static (e.g.,
+      for TF Lite).
+    checkpoint_path: Checkpoint path to load. Leave blank to keep the model's
+      initialization.
+  """

  # Use dimensions of 1 except the channels to export faster,
  # since we only really need the last dimension to build and get the output
  # states. These dimensions can be set to `None` once the model is built.
-  input_shape = [1 if s is None else s for s in input_specs.shape]
-
-  # Override swish activation implementation to remove custom gradients
-  activation = FLAGS.activation
-  if activation == 'swish':
-    activation = 'simple_swish'
-
-  classifier_activation = FLAGS.classifier_activation
-  if classifier_activation == 'swish':
-    classifier_activation = 'simple_swish'
-
-  backbone = movinet.Movinet(
-      model_id=FLAGS.model_id,
-      causal=FLAGS.causal,
-      use_positional_encoding=FLAGS.use_positional_encoding,
-      conv_type=FLAGS.conv_type,
-      se_type=FLAGS.se_type,
-      input_specs=input_specs,
-      activation=activation,
-      gating_activation=FLAGS.gating_activation,
-      use_sync_bn=False,
-      use_external_states=FLAGS.causal)
-  model = movinet_model.MovinetClassifier(
-      backbone,
-      num_classes=FLAGS.num_classes,
-      output_states=FLAGS.causal,
-      input_specs=dict(image=input_specs),
-      activation=classifier_activation)
-  model.build(input_shape)
+  input_shape_concrete = [1 if s is None else s for s in input_shape]
+  model.build(input_shape_concrete)

  # Compile model to generate some internal Keras variables.
  model.compile()

-  if FLAGS.checkpoint_path:
+  if checkpoint_path:
    checkpoint = tf.train.Checkpoint(model=model)
-    status = checkpoint.restore(FLAGS.checkpoint_path)
+    status = checkpoint.restore(checkpoint_path)
    status.assert_existing_objects_matched()

-  if FLAGS.causal:
+  if causal:
    # Call the model once to get the output states. Call again with `states`
    # input to ensure that the inputs with the `states` argument is built
    # with the full output state shapes.
-    input_image = tf.ones(input_shape)
-    _, states = model({**model.init_states(input_shape), 'image': input_image})
+    input_image = tf.ones(input_shape_concrete)
+    _, states = model({
+        **model.init_states(input_shape_concrete), 'image': input_image})
    _ = model({**states, 'image': input_image})

    # Create a function to explicitly set the names of the outputs
@@ -179,10 +169,10 @@ def main(_) -> None:
    specs = {
        name: tf.TensorSpec(spec.shape, name=name, dtype=spec.dtype)
        for name, spec in model.initial_state_specs(
-            input_specs.shape).items()
+            input_shape).items()
    }
    specs['image'] = tf.TensorSpec(
-        input_specs.shape, dtype=model.dtype, name='image')
+        input_shape, dtype=model.dtype, name='image')

    predict_fn = tf.function(predict, jit_compile=True)
    predict_fn = predict_fn.get_concrete_function(specs)
@@ -191,17 +181,118 @@ def main(_) -> None:
    init_states_fn = init_states_fn.get_concrete_function(
        tf.TensorSpec([5], dtype=tf.int32))

-    if FLAGS.bundle_input_init_states_fn:
+    if bundle_input_init_states_fn:
      signatures = {'call': predict_fn, 'init_states': init_states_fn}
    else:
      signatures = predict_fn

    tf.keras.models.save_model(
-        model, FLAGS.export_path, signatures=signatures)
+        model, export_path, signatures=signatures)
  else:
-    _ = model(tf.ones(input_shape))
-    tf.keras.models.save_model(model, FLAGS.export_path)
+    _ = model(tf.ones(input_shape_concrete))
+    tf.keras.models.save_model(model, export_path)
+
+
+def build_and_export_saved_model(
+    export_path: str = '/tmp/movinet/',
+    model_id: str = 'a0',
+    causal: bool = False,
+    conv_type: str = '3d',
+    se_type: str = '3d',
+    activation: str = 'swish',
+    classifier_activation: str = 'swish',
+    gating_activation: str = 'sigmoid',
+    use_positional_encoding: bool = False,
+    num_classes: int = 600,
+    input_shape: Optional[Tuple[int, int, int, int, int]] = None,
+    bundle_input_init_states_fn: bool = True,
+    checkpoint_path: Optional[str] = None) -> None:
+  """Builds and exports a MoViNet model to a saved model.

+  Args:
+    export_path: Export path to save the saved_model file.
+    model_id: MoViNet model name.
+    causal: Run the model in causal mode.
+    conv_type: 3d, 2plus1d, or 3d_2plus1d. 3d configures the network
+      to use the default 3D convolution. 2plus1d uses (2+1)D convolution
+      with Conv2D operations and 2D reshaping (e.g., a 5x3x3 kernel becomes
+      3x3 followed by 5x1 conv). 3d_2plus1d uses (2+1)D convolution with
+      Conv3D and no 2D reshaping (e.g., a 5x3x3 kernel becomes 1x3x3
+      followed by 5x1x1 conv).
+    se_type:
+      3d, 2d, or 2plus3d. 3d uses the default 3D spatiotemporal global average
+      pooling for squeeze excitation. 2d uses 2D spatial global average pooling
+      on each frame. 2plus3d concatenates both 3D and 2D global average
+      pooling.
+    activation: The main activation to use across layers.
+    classifier_activation: The classifier activation to use.
+    gating_activation: The gating activation to use in squeeze-excitation
+      layers.
+    use_positional_encoding: Whether to use positional encoding (only applied
+      when causal=True).
+    num_classes: The number of classes for prediction.
+    input_shape: The 5D spatiotemporal input shape of size
+      [batch_size, num_frames, image_height, image_width, num_channels].
+      Set the field or a shape position in the field to None for dynamic input.
+    bundle_input_init_states_fn: Add init_states as a function signature to the
+      saved model. This is not necessary if the input shape is static (e.g.,
+      for TF Lite).
+    checkpoint_path: Checkpoint path to load. Leave blank for default
+      initialization.
+  """
+
+  input_specs = tf.keras.layers.InputSpec(shape=input_shape)
+
+  # Override swish activation implementation to remove custom gradients
+  if activation == 'swish':
+    activation = 'simple_swish'
+  if classifier_activation == 'swish':
+    classifier_activation = 'simple_swish'
+
+  backbone = movinet.Movinet(
+      model_id=model_id,
+      causal=causal,
+      use_positional_encoding=use_positional_encoding,
+      conv_type=conv_type,
+      se_type=se_type,
+      input_specs=input_specs,
+      activation=activation,
+      gating_activation=gating_activation,
+      use_sync_bn=False,
+      use_external_states=causal)
+  model = movinet_model.MovinetClassifier(
+      backbone,
+      num_classes=num_classes,
+      output_states=causal,
+      input_specs=dict(image=input_specs),
+      activation=classifier_activation)
+
+  export_saved_model(
+      model=model,
+      input_shape=input_shape,
+      export_path=export_path,
+      causal=causal,
+      bundle_input_init_states_fn=bundle_input_init_states_fn,
+      checkpoint_path=checkpoint_path)
+
+
+def main(_) -> None:
+  input_shape = (
+      FLAGS.batch_size, FLAGS.num_frames, FLAGS.image_size, FLAGS.image_size, 3)
+  build_and_export_saved_model(
+      export_path=FLAGS.export_path,
+      model_id=FLAGS.model_id,
+      causal=FLAGS.causal,
+      conv_type=FLAGS.conv_type,
+      se_type=FLAGS.se_type,
+      activation=FLAGS.activation,
+      classifier_activation=FLAGS.classifier_activation,
+      gating_activation=FLAGS.gating_activation,
+      use_positional_encoding=FLAGS.use_positional_encoding,
+      num_classes=FLAGS.num_classes,
+      input_shape=input_shape,
+      bundle_input_init_states_fn=FLAGS.bundle_input_init_states_fn,
+      checkpoint_path=FLAGS.checkpoint_path)
  print(' ----- Done. Saved Model is saved at {}'.format(FLAGS.export_path))



--- a/official/projects/movinet/tools/plot_movinet_video_stream_predictions.ipynb
+++ b/official/projects/movinet/tools/plot_movinet_video_stream_predictions.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qwBHHt-XvPqn"
+      },
+      "source": [
+        "# Plot MoViNet Video Stream Predictions\n",
+        "\n",
+        "This notebook uses [MoViNets (Mobile Video Networks)](https://github.com/tensorflow/models/tree/master/official/projects/movinet) to predict a human action in a streaming video and outputs a visualization of predictions on each frame.\n",
+        "\n",
+        "Provide a video URL or upload your own to see how predictions change over time. All models can be run on CPU.\n",
+        "\n",
+        "Pretrained models are provided by [TensorFlow Hub](https://tfhub.dev/google/collections/movinet/) and the [TensorFlow Model Garden](https://github.com/tensorflow/models/tree/master/official/projects/movinet), trained on [Kinetics 600](https://deepmind.com/research/open-source/kinetics) for video action classification. All Models use TensorFlow 2 with Keras for inference and training. See the [research paper](https://arxiv.org/pdf/2103.11511.pdf) for more details.\n",
+        "\n",
+        "Example output using [this gif](https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif) as input:\n",
+        "\n",
+        "![jumping jacks plot](https://storage.googleapis.com/tf_model_garden/vision/movinet/artifacts/jumpingjacks_plot.gif)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "ElvELd9mIfZe"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Run this cell to initialize and setup a [MoViNet](https://github.com/tensorflow/models/tree/master/official/projects/movinet) model.\n",
+        "\n",
+        "\n",
+        "# Install the mediapy package for visualizing images/videos.\n",
+        "# See https://github.com/google/mediapy\n",
+        "!pip install -q mediapy\n",
+        "\n",
+        "# Run imports\n",
+        "import os\n",
+        "import io\n",
+        "\n",
+        "import matplotlib as mpl\n",
+        "import matplotlib.pyplot as plt\n",
+        "import mediapy as media\n",
+        "import numpy as np\n",
+        "import PIL\n",
+        "import pandas as pd\n",
+        "import tensorflow as tf\n",
+        "import tensorflow_datasets as tfds\n",
+        "import tensorflow_hub as hub\n",
+        "import tqdm\n",
+        "from google.colab import files\n",
+        "import urllib.request\n",
+        "\n",
+        "mpl.rcParams.update({\n",
+        "    'font.size': 10,\n",
+        "})\n",
+        "\n",
+        "\n",
+        "# Download Kinetics 600 label map\n",
+        "!wget https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt -O labels.txt -q\n",
+        "\n",
+        "with tf.io.gfile.GFile('labels.txt') as f:\n",
+        "  lines = f.readlines()\n",
+        "  KINETICS_600_LABELS_LIST = [line.strip() for line in lines]\n",
+        "  KINETICS_600_LABELS = tf.constant(KINETICS_600_LABELS_LIST)\n",
+        "\n",
+        "def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):\n",
+        "  \"\"\"Outputs the top k model labels and probabilities on the given video.\"\"\"\n",
+        "  top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]\n",
+        "  top_labels = tf.gather(label_map, top_predictions, axis=-1)\n",
+        "  top_labels = [label.decode('utf8') for label in top_labels.numpy()]\n",
+        "  top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()\n",
+        "  return tuple(zip(top_labels, top_probs))\n",
+        "\n",
+        "def predict_top_k(model, video, k=5, label_map=KINETICS_600_LABELS):\n",
+        "  \"\"\"Outputs the top k model labels and probabilities on the given video.\"\"\"\n",
+        "  outputs = model.predict(video[tf.newaxis])[0]\n",
+        "  probs = tf.nn.softmax(outputs)\n",
+        "  return get_top_k(probs, k=k, label_map=label_map)\n",
+        "\n",
+        "def load_movinet_from_hub(model_id, model_mode, hub_version=3):\n",
+        "  \"\"\"Loads a MoViNet model from TF Hub.\"\"\"\n",
+        "  hub_url = f'https://tfhub.dev/tensorflow/movinet/{model_id}/{model_mode}/kinetics-600/classification/{hub_version}'\n",
+        "\n",
+        "  encoder = hub.KerasLayer(hub_url, trainable=True)\n",
+        "\n",
+        "  inputs = tf.keras.layers.Input(\n",
+        "      shape=[None, None, None, 3],\n",
+        "      dtype=tf.float32)\n",
+        "\n",
+        "  if model_mode == 'base':\n",
+        "    inputs = dict(image=inputs)\n",
+        "  else:\n",
+        "    # Define the state inputs, which is a dict that maps state names to tensors.\n",
+        "    init_states_fn = encoder.resolved_object.signatures['init_states']\n",
+        "    state_shapes = {\n",
+        "        name: ([s if s \u003e 0 else None for s in state.shape], state.dtype)\n",
+        "        for name, state in init_states_fn(tf.constant([0, 0, 0, 0, 3])).items()\n",
+        "    }\n",
+        "    states_input = {\n",
+        "        name: tf.keras.Input(shape[1:], dtype=dtype, name=name)\n",
+        "        for name, (shape, dtype) in state_shapes.items()\n",
+        "    }\n",
+        "\n",
+        "    # The inputs to the model are the states and the video\n",
+        "    inputs = {**states_input, 'image': inputs}\n",
+        "\n",
+        "  # Output shape: [batch_size, 600]\n",
+        "  outputs = encoder(inputs)\n",
+        "\n",
+        "  model = tf.keras.Model(inputs, outputs)\n",
+        "  model.build([1, 1, 1, 1, 3])\n",
+        "\n",
+        "  return model\n",
+        "\n",
+        "# Download example gif\n",
+        "!wget https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif -O jumpingjack.gif -q\n",
+        "\n",
+        "def load_gif(file_path, image_size=(224, 224)):\n",
+        "  \"\"\"Loads a gif file into a TF tensor.\"\"\"\n",
+        "  with tf.io.gfile.GFile(file_path, 'rb') as f:\n",
+        "    video = tf.io.decode_gif(f.read())\n",
+        "  video = tf.image.resize(video, image_size)\n",
+        "  video = tf.cast(video, tf.float32) / 255.\n",
+        "  return video\n",
+        "\n",
+        "def get_top_k_streaming_labels(probs, k=5, label_map=KINETICS_600_LABELS_LIST):\n",
+        "  \"\"\"Returns the top-k labels over an entire video sequence.\n",
+        "\n",
+        "  Args:\n",
+        "    probs: probability tensor of shape (num_frames, num_classes) that represents\n",
+        "      the probability of each class on each frame.\n",
+        "    k: the number of top predictions to select.\n",
+        "    label_map: a list of labels to map logit indices to label strings.\n",
+        "\n",
+        "  Returns:\n",
+        "    a tuple of the top-k probabilities, labels, and logit indices\n",
+        "  \"\"\"\n",
+        "  top_categories_last = tf.argsort(probs, -1, 'DESCENDING')[-1, :1]\n",
+        "  categories = tf.argsort(probs, -1, 'DESCENDING')[:, :k]\n",
+        "  categories = tf.reshape(categories, [-1])\n",
+        "\n",
+        "  counts = sorted([\n",
+        "      (i.numpy(), tf.reduce_sum(tf.cast(categories == i, tf.int32)).numpy())\n",
+        "      for i in tf.unique(categories)[0]\n",
+        "  ], key=lambda x: x[1], reverse=True)\n",
+        "\n",
+        "  top_probs_idx = tf.constant([i for i, _ in counts[:k]])\n",
+        "  top_probs_idx = tf.concat([top_categories_last, top_probs_idx], 0)\n",
+        "  top_probs_idx = tf.unique(top_probs_idx)[0][:k+1]\n",
+        "\n",
+        "  top_probs = tf.gather(probs, top_probs_idx, axis=-1)\n",
+        "  top_probs = tf.transpose(top_probs, perm=(1, 0))\n",
+        "  top_labels = tf.gather(label_map, top_probs_idx, axis=0)\n",
+        "  top_labels = [label.decode('utf8') for label in top_labels.numpy()]\n",
+        "\n",
+        "  return top_probs, top_labels, top_probs_idx\n",
+        "\n",
+        "def plot_streaming_top_preds_at_step(\n",
+        "    top_probs,\n",
+        "    top_labels,\n",
+        "    step=None,\n",
+        "    image=None,\n",
+        "    legend_loc='lower left',\n",
+        "    duration_seconds=10,\n",
+        "    figure_height=500,\n",
+        "    playhead_scale=0.8,\n",
+        "    grid_alpha=0.3):\n",
+        "  \"\"\"Generates a plot of the top video model predictions at a given time step.\n",
+        "\n",
+        "  Args:\n",
+        "    top_probs: a tensor of shape (k, num_frames) representing the top-k\n",
+        "      probabilities over all frames.\n",
+        "    top_labels: a list of length k that represents the top-k label strings.\n",
+        "    step: the current time step in the range [0, num_frames].\n",
+        "    image: the image frame to display at the current time step.\n",
+        "    legend_loc: the placement location of the legend.\n",
+        "    duration_seconds: the total duration of the video.\n",
+        "    figure_height: the output figure height.\n",
+        "    playhead_scale: scale value for the playhead.\n",
+        "    grid_alpha: alpha value for the gridlines.\n",
+        "\n",
+        "  Returns:\n",
+        "    A tuple of the output numpy image, figure, and axes.\n",
+        "  \"\"\"\n",
+        "  num_labels, num_frames = top_probs.shape\n",
+        "  if step is None:\n",
+        "    step = num_frames\n",
+        "\n",
+        "  fig = plt.figure(figsize=(6.5, 7), dpi=300)\n",
+        "  gs = mpl.gridspec.GridSpec(8, 1)\n",
+        "  ax2 = plt.subplot(gs[:-3, :])\n",
+        "  ax = plt.subplot(gs[-3:, :])\n",
+        "\n",
+        "  if image is not None:\n",
+        "    ax2.imshow(image, interpolation='nearest')\n",
+        "    ax2.axis('off')\n",
+        "\n",
+        "  preview_line_x = tf.linspace(0., duration_seconds, num_frames)\n",
+        "  preview_line_y = top_probs\n",
+        "\n",
+        "  line_x = preview_line_x[:step+1]\n",
+        "  line_y = preview_line_y[:, :step+1]\n",
+        "\n",
+        "  for i in range(num_labels):\n",
+        "    ax.plot(preview_line_x, preview_line_y[i], label=None, linewidth='1.5',\n",
+        "            linestyle=':', color='gray')\n",
+        "    ax.plot(line_x, line_y[i], label=top_labels[i], linewidth='2.0')\n",
+        "\n",
+        "\n",
+        "  ax.grid(which='major', linestyle=':', linewidth='1.0', alpha=grid_alpha)\n",
+        "  ax.grid(which='minor', linestyle=':', linewidth='0.5', alpha=grid_alpha)\n",
+        "\n",
+        "  min_height = tf.reduce_min(top_probs) * playhead_scale\n",
+        "  max_height = tf.reduce_max(top_probs)\n",
+        "  ax.vlines(preview_line_x[step], min_height, max_height, colors='red')\n",
+        "  ax.scatter(preview_line_x[step], max_height, color='red')\n",
+        "\n",
+        "  ax.legend(loc=legend_loc)\n",
+        "\n",
+        "  plt.xlim(0, duration_seconds)\n",
+        "  plt.ylabel('Probability')\n",
+        "  plt.xlabel('Time (s)')\n",
+        "  plt.yscale('log')\n",
+        "\n",
+        "  fig.tight_layout()\n",
+        "  fig.canvas.draw()\n",
+        "\n",
+        "  data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)\n",
+        "  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))\n",
+        "  plt.close()\n",
+        "\n",
+        "  figure_width = int(figure_height * data.shape[1] / data.shape[0])\n",
+        "  image = PIL.Image.fromarray(data).resize([figure_width, figure_height])\n",
+        "  image = np.array(image)\n",
+        "\n",
+        "  return image, (fig, ax, ax2)\n",
+        "\n",
+        "def plot_streaming_top_preds(\n",
+        "    probs,\n",
+        "    video,\n",
+        "    top_k=5,\n",
+        "    video_fps=25.,\n",
+        "    figure_height=500,\n",
+        "    use_progbar=True):\n",
+        "  \"\"\"Generates a video plot of the top video model predictions.\n",
+        "\n",
+        "  Args:\n",
+        "    probs: probability tensor of shape (num_frames, num_classes) that represents\n",
+        "      the probability of each class on each frame.\n",
+        "    video: the video to display in the plot.\n",
+        "    top_k: the number of top predictions to select.\n",
+        "    video_fps: the input video fps.\n",
+        "    figure_fps: the output video fps.\n",
+        "    figure_height: the height of the output video.\n",
+        "    use_progbar: display a progress bar.\n",
+        "\n",
+        "  Returns:\n",
+        "    A numpy array representing the output video.\n",
+        "  \"\"\"\n",
+        "  video_fps = 8.\n",
+        "  figure_height = 500\n",
+        "  steps = video.shape[0]\n",
+        "  duration = steps / video_fps\n",
+        "\n",
+        "  top_probs, top_labels, _ = get_top_k_streaming_labels(probs, k=top_k)\n",
+        "\n",
+        "  images = []\n",
+        "  step_generator = tqdm.trange(steps) if use_progbar else range(steps)\n",
+        "  for i in step_generator:\n",
+        "    image, _ = plot_streaming_top_preds_at_step(\n",
+        "        top_probs=top_probs,\n",
+        "        top_labels=top_labels,\n",
+        "        step=i,\n",
+        "        image=video[i],\n",
+        "        duration_seconds=duration,\n",
+        "        figure_height=figure_height,\n",
+        "    )\n",
+        "    images.append(image)\n",
+        "\n",
+        "  return np.array(images)\n",
+        "\n",
+        "def generate_plot(\n",
+        "    model,\n",
+        "    video_url=None,\n",
+        "    resolution=224,\n",
+        "    video_fps=25,\n",
+        "    display_fps=25):\n",
+        "  # Load the video\n",
+        "  if not video_url:\n",
+        "    video_bytes = list(files.upload().values())[0]\n",
+        "    with open('video', 'wb') as f:\n",
+        "      f.write(video_bytes)\n",
+        "  else:\n",
+        "    urllib.request.urlretrieve(video_url, \"video\")\n",
+        "\n",
+        "  video = tf.cast(media.read_video('video'), tf.float32) / 255.\n",
+        "  video = tf.image.resize(video, [resolution, resolution], preserve_aspect_ratio=True)\n",
+        "\n",
+        "  # Create initial states for the stream model\n",
+        "  init_states_fn = model.layers[-1].resolved_object.signatures['init_states']\n",
+        "  init_states = init_states_fn(tf.shape(video[tf.newaxis]))\n",
+        "\n",
+        "  clips = tf.split(video[tf.newaxis], video.shape[0], axis=1)\n",
+        "\n",
+        "  all_logits = []\n",
+        "\n",
+        "  print('Running the model on the video...')\n",
+        "\n",
+        "  # To run on a video, pass in one frame at a time\n",
+        "  states = init_states\n",
+        "  for clip in tqdm.tqdm(clips):\n",
+        "    # Input shape: [1, 1, 172, 172, 3]\n",
+        "    logits, states = model.predict({**states, 'image': clip}, verbose=0)\n",
+        "    all_logits.append(logits)\n",
+        "\n",
+        "  logits = tf.concat(all_logits, 0)\n",
+        "  probs = tf.nn.softmax(logits)\n",
+        "\n",
+        "  print('Generating the plot...')\n",
+        "\n",
+        "  # Generate a plot and output to a video tensor\n",
+        "  plot_video = plot_streaming_top_preds(probs, video, video_fps=video_fps)\n",
+        "  media.show_video(plot_video, fps=display_fps, codec='gif')\n",
+        "\n",
+        "model_size = 'm' #@param [\"xs\", \"s\", \"m\", \"l\", \"xl\", \"xxl\"]\n",
+        "\n",
+        "model_map = {\n",
+        "    'xs': 'a0',\n",
+        "    's': 'a1',\n",
+        "    'm': 'a2',\n",
+        "    'l': 'a3',\n",
+        "    'xl': 'a4',\n",
+        "    'xxl': 'a5',\n",
+        "}\n",
+        "movinet_model_id = model_map[model_size]\n",
+        "\n",
+        "model = load_movinet_from_hub(\n",
+        "    movinet_model_id, 'stream', hub_version=3)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "jO6HrPk8pqo8"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Generate a video plot.\n",
+        "\n",
+        "#@markdown You may add a video URL (gif or mp4) or leave the video_url field blank to upload your own file.\n",
+        "video_url = \"https://i.pinimg.com/originals/33/5e/31/335e31bc8ed52511da0cfb4bc44e95c7.gif\"  #@param  {type:\"string\"}\n",
+        "\n",
+        "#@markdown The base input resolution to the model. A good value is 224, but can change based on model size.\n",
+        "resolution = 224 #@param\n",
+        "#@markdown The fps of the input video.\n",
+        "video_fps = 12  #@param\n",
+        "#@markdown The fps to display the output plot. Depending on the duration of the input video, it may help to use a lower fps.\n",
+        "display_fps = 12  #@param\n",
+        "\n",
+        "generate_plot(\n",
+        "    model,\n",
+        "    video_url=video_url,\n",
+        "    resolution=resolution,\n",
+        "    video_fps=video_fps,\n",
+        "    display_fps=display_fps)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "last_runtime": {
+        "build_target": "//learning/deepmind/dm_python:dm_notebook3",
+        "kind": "private"
+      },
+      "name": "plot_movinet_video_stream_predictions.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv3large_qat_tpu.yaml
+++ b/official/projects/qat/vision/configs/experiments/image_classification/imagenet_mobilenetv3large_qat_tpu.yaml
@@ -21,12 +21,6 @@ task:
    global_batch_size: 4096
    dtype: 'float32'
    aug_rand_hflip: true
-    aug_type:
-      autoaug:
-        augmentation_name: v0
-        cutout_const: 100
-        translate_const: 250
-      type: autoaug
    drop_remainder: true
  validation_data:
    input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/valid*'

--- a/official/projects/token_dropping/README.md
+++ b/official/projects/token_dropping/README.md
+# Token Dropping for Efficient BERT Pretraining
+
+This is the official implementation of the token dropping method
+[Pang et al. Token Dropping for Efficient BERT Pretraining. ACL 2022](#reference).
+
+Token dropping aims to accelerate the pretraining of transformer
+models such as BERT without degrading its performance on downstream tasks. In
+particular, we drop unimportant tokens starting from an intermediate layer in
+the model, to make the model focus on important tokens more efficiently with its
+limited computational resources. The dropped tokens are later picked up by the
+last layer of the model, so that the model still produces full-length sequences.
+We leverage the already built-in masked language modeling (MLM) loss and its
+dynamics to identify unimportant tokens with practically no computational
+overhead. In our experiments, this simple approach reduces the pretraining cost
+of BERT by 25% while achieving slightly better overall fine-tuning performance
+on standard downstream tasks.
+
+A BERT model pretrained using this token dropping method is not different to
+a BERT model pretrained in the conventional way: a BERT checkpoint pretrained
+with token dropping can be viewed and used as a normal BERT checkpoint, for
+finetuning etc. Thus, this README file only illustrates how to run token
+dropping for pretraining.
+
+### Requirements
+
+The starter code requires Tensorflow. If you haven't installed it yet, follow
+the instructions on [tensorflow.org][1].
+This code has been tested with Tensorflow 2.5.0. Going forward,
+we will continue to target the latest released version of Tensorflow.
+
+Please verify that you have Python 3.6+ and Tensorflow 2.5.0 or higher
+installed by running the following commands:
+
+```sh
+python --version
+python -c 'import tensorflow as tf; print(tf.__version__)'
+```
+
+Refer to the [instructions here][2]
+for using the model in this repo. Make sure to add the models folder to your
+Python path.
+
+[1]: https://www.tensorflow.org/install/
+[2]:
+https://github.com/tensorflow/models/tree/master/official#running-the-models
+
+Then, you need to generate pretraining data. See
+[this instruction]
+(https://github.com/tensorflow/models/blob/27fb855b027ead16d2616dcb59c67409a2176b7f/official/legacy/bert/README.md#pre-training)
+on how to do that.
+
+## Train using the config file.
+
+After you generated your pretraining data, run the following command to start
+pretraining:
+
+```bash
+PARAMS="task.train_data.input_data=/path/to/train/data"
+PARAMS="${PARAMS},task.validation_data.input_path=/path/to/validation/data"
+PARAMS="${PARAMS},runtime.distribution_strategy=tpu"
+
+python3 train.py \
+  --experiment=token_drop_bert/pretraining \
+  --config_file=wiki_books_pretrain_sequence_pack.yaml \
+  --config_file=bert_en_uncased_base_token_drop.yaml \
+  --params_override=${PARAMS} \
+  --tpu=local \
+  --model_dir=/folder/to/hold/logs/and/models/ \
+  --mode=train_and_eval
+```
+
+## Implementation
+
+We implement the encoder and layers using `tf.keras` APIs in NLP
+modeling library:
+
+  * [masked_lm.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/masked_lm.py)
+  contains the BERT pretraining task.
+
+  * [experiment_configs.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/experiment_configs.py)
+  registers the token dropping experiment.
+
+  * [encoder.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/encoder.py)
+  contains the BERT encoder that supports token dropping.
+
+  * [encoder_config.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/encoder_config.py)
+  contains the config and method for instantiating the token dropping BERT
+  encoder.
+
+  * [train.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/train.py)
+  is the program entry.
+
+## Reference
+
+Please cite our paper:
+
+```
+@inproceedings{pang2022,
+  title={Token Dropping for Efficient BERT Pretraining},
+  author={Richard Yuanzhe Pang*, Le Hou*, Tianyi Zhou, Yuexin Wu, Xinying Song, Xiaodan Song, Denny Zhou},
+  year={2022},
+  organization={Association for Computational Linguistics}
+}
+```
--- a/official/projects/token_dropping/bert_en_uncased_base_token_drop.yaml
+++ b/official/projects/token_dropping/bert_en_uncased_base_token_drop.yaml
+task:
+  model:
+    encoder:
+      type: any
+      any:
+        token_allow_list: !!python/tuple
+        - 100  # [UNK]
+        - 101  # [CLS]
+        - 102  # [SEP]
+        - 103  # [MASK]
+        token_deny_list: !!python/tuple
+        - 0  # [PAD]
+        attention_dropout_rate: 0.1
+        dropout_rate: 0.1
+        hidden_activation: gelu
+        hidden_size: 768
+        initializer_range: 0.02
+        intermediate_size: 3072
+        max_position_embeddings: 512
+        num_attention_heads: 12
+        num_layers: 12
+        type_vocab_size: 2
+        vocab_size: 30522
+        token_loss_init_value: 10.0
+        token_loss_beta: 0.995
+        token_keep_k: 256
--- a/official/projects/token_dropping/encoder.py
+++ b/official/projects/token_dropping/encoder.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer-based BERT encoder network."""
+# pylint: disable=g-classes-have-attributes
+
+from typing import Any, Callable, Optional, Union, Tuple
+from absl import logging
+import tensorflow as tf
+
+from official.nlp.modeling import layers
+
+
+_Initializer = Union[str, tf.keras.initializers.Initializer]
+_Activation = Union[str, Callable[..., Any]]
+
+_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
+
+
+class TokenDropBertEncoder(tf.keras.layers.Layer):
+  """Bi-directional Transformer-based encoder network with token dropping.
+
+  During pretraining, we drop unimportant tokens starting from an intermediate
+  layer in the model, to make the model focus on important tokens more
+  efficiently with its limited computational resources. The dropped tokens are
+  later picked up by the last layer of the model, so that the model still
+  produces full-length sequences. This approach reduces the pretraining cost of
+  BERT by 25% while achieving better overall fine-tuning performance on standard
+  downstream tasks.
+
+  Args:
+    vocab_size: The size of the token vocabulary.
+    hidden_size: The size of the transformer hidden layers.
+    num_layers: The number of transformer layers.
+    num_attention_heads: The number of attention heads for each transformer. The
+      hidden size must be divisible by the number of attention heads.
+    max_sequence_length: The maximum sequence length that this encoder can
+      consume. If None, max_sequence_length uses the value from sequence length.
+      This determines the variable shape for positional embeddings.
+    type_vocab_size: The number of types that the 'type_ids' input can take.
+    inner_dim: The output dimension of the first Dense layer in a two-layer
+      feedforward network for each transformer.
+    inner_activation: The activation for the first Dense layer in a two-layer
+      feedforward network for each transformer.
+    output_dropout: Dropout probability for the post-attention and output
+      dropout.
+    attention_dropout: The dropout rate to use for the attention layers within
+      the transformer layers.
+    token_loss_init_value: The default loss value of a token, when the token is
+      never masked and predicted.
+    token_loss_beta: How running average factor for computing the average loss
+      value of a token.
+    token_keep_k: The number of tokens you want to keep in the intermediate
+      layers. The rest will be dropped in those layers.
+    token_allow_list: The list of token-ids that should not be droped. In the
+      BERT English vocab, token-id from 1 to 998 contains special tokens such
+      as [CLS], [SEP]. By default, token_allow_list contains all of these
+      special tokens.
+    token_deny_list: The list of token-ids that should always be droped. In the
+      BERT English vocab, token-id=0 means [PAD]. By default, token_deny_list
+      contains and only contains [PAD].
+    initializer: The initialzer to use for all weights in this encoder.
+    output_range: The sequence output range, [0, output_range), by slicing the
+      target sequence of the last transformer layer. `None` means the entire
+      target sequence will attend to the source sequence, which yields the full
+      output.
+    embedding_width: The width of the word embeddings. If the embedding width is
+      not equal to hidden size, embedding parameters will be factorized into two
+      matrices in the shape of ['vocab_size', 'embedding_width'] and
+      ['embedding_width', 'hidden_size'] ('embedding_width' is usually much
+      smaller than 'hidden_size').
+    embedding_layer: An optional Layer instance which will be called to generate
+      embeddings for the input word IDs.
+    norm_first: Whether to normalize inputs to attention and intermediate dense
+      layers. If set False, output of attention and intermediate dense layers is
+      normalized.
+    with_dense_inputs: Whether to accept dense embeddings as the input.
+  """
+
+  def __init__(
+      self,
+      vocab_size: int,
+      hidden_size: int = 768,
+      num_layers: int = 12,
+      num_attention_heads: int = 12,
+      max_sequence_length: int = 512,
+      type_vocab_size: int = 16,
+      inner_dim: int = 3072,
+      inner_activation: _Activation = _approx_gelu,
+      output_dropout: float = 0.1,
+      attention_dropout: float = 0.1,
+      token_loss_init_value: float = 10.0,
+      token_loss_beta: float = 0.995,
+      token_keep_k: int = 256,
+      token_allow_list: Tuple[int, ...] = (100, 101, 102, 103),
+      token_deny_list: Tuple[int, ...] = (0,),
+      initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
+          stddev=0.02),
+      output_range: Optional[int] = None,
+      embedding_width: Optional[int] = None,
+      embedding_layer: Optional[tf.keras.layers.Layer] = None,
+      norm_first: bool = False,
+      with_dense_inputs: bool = False,
+      **kwargs):
+    # Pops kwargs that are used in V1 implementation.
+    if 'dict_outputs' in kwargs:
+      kwargs.pop('dict_outputs')
+    if 'return_all_encoder_outputs' in kwargs:
+      kwargs.pop('return_all_encoder_outputs')
+    if 'intermediate_size' in kwargs:
+      inner_dim = kwargs.pop('intermediate_size')
+    if 'activation' in kwargs:
+      inner_activation = kwargs.pop('activation')
+    if 'dropout_rate' in kwargs:
+      output_dropout = kwargs.pop('dropout_rate')
+    if 'attention_dropout_rate' in kwargs:
+      attention_dropout = kwargs.pop('attention_dropout_rate')
+    super().__init__(**kwargs)
+
+    activation = tf.keras.activations.get(inner_activation)
+    initializer = tf.keras.initializers.get(initializer)
+
+    if embedding_width is None:
+      embedding_width = hidden_size
+
+    if embedding_layer is None:
+      self._embedding_layer = layers.OnDeviceEmbedding(
+          vocab_size=vocab_size,
+          embedding_width=embedding_width,
+          initializer=initializer,
+          name='word_embeddings')
+    else:
+      self._embedding_layer = embedding_layer
+
+    self._position_embedding_layer = layers.PositionEmbedding(
+        initializer=initializer,
+        max_length=max_sequence_length,
+        name='position_embedding')
+
+    self._type_embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=type_vocab_size,
+        embedding_width=embedding_width,
+        initializer=initializer,
+        use_one_hot=True,
+        name='type_embeddings')
+
+    self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
+        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
+
+    self._embedding_dropout = tf.keras.layers.Dropout(
+        rate=output_dropout, name='embedding_dropout')
+
+    # We project the 'embedding' output to 'hidden_size' if it is not already
+    # 'hidden_size'.
+    self._embedding_projection = None
+    if embedding_width != hidden_size:
+      self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
+          '...x,xy->...y',
+          output_shape=hidden_size,
+          bias_axes='y',
+          kernel_initializer=initializer,
+          name='embedding_projection')
+
+    # The first 999 tokens are special tokens such as [PAD], [CLS], [SEP].
+    # We want to always mask [PAD], and always not to maks [CLS], [SEP].
+    init_importance = tf.constant(token_loss_init_value, shape=(vocab_size))
+    if token_allow_list:
+      init_importance = tf.tensor_scatter_nd_update(
+          tensor=init_importance,
+          indices=[[x] for x in token_allow_list],
+          updates=[1.0e4 for x in token_allow_list])
+    if token_deny_list:
+      init_importance = tf.tensor_scatter_nd_update(
+          tensor=init_importance,
+          indices=[[x] for x in token_deny_list],
+          updates=[-1.0e4 for x in token_deny_list])
+    self._token_importance_embed = layers.TokenImportanceWithMovingAvg(
+        vocab_size=vocab_size,
+        init_importance=init_importance,
+        moving_average_beta=token_loss_beta)
+
+    self._token_separator = layers.SelectTopK(top_k=token_keep_k)
+    self._transformer_layers = []
+    self._num_layers = num_layers
+    self._attention_mask_layer = layers.SelfAttentionMask(
+        name='self_attention_mask')
+    for i in range(num_layers):
+      layer = layers.TransformerEncoderBlock(
+          num_attention_heads=num_attention_heads,
+          inner_dim=inner_dim,
+          inner_activation=inner_activation,
+          output_dropout=output_dropout,
+          attention_dropout=attention_dropout,
+          norm_first=norm_first,
+          output_range=output_range if i == num_layers - 1 else None,
+          kernel_initializer=initializer,
+          name='transformer/layer_%d' % i)
+      self._transformer_layers.append(layer)
+
+    self._pooler_layer = tf.keras.layers.Dense(
+        units=hidden_size,
+        activation='tanh',
+        kernel_initializer=initializer,
+        name='pooler_transform')
+
+    self._config = {
+        'vocab_size': vocab_size,
+        'hidden_size': hidden_size,
+        'num_layers': num_layers,
+        'num_attention_heads': num_attention_heads,
+        'max_sequence_length': max_sequence_length,
+        'type_vocab_size': type_vocab_size,
+        'inner_dim': inner_dim,
+        'inner_activation': tf.keras.activations.serialize(activation),
+        'output_dropout': output_dropout,
+        'attention_dropout': attention_dropout,
+        'token_loss_init_value': token_loss_init_value,
+        'token_loss_beta': token_loss_beta,
+        'token_keep_k': token_keep_k,
+        'token_allow_list': token_allow_list,
+        'token_deny_list': token_deny_list,
+        'initializer': tf.keras.initializers.serialize(initializer),
+        'output_range': output_range,
+        'embedding_width': embedding_width,
+        'embedding_layer': embedding_layer,
+        'norm_first': norm_first,
+        'with_dense_inputs': with_dense_inputs,
+    }
+    if with_dense_inputs:
+      self.inputs = dict(
+          input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          dense_inputs=tf.keras.Input(
+              shape=(None, embedding_width), dtype=tf.float32),
+          dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+      )
+    else:
+      self.inputs = dict(
+          input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32))
+
+  def call(self, inputs):
+    if isinstance(inputs, dict):
+      word_ids = inputs.get('input_word_ids')
+      mask = inputs.get('input_mask')
+      type_ids = inputs.get('input_type_ids')
+
+      dense_inputs = inputs.get('dense_inputs', None)
+      dense_mask = inputs.get('dense_mask', None)
+      dense_type_ids = inputs.get('dense_type_ids', None)
+    else:
+      raise ValueError('Unexpected inputs type to %s.' % self.__class__)
+
+    word_embeddings = self._embedding_layer(word_ids)
+
+    if dense_inputs is not None:
+      # Concat the dense embeddings at sequence end.
+      word_embeddings = tf.concat([word_embeddings, dense_inputs], axis=1)
+      type_ids = tf.concat([type_ids, dense_type_ids], axis=1)
+      mask = tf.concat([mask, dense_mask], axis=1)
+
+    # absolute position embeddings.
+    position_embeddings = self._position_embedding_layer(word_embeddings)
+    type_embeddings = self._type_embedding_layer(type_ids)
+
+    embeddings = word_embeddings + position_embeddings + type_embeddings
+    embeddings = self._embedding_norm_layer(embeddings)
+    embeddings = self._embedding_dropout(embeddings)
+
+    if self._embedding_projection is not None:
+      embeddings = self._embedding_projection(embeddings)
+
+    attention_mask = self._attention_mask_layer(embeddings, mask)
+
+    encoder_outputs = []
+    x = embeddings
+
+    # Get token routing.
+    token_importance = self._token_importance_embed(word_ids)
+    selected, not_selected = self._token_separator(token_importance)
+
+    # For a 12-layer BERT:
+    #   1. All tokens fist go though 5 transformer layers, then
+    #   2. Only important tokens go through 1 transformer layer with cross
+    #      attention to unimportant tokens, then
+    #   3. Only important tokens go through 5 transformer layers without cross
+    #      attention.
+    #   4. Finally, all tokens go through the last layer.
+
+    # Step 1.
+    for layer in self._transformer_layers[:self._num_layers // 2 - 1]:
+      x = layer([x, attention_mask])
+      encoder_outputs.append(x)
+
+    # Step 2.
+    # First, separate important and non-important tokens.
+    x_selected = tf.gather(x, selected, batch_dims=1, axis=1)
+    mask_selected = tf.gather(mask, selected, batch_dims=1, axis=1)
+    attention_mask_token_drop = self._attention_mask_layer(
+        x_selected, mask_selected)
+
+    x_not_selected = tf.gather(x, not_selected, batch_dims=1, axis=1)
+    mask_not_selected = tf.gather(mask, not_selected, batch_dims=1, axis=1)
+    attention_mask_token_pass = self._attention_mask_layer(
+        x_selected, tf.concat([mask_selected, mask_not_selected], axis=1))
+    x_all = tf.concat([x_selected, x_not_selected], axis=1)
+
+    # Then, call transformer layer with cross attention.
+    x_selected = self._transformer_layers[self._num_layers // 2 - 1](
+        [x_selected, x_all, attention_mask_token_pass])
+    encoder_outputs.append(x_selected)
+
+    # Step 3.
+    for layer in self._transformer_layers[self._num_layers // 2:-1]:
+      x_selected = layer([x_selected, attention_mask_token_drop])
+      encoder_outputs.append(x_selected)
+
+    # Step 4.
+    # First, merge important and non-important tokens.
+    x_not_selected = tf.cast(x_not_selected, dtype=x_selected.dtype)
+    x = tf.concat([x_selected, x_not_selected], axis=1)
+    indices = tf.concat([selected, not_selected], axis=1)
+    reverse_indices = tf.argsort(indices)
+    x = tf.gather(x, reverse_indices, batch_dims=1, axis=1)
+
+    # Then, call transformer layer with all tokens.
+    x = self._transformer_layers[-1]([x, attention_mask])
+    encoder_outputs.append(x)
+
+    last_encoder_output = encoder_outputs[-1]
+    first_token_tensor = last_encoder_output[:, 0, :]
+    pooled_output = self._pooler_layer(first_token_tensor)
+
+    return dict(
+        sequence_output=encoder_outputs[-1],
+        pooled_output=pooled_output,
+        encoder_outputs=encoder_outputs)
+
+  def record_mlm_loss(self, mlm_ids: tf.Tensor, mlm_losses: tf.Tensor):
+    self._token_importance_embed.update_token_importance(
+        token_ids=mlm_ids, importance=mlm_losses)
+
+  def get_embedding_table(self):
+    return self._embedding_layer.embeddings
+
+  def get_embedding_layer(self):
+    return self._embedding_layer
+
+  def get_config(self):
+    return dict(self._config)
+
+  @property
+  def transformer_layers(self):
+    """List of Transformer layers in the encoder."""
+    return self._transformer_layers
+
+  @property
+  def pooler_layer(self):
+    """The pooler dense layer after the transformer layers."""
+    return self._pooler_layer
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    if 'embedding_layer' in config and config['embedding_layer'] is not None:
+      warn_string = (
+          'You are reloading a model that was saved with a '
+          'potentially-shared embedding layer object. If you contine to '
+          'train this model, the embedding layer will no longer be shared. '
+          'To work around this, load the model outside of the Keras API.')
+      print('WARNING: ' + warn_string)
+      logging.warn(warn_string)
+
+    return cls(**config)
+
--- a/official/projects/token_dropping/encoder_config.py
+++ b/official/projects/token_dropping/encoder_config.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Token dropping encoder configuration and instantiation."""
+import dataclasses
+from typing import Tuple
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.modeling.hyperparams import base_config
+from official.nlp.configs import encoders
+from official.projects.token_dropping import encoder
+
+
+@dataclasses.dataclass
+class TokenDropBertEncoderConfig(encoders.BertEncoderConfig):
+  token_loss_init_value: float = 10.0
+  token_loss_beta: float = 0.995
+  token_keep_k: int = 256
+  token_allow_list: Tuple[int, ...] = (100, 101, 102, 103)
+  token_deny_list: Tuple[int, ...] = (0,)
+
+
+@base_config.bind(TokenDropBertEncoderConfig)
+def get_encoder(encoder_cfg: TokenDropBertEncoderConfig):
+  """Instantiates 'TokenDropBertEncoder'.
+
+  Args:
+    encoder_cfg: A 'TokenDropBertEncoderConfig'.
+
+  Returns:
+    A 'encoder.TokenDropBertEncoder' object.
+  """
+  return encoder.TokenDropBertEncoder(
+      vocab_size=encoder_cfg.vocab_size,
+      hidden_size=encoder_cfg.hidden_size,
+      num_layers=encoder_cfg.num_layers,
+      num_attention_heads=encoder_cfg.num_attention_heads,
+      intermediate_size=encoder_cfg.intermediate_size,
+      activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
+      dropout_rate=encoder_cfg.dropout_rate,
+      attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+      max_sequence_length=encoder_cfg.max_position_embeddings,
+      type_vocab_size=encoder_cfg.type_vocab_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=encoder_cfg.initializer_range),
+      output_range=encoder_cfg.output_range,
+      embedding_width=encoder_cfg.embedding_size,
+      return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
+      dict_outputs=True,
+      norm_first=encoder_cfg.norm_first,
+      token_loss_init_value=encoder_cfg.token_loss_init_value,
+      token_loss_beta=encoder_cfg.token_loss_beta,
+      token_keep_k=encoder_cfg.token_keep_k,
+      token_allow_list=encoder_cfg.token_allow_list,
+      token_deny_list=encoder_cfg.token_deny_list)
--- a/official/projects/token_dropping/encoder_test.py
+++ b/official/projects/token_dropping/encoder_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for transformer-based bert encoder network."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling.networks import bert_encoder
+from official.projects.token_dropping import encoder
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class TokenDropBertEncoderTest(keras_parameterized.TestCase):
+
+  def tearDown(self):
+    super(TokenDropBertEncoderTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy("float32")
+
+  def test_dict_outputs_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    self.assertIsInstance(test_network.transformer_layers, list)
+    self.assertLen(test_network.transformer_layers, 3)
+    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+  def test_dict_outputs_all_encoder_outputs_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        dict_outputs=True,
+        token_keep_k=sequence_length,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    all_encoder_outputs = dict_outputs["encoder_outputs"]
+    pooled = dict_outputs["pooled_output"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertLen(all_encoder_outputs, 3)
+    for data in all_encoder_outputs:
+      self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+  def test_dict_outputs_network_creation_with_float16_dtype(self):
+    hidden_size = 32
+    sequence_length = 21
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
+    # Create a small BertEncoder for testing.
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=4,
+        dict_outputs=True,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # If float_dtype is set to float16, the data output is float32 (from a layer
+    # norm) and pool output should be float16.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float16, pooled.dtype)
+
+  @parameterized.named_parameters(
+      ("all_sequence_encoder", None, 21),
+      ("output_range_encoder", 1, 1),
+  )
+  def test_dict_outputs_network_invocation(
+      self, output_range, out_seq_len):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+    # Create a small BertEncoder for testing.
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        output_range=output_range,
+        dict_outputs=True,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], out_seq_len)
+
+    # Creates a BertEncoder with max_sequence_length != sequence_length
+    max_sequence_length = 128
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        dict_outputs=True,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], sequence_length)
+
+    # Creates a BertEncoder with embedding_width != hidden_size
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        embedding_width=16,
+        dict_outputs=True,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[-1], hidden_size)
+    self.assertTrue(hasattr(test_network, "_embedding_projection"))
+
+  def test_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    self.assertIsInstance(test_network.transformer_layers, list)
+    self.assertLen(test_network.transformer_layers, 3)
+    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    inputs = dict(
+        input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids)
+    _ = test_network(inputs)
+
+  def test_all_encoder_outputs_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        return_all_encoder_outputs=True,
+        token_keep_k=sequence_length,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    all_encoder_outputs = dict_outputs["encoder_outputs"]
+    pooled = dict_outputs["pooled_output"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertLen(all_encoder_outputs, 3)
+    for data in all_encoder_outputs:
+      self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+  def test_network_creation_with_float16_dtype(self):
+    hidden_size = 32
+    sequence_length = 21
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
+    # Create a small BertEncoder for testing.
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=4,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # If float_dtype is set to float16, the data output is float32 (from a layer
+    # norm) and pool output should be float16.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float16, pooled.dtype)
+
+  @parameterized.named_parameters(
+      ("all_sequence", None, 21),
+      ("output_range", 1, 1),
+  )
+  def test_network_invocation(self, output_range, out_seq_len):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+    # Create a small BertEncoder for testing.
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        output_range=output_range,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], out_seq_len)
+
+    # Creates a BertEncoder with max_sequence_length != sequence_length
+    max_sequence_length = 128
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], sequence_length)
+
+    # Creates a BertEncoder with embedding_width != hidden_size
+    test_network = encoder.TokenDropBertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        embedding_width=16,
+        token_keep_k=2,
+        token_allow_list=(),
+        token_deny_list=())
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[-1], hidden_size)
+    self.assertTrue(hasattr(test_network, "_embedding_projection"))
+
+
+class TokenDropCompatibilityTest(tf.test.TestCase):
+
+  def tearDown(self):
+    super().tearDown()
+    tf.keras.mixed_precision.set_global_policy("float32")
+
+  def test_checkpoint_forward_compatible(self):
+    batch_size = 3
+
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+
+    kwargs = dict(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        output_range=None)
+
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    data = dict(
+        input_word_ids=word_id_data,
+        input_mask=mask_data,
+        input_type_ids=type_id_data)
+
+    old_net = bert_encoder.BertEncoderV2(**kwargs)
+    old_net_outputs = old_net(data)
+    ckpt = tf.train.Checkpoint(net=old_net)
+    path = ckpt.save(self.get_temp_dir())
+    new_net = encoder.TokenDropBertEncoder(
+        token_keep_k=sequence_length,
+        token_allow_list=(),
+        token_deny_list=(),
+        **kwargs)
+    new_ckpt = tf.train.Checkpoint(net=new_net)
+    status = new_ckpt.restore(path)
+    status.assert_existing_objects_matched()
+    # assert_consumed will fail because the old model has redundant nodes.
+    new_net_outputs = new_net(data)
+
+    self.assertAllEqual(old_net_outputs.keys(), new_net_outputs.keys())
+    for key in old_net_outputs:
+      self.assertAllClose(old_net_outputs[key], new_net_outputs[key])
+
+  def test_keras_model_checkpoint_forward_compatible(self):
+    batch_size = 3
+
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+
+    kwargs = dict(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        output_range=None)
+
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    data = dict(
+        input_word_ids=word_id_data,
+        input_mask=mask_data,
+        input_type_ids=type_id_data)
+
+    old_net = bert_encoder.BertEncoderV2(**kwargs)
+    inputs = old_net.inputs
+    outputs = old_net(inputs)
+    old_model = tf.keras.Model(inputs=inputs, outputs=outputs)
+    old_model_outputs = old_model(data)
+    ckpt = tf.train.Checkpoint(net=old_model)
+    path = ckpt.save(self.get_temp_dir())
+    new_net = encoder.TokenDropBertEncoder(
+        token_keep_k=sequence_length,
+        token_allow_list=(),
+        token_deny_list=(),
+        **kwargs)
+    inputs = new_net.inputs
+    outputs = new_net(inputs)
+    new_model = tf.keras.Model(inputs=inputs, outputs=outputs)
+    new_ckpt = tf.train.Checkpoint(net=new_model)
+    new_ckpt.restore(path)
+
+    new_model_outputs = new_model(data)
+
+    self.assertAllEqual(old_model_outputs.keys(), new_model_outputs.keys())
+    for key in old_model_outputs:
+      self.assertAllClose(old_model_outputs[key], new_model_outputs[key])
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/projects/token_dropping/experiment_configs.py
+++ b/official/projects/token_dropping/experiment_configs.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Token dropping BERT experiment configurations.
+
+Only pretraining configs. Token dropping BERT's checkpoints can be used directly
+for the regular BERT. So you can just use the regular BERT for finetuning.
+"""
+# pylint: disable=g-doc-return-or-yield,line-too-long
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import optimization
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
+from official.projects.token_dropping import encoder_config
+from official.projects.token_dropping import masked_lm
+
+
+@exp_factory.register_config_factory('token_drop_bert/pretraining')
+def token_drop_bert_pretraining() -> cfg.ExperimentConfig:
+  """BERT pretraining with token dropping."""
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(enable_xla=True),
+      task=masked_lm.TokenDropMaskedLMConfig(
+          model=bert.PretrainerConfig(
+              encoder=encoders.EncoderConfig(
+                  any=encoder_config.TokenDropBertEncoderConfig(
+                      vocab_size=30522, num_layers=1, token_keep_k=64),
+                  type='any')),
+          train_data=pretrain_dataloader.BertPretrainDataConfig(),
+          validation_data=pretrain_dataloader.BertPretrainDataConfig(
+              is_training=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=1000000,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay':
+                          ['LayerNorm', 'layer_norm', 'bias'],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 1e-4,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
--- a/official/projects/token_dropping/masked_lm.py
+++ b/official/projects/token_dropping/masked_lm.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Masked language task."""
+
+import dataclasses
+from typing import Tuple
+import tensorflow as tf
+
+from official.core import task_factory
+from official.nlp.tasks import masked_lm
+
+
+@dataclasses.dataclass
+class TokenDropMaskedLMConfig(masked_lm.MaskedLMConfig):
+  """The model config."""
+  pass
+
+
+@task_factory.register_task_cls(TokenDropMaskedLMConfig)
+class TokenDropMaskedLMTask(masked_lm.MaskedLMTask):
+  """Task object for Mask language modeling."""
+
+  def build_losses(self,
+                   labels,
+                   model_outputs,
+                   metrics,
+                   aux_losses=None) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Return the final loss, and the masked-lm loss."""
+    with tf.name_scope('MaskedLMTask/losses'):
+      metrics = dict([(metric.name, metric) for metric in metrics])
+      lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+          labels['masked_lm_ids'],
+          tf.cast(model_outputs['mlm_logits'], tf.float32),
+          from_logits=True)
+      lm_label_weights = labels['masked_lm_weights']
+      lm_numerator_loss = tf.reduce_sum(lm_prediction_losses *
+                                        lm_label_weights)
+      lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+      mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
+      metrics['lm_example_loss'].update_state(mlm_loss)
+      if 'next_sentence_labels' in labels:
+        sentence_labels = labels['next_sentence_labels']
+        sentence_outputs = tf.cast(
+            model_outputs['next_sentence'], dtype=tf.float32)
+        sentence_loss = tf.reduce_mean(
+            tf.keras.losses.sparse_categorical_crossentropy(
+                sentence_labels, sentence_outputs, from_logits=True))
+        metrics['next_sentence_loss'].update_state(sentence_loss)
+        total_loss = mlm_loss + sentence_loss
+      else:
+        total_loss = mlm_loss
+
+      if aux_losses:
+        total_loss += tf.add_n(aux_losses)
+      return total_loss, lm_prediction_losses
+
+  def train_step(self, inputs, model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer, metrics):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    with tf.GradientTape() as tape:
+      outputs = model(inputs, training=True)
+      # Computes per-replica loss.
+      loss, lm_prediction_losses = self.build_losses(
+          labels=inputs,
+          model_outputs=outputs,
+          metrics=metrics,
+          aux_losses=model.losses)
+      model.encoder_network.record_mlm_loss(
+          mlm_ids=inputs['masked_lm_ids'],
+          mlm_losses=lm_prediction_losses)
+      if self.task_config.scale_loss:
+        # Scales loss as the default gradients allreduce performs sum inside the
+        # optimizer.
+        scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+    tvars = model.trainable_variables
+    if self.task_config.scale_loss:
+      grads = tape.gradient(scaled_loss, tvars)
+    else:
+      grads = tape.gradient(loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    outputs = self.inference_step(inputs, model)
+    loss, _ = self.build_losses(
+        labels=inputs,
+        model_outputs=outputs,
+        metrics=metrics,
+        aux_losses=model.losses)
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
--- a/official/projects/token_dropping/masked_lm_test.py
+++ b/official/projects/token_dropping/masked_lm_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for official.nlp.tasks.masked_lm."""
+
+import tensorflow as tf
+
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
+from official.projects.token_dropping import encoder_config
+from official.projects.token_dropping import masked_lm
+
+
+class MLMTaskTest(tf.test.TestCase):
+
+  def test_task(self):
+    config = masked_lm.TokenDropMaskedLMConfig(
+        init_checkpoint=self.get_temp_dir(),
+        scale_loss=True,
+        model=bert.PretrainerConfig(
+            encoder=encoders.EncoderConfig(
+                any=encoder_config.TokenDropBertEncoderConfig(
+                    vocab_size=30522, num_layers=1, token_keep_k=64),
+                type="any"),
+            cls_heads=[
+                bert.ClsHeadConfig(
+                    inner_dim=10, num_classes=2, name="next_sentence")
+            ]),
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
+            input_path="dummy",
+            max_predictions_per_seq=20,
+            seq_length=128,
+            global_batch_size=1))
+    task = masked_lm.TokenDropMaskedLMTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    dataset = task.build_inputs(config.train_data)
+
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+
+    # Saves a checkpoint.
+    ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
+    ckpt.save(config.init_checkpoint)
+    task.initialize(model)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/projects/token_dropping/train.py
+++ b/official/projects/token_dropping/train.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A customized training binary for running token dropping experiments."""
+
+from absl import app
+from absl import flags
+import gin
+
+from official.common import distribute_utils
+from official.common import flags as tfm_flags
+from official.core import task_factory
+from official.core import train_lib
+from official.core import train_utils
+from official.modeling import performance
+from official.projects.token_dropping import experiment_configs  # pylint: disable=unused-import
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  params = train_utils.parse_configuration(FLAGS)
+  model_dir = FLAGS.model_dir
+  if 'train' in FLAGS.mode:
+    # Pure eval modes do not output yaml files. Otherwise continuous eval job
+    # may race against the train job for writing the same file.
+    train_utils.serialize_config(params, model_dir)
+
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  distribution_strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu,
+      **params.runtime.model_parallelism())
+
+  with distribution_strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+
+  train_lib.run_experiment(
+      distribution_strategy=distribution_strategy,
+      task=task,
+      mode=FLAGS.mode,
+      params=params,
+      model_dir=model_dir)
+
+  train_utils.save_gin_config(FLAGS.mode, model_dir)
+
+
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  app.run(main)
--- a/official/projects/token_dropping/wiki_books_pretrain.yaml
+++ b/official/projects/token_dropping/wiki_books_pretrain.yaml
+task:
+  init_checkpoint: ''
+  model:
+    cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}]
+  train_data:
+    drop_remainder: true
+    global_batch_size: 512
+    input_path: /path-to-data/wikipedia.tfrecord*,/path-to-data/books.tfrecord*
+    is_training: true
+    max_predictions_per_seq: 76
+    seq_length: 512
+    use_next_sentence_label: true
+    use_position_id: false
+    use_v2_feature_names: true
+  validation_data:
+    drop_remainder: false
+    global_batch_size: 512
+    input_path: /path-to-data/wikipedia.tfrecord*,/path-to-data/books.tfrecord*
+    is_training: false
+    max_predictions_per_seq: 76
+    seq_length: 512
+    use_next_sentence_label: true
+    use_position_id: false
+    use_v2_feature_names: true
+trainer:
+  checkpoint_interval: 20000
+  max_to_keep: 5
+  optimizer_config:
+    learning_rate:
+      polynomial:
+        cycle: false
+        decay_steps: 1000000
+        end_learning_rate: 0.0
+        initial_learning_rate: 0.0001
+        power: 1.0
+      type: polynomial
+    optimizer:
+      type: adamw
+    warmup:
+      polynomial:
+        power: 1
+        warmup_steps: 10000
+      type: polynomial
+  steps_per_loop: 1000
+  summary_interval: 1000
+  train_steps: 1000000
+  validation_interval: 1000
+  validation_steps: 64
--- a/official/projects/token_dropping/wiki_books_pretrain_sequence_pack.yaml
+++ b/official/projects/token_dropping/wiki_books_pretrain_sequence_pack.yaml
+task:
+  init_checkpoint: ''
+  model:
+    cls_heads: []
+  train_data:
+    drop_remainder: true
+    global_batch_size: 512
+    input_path: /path-to-packed-data/wikipedia.tfrecord*,/path-to-packed-data/books.tfrecord*
+    is_training: true
+    max_predictions_per_seq: 76
+    seq_length: 512
+    use_next_sentence_label: false
+    use_position_id: false
+    use_v2_feature_names: true
+  validation_data:
+    drop_remainder: false
+    global_batch_size: 512
+    input_path: /path-to-packed-data/wikipedia.tfrecord*,/path-to-packed-data/books.tfrecord*
+    is_training: false
+    max_predictions_per_seq: 76
+    seq_length: 512
+    use_next_sentence_label: false
+    use_position_id: false
+    use_v2_feature_names: true
+trainer:
+  checkpoint_interval: 20000
+  max_to_keep: 5
+  optimizer_config:
+    learning_rate:
+      polynomial:
+        cycle: false
+        decay_steps: 1000000
+        end_learning_rate: 0.0
+        initial_learning_rate: 0.0001
+        power: 1.0
+      type: polynomial
+    optimizer:
+      type: adamw
+    warmup:
+      polynomial:
+        power: 1
+        warmup_steps: 10000
+      type: polynomial
+  steps_per_loop: 1000
+  summary_interval: 1000
+  train_steps: 1000000
+  validation_interval: 1000
+  validation_steps: 64
--- a/official/utils/docs/build_nlp_api_docs.py
+++ b/official/utils/docs/build_nlp_api_docs.py
@@ -37,7 +37,7 @@ FLAGS = flags.FLAGS
 flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
 flags.DEFINE_string(
    'code_url_prefix',
-    'https://github.com/tensorflow/models/blob/master/tensorflow_models/',
+    'https://github.com/tensorflow/models/blob/master/tensorflow_models/nlp',
    'The url prefix for links to code.')

 flags.DEFINE_bool('search_hints', True,
@@ -66,9 +66,11 @@ def gen_api_docs(code_url_prefix, site_path, output_dir, project_short_name,
  del tfm.nlp.layers.MultiHeadAttention
  del tfm.nlp.layers.EinsumDense

-  branch = code_url_prefix.strip('/').split('/')[-2]
-  official_url_prefix = (
-      f'https://github.com/tensorflow/models/blob/{branch}/official/')
+  url_parts = code_url_prefix.strip('/').split('/')
+  url_parts = url_parts[:url_parts.index('tensorflow_models')]
+  url_parts.append('official')
+
+  official_url_prefix = '/'.join(url_parts)

  nlp_base_dir = pathlib.Path(tfm.nlp.__file__).parent


--- a/official/utils/docs/build_vision_api_docs.py
+++ b/official/utils/docs/build_vision_api_docs.py
@@ -38,7 +38,7 @@ FLAGS = flags.FLAGS
 flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
 flags.DEFINE_string(
    'code_url_prefix',
-    'https://github.com/tensorflow/models/blob/master/tensorflow_models/',
+    'https://github.com/tensorflow/models/blob/master/tensorflow_models/vision',
    'The url prefix for links to code.')

 flags.DEFINE_bool('search_hints', True,
@@ -64,9 +64,11 @@ def gen_api_docs(code_url_prefix, site_path, output_dir, project_short_name,
  """Generates api docs for the tensorflow docs package."""
  build_api_docs_lib.hide_module_model_and_layer_methods()

-  branch = code_url_prefix.strip('/').split('/')[-2]
-  official_url_prefix = (
-      f'https://github.com/tensorflow/models/blob/{branch}/official/')
+  url_parts = code_url_prefix.strip('/').split('/')
+  url_parts = url_parts[:url_parts.index('tensorflow_models')]
+  url_parts.append('official')
+
+  official_url_prefix = '/'.join(url_parts)

  vision_base_dir = pathlib.Path(tfm.vision.__file__).parent


--- a/official/vision/__init__.py
+++ b/official/vision/__init__.py
@@ -15,5 +15,5 @@
 """Vision package definition."""
 # Lint as: python3
 # pylint: disable=unused-import
-from official.vision.beta import configs
-from official.vision.beta import tasks
+from official.vision import configs
+from official.vision import tasks
--- a/official/vision/beta/modeling/backbones/spinenet.py
+++ b/official/vision/beta/modeling/backbones/spinenet.py
@@ -405,9 +405,9 @@ class SpineNet(tf.keras.Model):
        if (block_spec.level < self._min_level or
            block_spec.level > self._max_level):
          logging.warning(
-              'SpineNet output level out of range [min_level, max_level] = '
+              'SpineNet output level %s out of range [min_level, max_level] = '
              '[%s, %s] will not be used for further processing.',
-              self._min_level, self._max_level)
+              block_spec.level, self._min_level, self._max_level)
        endpoints[str(block_spec.level)] = x

    return endpoints