Merge branch 'master' of https://github.com/tensorflow/models into RTESuperGLUE

bb124157 · stephenwu · 2e9bb539 · 0edeb7f6 · bb124157 · bb124157
Commit bb124157 authored Mar 10, 2021 by stephenwu
20 changed files
--- a/CODEOWNERS
+++ b/CODEOWNERS
 * @tensorflow/tf-garden-team @tensorflow/tf-model-garden-team
 /official/ @rachellj218 @saberkun @jaeyounkim
 /official/nlp/ @saberkun @chenGitHuber @lehougoogle @rachellj218 @jaeyounkim
-/official/vision/ @pengchongjin @xianzhidu @yeqingli @arashwan @saberkun @rachellj218 @jaeyounkim
+/official/vision/ @xianzhidu @yeqingli @arashwan @saberkun @rachellj218 @jaeyounkim
 /research/adversarial_text/ @rsepassi @a-dai
 /research/attention_ocr/ @xavigibert
 /research/audioset/ @plakal @dpwe

--- a/official/colab/decoding_api_in_tf_nlp.ipynb
+++ b/official/colab/decoding_api_in_tf_nlp.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vXLA5InzXydn"
+      },
+      "source": [
+        "##### Copyright 2021 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "RuRlpLL-X0R_"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fsACVQpVSifi"
+      },
+      "source": [
+        "### Install the TensorFlow Model Garden pip package\n",
+        "\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
+        "*  pip will install all models and dependencies automatically."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hYEwGTeCXnnX"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2j-xhrsVQOQT"
+      },
+      "outputs": [],
+      "source": [
+        "pip install  tf-models-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BjP7zwxmskpY"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "\n",
+        "from official import nlp\n",
+        "from official.nlp.modeling.ops import sampling_module\n",
+        "from official.nlp.modeling.ops import beam_search"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0AWgyo-IQ5sP"
+      },
+      "source": [
+        "# Decoding API\n",
+        "This API provides an interface to experiment with different decoding strategies used for auto-regressive models.\n",
+        "\n",
+        "1. The following sampling strategies are provided in sampling_module.py, which inherits from the base Decoding class:\n",
+        "  *   [top_p](https://arxiv.org/abs/1904.09751) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L65) \n",
+        "\n",
+        "      This implementation chooses most probable logits with cumulative probabilities upto top_p.\n",
+        "\n",
+        "  *   [top_k](https://arxiv.org/pdf/1805.04833.pdf) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L48)\n",
+        "\n",
+        "      At each timestep, this implementation samples from top-k logits based on their probability distribution\n",
+        "\n",
+        "  *   Greedy : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L26)\n",
+        "\n",
+        "      This implementation returns the top logits based on probabilities.\n",
+        "\n",
+        "2. Beam search is provided in beam_search.py. [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search.py)\n",
+        "\n",
+        "      This implementation reduces the risk of missing hidden high probability logits by keeping the most likely num_beams of logits at each time step and eventually choosing the logits that has the overall highest probability."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MfOj7oaBRQnS"
+      },
+      "source": [
+        "## Initialize Sampling Module in TF-NLP.\n",
+        "\n",
+        "\n",
+        "\u003e **symbols_to_logits_fn** : This is a closure implemented by the users of the API. The input to this closure will be  \n",
+        "```\n",
+        "Args:\n",
+        "  1] ids [batch_size, .. (index + 1 or 1 if padded_decode is True)],\n",
+        "  2] index [scalar] : current decoded step,\n",
+        "  3] cache [nested dictionary of tensors].\n",
+        "Returns:\n",
+        "  1] tensor for next-step logits [batch_size, vocab]\n",
+        "  2] the updated_cache [nested dictionary of tensors].\n",
+        "```\n",
+        "This closure calls the model to predict the logits for the 'index+1' step. The cache is used for faster decoding.\n",
+        "Here is a [reference](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search_test.py#L88) implementation for the above closure.\n",
+        "\n",
+        "\n",
+        "\u003e **length_normalization_fn** : Closure for returning length normalization parameter.\n",
+        "```\n",
+        "Args: \n",
+        "  1] length : scalar for decoded step index.\n",
+        "  2] dtype : data-type of output tensor\n",
+        "Returns:\n",
+        "  1] value of length normalization factor.\n",
+        "Example :\n",
+        "  def _length_norm(length, dtype):\n",
+        "    return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)\n",
+        "```\n",
+        "\n",
+        "\u003e **vocab_size** : Output vocabulary size.\n",
+        "\n",
+        "\u003e **max_decode_length** : Scalar for total number of decoding steps.\n",
+        "\n",
+        "\u003e **eos_id** : Decoding will stop if all output decoded ids in the batch have this ID.\n",
+        "\n",
+        "\u003e **padded_decode** : Set this to True if running on TPU. Tensors are padded to max_decoding_length if this is True.\n",
+        "\n",
+        "\u003e **top_k** : top_k is enabled if this value is \u003e 1.\n",
+        "\n",
+        "\u003e **top_p** : top_p is enabled if this value is \u003e 0 and \u003c 1.0\n",
+        "\n",
+        "\u003e **sampling_temperature** : This is used to re-estimate the softmax output. Temperature skews the distribution towards high probability tokens and lowers the mass in tail distribution. Value has to be positive. Low temperature is equivalent to greedy and makes the distribution sharper, while high temperature makes it more flat.\n",
+        "\n",
+        "\u003e **enable_greedy** : By default, this is true and greedy decoding is enabled.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lV1RRp6ihnGX"
+      },
+      "source": [
+        "# Initialize the Model Hyper-parameters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "eTsGp2gaKLdE"
+      },
+      "outputs": [],
+      "source": [
+        "params = {}\n",
+        "params['num_heads'] = 2\n",
+        "params['num_layers'] = 2\n",
+        "params['batch_size'] = 2\n",
+        "params['n_dims'] = 256\n",
+        "params['max_decode_length'] = 4"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UGvmd0_dRFYI"
+      },
+      "source": [
+        "## What is a Cache?\n",
+        "In auto-regressive architectures like Transformer based [Encoder-Decoder](https://arxiv.org/abs/1706.03762) models, \n",
+        "Cache is used for fast sequential decoding.\n",
+        "It is a nested dictionary storing pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) for every layer.\n",
+        "\n",
+        "```\n",
+        "{\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers']),\n",
+        "    'model_specific_item' : Model specific tensor shape,\n",
+        "}\n",
+        "\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CYXkoplAij01"
+      },
+      "source": [
+        "# Initialize cache. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "D6kfZOOKgkm1"
+      },
+      "outputs": [],
+      "source": [
+        "cache = {\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers'])\n",
+        "    }\n",
+        "print(\"cache key shape for layer 1 :\", cache['layer_1']['k'].shape)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nNY3Xn8SiblP"
+      },
+      "source": [
+        "# Define closure for length normalization. **optional.**\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T92ccAzlnGqh"
+      },
+      "outputs": [],
+      "source": [
+        "def length_norm(length, dtype):\n",
+        "  \"\"\"Return length normalization factor.\"\"\"\n",
+        "  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "syl7I5nURPgW"
+      },
+      "source": [
+        "# Create model_fn\n",
+        "  In practice, this will be replaced by an actual model implementation such as [here](https://github.com/tensorflow/models/blob/master/official/nlp/transformer/transformer.py#L236)\n",
+        "```\n",
+        "Args:\n",
+        "i : Step that is being decoded.\n",
+        "Returns:\n",
+        "  logit probabilities of size [batch_size, 1, vocab_size]\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AhzSkRisRdB6"
+      },
+      "outputs": [],
+      "source": [
+        "probabilities = tf.constant([[[0.3, 0.4, 0.3], [0.3, 0.3, 0.4],\n",
+        "                              [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],\n",
+        "                            [[0.2, 0.5, 0.3], [0.2, 0.7, 0.1],\n",
+        "                              [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]])\n",
+        "def model_fn(i):\n",
+        "  return probabilities[:, i, :]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DBMUkaVmVZBg"
+      },
+      "source": [
+        "# Initialize symbols_to_logits_fn\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FAJ4CpbfVdjr"
+      },
+      "outputs": [],
+      "source": [
+        "def _symbols_to_logits_fn():\n",
+        "  \"\"\"Calculates logits of the next tokens.\"\"\"\n",
+        "  def symbols_to_logits_fn(ids, i, temp_cache):\n",
+        "    del ids\n",
+        "    logits = tf.cast(tf.math.log(model_fn(i)), tf.float32)\n",
+        "    return logits, temp_cache\n",
+        "  return symbols_to_logits_fn"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "R_tV3jyWVL47"
+      },
+      "source": [
+        "# Greedy \n",
+        "Greedy decoding selects the token id with the highest probability as its next id: $id_t = argmax_{w}P(id | id_{1:t-1})$ at each timestep $t$. The following sketch shows greedy decoding. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aGt9idSkVQEJ"
+      },
+      "outputs": [],
+      "source": [
+        "greedy_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=None,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    padded_decode=False)\n",
+        "ids, _ = greedy_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"Greedy Decoded Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "s4pTTsQXVz5O"
+      },
+      "source": [
+        "# top_k sampling\n",
+        "In *Top-K* sampling, the *K* most likely next token ids are filtered and the probability mass is redistributed among only those *K* ids. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pCLWIn6GV5_G"
+      },
+      "outputs": [],
+      "source": [
+        "top_k_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=length_norm,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    sample_temperature=tf.constant(1.0),\n",
+        "    top_k=tf.constant(3),\n",
+        "    padded_decode=False,\n",
+        "    enable_greedy=False)\n",
+        "ids, _ = top_k_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"top-k sampled Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Jp3G-eE_WI4Y"
+      },
+      "source": [
+        "# top_p sampling\n",
+        "Instead of sampling only from the most likely *K* token ids, in *Top-p* sampling chooses from the smallest possible set of ids whose cumulative probability exceeds the probability *p*."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rEGdIWcuWILO"
+      },
+      "outputs": [],
+      "source": [
+        "top_p_obj = sampling_module.SamplingModule(\n",
+        "    length_normalization_fn=length_norm,\n",
+        "    dtype=tf.float32,\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    vocab_size=3,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    sample_temperature=tf.constant(1.0),\n",
+        "    top_p=tf.constant(0.9),\n",
+        "    padded_decode=False,\n",
+        "    enable_greedy=False)\n",
+        "ids, _ = top_p_obj.generate(\n",
+        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
+        "print(\"top-p sampled Ids:\", ids)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2hcuyJ2VWjDz"
+      },
+      "source": [
+        "# Beam search decoding\n",
+        "Beam search reduces the risk of missing hidden high probability token ids by keeping the most likely num_beams of hypotheses at each time step and eventually choosing the hypothesis that has the overall highest probability. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cJ3WzvSrWmSA"
+      },
+      "outputs": [],
+      "source": [
+        "beam_size = 2\n",
+        "params['batch_size'] = 1\n",
+        "beam_cache = {\n",
+        "    'layer_%d' % layer: {\n",
+        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32),\n",
+        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32)\n",
+        "        } for layer in range(params['num_layers'])\n",
+        "    }\n",
+        "print(\"cache key shape for layer 1 :\", beam_cache['layer_1']['k'].shape)\n",
+        "ids, _ = beam_search.sequence_beam_search(\n",
+        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
+        "    initial_ids=tf.constant([9], tf.int32),\n",
+        "    initial_cache=beam_cache,\n",
+        "    vocab_size=3,\n",
+        "    beam_size=beam_size,\n",
+        "    alpha=0.6,\n",
+        "    max_decode_length=params['max_decode_length'],\n",
+        "    eos_id=10,\n",
+        "    padded_decode=False,\n",
+        "    dtype=tf.float32)\n",
+        "print(\"Beam search ids:\", ids)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "decoding_api_in_tf_nlp.ipynb",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/official/colab/nlp/nlp_modeling_library_intro.ipynb
+++ b/official/colab/nlp/nlp_modeling_library_intro.ipynb
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "Introduction to the TensorFlow Models NLP library",
-      "private_outputs": true,
-      "provenance": [],
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
  "cells": [
    {
      "cell_type": "markdown",
@@ -26,10 +11,12 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "8nvTnfs6Q692"
      },
+      "outputs": [],
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
@@ -42,9 +29,7 @@
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -61,20 +46,20 @@
        "id": "cH-oJ8R6AHMK"
      },
      "source": [
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  <td>\n",
+        "  \u003ctd\u003e\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  </td>\n",
+        "  \u003c/td\u003e\n",
-        "  <td>\n",
+        "  \u003ctd\u003e\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  </td>\n",
+        "  \u003c/td\u003e\n",
-        "  <td>\n",
+        "  \u003ctd\u003e\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  </td>\n",
+        "  \u003c/td\u003e\n",
-        "  <td>\n",
+        "  \u003ctd\u003e\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  </td>\n",
+        "  \u003c/td\u003e\n",
-        "</table>"
+        "\u003c/table\u003e"
      ]
    },
    {
@@ -112,14 +97,14 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "Y-qGkdh6_sZc"
      },
+      "outputs": [],
      "source": [
        "!pip install -q tf-models-official==2.4.0"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -132,18 +117,18 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "jqYXqtjBAJd9"
      },
+      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import tensorflow as tf\n",
        "\n",
        "from official.nlp import modeling\n",
        "from official.nlp.modeling import layers, losses, models, networks"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -173,18 +158,18 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "EXkcXz-9BwB3"
      },
+      "outputs": [],
      "source": [
        "# Build a small transformer network.\n",
        "vocab_size = 100\n",
        "sequence_length = 16\n",
        "network = modeling.networks.BertEncoder(\n",
        "    vocab_size=vocab_size, num_layers=2, sequence_length=16)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -199,28 +184,28 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "lZNoZkBrIoff"
      },
+      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(network, show_shapes=True, dpi=48)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "o7eFOZXiIl-b"
      },
+      "outputs": [],
      "source": [
        "# Create a BERT pretrainer with the created network.\n",
        "num_token_predictions = 8\n",
        "bert_pretrainer = modeling.models.BertPretrainer(\n",
        "    network, num_classes=2, num_token_predictions=num_token_predictions, output='predictions')"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -233,20 +218,22 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "2tcNfm03IBF7"
      },
+      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_pretrainer, show_shapes=True, dpi=48)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "F2oHrXGUIS0M"
      },
+      "outputs": [],
      "source": [
        "# We can feed some dummy data to get masked language model and sentence output.\n",
        "batch_size = 2\n",
@@ -261,9 +248,7 @@
        "sentence_output = outputs[\"classification\"]\n",
        "print(lm_output)\n",
        "print(sentence_output)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -277,9 +262,11 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "k30H4Q86f52x"
      },
+      "outputs": [],
      "source": [
        "masked_lm_ids_data = np.random.randint(vocab_size, size=(batch_size, num_token_predictions))\n",
        "masked_lm_weights_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
@@ -294,9 +281,7 @@
        "    predictions=sentence_output)\n",
        "loss = mlm_loss + sentence_loss\n",
        "print(loss)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -337,18 +322,18 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "B941M4iUCejO"
      },
+      "outputs": [],
      "source": [
        "network = modeling.networks.BertEncoder(\n",
        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
        "\n",
        "# Create a BERT trainer with the created network.\n",
        "bert_span_labeler = modeling.models.BertSpanLabeler(network)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -361,20 +346,22 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "RbqRNJCLJu4H"
      },
+      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_span_labeler, show_shapes=True, dpi=48)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "fUf1vRxZJwio"
      },
+      "outputs": [],
      "source": [
        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
@@ -385,9 +372,7 @@
        "start_logits, end_logits = bert_span_labeler([word_id_data, mask_data, type_id_data])\n",
        "print(start_logits)\n",
        "print(end_logits)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -401,9 +386,11 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "waqs6azNl3Nn"
      },
+      "outputs": [],
      "source": [
        "start_positions = np.random.randint(sequence_length, size=(batch_size))\n",
        "end_positions = np.random.randint(sequence_length, size=(batch_size))\n",
@@ -415,9 +402,7 @@
        "\n",
        "total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2\n",
        "print(total_loss)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -452,9 +437,11 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "cXXCsffkCphk"
      },
+      "outputs": [],
      "source": [
        "network = modeling.networks.BertEncoder(\n",
        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
@@ -463,9 +450,7 @@
        "num_classes = 2\n",
        "bert_classifier = modeling.models.BertClassifier(\n",
        "    network, num_classes=num_classes)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -478,20 +463,22 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "snlutm9ZJgEZ"
      },
+      "outputs": [],
      "source": [
        "tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "yyHPHsqBJkCz"
      },
+      "outputs": [],
      "source": [
        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
@@ -501,9 +488,7 @@
        "# Feed the data to the model.\n",
        "logits = bert_classifier([word_id_data, mask_data, type_id_data])\n",
        "print(logits)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -518,18 +503,18 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "9X0S1DoFn_5Q"
      },
+      "outputs": [],
      "source": [
        "labels = np.random.randint(num_classes, size=(batch_size))\n",
        "\n",
-        "loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
+        "loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
-        "    labels=labels, predictions=tf.nn.log_softmax(logits, axis=-1))\n",
+        "    labels, logits, from_logits=True)\n",
        "print(loss)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -540,5 +525,20 @@
        "With the `loss`, you can optimize the model. Please see [run_classifier.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_classifier.py) or the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb) for the full example."
      ]
    }
-  ]
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Introduction to the TensorFlow Models NLP library",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
--- a/official/common/distribute_utils.py
+++ b/official/common/distribute_utils.py
@@ -170,7 +170,8 @@ def get_distribution_strategy(distribution_strategy="mirrored",
        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
  if distribution_strategy == "parameter_server":
-    return tf.compat.v1.distribute.experimental.ParameterServerStrategy()
+    cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
+    return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
  raise ValueError("Unrecognized Distribution Strategy: %r" %
                   distribution_strategy)
@@ -181,6 +182,7 @@ def configure_cluster(worker_hosts=None, task_index=-1):
  Args:
    worker_hosts: comma-separated list of worker ip:port pairs.
+    task_index: index of the worker.
  Returns:
    Number of workers in the cluster.

--- a/official/common/flags.py
+++ b/official/common/flags.py
@@ -61,13 +61,21 @@ def define_flags():
      '--> params in params_override. See also the help message of '
      '`--config_file`.')
+  # The libraries rely on gin often make mistakes that include flags inside
+  # the library files which causes conflicts.
+  try:
    flags.DEFINE_multi_string(
        'gin_file', default=None, help='List of paths to the config files.')
+  except flags.DuplicateFlagError:
+    pass
+  try:
    flags.DEFINE_multi_string(
        'gin_params',
        default=None,
        help='Newline separated list of Gin parameter bindings.')
+  except flags.DuplicateFlagError:
+    pass
  flags.DEFINE_string(
      'tpu', default=None,

--- a/official/core/base_task.py
+++ b/official/core/base_task.py
@@ -80,7 +80,8 @@ class Task(tf.Module, metaclass=abc.ABCMeta):
      optimizer = performance.configure_optimizer(
          optimizer,
          use_float16=runtime_config.mixed_precision_dtype == "float16",
-          loss_scale=runtime_config.loss_scale)
+          loss_scale=runtime_config.loss_scale,
+          use_experimental_api=True)
    return optimizer
@@ -291,6 +292,8 @@ class Task(tf.Module, metaclass=abc.ABCMeta):
    """Optional aggregation over logs returned from a validation step."""
    pass
-  def reduce_aggregated_logs(self, aggregated_logs):
+  def reduce_aggregated_logs(self,
+                             aggregated_logs,
+                             global_step: Optional[tf.Tensor] = None):
    """Optional reduce of aggregated logs over validation steps."""
    return {}
--- a/official/core/base_trainer.py
+++ b/official/core/base_trainer.py
@@ -334,7 +334,8 @@ class Trainer(orbit.StandardTrainer, orbit.StandardEvaluator):
      # loss was not returned from the task's `validation_step` method.
      logging.info("The task did not report validation loss.")
    if aggregated_logs:
-      metrics = self.task.reduce_aggregated_logs(aggregated_logs)
+      metrics = self.task.reduce_aggregated_logs(
+          aggregated_logs, global_step=self.global_step)
      logs.update(metrics)
    if self._checkpoint_exporter:

--- a/official/core/config_definitions.py
+++ b/official/core/config_definitions.py
@@ -63,7 +63,6 @@ class DataConfig(base_config.Config):
      The default behavior is that the dataset creates anonymous, exclusively
      owned jobs.
    tfds_data_dir: A str specifying the directory to read/write TFDS data.
-    tfds_download: A bool to indicate whether to download data using TFDS.
    tfds_as_supervised: A bool. When loading dataset from TFDS, if True, the
      returned tf.data.Dataset will have a 2-tuple structure (input, label)
      according to builder.info.supervised_keys; if False, the default, the
@@ -89,7 +88,6 @@ class DataConfig(base_config.Config):
  tf_data_service_address: Optional[str] = None
  tf_data_service_job_name: Optional[str] = None
  tfds_data_dir: str = ""
-  tfds_download: bool = False
  tfds_as_supervised: bool = False
  tfds_skip_decoding_feature: str = ""

--- a/official/core/export_base.py
+++ b/official/core/export_base.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base class for model export."""
+import abc
+import functools
+from typing import Any, Callable, Dict, Mapping, List, Optional, Text, Union
+import tensorflow as tf
+from tensorflow.python.saved_model.model_utils import export_utils
+class ExportModule(tf.Module, metaclass=abc.ABCMeta):
+  """Base Export Module."""
+  def __init__(self,
+               params,
+               model: Union[tf.Module, tf.keras.Model],
+               inference_step: Optional[Callable[..., Any]] = None):
+    """Instantiates an ExportModel.
+    Args:
+      params: A dataclass for parameters to the module.
+      model: A model instance which contains weights and forward computation.
+      inference_step: An optional callable to define how the model is called.
+    """
+    super().__init__(name=None)
+    self.model = model
+    self.params = params
+    if inference_step is not None:
+      self.inference_step = functools.partial(inference_step, model=self.model)
+    else:
+      self.inference_step = functools.partial(
+          self.model.__call__, training=False)
+  @abc.abstractmethod
+  def serve(self) -> Mapping[Text, tf.Tensor]:
+    """The bare inference function which should run on all devices.
+    Expecting tensors are passed in through keyword arguments. Returns a
+    dictionary of tensors, when the keys will be used inside the SignatureDef.
+    """
+  @abc.abstractmethod
+  def get_inference_signatures(
+      self, function_keys: Dict[Text, Text]) -> Mapping[Text, Any]:
+    """Get defined function signatures."""
+def export(export_module: ExportModule,
+           function_keys: Union[List[Text], Dict[Text, Text]],
+           export_savedmodel_dir: Text,
+           checkpoint_path: Optional[Text] = None,
+           timestamped: bool = True,
+           save_options: Optional[tf.saved_model.SaveOptions] = None) -> Text:
+  """Exports to SavedModel format.
+  Args:
+    export_module: a ExportModule with the keras Model and serving tf.functions.
+    function_keys: a list of string keys to retrieve pre-defined serving
+      signatures. The signaute keys will be set with defaults. If a dictionary
+      is provided, the values will be used as signature keys.
+    export_savedmodel_dir: Output saved model directory.
+    checkpoint_path: Object-based checkpoint path or directory.
+    timestamped: Whether to export the savedmodel to a timestamped directory.
+    save_options: `SaveOptions` for `tf.saved_model.save`.
+  Returns:
+    The savedmodel directory path.
+  """
+  ckpt_dir_or_file = checkpoint_path
+  if tf.io.gfile.isdir(ckpt_dir_or_file):
+    ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+  if ckpt_dir_or_file:
+    checkpoint = tf.train.Checkpoint(model=export_module.model)
+    checkpoint.read(
+        ckpt_dir_or_file).assert_existing_objects_matched().expect_partial()
+  if isinstance(function_keys, list):
+    if len(function_keys) == 1:
+      function_keys = {
+          function_keys[0]: tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      }
+    else:
+      raise ValueError(
+          "If the function_keys is a list, it must contain a single element. %s"
+          % function_keys)
+  signatures = export_module.get_inference_signatures(function_keys)
+  if timestamped:
+    export_dir = export_utils.get_timestamped_export_dir(
+        export_savedmodel_dir).decode("utf-8")
+  else:
+    export_dir = export_savedmodel_dir
+  tf.saved_model.save(
+      export_module, export_dir, signatures=signatures, options=save_options)
+  return export_dir
--- a/official/core/export_base_test.py
+++ b/official/core/export_base_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.core.export_base."""
+import os
+from typing import Any, Dict, Mapping, Text
+import tensorflow as tf
+from official.core import export_base
+class TestModule(export_base.ExportModule):
+  @tf.function
+  def serve(self, inputs: tf.Tensor) -> Mapping[Text, tf.Tensor]:
+    return {'outputs': self.inference_step(inputs)}
+  def get_inference_signatures(
+      self, function_keys: Dict[Text, Text]) -> Mapping[Text, Any]:
+    input_signature = tf.TensorSpec(shape=[None, None], dtype=tf.float32)
+    return {'foo': self.serve.get_concrete_function(input_signature)}
+class ExportBaseTest(tf.test.TestCase):
+  def test_export_module(self):
+    tmp_dir = self.get_temp_dir()
+    model = tf.keras.layers.Dense(2)
+    inputs = tf.ones([2, 4], tf.float32)
+    expected_output = model(inputs, training=False)
+    module = TestModule(params=None, model=model)
+    ckpt_path = tf.train.Checkpoint(model=model).save(
+        os.path.join(tmp_dir, 'ckpt'))
+    export_dir = export_base.export(
+        module, ['foo'],
+        export_savedmodel_dir=tmp_dir,
+        checkpoint_path=ckpt_path,
+        timestamped=True)
+    self.assertTrue(os.path.exists(os.path.join(export_dir, 'saved_model.pb')))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(export_dir, 'variables', 'variables.index')))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(export_dir, 'variables',
+                         'variables.data-00000-of-00001')))
+    imported = tf.saved_model.load(export_dir)
+    output = imported.signatures['foo'](inputs)
+    self.assertAllClose(output['outputs'].numpy(), expected_output.numpy())
+  def test_custom_inference_step(self):
+    tmp_dir = self.get_temp_dir()
+    model = tf.keras.layers.Dense(2)
+    inputs = tf.ones([2, 4], tf.float32)
+    def _inference_step(inputs, model):
+      return tf.nn.softmax(model(inputs, training=False))
+    module = TestModule(
+        params=None, model=model, inference_step=_inference_step)
+    expected_output = _inference_step(inputs, model)
+    ckpt_path = tf.train.Checkpoint(model=model).save(
+        os.path.join(tmp_dir, 'ckpt'))
+    export_dir = export_base.export(
+        module, ['foo'],
+        export_savedmodel_dir=tmp_dir,
+        checkpoint_path=ckpt_path,
+        timestamped=False)
+    imported = tf.saved_model.load(export_dir)
+    output = imported.signatures['foo'](inputs)
+    self.assertAllClose(output['outputs'].numpy(), expected_output.numpy())
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/core/input_reader.py
+++ b/official/core/input_reader.py
@@ -101,7 +101,6 @@ class InputReader:
    self._deterministic = params.deterministic
    self._sharding = params.sharding
    self._tfds_split = params.tfds_split
-    self._tfds_download = params.tfds_download
    self._tfds_as_supervised = params.tfds_as_supervised
    self._tfds_skip_decoding_feature = params.tfds_skip_decoding_feature
@@ -232,7 +231,7 @@ class InputReader:
      input_context: Optional[tf.distribute.InputContext] = None
  ) -> tf.data.Dataset:
    """Reads a dataset from tfds."""
-    if self._tfds_download:
+    # No op if exist.
    self._tfds_builder.download_and_prepare()
    read_config = tfds.ReadConfig(

--- a/official/core/registry_test.py
+++ b/official/core/registry_test.py
@@ -14,10 +14,6 @@
 """Tests for registry."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
 import tensorflow as tf
 from official.core import registry

--- a/official/modeling/multitask/evaluator.py
+++ b/official/modeling/multitask/evaluator.py
@@ -161,7 +161,8 @@ class MultiTaskEvaluator(orbit.AbstractEvaluator):
      for metric in task_metrics + [task_loss]:
        logs[metric.name] = metric.result()
      if outputs:
-        metrics = task.reduce_aggregated_logs(outputs)
+        metrics = task.reduce_aggregated_logs(
+            outputs, global_step=self.global_step)
        logs.update(metrics)
      results[name] = logs

--- a/official/modeling/multitask/evaluator_test.py
+++ b/official/modeling/multitask/evaluator_test.py
@@ -89,7 +89,9 @@ class MockTask(base_task.Task):
          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
    return state
-  def reduce_aggregated_logs(self, aggregated_logs):
+  def reduce_aggregated_logs(self,
+                             aggregated_logs,
+                             global_step=None):
    for k, v in aggregated_logs.items():
      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
    return aggregated_logs

--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
@@ -166,6 +166,29 @@ class PowerAndLinearDecayLrConfig(base_config.Config):
  linear_decay_fraction: float = 0.1
+@dataclasses.dataclass
+class PowerDecayWithOffsetLrConfig(base_config.Config):
+  """Configuration for power learning rate decay with step offset.
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+  Attributes:
+    name: The name of the learning rate schedule.
+      Defaults to PowerDecayWithOffset.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+    offset: An integer. Power decay happens after `offset` steps.
+    pre_offset_learning_rate: A float. The constant learning rate before
+      `offset` steps.
+  """
+  name: str = 'PowerDecayWithOffset'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+  offset: int = 0
+  pre_offset_learning_rate: float = 1.0e6
 @dataclasses.dataclass
 class LinearWarmupConfig(base_config.Config):
  """Configuration for linear warmup schedule config.

--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
@@ -62,6 +62,7 @@ class LrConfig(oneof.OneOfConfig):
    power: step^power learning rate config.
    power_linear: learning rate config of step^power followed by
      step^power*linear.
+    power_with_offset: power decay with a step offset.
  """
  type: Optional[str] = None
  constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
@@ -72,6 +73,8 @@ class LrConfig(oneof.OneOfConfig):
  power: lr_cfg.DirectPowerLrConfig = lr_cfg.DirectPowerLrConfig()
  power_linear: lr_cfg.PowerAndLinearDecayLrConfig = (
      lr_cfg.PowerAndLinearDecayLrConfig())
+  power_with_offset: lr_cfg.PowerDecayWithOffsetLrConfig = (
+      lr_cfg.PowerDecayWithOffsetLrConfig())
 @dataclasses.dataclass

--- a/official/modeling/optimization/ema_optimizer.py
+++ b/official/modeling/optimization/ema_optimizer.py
@@ -70,7 +70,7 @@ class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
      **kwargs: keyword arguments. Allowed to be {`clipnorm`,
        `clipvalue`, `lr`, `decay`}.
    """
-    super(ExponentialMovingAverage, self).__init__(name, **kwargs)
+    super().__init__(name, **kwargs)
    self._average_decay = average_decay
    self._start_step = tf.constant(start_step, tf.float32)
    self._dynamic_decay = dynamic_decay

--- a/official/modeling/optimization/lr_schedule.py
+++ b/official/modeling/optimization/lr_schedule.py
@@ -40,12 +40,11 @@ class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
    Args:
      after_warmup_lr_sched: tf.keras.optimizers.schedules
                                .LearningRateSchedule or a constant.
-      warmup_steps: int. number of the warmup steps.
+      warmup_steps: Number of the warmup steps.
-      warmup_learning_rate: floating point number. Initial learning rate for the
+      warmup_learning_rate: Initial learning rate for the warmup.
-                      warmup.
      name: Optional, name of warmup schedule.
    """
-    super(LinearWarmup, self).__init__()
+    super().__init__()
    self._name = name
    self._after_warmup_lr_sched = after_warmup_lr_sched
    self._warmup_steps = warmup_steps
@@ -102,7 +101,7 @@ class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
               warmup_steps: int,
               power: float = 1.0,
               name: str = "PolynomialWarmup"):
-    super(PolynomialWarmUp, self).__init__()
+    super().__init__()
    if isinstance(after_warmup_lr_sched,
                  tf.keras.optimizers.schedules.LearningRateSchedule):
      self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
@@ -121,7 +120,14 @@ class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
      # learning rate will be `global_step/num_warmup_steps * init_lr`.
      global_step_float = tf.cast(step, tf.float32)
      warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)
-      warmup_percent_done = global_step_float / warmup_steps_float
+      if self._warmup_steps <= 0:
+        warmup_percent_done = 1.0
+      else:
+        # A zero `step` may cause Inf. So make `step` positive.
+        step_non_zero = tf.math.maximum(global_step_float, 1.0)
+        warmup_percent_done = step_non_zero / warmup_steps_float
      warmup_learning_rate = (
          self._initial_learning_rate *
          tf.math.pow(warmup_percent_done, self._power))
@@ -164,11 +170,11 @@ class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    """Initialize configuration of the learning rate schedule.
    Args:
-      initial_learning_rate: A float, the initial learning rate.
+      initial_learning_rate: The initial learning rate.
-      power: A float, the number of steps required for linear warmup.
+      power: The order of the polynomial.
      name: Optional, name of warmup schedule.
    """
-    super(DirectPowerDecay, self).__init__()
+    super().__init__()
    self._initial_learning_rate = initial_learning_rate
    self._power = power
    self._name = name
@@ -177,7 +183,9 @@ class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    with tf.name_scope(self._name or "DirectPowerDecay"):
      step = tf.cast(step, tf.float32)
      learning_rate = self._initial_learning_rate
-      learning_rate *= tf.math.pow(step, self._power)
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
      return learning_rate
  def get_config(self):
@@ -207,14 +215,14 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    """Initialize configuration of the learning rate schedule.
    Args:
-      initial_learning_rate: A float, the initial learning rate.
+      initial_learning_rate: The initial learning rate.
      total_decay_steps: The total number of steps for power + linear decay.
-      power: A float, the number of steps required for linear warmup.
+      power: The order of the polynomial.
-      linear_decay_fraction: A float, in the last `linear_decay_fraction` steps,
+      linear_decay_fraction: In the last `linear_decay_fraction` steps,
        the learning rate will be multiplied by a linear decay.
      name: Optional, name of warmup schedule.
    """
-    super(PowerAndLinearDecay, self).__init__()
+    super().__init__()
    self._initial_learning_rate = initial_learning_rate
    self._total_decay_steps = total_decay_steps
    self._power = power
@@ -225,8 +233,10 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    with tf.name_scope(self._name or "PowerAndLinearDecay"):
      step = tf.cast(step, tf.float32)
      learning_rate = self._initial_learning_rate
-      learning_rate *= tf.math.pow(step, self._power)
+      # A zero `step` may cause Inf. So make `step` positive.
-      if self._linear_decay_fraction > 0:
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      if self._total_decay_steps * self._linear_decay_fraction > 0:
        learning_rate *= tf.minimum(
            1.0, (self._total_decay_steps - step) /
            (self._total_decay_steps * self._linear_decay_fraction))
@@ -242,3 +252,55 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
        "linear_decay_fraction": self._linear_decay_fraction,
        "name": self._name,
    }
+class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Power learning rate decay with offset.
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+  """
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               offset: int = 0,
+               pre_offset_learning_rate: float = 1.0e6,
+               name: str = "PowerDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      offset: The offset when computing the power decay.
+      pre_offset_learning_rate: The maximum learning rate we'll use.
+      name: Optional, name of warmup schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._offset = offset
+    self._pre_offset_lr = pre_offset_learning_rate
+    self._name = name
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerDecayWithOffset"):
+      step = tf.cast(step, tf.float32)
+      lr_after_offset = tf.math.pow(
+          tf.math.maximum(step - self._offset, 1.0), self._power) * (
+              self._initial_learning_rate)
+      sign = tf.cast(step > self._offset, tf.float32)
+      lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
+      # Power may give infinitely large LR. So cap it with pre_offset_lr.
+      return tf.math.minimum(lr_combined, self._pre_offset_lr)
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "offset": self._offset,
+        "pre_offset_learning_rate": self._pre_offset_lr,
+        "name": self._name,
+    }
--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -40,6 +40,7 @@ LR_CLS = {
    'cosine': tf.keras.experimental.CosineDecay,
    'power': lr_schedule.DirectPowerDecay,
    'power_linear': lr_schedule.PowerAndLinearDecay,
+    'power_with_offset': lr_schedule.PowerDecayWithOffset,
 }
 WARMUP_CLS = {
@@ -48,7 +49,7 @@ WARMUP_CLS = {
 }
-class OptimizerFactory(object):
+class OptimizerFactory:
  """Optimizer factory class.
  This class builds learning rate and optimizer based on an optimization config.

--- a/official/modeling/optimization/optimizer_factory_test.py
+++ b/official/modeling/optimization/optimizer_factory_test.py
@@ -313,7 +313,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    lr = opt_factory.build_learning_rate()
    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
+      self.assertAlmostEqual(lr(step).numpy(), value, places=6)
  def test_power_lr_schedule(self):
    params = {
@@ -331,7 +331,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
            }
        }
    }
-    expected_lr_step_values = [[1, 1.0], [250, 1. / 250.]]
+    expected_lr_step_values = [[0, 1.0], [1, 1.0], [250, 1. / 250.]]
    opt_config = optimization_config.OptimizationConfig(params)
    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
    lr = opt_factory.build_learning_rate()
@@ -357,7 +357,34 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
            }
        }
    }
-    expected_lr_step_values = [[1, 1.0], [40, 1. / 40.], [60, 1. / 60. * 0.8]]
+    expected_lr_step_values = [
+        [0, 1.0], [1, 1.0], [40, 1. / 40.], [60, 1. / 60. * 0.8]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+  def test_power_with_offset_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power_with_offset',
+            'power_with_offset': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0,
+                'offset': 10,
+                'pre_offset_learning_rate': 3.0,
+            }
+        }
+    }
+    expected_lr_step_values = [[1, 3.0], [10, 3.0], [20, 1. / 10.]]
    opt_config = optimization_config.OptimizationConfig(params)
    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
    lr = opt_factory.build_learning_rate()