Merge remote-tracking branch 'upstream/master' into detr-push-3

356c98bd · Kaushik Shivakumar · d31aba8a · b9785623 · 356c98bd · 356c98bd
Commit 356c98bd authored Aug 07, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/README.md
+++ b/official/README.md
@@ -98,17 +98,24 @@ pip3 install tf-nightly

 #### Method 1: Install the TensorFlow Model Garden pip package

-**tf-models-nightly** is the nightly Model Garden package
-created daily automatically. pip will install all models
-and dependencies automatically.
+**tf-models-official** is the stable Model Garden package.
+pip will install all models and dependencies automatically.

 ```shell
-pip install tf-models-nightly
+pip install tf-models-official
 ```

 Please check out our [example](colab/fine_tuning_bert.ipynb)
 to learn how to use a PIP package.

+Note that **tf-models-official** may not include the latest changes in this
+github repo. To include latest changes, you may install **tf-models-nightly**,
+which is the nightly Model Garden package created daily automatically.
+
+```shell
+pip install tf-models-nightly
+```
+
 #### Method 2: Clone the source

 1. Clone the GitHub repository:

--- a/official/colab/fine_tuning_bert.ipynb
+++ b/official/colab/fine_tuning_bert.ipynb
@@ -98,7 +98,8 @@
      "source": [
        "### Install the TensorFlow Model Garden pip package\n",
        "\n",
-        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
        "*  pip will install all models and dependencies automatically."
      ]
    },
@@ -112,8 +113,7 @@
      },
      "outputs": [],
      "source": [
-        "!pip install -q tf-nightly\n",
-        "!pip install -q tf-models-nightly"
+        "!pip install -q tf-models-official==2.3.0"
      ]
    },
    {

--- a/official/colab/nlp/customize_encoder.ipynb
+++ b/official/colab/nlp/customize_encoder.ipynb
@@ -100,7 +100,8 @@
      "source": [
        "### Install the TensorFlow Model Garden pip package\n",
        "\n",
-        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
        "*  `pip` will install all models and dependencies automatically."
      ]
    },
@@ -114,8 +115,7 @@
      },
      "outputs": [],
      "source": [
-        "!pip install -q tf-nightly\n",
-        "!pip install -q tf-models-nightly"
+        "!pip install -q tf-models-official==2.3.0"
      ]
    },
    {

--- a/official/colab/nlp/nlp_modeling_library_intro.ipynb
+++ b/official/colab/nlp/nlp_modeling_library_intro.ipynb
@@ -98,7 +98,8 @@
      "source": [
        "### Install the TensorFlow Model Garden pip package\n",
        "\n",
-        "*  `tf-models-nightly` is the nightly Model Garden package created daily automatically.\n",
+        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
+        "which is the nightly Model Garden package created daily automatically.\n",
        "*  `pip` will install all models and dependencies automatically."
      ]
    },
@@ -112,8 +113,7 @@
      },
      "outputs": [],
      "source": [
-        "!pip install -q tf-nightly\n",
-        "!pip install -q tf-models-nightly"
+        "!pip install -q tf-models-official==2.3.0"
      ]
    },
    {
@@ -478,7 +478,7 @@
      "source": [
        "### Build a BertClassifier model wrapping TransformerEncoder\n",
        "\n",
-        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a simple token classification model containing a single classification head using the `TokenClassification` network."
+        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a [CLS] token classification model containing a single classification head."
      ]
    },
    {

--- a/official/core/base_task.py
+++ b/official/core/base_task.py
@@ -18,11 +18,11 @@ import abc
 import functools
 from typing import Any, Callable, Optional

+from absl import logging
 import six
 import tensorflow as tf

 from official.modeling.hyperparams import config_definitions as cfg
-from official.utils import registry


 @six.add_metaclass(abc.ABCMeta)
@@ -67,7 +67,19 @@ class Task(tf.Module):
    Args:
      model: The keras.Model built or used by this task.
    """
-    pass
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    logging.info("Trying to load pretrained checkpoint from %s",
+                 ckpt_dir_or_file)
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    status = ckpt.read(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info("Finished loading pretrained checkpoint from %s",
+                 ckpt_dir_or_file)

  @abc.abstractmethod
  def build_model(self) -> tf.keras.Model:
@@ -282,49 +294,3 @@ class Task(tf.Module):
    """Optional reduce of aggregated logs over validation steps."""
    return {}

-
-_REGISTERED_TASK_CLS = {}
-
-
-# TODO(b/158268740): Move these outside the base class file.
-# TODO(b/158741360): Add type annotations once pytype checks across modules.
-def register_task_cls(task_config_cls):
-  """Decorates a factory of Tasks for lookup by a subclass of TaskConfig.
-
-  This decorator supports registration of tasks as follows:
-
-  ```
-  @dataclasses.dataclass
-  class MyTaskConfig(TaskConfig):
-    # Add fields here.
-    pass
-
-  @register_task_cls(MyTaskConfig)
-  class MyTask(Task):
-    # Inherits def __init__(self, task_config).
-    pass
-
-  my_task_config = MyTaskConfig()
-  my_task = get_task(my_task_config)  # Returns MyTask(my_task_config).
-  ```
-
-  Besisdes a class itself, other callables that create a Task from a TaskConfig
-  can be decorated by the result of this function, as long as there is at most
-  one registration for each config class.
-
-  Args:
-    task_config_cls: a subclass of TaskConfig (*not* an instance of TaskConfig).
-      Each task_config_cls can only be used for a single registration.
-
-  Returns:
-    A callable for use as class decorator that registers the decorated class
-    for creation from an instance of task_config_cls.
-  """
-  return registry.register(_REGISTERED_TASK_CLS, task_config_cls)
-
-
-# The user-visible get_task() is defined after classes have been registered.
-# TODO(b/158741360): Add type annotations once pytype checks across modules.
-def get_task_cls(task_config_cls):
-  task_cls = registry.lookup(_REGISTERED_TASK_CLS, task_config_cls)
-  return task_cls
--- a/official/core/base_task_test.py
+++ b/official/core/base_task_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow_models.core.base_task."""
+
+import functools
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.utils.testing import mock_task
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+      mode='eager',
+  )
+
+
+class TaskKerasTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_task_with_step_override(self, distribution):
+    with distribution.scope():
+      task = mock_task.MockTask()
+      model = task.build_model()
+      model = task.compile_model(
+          model,
+          optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
+          metrics=task.build_metrics(),
+          train_step=task.train_step,
+          validation_step=task.validation_step)
+
+    dataset = task.build_inputs(params=None)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn('loss', logs.history)
+    self.assertIn('acc', logs.history)
+
+    # Without specifying metrics through compile.
+    with distribution.scope():
+      train_metrics = task.build_metrics(training=True)
+      val_metrics = task.build_metrics(training=False)
+      model = task.build_model()
+      model = task.compile_model(
+          model,
+          optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
+          train_step=functools.partial(task.train_step, metrics=train_metrics),
+          validation_step=functools.partial(
+              task.validation_step, metrics=val_metrics))
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn('loss', logs.history)
+    self.assertIn('acc', logs.history)
+
+  def test_task_with_fit(self):
+    task = mock_task.MockTask()
+    model = task.build_model()
+    model = task.compile_model(
+        model,
+        optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
+        loss=tf.keras.losses.CategoricalCrossentropy(),
+        metrics=task.build_metrics())
+    dataset = task.build_inputs(params=None)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn('loss', logs.history)
+    self.assertIn('acc', logs.history)
+    self.assertLen(model.evaluate(dataset, steps=1), 2)
+
+  def test_task_invalid_compile(self):
+    task = mock_task.MockTask()
+    model = task.build_model()
+    with self.assertRaises(ValueError):
+      _ = task.compile_model(
+          model,
+          optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
+          loss=tf.keras.losses.CategoricalCrossentropy(),
+          metrics=task.build_metrics(),
+          train_step=task.train_step)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/core/base_trainer.py
+++ b/official/core/base_trainer.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Standard Trainer implementation.
+
+The base trainer implements the Orbit `StandardTrainable` and
+`StandardEvaluable` interfaces. Trainers inside this project should be
+interchangable and independent on model architectures and tasks.
+"""
+import gin
+import orbit
+import tensorflow as tf
+
+from official.core import base_task
+from official.modeling import optimization
+from official.modeling import performance
+from official.modeling.hyperparams import config_definitions
+
+
+ExperimentConfig = config_definitions.ExperimentConfig
+
+
+@gin.configurable
+class Trainer(orbit.StandardTrainer, orbit.StandardEvaluator):
+  """Implements the common trainer shared for TensorFlow models."""
+
+  def __init__(self,
+               config: ExperimentConfig,
+               task: base_task.Task,
+               train: bool = True,
+               evaluate: bool = True,
+               model=None,
+               optimizer=None):
+    """Initialize common trainer for TensorFlow models.
+
+    Args:
+      config: An `ExperimentConfig` instance specifying experiment config.
+      task: A base_task.Task instance.
+      train: bool, whether or not this trainer will be used for training.
+        default to True.
+      evaluate: bool, whether or not this trainer will be used for evaluation.
+        default to True.
+      model: tf.keras.Model instance. If provided, it will be used instead
+        of building model using task.build_model(). Default to None.
+      optimizer: tf.keras.optimizers.Optimizer instance. If provided, it will
+        used instead of the optimizer from config. Default to None.
+    """
+    # Gets the current distribution strategy. If not inside any strategy scope,
+    # it gets a single-replica no-op strategy.
+    self._strategy = tf.distribute.get_strategy()
+    self._config = config
+    self._task = task
+
+    self._model = model or task.build_model()
+
+    if optimizer is None:
+      opt_factory = optimization.OptimizerFactory(
+          config.trainer.optimizer_config)
+      self._optimizer = opt_factory.build_optimizer(
+          opt_factory.build_learning_rate())
+    else:
+      self._optimizer = optimizer
+
+    # Configuring optimizer when loss_scale is set in runtime config. This helps
+    # avoiding overflow/underflow for float16 computations.
+    if config.runtime.loss_scale:
+      self._optimizer = performance.configure_optimizer(
+          self._optimizer,
+          use_float16=config.runtime.mixed_precision_dtype == 'float16',
+          loss_scale=config.runtime.loss_scale)
+
+    # global_step increases by 1 after each training iteration.
+    # We should have global_step.numpy() == self.optimizer.iterations.numpy()
+    # when there is only 1 optimizer.
+    self._global_step = orbit.utils.create_global_step()
+    if hasattr(self.model, 'checkpoint_items'):
+      checkpoint_items = self.model.checkpoint_items
+    else:
+      checkpoint_items = {}
+    self._checkpoint = tf.train.Checkpoint(
+        global_step=self.global_step, model=self.model,
+        optimizer=self.optimizer, **checkpoint_items)
+
+    self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
+    self._validation_loss = tf.keras.metrics.Mean(
+        'validation_loss', dtype=tf.float32)
+    self._train_metrics = self.task.build_metrics(
+        training=True) + self.model.metrics
+    self._validation_metrics = self.task.build_metrics(
+        training=False) + self.model.metrics
+
+    if train:
+      train_dataset = orbit.utils.make_distributed_dataset(
+          self.strategy, self.task.build_inputs, self.config.task.train_data)
+      orbit.StandardTrainer.__init__(
+          self,
+          train_dataset,
+          options=orbit.StandardTrainerOptions(
+              use_tf_while_loop=config.trainer.train_tf_while_loop,
+              use_tf_function=config.trainer.train_tf_function,
+              use_tpu_summary_optimization=config.trainer.allow_tpu_summary))
+
+    if evaluate:
+      eval_dataset = orbit.utils.make_distributed_dataset(
+          self.strategy, self.task.build_inputs,
+          self.config.task.validation_data)
+      orbit.StandardEvaluator.__init__(
+          self,
+          eval_dataset,
+          options=orbit.StandardEvaluatorOptions(
+              use_tf_function=config.trainer.eval_tf_function))
+
+  @property
+  def strategy(self):
+    return self._strategy
+
+  @property
+  def config(self):
+    return self._config
+
+  @property
+  def task(self):
+    return self._task
+
+  @property
+  def model(self):
+    return self._model
+
+  @property
+  def optimizer(self):
+    return self._optimizer
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  @property
+  def train_loss(self):
+    """Accesses the training loss metric object."""
+    return self._train_loss
+
+  @property
+  def validation_loss(self):
+    """Accesses the validation loss metric object."""
+    return self._validation_loss
+
+  @property
+  def train_metrics(self):
+    """Accesses all training metric objects."""
+    return self._train_metrics
+
+  @property
+  def validation_metrics(self):
+    """Accesses all validation metric metric objects."""
+    return self._validation_metrics
+
+  def initialize(self):
+    """A callback function.
+
+    This function will be called when no checkpoint found for the model.
+    If there is a checkpoint, the checkpoint will be loaded and this function
+    will not be called. Tasks may use this callback function to load a
+    pretrained checkpoint, saved under a directory other than the model_dir.
+    """
+    self.task.initialize(self.model)
+
+  @property
+  def checkpoint(self):
+    """Accesses the training checkpoint."""
+    return self._checkpoint
+
+  def train_loop_end(self):
+    """See base class."""
+    logs = {}
+    for metric in self.train_metrics + [self.train_loss]:
+      logs[metric.name] = metric.result()
+      metric.reset_states()
+    if callable(self.optimizer.learning_rate):
+      logs['learning_rate'] = self.optimizer.learning_rate(self.global_step)
+    else:
+      logs['learning_rate'] = self.optimizer.learning_rate
+    return logs
+
+  def train_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      logs = self.task.train_step(
+          inputs,
+          model=self.model,
+          optimizer=self.optimizer,
+          metrics=self.train_metrics)
+      self._train_loss.update_state(logs[self.task.loss])
+      self.global_step.assign_add(1)
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def eval_begin(self):
+    """Sets up metrics."""
+    for metric in self.validation_metrics + [self.validation_loss]:
+      metric.reset_states()
+
+  def eval_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      logs = self.task.validation_step(
+          inputs, model=self.model, metrics=self.validation_metrics)
+      self._validation_loss.update_state(logs[self.task.loss])
+      return logs
+
+    distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
+    return tf.nest.map_structure(self.strategy.experimental_local_results,
+                                 distributed_outputs)
+
+  def eval_end(self, aggregated_logs=None):
+    """Processes evaluation results."""
+    logs = {}
+    for metric in self.validation_metrics + [self.validation_loss]:
+      logs[metric.name] = metric.result()
+    if aggregated_logs:
+      metrics = self.task.reduce_aggregated_logs(aggregated_logs)
+      logs.update(metrics)
+    return logs
+
+  def eval_reduce(self, state=None, step_outputs=None):
+    return self.task.aggregate_logs(state, step_outputs)
--- a/official/core/base_trainer_test.py
+++ b/official/core/base_trainer_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow_models.core.trainers.trainer."""
+# pylint: disable=g-direct-tensorflow-import
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import base_trainer as trainer_lib
+from official.modeling.hyperparams import config_definitions as cfg
+from official.utils.testing import mock_task
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+      mode='eager',
+  )
+
+
+class TrainerTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            optimizer_config=cfg.OptimizationConfig(
+                {'optimizer': {
+                    'type': 'sgd'
+                },
+                 'learning_rate': {
+                     'type': 'constant'
+                 }})))
+
+  def create_test_trainer(self):
+    task = mock_task.MockTask()
+    trainer = trainer_lib.Trainer(self._config, task)
+    return trainer
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_train(self, distribution):
+    with distribution.scope():
+      trainer = self.create_test_trainer()
+      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('training_loss', logs)
+      self.assertIn('learning_rate', logs)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_validate(self, distribution):
+    with distribution.scope():
+      trainer = self.create_test_trainer()
+      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('validation_loss', logs)
+      self.assertEqual(logs['acc'], 5. * distribution.num_replicas_in_sync)
+
+  @combinations.generate(
+      combinations.combine(
+          mixed_precision_dtype=['float32', 'bfloat16', 'float16'],
+          loss_scale=[None, 'dynamic', 128, 256],
+      ))
+  def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
+    config = cfg.ExperimentConfig(
+        runtime=cfg.RuntimeConfig(
+            mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale),
+        trainer=cfg.TrainerConfig(
+            optimizer_config=cfg.OptimizationConfig(
+                {'optimizer': {
+                    'type': 'sgd'
+                },
+                 'learning_rate': {
+                     'type': 'constant'
+                 }})))
+    task = mock_task.MockTask()
+    trainer = trainer_lib.Trainer(config, task)
+    if mixed_precision_dtype != 'float16':
+      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
+    elif mixed_precision_dtype == 'float16' and loss_scale is None:
+      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
+    else:
+      self.assertIsInstance(
+          trainer.optimizer,
+          tf.keras.mixed_precision.experimental.LossScaleOptimizer)
+
+    metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertIn('training_loss', metrics)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/compression/entropy_coder/lib/blocks_binarizer.py
+++ b/research/compression/entropy_coder/lib/blocks_binarizer.py
-# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,24 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Experiment factory methods."""

-"""Activation and weight binarizer implementations."""
+from official.modeling.hyperparams import config_definitions as cfg
+from official.utils import registry

-import math

-import numpy as np
-import tensorflow as tf
+_REGISTERED_CONFIGS = {}


-def ConvertSignCodeToZeroOneCode(x):
-  """Conversion from codes {-1, +1} to codes {0, 1}."""
-  return 0.5 * (x + 1.0)
+def register_config_factory(name):
+  """Register ExperimentConfig factory method."""
+  return registry.register(_REGISTERED_CONFIGS, name)


-def ConvertZeroOneCodeToSignCode(x):
-  """Convert from codes {0, 1} to codes {-1, +1}."""
-  return 2.0 * x - 1.0
+def get_exp_config_creater(exp_name: str):
+  """Looks up ExperimentConfig factory methods."""
+  exp_creater = registry.lookup(_REGISTERED_CONFIGS, exp_name)
+  return exp_creater


-def CheckZeroOneCode(x):
-  return tf.reduce_all(tf.equal(x * (x - 1.0), 0))
+def get_exp_config(exp_name: str) -> cfg.ExperimentConfig:
+  return get_exp_config_creater(exp_name)()
--- a/official/core/task_factory.py
+++ b/official/core/task_factory.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A global factory to register and access all registered tasks."""
+
+from official.utils import registry
+
+_REGISTERED_TASK_CLS = {}
+
+
+# TODO(b/158741360): Add type annotations once pytype checks across modules.
+def register_task_cls(task_config_cls):
+  """Decorates a factory of Tasks for lookup by a subclass of TaskConfig.
+
+  This decorator supports registration of tasks as follows:
+
+  ```
+  @dataclasses.dataclass
+  class MyTaskConfig(TaskConfig):
+    # Add fields here.
+    pass
+
+  @register_task_cls(MyTaskConfig)
+  class MyTask(Task):
+    # Inherits def __init__(self, task_config).
+    pass
+
+  my_task_config = MyTaskConfig()
+  my_task = get_task(my_task_config)  # Returns MyTask(my_task_config).
+  ```
+
+  Besisdes a class itself, other callables that create a Task from a TaskConfig
+  can be decorated by the result of this function, as long as there is at most
+  one registration for each config class.
+
+  Args:
+    task_config_cls: a subclass of TaskConfig (*not* an instance of TaskConfig).
+      Each task_config_cls can only be used for a single registration.
+
+  Returns:
+    A callable for use as class decorator that registers the decorated class
+    for creation from an instance of task_config_cls.
+  """
+  return registry.register(_REGISTERED_TASK_CLS, task_config_cls)
+
+
+def get_task(task_config, **kwargs):
+  """Creates a Task (of suitable subclass type) from task_config."""
+  return get_task_cls(task_config.__class__)(task_config, **kwargs)
+
+
+# The user-visible get_task() is defined after classes have been registered.
+# TODO(b/158741360): Add type annotations once pytype checks across modules.
+def get_task_cls(task_config_cls):
+  task_cls = registry.lookup(_REGISTERED_TASK_CLS, task_config_cls)
+  return task_cls
--- a/official/modeling/activations/gelu.py
+++ b/official/modeling/activations/gelu.py
@@ -14,12 +14,6 @@
 # ==============================================================================
 """Gaussian error linear unit."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
 import tensorflow as tf


@@ -35,6 +29,4 @@ def gelu(x):
  Returns:
    `x` with the GELU activation applied.
  """
-  cdf = 0.5 * (1.0 + tf.tanh(
-      (math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-  return x * cdf
+  return tf.keras.activations.gelu(x, approximate=True)
--- a/official/modeling/hyperparams/config_definitions.py
+++ b/official/modeling/hyperparams/config_definitions.py
@@ -21,7 +21,6 @@ import dataclasses

 from official.modeling.hyperparams import base_config
 from official.modeling.optimization.configs import optimization_config
-from official.utils import registry

 OptimizationConfig = optimization_config.OptimizationConfig

@@ -179,6 +178,7 @@ class TrainerConfig(base_config.Config):
    max_to_keep: max checkpoints to keep.
    continuous_eval_timeout: maximum number of seconds to wait between
      checkpoints, if set to None, continuous eval will wait indefinitely.
+      This is only used continuous_train_and_eval and continuous_eval modes.
    train_steps: number of train steps.
    validation_steps: number of eval steps. If `None`, the entire eval dataset
      is used.
@@ -205,6 +205,7 @@ class TrainerConfig(base_config.Config):

 @dataclasses.dataclass
 class TaskConfig(base_config.Config):
+  init_checkpoint: str = ""
  model: base_config.Config = None
  train_data: DataConfig = DataConfig()
  validation_data: DataConfig = DataConfig()
@@ -217,16 +218,3 @@ class ExperimentConfig(base_config.Config):
  trainer: TrainerConfig = TrainerConfig()
  runtime: RuntimeConfig = RuntimeConfig()

-
-_REGISTERED_CONFIGS = {}
-
-
-def register_config_factory(name):
-  """Register ExperimentConfig factory method."""
-  return registry.register(_REGISTERED_CONFIGS, name)
-
-
-def get_exp_config_creater(exp_name: str):
-  """Looks up ExperimentConfig factory methods."""
-  exp_creater = registry.lookup(_REGISTERED_CONFIGS, exp_name)
-  return exp_creater
--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
@@ -106,6 +106,7 @@ class AdamWeightDecayConfig(base_config.Config):
  weight_decay_rate: float = 0.0
  include_in_weight_decay: Optional[List[str]] = None
  exclude_from_weight_decay: Optional[List[str]] = None
+  gradient_clip_norm: float = 1.0


 @dataclasses.dataclass

--- a/official/modeling/training/distributed_executor.py
+++ b/official/modeling/training/distributed_executor.py
@@ -63,8 +63,8 @@ def metrics_as_dict(metric):
  """Puts input metric(s) into a list.

  Args:
-    metric: metric(s) to be put into the list. `metric` could be a object, a
-      list or a dict of tf.keras.metrics.Metric or has the `required_method`.
+    metric: metric(s) to be put into the list. `metric` could be an object, a
+      list, or a dict of tf.keras.metrics.Metric or has the `required_method`.

  Returns:
    A dictionary of valid metrics.
@@ -351,7 +351,8 @@ class DistributedExecutor(object):
      train_input_fn: (params: dict) -> tf.data.Dataset training data input
        function.
      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
-        trigger evaluting metric on eval data. If None, will not run eval step.
+        trigger evaluating metric on eval data. If None, will not run the eval
+        step.
      model_dir: the folder path for model checkpoints.
      total_steps: total training steps.
      iterations_per_loop: train steps per loop. After each loop, this job will
@@ -672,7 +673,7 @@ class DistributedExecutor(object):
      raise ValueError('if `eval_metric_fn` is specified, '
                       'eval_metric_fn must be a callable.')

-    old_phrase = tf.keras.backend.learning_phase()
+    old_phase = tf.keras.backend.learning_phase()
    tf.keras.backend.set_learning_phase(0)
    params = self._params
    strategy = self._strategy
@@ -698,7 +699,8 @@ class DistributedExecutor(object):
      logging.info(
          'Checkpoint file %s found and restoring from '
          'checkpoint', checkpoint_path)
-      checkpoint.restore(checkpoint_path)
+      status = checkpoint.restore(checkpoint_path)
+      status.expect_partial().assert_existing_objects_matched()

      self.global_train_step = model.optimizer.iterations
      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
@@ -709,7 +711,7 @@ class DistributedExecutor(object):
      summary_writer(metrics=eval_metric_result, step=current_step)
      reset_states(eval_metric)

-    tf.keras.backend.set_learning_phase(old_phrase)
+    tf.keras.backend.set_learning_phase(old_phase)
    return eval_metric_result, current_step

  def predict(self):
@@ -759,7 +761,7 @@ class ExecutorBuilder(object):

    Args:
      strategy_type: string. One of 'tpu', 'mirrored', 'multi_worker_mirrored'.
-        If None. User is responsible to set the strategy before calling
+        If None, the user is responsible to set the strategy before calling
        build_executor(...).
      strategy_config: necessary config for constructing the proper Strategy.
        Check strategy_flags_dict() for examples of the structure.

--- a/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
+++ b/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
@@ -86,7 +86,7 @@ def _create_albert_model(cfg):
      activation=activations.gelu,
      dropout_rate=cfg.hidden_dropout_prob,
      attention_dropout_rate=cfg.attention_probs_dropout_prob,
-      sequence_length=cfg.max_position_embeddings,
+      max_sequence_length=cfg.max_position_embeddings,
      type_vocab_size=cfg.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=cfg.initializer_range))

--- a/official/nlp/bert/README.md
+++ b/official/nlp/bert/README.md
@@ -46,6 +46,8 @@ The new checkpoints are:**
    12-layer, 768-hidden, 12-heads , 110M parameters
 *   **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz)**:
    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Multilingual Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/multi_cased_L-12_H-768_A-12.tar.gz)**:
+    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters

 We recommend to host checkpoints on Google Cloud storage buckets when you use
 Cloud GPU/TPU.

--- a/official/nlp/bert/bert_models.py
+++ b/official/nlp/bert/bert_models.py
@@ -104,14 +104,14 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):

 @gin.configurable
 def get_transformer_encoder(bert_config,
-                            sequence_length,
+                            sequence_length=None,
                            transformer_encoder_cls=None,
                            output_range=None):
  """Gets a 'TransformerEncoder' object.

  Args:
    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
-    sequence_length: Maximum sequence length of the training data.
+    sequence_length: [Deprecated].
    transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
      default BERT encoder implementation.
    output_range: the sequence output range, [0, output_range). Default setting
@@ -120,13 +120,13 @@ def get_transformer_encoder(bert_config,
  Returns:
    A networks.TransformerEncoder object.
  """
+  del sequence_length
  if transformer_encoder_cls is not None:
    # TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
    embedding_cfg = dict(
        vocab_size=bert_config.vocab_size,
        type_vocab_size=bert_config.type_vocab_size,
        hidden_size=bert_config.hidden_size,
-        seq_length=sequence_length,
        max_seq_length=bert_config.max_position_embeddings,
        initializer=tf.keras.initializers.TruncatedNormal(
            stddev=bert_config.initializer_range),
@@ -161,7 +161,6 @@ def get_transformer_encoder(bert_config,
      activation=tf_utils.get_activation(bert_config.hidden_act),
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
-      sequence_length=sequence_length,
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      embedding_width=bert_config.embedding_size,

--- a/official/nlp/bert/bert_models_test.py
+++ b/official/nlp/bert/bert_models_test.py
@@ -56,8 +56,6 @@ class BertModelsTest(tf.test.TestCase):
    # Expect two output from encoder: sequence and classification output.
    self.assertIsInstance(encoder.output, list)
    self.assertLen(encoder.output, 2)
-    # shape should be [batch size, seq_length, hidden_size]
-    self.assertEqual(encoder.output[0].shape.as_list(), [None, 5, 16])
    # shape should be [batch size, hidden_size]
    self.assertEqual(encoder.output[1].shape.as_list(), [None, 16])

@@ -74,16 +72,12 @@ class BertModelsTest(tf.test.TestCase):
    # Expect two output from model: start positions and end positions
    self.assertIsInstance(model.output, list)
    self.assertLen(model.output, 2)
-    # shape should be [batch size, seq_length]
-    self.assertEqual(model.output[0].shape.as_list(), [None, 5])
-    # shape should be [batch size, seq_length]
-    self.assertEqual(model.output[1].shape.as_list(), [None, 5])

    # Expect two output from core_model: sequence and classification output.
    self.assertIsInstance(core_model.output, list)
    self.assertLen(core_model.output, 2)
-    # shape should be [batch size, seq_length, hidden_size]
-    self.assertEqual(core_model.output[0].shape.as_list(), [None, 5, 16])
+    # shape should be [batch size, None, hidden_size]
+    self.assertEqual(core_model.output[0].shape.as_list(), [None, None, 16])
    # shape should be [batch size, hidden_size]
    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])

@@ -104,8 +98,8 @@ class BertModelsTest(tf.test.TestCase):
    # Expect two output from core_model: sequence and classification output.
    self.assertIsInstance(core_model.output, list)
    self.assertLen(core_model.output, 2)
-    # shape should be [batch size, 1, hidden_size]
-    self.assertEqual(core_model.output[0].shape.as_list(), [None, 1, 16])
+    # shape should be [batch size, None, hidden_size]
+    self.assertEqual(core_model.output[0].shape.as_list(), [None, None, 16])
    # shape should be [batch size, hidden_size]
    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])


--- a/official/nlp/bert/tf2_encoder_checkpoint_converter.py
+++ b/official/nlp/bert/tf2_encoder_checkpoint_converter.py
@@ -61,7 +61,7 @@ def _create_bert_model(cfg):
      activation=activations.gelu,
      dropout_rate=cfg.hidden_dropout_prob,
      attention_dropout_rate=cfg.attention_probs_dropout_prob,
-      sequence_length=cfg.max_position_embeddings,
+      max_sequence_length=cfg.max_position_embeddings,
      type_vocab_size=cfg.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=cfg.initializer_range),
@@ -73,6 +73,7 @@ def _create_bert_model(cfg):
 def convert_checkpoint(bert_config, output_path, v1_checkpoint):
  """Converts a V1 checkpoint into an OO V2 checkpoint."""
  output_dir, _ = os.path.split(output_path)
+  tf.io.gfile.makedirs(output_dir)

  # Create a temporary V1 name-converted checkpoint in the output directory.
  temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")

--- a/official/nlp/configs/bert.py
+++ b/official/nlp/configs/bert.py
@@ -20,13 +20,9 @@ Includes configurations and instantiation methods.
 from typing import List, Optional, Text

 import dataclasses
-import tensorflow as tf

-from official.modeling import tf_utils
 from official.modeling.hyperparams import base_config
 from official.nlp.configs import encoders
-from official.nlp.modeling import layers
-from official.nlp.modeling.models import bert_pretrainer


 @dataclasses.dataclass
@@ -40,32 +36,9 @@ class ClsHeadConfig(base_config.Config):


 @dataclasses.dataclass
-class BertPretrainerConfig(base_config.Config):
-  """BERT encoder configuration."""
-  encoder: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+class PretrainerConfig(base_config.Config):
+  """Pretrainer configuration."""
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
  cls_heads: List[ClsHeadConfig] = dataclasses.field(default_factory=list)
-
-
-def instantiate_classification_heads_from_cfgs(
-    cls_head_configs: List[ClsHeadConfig]) -> List[layers.ClassificationHead]:
-  return [
-      layers.ClassificationHead(**cfg.as_dict()) for cfg in cls_head_configs
-    ] if cls_head_configs else []
-
-
-def instantiate_pretrainer_from_cfg(
-    config: BertPretrainerConfig,
-    encoder_network: Optional[tf.keras.Model] = None
-) -> bert_pretrainer.BertPretrainerV2:
-  """Instantiates a BertPretrainer from the config."""
-  encoder_cfg = config.encoder
-  if encoder_network is None:
-    encoder_network = encoders.instantiate_encoder_from_cfg(encoder_cfg)
-  return bert_pretrainer.BertPretrainerV2(
-      mlm_activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
-      mlm_initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=encoder_cfg.initializer_range),
-      encoder_network=encoder_network,
-      classification_heads=instantiate_classification_heads_from_cfgs(
-          config.cls_heads))
+  mlm_activation: str = "gelu"
+  mlm_initializer_range: float = 0.02