Merge branch 'master' of https://github.com/tensorflow/models

78c43ef1 · Gunho Park · 67cfc95b · e3c7e300 · 78c43ef1 · 78c43ef1
Commit 78c43ef1 authored Jul 26, 2021 by Gunho Park
20 changed files
--- a/official/modeling/optimization/adafactor_optimizer.py
+++ b/official/modeling/optimization/adafactor_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Adafactor optimizer.
+
+A new optimizer that will be open sourced soon.
+"""
+# pylint: disable=invalid-name, represents an unimplemented class definition.
+Adafactor = "Unimplemented"
--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
@@ -56,10 +56,12 @@ class StepwiseLrConfig(base_config.Config):
              values[0] [boundaries[0], boundaries[1]]     -> values[1]
              [boundaries[n-1], boundaries[n]]   -> values[n] [boundaries[n],
              end]               -> values[n+1] Defaults to None.
+    offset: An int. The offset applied to steps. Defaults to 0.
  """
  name: str = 'PiecewiseConstantDecay'
  boundaries: Optional[List[int]] = None
  values: Optional[List[float]] = None
+  offset: int = 0


 @dataclasses.dataclass
@@ -76,12 +78,14 @@ class ExponentialLrConfig(base_config.Config):
    decay_rate: A float. Defaults to None.
    staircase: A boolean, if true, learning rate is decreased at discreate
      intervals. Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
  """
  name: str = 'ExponentialDecay'
  initial_learning_rate: Optional[float] = None
  decay_steps: Optional[int] = None
  decay_rate: Optional[float] = None
  staircase: Optional[bool] = None
+  offset: int = 0


 @dataclasses.dataclass
@@ -99,6 +103,7 @@ class PolynomialLrConfig(base_config.Config):
    power: A float.  The power of the polynomial. Defaults to linear, 1.0.
    cycle: A boolean, whether or not it should cycle beyond decay_steps.
      Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
  """
  name: str = 'PolynomialDecay'
  initial_learning_rate: Optional[float] = None
@@ -106,6 +111,7 @@ class PolynomialLrConfig(base_config.Config):
  end_learning_rate: float = 0.0001
  power: float = 1.0
  cycle: bool = False
+  offset: int = 0


 @dataclasses.dataclass
@@ -122,11 +128,13 @@ class CosineLrConfig(base_config.Config):
      to None.
    alpha: A float.  Minimum learning rate value as a fraction of
      initial_learning_rate.
+    offset: An int. The offset applied to steps. Defaults to 0.
  """
  name: str = 'CosineDecay'
  initial_learning_rate: Optional[float] = None
  decay_steps: Optional[int] = None
  alpha: float = 0.0
+  offset: int = 0


 @dataclasses.dataclass

--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
@@ -52,6 +52,7 @@ class OptimizerConfig(oneof.OneOfConfig):
  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
  adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
  slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
+  adafactor: opt_cfg.AdafactorConfig = opt_cfg.AdafactorConfig()


 @dataclasses.dataclass

--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
@@ -247,3 +247,22 @@ class SLIDEConfig(BaseOptimizerConfig):
  do_gradient_rescaling: bool = True
  norm_type: str = "layer"
  ratio_clip_norm: float = 1e5
+
+
+@dataclasses.dataclass
+class AdafactorConfig(BaseOptimizerConfig):
+  """Configuration for Adafactor optimizer.
+
+  The attributes for this class matches the arguments of the Adafactor
+  implementation.
+  """
+  name: str = "Adafactor"
+  factored: bool = True
+  multiply_by_parameter_scale: bool = True
+  beta1: Optional[float] = None
+  decay_rate: float = 0.8
+  step_offset: int = 0
+  clipping_threshold: float = 1.0
+  min_dim_size_to_factor: int = 128
+  epsilon1: float = 1e-30
+  epsilon2: float = 1e-3
--- a/official/modeling/optimization/lr_schedule.py
+++ b/official/modeling/optimization/lr_schedule.py
@@ -19,6 +19,75 @@ from typing import Mapping, Any, Union, Optional
 import tensorflow as tf


+def _make_offset_wrapper(new_class_name: str, base_lr_class):
+  """Generates a offset wrapper of learning rate schedule.
+
+  It will returns a subclass of the the `base_lr_class`, the subclass takes an
+  `offset` argument in the constructor. When the new class instance is called,
+  the behavior is:
+    new_class_object(step) = base_lr_class_object(step - offset)
+
+  Example:
+    CosineDecayWithOffset = _make_offset_wrapper(
+                     'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
+    # Use the lr:
+    lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
+                               decay_steps=1000)
+    lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
+
+  Args:
+    new_class_name: the name of the new class.
+    base_lr_class: the base learning rate schedule class. Should be subclass of
+      tf.keras.optimizers.schedules.LearningRateSchedule
+
+  Returns:
+    A new class (subclass of the base_lr_class) that can take an offset.
+  """
+  assert issubclass(base_lr_class,
+                    tf.keras.optimizers.schedules.LearningRateSchedule), (
+                        "base_lr_class should be subclass of keras "
+                        f"LearningRateSchedule, got {base_lr_class}")
+
+  # pylint: disable=protected-access,pointless-statement
+  def offset_learning_rate_init(self, offset=0, **kwargs):
+    """Construct learning rate schedule object.
+
+    When this object is called, its behavior is
+       self.__call__(step) == base_lr_class.__call__(step - offset)
+    Args:
+      self: this object.
+      offset: The offset when computing the learning rate schedule.
+      **kwargs: Pass through to base learning rate class constructor.
+    """
+    base_lr_class.__init__(self, **kwargs)
+    self._offset = offset
+
+  def offset_learning_rate_call(self, step):
+    step = tf.cast(step - self._offset, tf.float32)
+    return base_lr_class.__call__(self, step)
+
+  # pylint: enable=protected-access,pointless-statement
+
+  return type(
+      new_class_name, (base_lr_class,), {
+          "base_lr_class": base_lr_class,
+          "__init__": offset_learning_rate_init,
+          "__call__": offset_learning_rate_call
+      })
+
+
+PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
+    "PiecewiseConstantDecayWithOffset",
+    tf.keras.optimizers.schedules.PiecewiseConstantDecay)
+PolynomialDecayWithOffset = _make_offset_wrapper(
+    "PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
+ExponentialDecayWithOffset = _make_offset_wrapper(
+    "ExponentialDecayWithOffset",
+    tf.keras.optimizers.schedules.ExponentialDecay)
+CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
+                                             tf.keras.experimental.CosineDecay)
+
+
 class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
  """Linear warmup schedule."""


--- a/official/modeling/optimization/lr_schedule_test.py
+++ b/official/modeling/optimization/lr_schedule_test.py
@@ -70,5 +70,40 @@ class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase):
      self.assertAlmostEqual(lr(step).numpy(), value)


+class OffsetLearningRateTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      dict(class_name=lr_schedule.PiecewiseConstantDecayWithOffset),
+      dict(class_name=lr_schedule.PolynomialDecayWithOffset),
+      dict(class_name=lr_schedule.ExponentialDecayWithOffset),
+      dict(class_name=lr_schedule.CosineDecayWithOffset),
+  )
+  def test_generated_docstring(self, class_name):
+    self.assertNotEmpty(class_name.__init__.__doc__)
+
+  @parameterized.parameters(
+      dict(
+          class_name=lr_schedule.PiecewiseConstantDecayWithOffset,
+          kwarg=dict(boundaries=[50, 80], values=[1.0, 0.5, 0.1])),
+      dict(
+          class_name=lr_schedule.PolynomialDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+      dict(
+          class_name=lr_schedule.ExponentialDecayWithOffset,
+          kwarg=dict(
+              initial_learning_rate=1.0, decay_steps=100, decay_rate=0.5)),
+      dict(
+          class_name=lr_schedule.CosineDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+  )
+  def test_offset(self, class_name, kwarg):
+    offset = 10
+    offset_lr = class_name(offset=offset, **kwarg)
+    base_lr = class_name.base_lr_class(**kwarg)
+    self.assertIsInstance(offset_lr, class_name)
+    for step in range(10, 101, 10):
+      self.assertEqual(offset_lr(step), base_lr(step - offset))
+
+
 if __name__ == '__main__':
  tf.test.main()
--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -20,6 +20,7 @@ import tensorflow as tf
 import tensorflow_addons.optimizers as tfa_optimizers

 from official.modeling.optimization import slide_optimizer
+from official.modeling.optimization import adafactor_optimizer
 from official.modeling.optimization import ema_optimizer
 from official.modeling.optimization import lars_optimizer
 from official.modeling.optimization import lr_schedule
@@ -34,14 +35,15 @@ OPTIMIZERS_CLS = {
    'rmsprop': tf.keras.optimizers.RMSprop,
    'lars': lars_optimizer.LARS,
    'adagrad': tf.keras.optimizers.Adagrad,
-    'slide': slide_optimizer.SLIDE
+    'slide': slide_optimizer.SLIDE,
+    'adafactor': adafactor_optimizer.Adafactor,
 }

 LR_CLS = {
-    'stepwise': tf.keras.optimizers.schedules.PiecewiseConstantDecay,
-    'polynomial': tf.keras.optimizers.schedules.PolynomialDecay,
-    'exponential': tf.keras.optimizers.schedules.ExponentialDecay,
-    'cosine': tf.keras.experimental.CosineDecay,
+    'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
+    'polynomial': lr_schedule.PolynomialDecayWithOffset,
+    'exponential': lr_schedule.ExponentialDecayWithOffset,
+    'cosine': lr_schedule.CosineDecayWithOffset,
    'power': lr_schedule.DirectPowerDecay,
    'power_linear': lr_schedule.PowerAndLinearDecay,
    'power_with_offset': lr_schedule.PowerDecayWithOffset,

--- a/official/modeling/performance.py
+++ b/official/modeling/performance.py
@@ -14,29 +14,16 @@

 """Functions and classes related to training performance."""

-from absl import logging
 import tensorflow as tf


 def configure_optimizer(optimizer,
                        use_float16=False,
                        use_graph_rewrite=False,
-                        loss_scale='dynamic',
-                        use_experimental_api=False):
+                        loss_scale=None):
  """Configures optimizer object with performance options."""
-  if use_experimental_api:
-    logging.warning('Passing use_experimental_api=True is deprecated. The '
-                    'argument will be removed in the future.')
  if use_float16:
-    # TODO(b/171936854): Move all methods to non-experimental api.
-    if use_experimental_api:
-      # Wraps optimizer with a LossScaleOptimizer. This is done automatically
-      # in compile() with the "mixed_float16" policy, but since we do not call
-      # compile(), we must wrap the optimizer manually.
-      optimizer = (
-          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-              optimizer, loss_scale=loss_scale))
-    elif loss_scale == 'dynamic':
+    if loss_scale in (None, 'dynamic'):
      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
    else:
      # loss_scale is a number. We interpret that as a fixed loss scale.
@@ -52,34 +39,17 @@ def configure_optimizer(optimizer,
  return optimizer


-def set_mixed_precision_policy(dtype, loss_scale=None,
-                               use_experimental_api=False):
-  """Sets mix precision policy."""
-  if use_experimental_api:
-    logging.warning('Passing use_experimental_api=True is deprecated. The '
-                    'argument will be removed in the future.')
-  assert use_experimental_api or loss_scale is None, (
-      'loss_scale cannot be specified if use_experimental_api is False. If the '
-      'non-experimental API is used, specify the loss scaling configuration '
-      'when creating the LossScaleOptimizer instead.'
-  )
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets the global `tf.keras.mixed_precision.Policy`."""
+  # TODO(b/191894773): Remove loss_scale argument
+  assert loss_scale is None, (
+      'The loss_scale argument must be None. The argument exists for '
+      'historical reasons and will be removed soon.')
  if dtype == tf.float16:
-    # TODO(b/171936854): Move all methods to non-experimental api.
-    if use_experimental_api:
-      policy = tf.keras.mixed_precision.experimental.Policy(
-          'mixed_float16', loss_scale=loss_scale)
-      tf.keras.mixed_precision.experimental.set_policy(policy)
-    else:
-      tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
  elif dtype == tf.bfloat16:
-    if use_experimental_api:
-      tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
-    else:
-      tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+    tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
  elif dtype == tf.float32:
-    if use_experimental_api:
-      tf.keras.mixed_precision.experimental.set_policy('float32')
-    else:
-      tf.keras.mixed_precision.set_global_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')
  else:
    raise ValueError('Unexpected dtype: %s' % dtype)
--- a/official/modeling/tf_utils.py
+++ b/official/modeling/tf_utils.py
@@ -108,6 +108,7 @@ def get_activation(identifier, use_keras_layer=False):
          "linear": "linear",
          "identity": "linear",
          "swish": "swish",
+          "sigmoid": "sigmoid",
          "relu6": tf.nn.relu6,
      }
      if identifier in keras_layer_allowlist:

--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -46,6 +46,8 @@ class BertEncoderConfig(hyperparams.Config):
  embedding_size: Optional[int] = None
  output_range: Optional[int] = None
  return_all_encoder_outputs: bool = False
+  # Pre/Post-LN Transformer
+  norm_first: bool = False


 @dataclasses.dataclass
@@ -132,6 +134,8 @@ class BigBirdEncoderConfig(hyperparams.Config):
  intermediate_size: int = 3072
  dropout_rate: float = 0.1
  attention_dropout_rate: float = 0.1
+  # Pre/Post-LN Transformer
+  norm_first: bool = False
  max_position_embeddings: int = 4096
  num_rand_blocks: int = 3
  block_size: int = 64
@@ -152,6 +156,8 @@ class KernelEncoderConfig(hyperparams.Config):
  intermediate_size: int = 3072
  dropout_rate: float = 0.1
  attention_dropout_rate: float = 0.1
+  # Pre/Post-LN Transformer
+  norm_first: bool = False
  max_position_embeddings: int = 512
  type_vocab_size: int = 2
  initializer_range: float = 0.02
@@ -161,6 +167,7 @@ class KernelEncoderConfig(hyperparams.Config):
  redraw: bool = False
  is_short_seq: bool = False
  begin_kernel: int = 0
+  scale: Optional[float] = None


 @dataclasses.dataclass
@@ -339,6 +346,7 @@ def build_encoder(config: EncoderConfig,
            encoder_cfg.hidden_activation),
        dropout_rate=encoder_cfg.dropout_rate,
        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        norm_first=encoder_cfg.norm_first,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=encoder_cfg.initializer_range),
        attention_cls=layers.BigBirdAttention,
@@ -377,6 +385,7 @@ def build_encoder(config: EncoderConfig,
        redraw=encoder_cfg.redraw,
        is_short_seq=encoder_cfg.is_short_seq,
        begin_kernel=encoder_cfg.begin_kernel,
+        scale=encoder_cfg.scale,
        )
    hidden_cfg = dict(
        num_attention_heads=encoder_cfg.num_attention_heads,
@@ -385,6 +394,7 @@ def build_encoder(config: EncoderConfig,
            encoder_cfg.hidden_activation),
        dropout_rate=encoder_cfg.dropout_rate,
        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        norm_first=encoder_cfg.norm_first,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=encoder_cfg.initializer_range),
        attention_cls=layers.KernelAttention,
@@ -445,4 +455,5 @@ def build_encoder(config: EncoderConfig,
      embedding_width=encoder_cfg.embedding_size,
      embedding_layer=embedding_layer,
      return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
-      dict_outputs=True)
+      dict_outputs=True,
+      norm_first=encoder_cfg.norm_first)
--- a/official/nlp/continuous_finetune_lib.py
+++ b/official/nlp/continuous_finetune_lib.py
@@ -28,7 +28,6 @@ from official.core import train_lib
 from official.core import train_utils
 from official.modeling import performance
 from official.modeling.multitask import configs
-from official.modeling.multitask import multitask
 from official.modeling.multitask import train_lib as multitask_train_lib


@@ -167,7 +166,10 @@ def run_continuous_finetune(
    with distribution_strategy.scope():
      if isinstance(params, configs.MultiEvalExperimentConfig):
        task = task_factory.get_task(params_replaced.task)
-        eval_tasks = multitask.MultiTask.from_config(params_replaced.eval_tasks)
+        eval_tasks = [
+            task_factory.get_task(config.task_config, name=config.task_name)
+            for config in params.eval_tasks
+        ]
        (_,
         eval_metrics) = multitask_train_lib.run_experiment_with_multitask_eval(
             distribution_strategy=distribution_strategy,

--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
--- a/official/nlp/data/classifier_data_lib_test.py
+++ b/official/nlp/data/classifier_data_lib_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for third_party.tensorflow_models.official.nlp.data.classifier_data_lib."""
+
+import os
+import tempfile
+
+from absl.testing import parameterized
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from official.nlp.bert import tokenization
+from official.nlp.data import classifier_data_lib
+
+
+def decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  return tf.io.parse_single_example(record, name_to_features)
+
+
+class BertClassifierLibTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(BertClassifierLibTest, self).setUp()
+    self.model_dir = self.get_temp_dir()
+    self.processors = {
+        "CB": classifier_data_lib.CBProcessor,
+        "SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor,
+        "BOOLQ": classifier_data_lib.BoolQProcessor,
+        "WIC": classifier_data_lib.WiCProcessor,
+    }
+
+    vocab_tokens = [
+        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+        "##ing", ","
+    ]
+    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
+      vocab_writer.write("".join([x + "\n" for x in vocab_tokens
+                                 ]).encode("utf-8"))
+    vocab_file = vocab_writer.name
+    self.tokenizer = tokenization.FullTokenizer(vocab_file)
+
+  @parameterized.parameters(
+      {"task_type": "CB"},
+      {"task_type": "BOOLQ"},
+      {"task_type": "SUPERGLUE-RTE"},
+      {"task_type": "WIC"},
+  )
+  def test_generate_dataset_from_tfds_processor(self, task_type):
+    with tfds.testing.mock_data(num_examples=5):
+      output_path = os.path.join(self.model_dir, task_type)
+
+      processor = self.processors[task_type]()
+
+      classifier_data_lib.generate_tf_record_from_data_file(
+          processor,
+          None,
+          self.tokenizer,
+          train_data_output_path=output_path,
+          eval_data_output_path=output_path,
+          test_data_output_path=output_path)
+      files = tf.io.gfile.glob(output_path)
+      self.assertNotEmpty(files)
+
+      train_dataset = tf.data.TFRecordDataset(output_path)
+      seq_length = 128
+      label_type = tf.int64
+      name_to_features = {
+          "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "label_ids": tf.io.FixedLenFeature([], label_type),
+      }
+      train_dataset = train_dataset.map(
+          lambda record: decode_record(record, name_to_features))
+
+      # If data is retrieved without error, then all requirements
+      # including data type/shapes are met.
+      _ = next(iter(train_dataset))
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -50,7 +50,7 @@ flags.DEFINE_enum(
    "classification_task_name", "MNLI", [
        "AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
        "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X",
-        "AX-g", "SUPERGLUE-RTE", "CB", "BoolQ"
+        "AX-g", "SUPERGLUE-RTE", "CB", "BoolQ", "WIC"
    ], "The name of the task to train BERT classifier. The "
    "difference between XTREME-XNLI and XNLI is: 1. the format "
    "of input tsv files; 2. the dev set for XTREME is english "
@@ -173,8 +173,26 @@ flags.DEFINE_string(

 def generate_classifier_dataset():
  """Generates classifier dataset and returns input meta data."""
-  assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
-          FLAGS.tfds_params)
+  if FLAGS.classification_task_name in [
+      "COLA",
+      "WNLI",
+      "SST-2",
+      "MRPC",
+      "QQP",
+      "STS-B",
+      "MNLI",
+      "QNLI",
+      "RTE",
+      "AX",
+      "SUPERGLUE-RTE",
+      "CB",
+      "BoolQ",
+      "WIC",
+  ]:
+    assert not FLAGS.input_data_dir or FLAGS.tfds_params
+  else:
+    assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
+            FLAGS.tfds_params)

  if FLAGS.tokenization == "WordPiece":
    tokenizer = tokenization.FullTokenizer(
@@ -248,6 +266,8 @@ def generate_classifier_dataset():
            classifier_data_lib.CBProcessor,
        "boolq":
            classifier_data_lib.BoolQProcessor,
+        "wic":
+            classifier_data_lib.WnliProcessor,
    }
    task_name = FLAGS.classification_task_name.lower()
    if task_name not in processors:

--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -60,8 +60,8 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
    else:
      self._label_name_mapping = dict()

-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
+  def name_to_features_spec(self):
+    """Defines features to decode. Subclass may override to append features."""
    label_type = LABEL_TYPES_MAP[self._params.label_type]
    name_to_features = {
        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
@@ -72,7 +72,11 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
    if self._include_example_id:
      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)

-    example = tf.io.parse_single_example(record, name_to_features)
+    return name_to_features
+
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    example = tf.io.parse_single_example(record, self.name_to_features_spec())

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
@@ -86,20 +90,23 @@ class SentencePredictionDataLoader(data_loader.DataLoader):

  def _parse(self, record: Mapping[str, tf.Tensor]):
    """Parses raw tensors into a dict of tensors to be consumed by the model."""
-    x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids']
+    key_mapping = {
+        'input_ids': 'input_word_ids',
+        'input_mask': 'input_mask',
+        'segment_ids': 'input_type_ids'
    }
-    if self._include_example_id:
-      x['example_id'] = record['example_id']
-
-    x[self._label_field] = record[self._label_field]
+    ret = {}
+    for record_key in record:
+      if record_key in key_mapping:
+        ret[key_mapping[record_key]] = record[record_key]
+      else:
+        ret[record_key] = record[record_key]

    if self._label_field in self._label_name_mapping:
-      x[self._label_name_mapping[self._label_field]] = record[self._label_field]
+      ret[self._label_name_mapping[self._label_field]] = record[
+          self._label_field]

-    return x
+    return ret

  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
    """Returns a tf.dataset.Dataset."""
@@ -215,13 +222,12 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
    """Berts preprocess."""
    segments = [record[x] for x in self._text_fields]
    model_inputs = self._text_processor(segments)
-    if self._include_example_id:
-      model_inputs['example_id'] = record['example_id']
-    model_inputs[self._label_field] = record[self._label_field]
+    for key in record:
+      if key not in self._text_fields:
+        model_inputs[key] = record[key]
    return model_inputs

-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
+  def name_to_features_spec(self):
    name_to_features = {}
    for text_field in self._text_fields:
      name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string)
@@ -230,8 +236,11 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
    name_to_features[self._label_field] = tf.io.FixedLenFeature([], label_type)
    if self._include_example_id:
      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
-    example = tf.io.parse_single_example(record, name_to_features)
+    return name_to_features

+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    example = tf.io.parse_single_example(record, self.name_to_features_spec())
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in example:

--- a/official/nlp/data/sentence_prediction_dataloader_test.py
+++ b/official/nlp/data/sentence_prediction_dataloader_test.py
@@ -198,9 +198,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
-        features.keys())
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
@@ -233,9 +236,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
-        features.keys())
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
@@ -268,9 +274,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
-        features.keys())
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))

--- a/official/nlp/keras_nlp/encoders/bert_encoder.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder.py
@@ -69,6 +69,9 @@ class BertEncoder(tf.keras.Model):
      smaller than 'hidden_size').
    embedding_layer: An optional Layer instance which will be called to
     generate embeddings for the input word IDs.
+    norm_first: Whether to normalize inputs to attention and intermediate
+      dense layers. If set False, output of attention and intermediate dense
+      layers is normalized.
  """

  def __init__(
@@ -87,6 +90,7 @@ class BertEncoder(tf.keras.Model):
      output_range=None,
      embedding_width=None,
      embedding_layer=None,
+      norm_first=False,
      **kwargs):
    activation = tf.keras.activations.get(inner_activation)
    initializer = tf.keras.initializers.get(initializer)
@@ -162,6 +166,7 @@ class BertEncoder(tf.keras.Model):
          inner_activation=inner_activation,
          output_dropout=output_dropout,
          attention_dropout=attention_dropout,
+          norm_first=norm_first,
          output_range=transformer_output_range,
          kernel_initializer=initializer,
          name='transformer/layer_%d' % i)
@@ -211,6 +216,7 @@ class BertEncoder(tf.keras.Model):
        'output_range': output_range,
        'embedding_width': embedding_width,
        'embedding_layer': embedding_layer,
+        'norm_first': norm_first,
    }

    # We are storing the config dict as a namedtuple here to ensure checkpoint

--- a/official/nlp/keras_nlp/encoders/bert_encoder_test.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder_test.py
@@ -205,7 +205,8 @@ class BertEncoderTest(keras_parameterized.TestCase):
        initializer="glorot_uniform",
        output_range=-1,
        embedding_width=16,
-        embedding_layer=None)
+        embedding_layer=None,
+        norm_first=False)
    network = bert_encoder.BertEncoder(**kwargs)
    expected_config = dict(kwargs)
    expected_config["inner_activation"] = tf.keras.activations.serialize(

--- a/official/nlp/keras_nlp/layers/position_embedding_test.py
+++ b/official/nlp/keras_nlp/layers/position_embedding_test.py
@@ -48,12 +48,12 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    test_layer = position_embedding.PositionEmbedding(
        max_length=sequence_length, seq_axis=2)
    width = 30
-    input_tensor = tf.keras.Input(shape=(sequence_length, width, width))
+    input_tensor = tf.keras.Input(shape=(width, sequence_length, width))
    output_tensor = test_layer(input_tensor)

    # When using static positional embedding shapes, the output is expected
    # to be the same as the input shape in all dimensions save batch.
-    expected_output_shape = [None, sequence_length, width, width]
+    expected_output_shape = [None, width, sequence_length, width]
    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
    # The default output dtype for this layer should be tf.float32.
    self.assertEqual(tf.float32, output_tensor.dtype)

--- a/official/nlp/keras_nlp/layers/transformer_encoder_block.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block.py
@@ -249,7 +249,7 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
          attention.

    Returns:
-      An ouput tensor with the same dimensions as input/query tensor.
+      An output tensor with the same dimensions as input/query tensor.
    """
    if isinstance(inputs, (list, tuple)):
      if len(inputs) == 2: