Merge branch 'master' of https://github.com/tensorflow/models

78c43ef1 · Gunho Park · 67cfc95b · e3c7e300 · 78c43ef1 · 78c43ef1
Commit 78c43ef1 authored Jul 26, 2021 by Gunho Park
20 changed files
--- a/official/modeling/optimization/adafactor_optimizer.py
+++ b/official/modeling/optimization/adafactor_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Adafactor optimizer.
+
+A new optimizer that will be open sourced soon.
+"""
+# pylint: disable=invalid-name, represents an unimplemented class definition.
+Adafactor = "Unimplemented"
--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
@@ -56,10 +56,12 @@ class StepwiseLrConfig(base_config.Config):
              values[0] [boundaries[0], boundaries[1]]     -> values[1]
              [boundaries[n-1], boundaries[n]]   -> values[n] [boundaries[n],
              end]               -> values[n+1] Defaults to None.
+    offset: An int. The offset applied to steps. Defaults to 0.
  """
  name: str = 'PiecewiseConstantDecay'
  boundaries: Optional[List[int]] = None
  values: Optional[List[float]] = None
+  offset: int = 0


 @dataclasses.dataclass
@@ -76,12 +78,14 @@ class ExponentialLrConfig(base_config.Config):
    decay_rate: A float. Defaults to None.
    staircase: A boolean, if true, learning rate is decreased at discreate
      intervals. Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
  """
  name: str = 'ExponentialDecay'
  initial_learning_rate: Optional[float] = None
  decay_steps: Optional[int] = None
  decay_rate: Optional[float] = None
  staircase: Optional[bool] = None
+  offset: int = 0


 @dataclasses.dataclass
@@ -99,6 +103,7 @@ class PolynomialLrConfig(base_config.Config):
    power: A float.  The power of the polynomial. Defaults to linear, 1.0.
    cycle: A boolean, whether or not it should cycle beyond decay_steps.
      Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
  """
  name: str = 'PolynomialDecay'
  initial_learning_rate: Optional[float] = None
@@ -106,6 +111,7 @@ class PolynomialLrConfig(base_config.Config):
  end_learning_rate: float = 0.0001
  power: float = 1.0
  cycle: bool = False
+  offset: int = 0


 @dataclasses.dataclass
@@ -122,11 +128,13 @@ class CosineLrConfig(base_config.Config):
      to None.
    alpha: A float.  Minimum learning rate value as a fraction of
      initial_learning_rate.
+    offset: An int. The offset applied to steps. Defaults to 0.
  """
  name: str = 'CosineDecay'
  initial_learning_rate: Optional[float] = None
  decay_steps: Optional[int] = None
  alpha: float = 0.0
+  offset: int = 0


 @dataclasses.dataclass

--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
@@ -52,6 +52,7 @@ class OptimizerConfig(oneof.OneOfConfig):
  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
  adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
  slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
+  adafactor: opt_cfg.AdafactorConfig = opt_cfg.AdafactorConfig()


 @dataclasses.dataclass

--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
@@ -247,3 +247,22 @@ class SLIDEConfig(BaseOptimizerConfig):
  do_gradient_rescaling: bool = True
  norm_type: str = "layer"
  ratio_clip_norm: float = 1e5
+
+
+@dataclasses.dataclass
+class AdafactorConfig(BaseOptimizerConfig):
+  """Configuration for Adafactor optimizer.
+
+  The attributes for this class matches the arguments of the Adafactor
+  implementation.
+  """
+  name: str = "Adafactor"
+  factored: bool = True
+  multiply_by_parameter_scale: bool = True
+  beta1: Optional[float] = None
+  decay_rate: float = 0.8
+  step_offset: int = 0
+  clipping_threshold: float = 1.0
+  min_dim_size_to_factor: int = 128
+  epsilon1: float = 1e-30
+  epsilon2: float = 1e-3
--- a/official/modeling/optimization/lr_schedule.py
+++ b/official/modeling/optimization/lr_schedule.py
@@ -19,6 +19,75 @@ from typing import Mapping, Any, Union, Optional
 import tensorflow as tf


+def _make_offset_wrapper(new_class_name: str, base_lr_class):
+  """Generates a offset wrapper of learning rate schedule.
+
+  It will returns a subclass of the the `base_lr_class`, the subclass takes an
+  `offset` argument in the constructor. When the new class instance is called,
+  the behavior is:
+    new_class_object(step) = base_lr_class_object(step - offset)
+
+  Example:
+    CosineDecayWithOffset = _make_offset_wrapper(
+                     'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
+    # Use the lr:
+    lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
+                               decay_steps=1000)
+    lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
+
+  Args:
+    new_class_name: the name of the new class.
+    base_lr_class: the base learning rate schedule class. Should be subclass of
+      tf.keras.optimizers.schedules.LearningRateSchedule
+
+  Returns:
+    A new class (subclass of the base_lr_class) that can take an offset.
+  """
+  assert issubclass(base_lr_class,
+                    tf.keras.optimizers.schedules.LearningRateSchedule), (
+                        "base_lr_class should be subclass of keras "
+                        f"LearningRateSchedule, got {base_lr_class}")
+
+  # pylint: disable=protected-access,pointless-statement
+  def offset_learning_rate_init(self, offset=0, **kwargs):
+    """Construct learning rate schedule object.
+
+    When this object is called, its behavior is
+       self.__call__(step) == base_lr_class.__call__(step - offset)
+    Args:
+      self: this object.
+      offset: The offset when computing the learning rate schedule.
+      **kwargs: Pass through to base learning rate class constructor.
+    """
+    base_lr_class.__init__(self, **kwargs)
+    self._offset = offset
+
+  def offset_learning_rate_call(self, step):
+    step = tf.cast(step - self._offset, tf.float32)
+    return base_lr_class.__call__(self, step)
+
+  # pylint: enable=protected-access,pointless-statement
+
+  return type(
+      new_class_name, (base_lr_class,), {
+          "base_lr_class": base_lr_class,
+          "__init__": offset_learning_rate_init,
+          "__call__": offset_learning_rate_call
+      })
+
+
+PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
+    "PiecewiseConstantDecayWithOffset",
+    tf.keras.optimizers.schedules.PiecewiseConstantDecay)
+PolynomialDecayWithOffset = _make_offset_wrapper(
+    "PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
+ExponentialDecayWithOffset = _make_offset_wrapper(
+    "ExponentialDecayWithOffset",
+    tf.keras.optimizers.schedules.ExponentialDecay)
+CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
+                                             tf.keras.experimental.CosineDecay)
+
+
 class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
  """Linear warmup schedule."""


--- a/official/modeling/optimization/lr_schedule_test.py
+++ b/official/modeling/optimization/lr_schedule_test.py
@@ -70,5 +70,40 @@ class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase):
      self.assertAlmostEqual(lr(step).numpy(), value)


+class OffsetLearningRateTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      dict(class_name=lr_schedule.PiecewiseConstantDecayWithOffset),
+      dict(class_name=lr_schedule.PolynomialDecayWithOffset),
+      dict(class_name=lr_schedule.ExponentialDecayWithOffset),
+      dict(class_name=lr_schedule.CosineDecayWithOffset),
+  )
+  def test_generated_docstring(self, class_name):
+    self.assertNotEmpty(class_name.__init__.__doc__)
+
+  @parameterized.parameters(
+      dict(
+          class_name=lr_schedule.PiecewiseConstantDecayWithOffset,
+          kwarg=dict(boundaries=[50, 80], values=[1.0, 0.5, 0.1])),
+      dict(
+          class_name=lr_schedule.PolynomialDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+      dict(
+          class_name=lr_schedule.ExponentialDecayWithOffset,
+          kwarg=dict(
+              initial_learning_rate=1.0, decay_steps=100, decay_rate=0.5)),
+      dict(
+          class_name=lr_schedule.CosineDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+  )
+  def test_offset(self, class_name, kwarg):
+    offset = 10
+    offset_lr = class_name(offset=offset, **kwarg)
+    base_lr = class_name.base_lr_class(**kwarg)
+    self.assertIsInstance(offset_lr, class_name)
+    for step in range(10, 101, 10):
+      self.assertEqual(offset_lr(step), base_lr(step - offset))
+
+
 if __name__ == '__main__':
  tf.test.main()
--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -20,6 +20,7 @@ import tensorflow as tf
 import tensorflow_addons.optimizers as tfa_optimizers

 from official.modeling.optimization import slide_optimizer
+from official.modeling.optimization import adafactor_optimizer
 from official.modeling.optimization import ema_optimizer
 from official.modeling.optimization import lars_optimizer
 from official.modeling.optimization import lr_schedule
@@ -34,14 +35,15 @@ OPTIMIZERS_CLS = {
    'rmsprop': tf.keras.optimizers.RMSprop,
    'lars': lars_optimizer.LARS,
    'adagrad': tf.keras.optimizers.Adagrad,
-    'slide': slide_optimizer.SLIDE
+    'slide': slide_optimizer.SLIDE,
+    'adafactor': adafactor_optimizer.Adafactor,
 }

 LR_CLS = {
-    'stepwise': tf.keras.optimizers.schedules.PiecewiseConstantDecay,
-    'polynomial': tf.keras.optimizers.schedules.PolynomialDecay,
-    'exponential': tf.keras.optimizers.schedules.ExponentialDecay,
-    'cosine': tf.keras.experimental.CosineDecay,
+    'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
+    'polynomial': lr_schedule.PolynomialDecayWithOffset,
+    'exponential': lr_schedule.ExponentialDecayWithOffset,
+    'cosine': lr_schedule.CosineDecayWithOffset,
    'power': lr_schedule.DirectPowerDecay,
    'power_linear': lr_schedule.PowerAndLinearDecay,
    'power_with_offset': lr_schedule.PowerDecayWithOffset,

--- a/official/modeling/performance.py
+++ b/official/modeling/performance.py
@@ -14,29 +14,16 @@

 """Functions and classes related to training performance."""

-from absl import logging
 import tensorflow as tf


 def configure_optimizer(optimizer,
                        use_float16=False,
                        use_graph_rewrite=False,
-                        loss_scale='dynamic',
-                        use_experimental_api=False):
+                        loss_scale=None):
  """Configures optimizer object with performance options."""
-  if use_experimental_api:
-    logging.warning('Passing use_experimental_api=True is deprecated. The '
-                    'argument will be removed in the future.')
  if use_float16:
-    # TODO(b/171936854): Move all methods to non-experimental api.
-    if use_experimental_api:
-      # Wraps optimizer with a LossScaleOptimizer. This is done automatically
-      # in compile() with the "mixed_float16" policy, but since we do not call
-      # compile(), we must wrap the optimizer manually.
-      optimizer = (
-          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-              optimizer, loss_scale=loss_scale))
-    elif loss_scale == 'dynamic':
+    if loss_scale in (None, 'dynamic'):
      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
    else:
      # loss_scale is a number. We interpret that as a fixed loss scale.
@@ -52,34 +39,17 @@ def configure_optimizer(optimizer,
  return optimizer


-def set_mixed_precision_policy(dtype, loss_scale=None,
-                               use_experimental_api=False):
-  """Sets mix precision policy."""
-  if use_experimental_api:
-    logging.warning('Passing use_experimental_api=True is deprecated. The '
-                    'argument will be removed in the future.')
-  assert use_experimental_api or loss_scale is None, (
-      'loss_scale cannot be specified if use_experimental_api is False. If the '
-      'non-experimental API is used, specify the loss scaling configuration '
-      'when creating the LossScaleOptimizer instead.'
-  )
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets the global `tf.keras.mixed_precision.Policy`."""
+  # TODO(b/191894773): Remove loss_scale argument
+  assert loss_scale is None, (
+      'The loss_scale argument must be None. The argument exists for '
+      'historical reasons and will be removed soon.')
  if dtype == tf.float16:
-    # TODO(b/171936854): Move all methods to non-experimental api.
-    if use_experimental_api:
-      policy = tf.keras.mixed_precision.experimental.Policy(
-          'mixed_float16', loss_scale=loss_scale)
-      tf.keras.mixed_precision.experimental.set_policy(policy)
-    else:
-      tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
  elif dtype == tf.bfloat16:
-    if use_experimental_api:
-      tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
-    else:
-      tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+    tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
  elif dtype == tf.float32:
-    if use_experimental_api:
-      tf.keras.mixed_precision.experimental.set_policy('float32')
-    else:
-      tf.keras.mixed_precision.set_global_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')
  else:
    raise ValueError('Unexpected dtype: %s' % dtype)
--- a/official/modeling/tf_utils.py
+++ b/official/modeling/tf_utils.py
@@ -108,6 +108,7 @@ def get_activation(identifier, use_keras_layer=False):
          "linear": "linear",
          "identity": "linear",
          "swish": "swish",
+          "sigmoid": "sigmoid",
          "relu6": tf.nn.relu6,
      }
      if identifier in keras_layer_allowlist:

--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -46,6 +46,8 @@ class BertEncoderConfig(hyperparams.Config):
  embedding_size: Optional[int] = None
  output_range: Optional[int] = None
  return_all_encoder_outputs: bool = False
+  # Pre/Post-LN Transformer
+  norm_first: bool = False


 @dataclasses.dataclass
@@ -132,6 +134,8 @@ class BigBirdEncoderConfig(hyperparams.Config):
  intermediate_size: int = 3072
  dropout_rate: float = 0.1
  attention_dropout_rate: float = 0.1
+  # Pre/Post-LN Transformer
+  norm_first: bool = False
  max_position_embeddings: int = 4096
  num_rand_blocks: int = 3
  block_size: int = 64
@@ -152,6 +156,8 @@ class KernelEncoderConfig(hyperparams.Config):
  intermediate_size: int = 3072
  dropout_rate: float = 0.1
  attention_dropout_rate: float = 0.1
+  # Pre/Post-LN Transformer
+  norm_first: bool = False
  max_position_embeddings: int = 512
  type_vocab_size: int = 2
  initializer_range: float = 0.02
@@ -161,6 +167,7 @@ class KernelEncoderConfig(hyperparams.Config):
  redraw: bool = False
  is_short_seq: bool = False
  begin_kernel: int = 0
+  scale: Optional[float] = None


 @dataclasses.dataclass
@@ -339,6 +346,7 @@ def build_encoder(config: EncoderConfig,
            encoder_cfg.hidden_activation),
        dropout_rate=encoder_cfg.dropout_rate,
        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        norm_first=encoder_cfg.norm_first,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=encoder_cfg.initializer_range),
        attention_cls=layers.BigBirdAttention,
@@ -377,6 +385,7 @@ def build_encoder(config: EncoderConfig,
        redraw=encoder_cfg.redraw,
        is_short_seq=encoder_cfg.is_short_seq,
        begin_kernel=encoder_cfg.begin_kernel,
+        scale=encoder_cfg.scale,
        )
    hidden_cfg = dict(
        num_attention_heads=encoder_cfg.num_attention_heads,
@@ -385,6 +394,7 @@ def build_encoder(config: EncoderConfig,
            encoder_cfg.hidden_activation),
        dropout_rate=encoder_cfg.dropout_rate,
        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        norm_first=encoder_cfg.norm_first,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=encoder_cfg.initializer_range),
        attention_cls=layers.KernelAttention,
@@ -445,4 +455,5 @@ def build_encoder(config: EncoderConfig,
      embedding_width=encoder_cfg.embedding_size,
      embedding_layer=embedding_layer,
      return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
-      dict_outputs=True)
+      dict_outputs=True,
+      norm_first=encoder_cfg.norm_first)
--- a/official/nlp/continuous_finetune_lib.py
+++ b/official/nlp/continuous_finetune_lib.py
@@ -28,7 +28,6 @@ from official.core import train_lib
 from official.core import train_utils
 from official.modeling import performance
 from official.modeling.multitask import configs
-from official.modeling.multitask import multitask
 from official.modeling.multitask import train_lib as multitask_train_lib


@@ -167,7 +166,10 @@ def run_continuous_finetune(
    with distribution_strategy.scope():
      if isinstance(params, configs.MultiEvalExperimentConfig):
        task = task_factory.get_task(params_replaced.task)
-        eval_tasks = multitask.MultiTask.from_config(params_replaced.eval_tasks)
+        eval_tasks = [
+            task_factory.get_task(config.task_config, name=config.task_name)
+            for config in params.eval_tasks
+        ]
        (_,
         eval_metrics) = multitask_train_lib.run_experiment_with_multitask_eval(
             distribution_strategy=distribution_strategy,

--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -129,24 +129,52 @@ class DataProcessor(object):
        lines.append(json.loads(json_str))
    return lines

+  def featurize_example(self, *kargs, **kwargs):
+    """Converts a single `InputExample` into a single `InputFeatures`."""
+    return convert_single_example(*kargs, **kwargs)
+
+
+class DefaultGLUEDataProcessor(DataProcessor):
+  """Processor for the SuperGLUE dataset."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples_tfds("train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples_tfds("validation")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples_tfds("test")
+
+  def _create_examples_tfds(self, set_type):
+    """Creates examples for the training/dev/test sets."""
+    raise NotImplementedError()
+

 class AxProcessor(DataProcessor):
  """Processor for the AX dataset (GLUE diagnostics dataset)."""

  def get_train_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    train_mnli_dataset = tfds.load(
+        "glue/mnli", split="train", try_gcs=True).as_numpy_iterator()
+    return self._create_examples_tfds(train_mnli_dataset, "train")

  def get_dev_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    val_mnli_dataset = tfds.load(
+        "glue/mnli", split="validation_matched",
+        try_gcs=True).as_numpy_iterator()
+    return self._create_examples_tfds(val_mnli_dataset, "validation")

  def get_test_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    test_ax_dataset = tfds.load(
+        "glue/ax", split="test", try_gcs=True).as_numpy_iterator()
+    return self._create_examples_tfds(test_ax_dataset, "test")

  def get_labels(self):
    """See base class."""
@@ -157,46 +185,26 @@ class AxProcessor(DataProcessor):
    """See base class."""
    return "AX"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, dataset, set_type):
    """Creates examples for the training/dev/test sets."""
-    text_a_index = 1 if set_type == "test" else 8
-    text_b_index = 2 if set_type == "test" else 9
    examples = []
-    for i, line in enumerate(lines):
-      # Skip header.
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
-      text_a = self.process_text_fn(line[text_a_index])
-      text_b = self.process_text_fn(line[text_b_index])
-      if set_type == "test":
-        label = "contradiction"
-      else:
-        label = self.process_text_fn(line[-1])
+    for i, example in enumerate(dataset):
+      guid = "%s-%s" % (set_type, i)
+      label = "contradiction"
+      text_a = self.process_text_fn(example["hypothesis"])
+      text_b = self.process_text_fn(example["premise"])
+      if set_type != "test":
+        label = self.get_labels()[example["label"]]
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=text_b, label=label,
+              weight=None))
    return examples


-class ColaProcessor(DataProcessor):
+class ColaProcessor(DefaultGLUEDataProcessor):
  """Processor for the CoLA data set (GLUE version)."""

-  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-    super(ColaProcessor, self).__init__(process_text_fn)
-    self.dataset = tfds.load("glue/cola", try_gcs=True)
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples_tfds("train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples_tfds("validation")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples_tfds("test")
-
  def get_labels(self):
    """See base class."""
    return ["0", "1"]
@@ -208,7 +216,8 @@ class ColaProcessor(DataProcessor):

  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
-    dataset = self.dataset[set_type].as_numpy_iterator()
+    dataset = tfds.load(
+        "glue/cola", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
    for i, example in enumerate(dataset):
      guid = "%s-%s" % (set_type, i)
@@ -267,34 +276,28 @@ class MnliProcessor(DataProcessor):
               mnli_type="matched",
               process_text_fn=tokenization.convert_to_unicode):
    super(MnliProcessor, self).__init__(process_text_fn)
+    self.dataset = tfds.load("glue/mnli", try_gcs=True)
    if mnli_type not in ("matched", "mismatched"):
      raise ValueError("Invalid `mnli_type`: %s" % mnli_type)
    self.mnli_type = mnli_type

  def get_train_examples(self, data_dir):
    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    return self._create_examples_tfds("train")

  def get_dev_examples(self, data_dir):
    """See base class."""
    if self.mnli_type == "matched":
-      return self._create_examples(
-          self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-          "dev_matched")
+      return self._create_examples_tfds("validation_matched")
    else:
-      return self._create_examples(
-          self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
-          "dev_mismatched")
+      return self._create_examples_tfds("validation_mismatched")

  def get_test_examples(self, data_dir):
    """See base class."""
    if self.mnli_type == "matched":
-      return self._create_examples(
-          self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
+      return self._create_examples_tfds("test_matched")
    else:
-      return self._create_examples(
-          self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test")
+      return self._create_examples_tfds("test_mismatched")

  def get_labels(self):
    """See base class."""
@@ -305,42 +308,28 @@ class MnliProcessor(DataProcessor):
    """See base class."""
    return "MNLI"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "glue/mnli", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
-      text_a = self.process_text_fn(line[8])
-      text_b = self.process_text_fn(line[9])
-      if set_type == "test":
-        label = "contradiction"
-      else:
-        label = self.process_text_fn(line[-1])
+    for i, example in enumerate(dataset):
+      guid = "%s-%s" % (set_type, i)
+      label = "contradiction"
+      text_a = self.process_text_fn(example["hypothesis"])
+      text_b = self.process_text_fn(example["premise"])
+      if set_type != "test":
+        label = self.get_labels()[example["label"]]
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=text_b, label=label,
+              weight=None))
    return examples


-class MrpcProcessor(DataProcessor):
+class MrpcProcessor(DefaultGLUEDataProcessor):
  """Processor for the MRPC data set (GLUE version)."""

-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
  def get_labels(self):
    """See base class."""
    return ["0", "1"]
@@ -350,21 +339,22 @@ class MrpcProcessor(DataProcessor):
    """See base class."""
    return "MRPC"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "glue/mrpc", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
+    for i, example in enumerate(dataset):
      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line[3])
-      text_b = self.process_text_fn(line[4])
-      if set_type == "test":
-        label = "0"
-      else:
-        label = self.process_text_fn(line[0])
+      label = "0"
+      text_a = self.process_text_fn(example["sentence1"])
+      text_b = self.process_text_fn(example["sentence2"])
+      if set_type != "test":
+        label = str(example["label"])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=text_b, label=label,
+              weight=None))
    return examples


@@ -447,24 +437,9 @@ class PawsxProcessor(DataProcessor):
    return "XTREME-PAWS-X"


-class QnliProcessor(DataProcessor):
+class QnliProcessor(DefaultGLUEDataProcessor):
  """Processor for the QNLI data set (GLUE version)."""

-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
  def get_labels(self):
    """See base class."""
    return ["entailment", "not_entailment"]
@@ -474,44 +449,28 @@ class QnliProcessor(DataProcessor):
    """See base class."""
    return "QNLI"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "glue/qnli", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, 1)
-      if set_type == "test":
-        text_a = tokenization.convert_to_unicode(line[1])
-        text_b = tokenization.convert_to_unicode(line[2])
-        label = "entailment"
-      else:
-        text_a = tokenization.convert_to_unicode(line[1])
-        text_b = tokenization.convert_to_unicode(line[2])
-        label = tokenization.convert_to_unicode(line[-1])
+    for i, example in enumerate(dataset):
+      guid = "%s-%s" % (set_type, i)
+      label = "entailment"
+      text_a = self.process_text_fn(example["question"])
+      text_b = self.process_text_fn(example["sentence"])
+      if set_type != "test":
+        label = self.get_labels()[example["label"]]
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=text_b, label=label,
+              weight=None))
    return examples


-class QqpProcessor(DataProcessor):
+class QqpProcessor(DefaultGLUEDataProcessor):
  """Processor for the QQP data set (GLUE version)."""

-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
  def get_labels(self):
    """See base class."""
    return ["0", "1"]
@@ -521,48 +480,28 @@ class QqpProcessor(DataProcessor):
    """See base class."""
    return "QQP"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "glue/qqp", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, line[0])
-      if set_type == "test":
-        text_a = line[1]
-        text_b = line[2]
-        label = "0"
-      else:
-        # There appear to be some garbage lines in the train dataset.
-        try:
-          text_a = line[3]
-          text_b = line[4]
-          label = line[5]
-        except IndexError:
-          continue
+    for i, example in enumerate(dataset):
+      guid = "%s-%s" % (set_type, i)
+      label = "0"
+      text_a = self.process_text_fn(example["question1"])
+      text_b = self.process_text_fn(example["question2"])
+      if set_type != "test":
+        label = str(example["label"])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=text_b, label=label,
+              weight=None))
    return examples


-class RteProcessor(DataProcessor):
+class RteProcessor(DefaultGLUEDataProcessor):
  """Processor for the RTE data set (GLUE version)."""

-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
  def get_labels(self):
    """See base class."""
    # All datasets are converted to 2-class split, where for 3-class datasets we
@@ -574,42 +513,28 @@ class RteProcessor(DataProcessor):
    """See base class."""
    return "RTE"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "glue/rte", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
+    for i, example in enumerate(dataset):
      guid = "%s-%s" % (set_type, i)
-      text_a = tokenization.convert_to_unicode(line[1])
-      text_b = tokenization.convert_to_unicode(line[2])
-      if set_type == "test":
-        label = "entailment"
-      else:
-        label = tokenization.convert_to_unicode(line[3])
+      label = "entailment"
+      text_a = self.process_text_fn(example["sentence1"])
+      text_b = self.process_text_fn(example["sentence2"])
+      if set_type != "test":
+        label = self.get_labels()[example["label"]]
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=text_b, label=label,
+              weight=None))
    return examples


-class SstProcessor(DataProcessor):
+class SstProcessor(DefaultGLUEDataProcessor):
  """Processor for the SST-2 data set (GLUE version)."""

-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
  def get_labels(self):
    """See base class."""
    return ["0", "1"]
@@ -619,25 +544,24 @@ class SstProcessor(DataProcessor):
    """See base class."""
    return "SST-2"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "glue/sst2", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
+    for i, example in enumerate(dataset):
      guid = "%s-%s" % (set_type, i)
-      if set_type == "test":
-        text_a = tokenization.convert_to_unicode(line[1])
-        label = "0"
-      else:
-        text_a = tokenization.convert_to_unicode(line[0])
-        label = tokenization.convert_to_unicode(line[1])
+      label = "0"
+      text_a = self.process_text_fn(example["sentence"])
+      if set_type != "test":
+        label = str(example["label"])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=None, label=label, weight=None))
    return examples


-class StsBProcessor(DataProcessor):
+class StsBProcessor(DefaultGLUEDataProcessor):
  """Processor for the STS-B data set (GLUE version)."""

  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
@@ -646,20 +570,23 @@ class StsBProcessor(DataProcessor):
    self.label_type = float
    self._labels = None

-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+  def _create_examples_tfds(self, set_type):
+    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "glue/stsb", split=set_type, try_gcs=True).as_numpy_iterator()
+    examples = []
+    for i, example in enumerate(dataset):
+      guid = "%s-%s" % (set_type, i)
+      label = 0.0
+      text_a = self.process_text_fn(example["sentence1"])
+      text_b = self.process_text_fn(example["sentence2"])
+      if set_type != "test":
+        label = self.label_type(example["label"])
+      examples.append(
+          InputExample(
+              guid=guid, text_a=text_a, text_b=text_b, label=label,
+              weight=None))
+    return examples

  def get_labels(self):
    """See base class."""
@@ -670,23 +597,6 @@ class StsBProcessor(DataProcessor):
    """See base class."""
    return "STS-B"

-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      text_a = tokenization.convert_to_unicode(line[7])
-      text_b = tokenization.convert_to_unicode(line[8])
-      if set_type == "test":
-        label = 0.0
-      else:
-        label = self.label_type(tokenization.convert_to_unicode(line[9]))
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-

 class TfdsProcessor(DataProcessor):
  """Processor for generic text classification and regression TFDS data set.
@@ -816,24 +726,9 @@ class TfdsProcessor(DataProcessor):
    return examples


-class WnliProcessor(DataProcessor):
+class WnliProcessor(DefaultGLUEDataProcessor):
  """Processor for the WNLI data set (GLUE version)."""

-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
  def get_labels(self):
    """See base class."""
    return ["0", "1"]
@@ -843,21 +738,22 @@ class WnliProcessor(DataProcessor):
    """See base class."""
    return "WNLI"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "glue/wnli", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
+    for i, example in enumerate(dataset):
      guid = "%s-%s" % (set_type, i)
-      text_a = tokenization.convert_to_unicode(line[1])
-      text_b = tokenization.convert_to_unicode(line[2])
-      if set_type == "test":
-        label = "0"
-      else:
-        label = tokenization.convert_to_unicode(line[3])
+      label = "0"
+      text_a = self.process_text_fn(example["sentence1"])
+      text_b = self.process_text_fn(example["sentence2"])
+      if set_type != "test":
+        label = str(example["label"])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+          InputExample(
+              guid=guid, text_a=text_a, text_b=text_b, label=label,
+              weight=None))
    return examples


@@ -1314,30 +1210,7 @@ class AXgProcessor(DataProcessor):
    return examples


-class SuperGLUEDataProcessor(DataProcessor):
-  """Processor for the SuperGLUE dataset."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "val.jsonl")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    raise NotImplementedError()
-
-
-class BoolQProcessor(SuperGLUEDataProcessor):
+class BoolQProcessor(DefaultGLUEDataProcessor):
  """Processor for the BoolQ dataset (SuperGLUE diagnostics dataset)."""

  def get_labels(self):
@@ -1349,23 +1222,24 @@ class BoolQProcessor(SuperGLUEDataProcessor):
    """See base class."""
    return "BoolQ"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "super_glue/boolq", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for line in lines:
-      guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"])))
-      text_a = self.process_text_fn(line["question"])
-      text_b = self.process_text_fn(line["passage"])
-      if set_type == "test":
-        label = "False"
-      else:
-        label = str(line["label"])
+    for example in dataset:
+      guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"])))
+      text_a = self.process_text_fn(example["question"])
+      text_b = self.process_text_fn(example["passage"])
+      label = "False"
+      if set_type != "test":
+        label = self.get_labels()[example["label"]]
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples


-class CBProcessor(SuperGLUEDataProcessor):
+class CBProcessor(DefaultGLUEDataProcessor):
  """Processor for the CB dataset (SuperGLUE diagnostics dataset)."""

  def get_labels(self):
@@ -1377,23 +1251,24 @@ class CBProcessor(SuperGLUEDataProcessor):
    """See base class."""
    return "CB"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
+    dataset = tfds.load(
+        "super_glue/cb", split=set_type, try_gcs=True).as_numpy_iterator()
    examples = []
-    for line in lines:
-      guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"])))
-      text_a = self.process_text_fn(line["premise"])
-      text_b = self.process_text_fn(line["hypothesis"])
-      if set_type == "test":
-        label = "entailment"
-      else:
-        label = self.process_text_fn(line["label"])
+    for example in dataset:
+      guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"])))
+      text_a = self.process_text_fn(example["premise"])
+      text_b = self.process_text_fn(example["hypothesis"])
+      label = "entailment"
+      if set_type != "test":
+        label = self.get_labels()[example["label"]]
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples


-class SuperGLUERTEProcessor(SuperGLUEDataProcessor):
+class SuperGLUERTEProcessor(DefaultGLUEDataProcessor):
  """Processor for the RTE dataset (SuperGLUE version)."""

  def get_labels(self):
@@ -1407,28 +1282,163 @@ class SuperGLUERTEProcessor(SuperGLUEDataProcessor):
    """See base class."""
    return "RTESuperGLUE"

-  def _create_examples(self, lines, set_type):
+  def _create_examples_tfds(self, set_type):
    """Creates examples for the training/dev/test sets."""
    examples = []
-    for i, line in enumerate(lines):
-      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line["premise"])
-      text_b = self.process_text_fn(line["hypothesis"])
-      if set_type == "test":
-        label = "entailment"
-      else:
-        label = self.process_text_fn(line["label"])
+    dataset = tfds.load(
+        "super_glue/rte", split=set_type, try_gcs=True).as_numpy_iterator()
+    for example in dataset:
+      guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"])))
+      text_a = self.process_text_fn(example["premise"])
+      text_b = self.process_text_fn(example["hypothesis"])
+      label = "entailment"
+      if set_type != "test":
+        label = self.get_labels()[example["label"]]
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples


+class WiCInputExample(InputExample):
+  """Processor for the WiC dataset (SuperGLUE version)."""
+
+  def __init__(self,
+               guid,
+               text_a,
+               text_b=None,
+               label=None,
+               word=None,
+               weight=None,
+               example_id=None):
+    """A single training/test example for simple seq regression/classification."""
+    super(WiCInputExample, self).__init__(guid, text_a, text_b, label, weight,
+                                          example_id)
+    self.word = word
+
+
+class WiCProcessor(DefaultGLUEDataProcessor):
+  """Processor for the RTE dataset (SuperGLUE version)."""
+
+  def get_labels(self):
+    """Not used."""
+    return []
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "RTESuperGLUE"
+
+  def _create_examples_tfds(self, set_type):
+    """Creates examples for the training/dev/test sets."""
+    examples = []
+    dataset = tfds.load(
+        "super_glue/wic", split=set_type, try_gcs=True).as_numpy_iterator()
+    for example in dataset:
+      guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"])))
+      text_a = self.process_text_fn(example["sentence1"])
+      text_b = self.process_text_fn(example["sentence2"])
+      word = self.process_text_fn(example["word"])
+      label = 0
+      if set_type != "test":
+        label = example["label"]
+      examples.append(
+          WiCInputExample(
+              guid=guid, text_a=text_a, text_b=text_b, word=word, label=label))
+    return examples
+
+  def featurize_example(self, ex_index, example, label_list, max_seq_length,
+                        tokenizer):
+    """Here we concate sentence1, sentence2, word together with [SEP] tokens."""
+    del label_list
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = tokenizer.tokenize(example.text_b)
+    tokens_word = tokenizer.tokenize(example.word)
+
+    # Modifies `tokens_a` and `tokens_b` in place so that the total
+    # length is less than the specified length.
+    # Account for [CLS], [SEP], [SEP], [SEP] with "- 4"
+    # Here we only pop out the first two sentence tokens.
+    _truncate_seq_pair(tokens_a, tokens_b,
+                       max_seq_length - 4 - len(tokens_word))
+
+    seg_id_a = 0
+    seg_id_b = 1
+    seg_id_c = 2
+    seg_id_cls = 0
+    seg_id_pad = 0
+
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(seg_id_cls)
+    for token in tokens_a:
+      tokens.append(token)
+      segment_ids.append(seg_id_a)
+    tokens.append("[SEP]")
+    segment_ids.append(seg_id_a)
+
+    for token in tokens_b:
+      tokens.append(token)
+      segment_ids.append(seg_id_b)
+
+    tokens.append("[SEP]")
+    segment_ids.append(seg_id_b)
+
+    for token in tokens_word:
+      tokens.append(token)
+      segment_ids.append(seg_id_c)
+
+    tokens.append("[SEP]")
+    segment_ids.append(seg_id_c)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(seg_id_pad)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    label_id = example.label
+    if ex_index < 5:
+      logging.info("*** Example ***")
+      logging.info("guid: %s", (example.guid))
+      logging.info("tokens: %s",
+                   " ".join([tokenization.printable_text(x) for x in tokens]))
+      logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+      logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+      logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+      logging.info("label: %s (id = %s)", example.label, str(label_id))
+      logging.info("weight: %s", example.weight)
+      logging.info("example_id: %s", example.example_id)
+
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id,
+        is_real_example=True,
+        weight=example.weight,
+        example_id=example.example_id)
+
+    return feature
+
+
 def file_based_convert_examples_to_features(examples,
                                            label_list,
                                            max_seq_length,
                                            tokenizer,
                                            output_file,
-                                            label_type=None):
+                                            label_type=None,
+                                            featurize_fn=None):
  """Convert a set of `InputExample`s to a TFRecord file."""

  tf.io.gfile.makedirs(os.path.dirname(output_file))
@@ -1438,8 +1448,12 @@ def file_based_convert_examples_to_features(examples,
    if ex_index % 10000 == 0:
      logging.info("Writing example %d of %d", ex_index, len(examples))

-    feature = convert_single_example(ex_index, example, label_list,
-                                     max_seq_length, tokenizer)
+    if featurize_fn:
+      feature = featurize_fn(ex_index, example, label_list, max_seq_length,
+                             tokenizer)
+    else:
+      feature = convert_single_example(ex_index, example, label_list,
+                                       max_seq_length, tokenizer)

    def create_int_feature(values):
      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
@@ -1528,7 +1542,8 @@ def generate_tf_record_from_data_file(processor,
    file_based_convert_examples_to_features(train_input_data_examples,
                                            label_list, max_seq_length,
                                            tokenizer, train_data_output_path,
-                                            label_type)
+                                            label_type,
+                                            processor.featurize_example)
    num_training_data = len(train_input_data_examples)

  if eval_data_output_path:
@@ -1536,7 +1551,8 @@ def generate_tf_record_from_data_file(processor,
    file_based_convert_examples_to_features(eval_input_data_examples,
                                            label_list, max_seq_length,
                                            tokenizer, eval_data_output_path,
-                                            label_type)
+                                            label_type,
+                                            processor.featurize_example)

  meta_data = {
      "processor_type": processor.get_processor_name(),
@@ -1550,13 +1566,15 @@ def generate_tf_record_from_data_file(processor,
      for language, examples in test_input_data_examples.items():
        file_based_convert_examples_to_features(
            examples, label_list, max_seq_length, tokenizer,
-            test_data_output_path.format(language), label_type)
+            test_data_output_path.format(language), label_type,
+            processor.featurize_example)
        meta_data["test_{}_data_size".format(language)] = len(examples)
    else:
      file_based_convert_examples_to_features(test_input_data_examples,
                                              label_list, max_seq_length,
                                              tokenizer, test_data_output_path,
-                                              label_type)
+                                              label_type,
+                                              processor.featurize_example)
      meta_data["test_data_size"] = len(test_input_data_examples)

  if is_regression:

--- a/official/nlp/data/classifier_data_lib_test.py
+++ b/official/nlp/data/classifier_data_lib_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for third_party.tensorflow_models.official.nlp.data.classifier_data_lib."""
+
+import os
+import tempfile
+
+from absl.testing import parameterized
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from official.nlp.bert import tokenization
+from official.nlp.data import classifier_data_lib
+
+
+def decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  return tf.io.parse_single_example(record, name_to_features)
+
+
+class BertClassifierLibTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(BertClassifierLibTest, self).setUp()
+    self.model_dir = self.get_temp_dir()
+    self.processors = {
+        "CB": classifier_data_lib.CBProcessor,
+        "SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor,
+        "BOOLQ": classifier_data_lib.BoolQProcessor,
+        "WIC": classifier_data_lib.WiCProcessor,
+    }
+
+    vocab_tokens = [
+        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+        "##ing", ","
+    ]
+    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
+      vocab_writer.write("".join([x + "\n" for x in vocab_tokens
+                                 ]).encode("utf-8"))
+    vocab_file = vocab_writer.name
+    self.tokenizer = tokenization.FullTokenizer(vocab_file)
+
+  @parameterized.parameters(
+      {"task_type": "CB"},
+      {"task_type": "BOOLQ"},
+      {"task_type": "SUPERGLUE-RTE"},
+      {"task_type": "WIC"},
+  )
+  def test_generate_dataset_from_tfds_processor(self, task_type):
+    with tfds.testing.mock_data(num_examples=5):
+      output_path = os.path.join(self.model_dir, task_type)
+
+      processor = self.processors[task_type]()
+
+      classifier_data_lib.generate_tf_record_from_data_file(
+          processor,
+          None,
+          self.tokenizer,
+          train_data_output_path=output_path,
+          eval_data_output_path=output_path,
+          test_data_output_path=output_path)
+      files = tf.io.gfile.glob(output_path)
+      self.assertNotEmpty(files)
+
+      train_dataset = tf.data.TFRecordDataset(output_path)
+      seq_length = 128
+      label_type = tf.int64
+      name_to_features = {
+          "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "label_ids": tf.io.FixedLenFeature([], label_type),
+      }
+      train_dataset = train_dataset.map(
+          lambda record: decode_record(record, name_to_features))
+
+      # If data is retrieved without error, then all requirements
+      # including data type/shapes are met.
+      _ = next(iter(train_dataset))
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -50,7 +50,7 @@ flags.DEFINE_enum(
    "classification_task_name", "MNLI", [
        "AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
        "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X",
-        "AX-g", "SUPERGLUE-RTE", "CB", "BoolQ"
+        "AX-g", "SUPERGLUE-RTE", "CB", "BoolQ", "WIC"
    ], "The name of the task to train BERT classifier. The "
    "difference between XTREME-XNLI and XNLI is: 1. the format "
    "of input tsv files; 2. the dev set for XTREME is english "
@@ -173,8 +173,26 @@ flags.DEFINE_string(

 def generate_classifier_dataset():
  """Generates classifier dataset and returns input meta data."""
-  assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
-          FLAGS.tfds_params)
+  if FLAGS.classification_task_name in [
+      "COLA",
+      "WNLI",
+      "SST-2",
+      "MRPC",
+      "QQP",
+      "STS-B",
+      "MNLI",
+      "QNLI",
+      "RTE",
+      "AX",
+      "SUPERGLUE-RTE",
+      "CB",
+      "BoolQ",
+      "WIC",
+  ]:
+    assert not FLAGS.input_data_dir or FLAGS.tfds_params
+  else:
+    assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
+            FLAGS.tfds_params)

  if FLAGS.tokenization == "WordPiece":
    tokenizer = tokenization.FullTokenizer(
@@ -248,6 +266,8 @@ def generate_classifier_dataset():
            classifier_data_lib.CBProcessor,
        "boolq":
            classifier_data_lib.BoolQProcessor,
+        "wic":
+            classifier_data_lib.WnliProcessor,
    }
    task_name = FLAGS.classification_task_name.lower()
    if task_name not in processors:

--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -60,8 +60,8 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
    else:
      self._label_name_mapping = dict()

-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
+  def name_to_features_spec(self):
+    """Defines features to decode. Subclass may override to append features."""
    label_type = LABEL_TYPES_MAP[self._params.label_type]
    name_to_features = {
        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
@@ -72,7 +72,11 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
    if self._include_example_id:
      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)

-    example = tf.io.parse_single_example(record, name_to_features)
+    return name_to_features
+
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    example = tf.io.parse_single_example(record, self.name_to_features_spec())

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
@@ -86,20 +90,23 @@ class SentencePredictionDataLoader(data_loader.DataLoader):

  def _parse(self, record: Mapping[str, tf.Tensor]):
    """Parses raw tensors into a dict of tensors to be consumed by the model."""
-    x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids']
+    key_mapping = {
+        'input_ids': 'input_word_ids',
+        'input_mask': 'input_mask',
+        'segment_ids': 'input_type_ids'
    }
-    if self._include_example_id:
-      x['example_id'] = record['example_id']
-
-    x[self._label_field] = record[self._label_field]
+    ret = {}
+    for record_key in record:
+      if record_key in key_mapping:
+        ret[key_mapping[record_key]] = record[record_key]
+      else:
+        ret[record_key] = record[record_key]

    if self._label_field in self._label_name_mapping:
-      x[self._label_name_mapping[self._label_field]] = record[self._label_field]
+      ret[self._label_name_mapping[self._label_field]] = record[
+          self._label_field]

-    return x
+    return ret

  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
    """Returns a tf.dataset.Dataset."""
@@ -215,13 +222,12 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
    """Berts preprocess."""
    segments = [record[x] for x in self._text_fields]
    model_inputs = self._text_processor(segments)
-    if self._include_example_id:
-      model_inputs['example_id'] = record['example_id']
-    model_inputs[self._label_field] = record[self._label_field]
+    for key in record:
+      if key not in self._text_fields:
+        model_inputs[key] = record[key]
    return model_inputs

-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
+  def name_to_features_spec(self):
    name_to_features = {}
    for text_field in self._text_fields:
      name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string)
@@ -230,8 +236,11 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
    name_to_features[self._label_field] = tf.io.FixedLenFeature([], label_type)
    if self._include_example_id:
      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
-    example = tf.io.parse_single_example(record, name_to_features)
+    return name_to_features

+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    example = tf.io.parse_single_example(record, self.name_to_features_spec())
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in example:

--- a/official/nlp/data/sentence_prediction_dataloader_test.py
+++ b/official/nlp/data/sentence_prediction_dataloader_test.py
@@ -198,9 +198,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
-        features.keys())
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
@@ -233,9 +236,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
-        features.keys())
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
@@ -268,9 +274,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
-        features.keys())
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))

--- a/official/nlp/keras_nlp/encoders/bert_encoder.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder.py
@@ -69,6 +69,9 @@ class BertEncoder(tf.keras.Model):
      smaller than 'hidden_size').
    embedding_layer: An optional Layer instance which will be called to
     generate embeddings for the input word IDs.
+    norm_first: Whether to normalize inputs to attention and intermediate
+      dense layers. If set False, output of attention and intermediate dense
+      layers is normalized.
  """

  def __init__(
@@ -87,6 +90,7 @@ class BertEncoder(tf.keras.Model):
      output_range=None,
      embedding_width=None,
      embedding_layer=None,
+      norm_first=False,
      **kwargs):
    activation = tf.keras.activations.get(inner_activation)
    initializer = tf.keras.initializers.get(initializer)
@@ -162,6 +166,7 @@ class BertEncoder(tf.keras.Model):
          inner_activation=inner_activation,
          output_dropout=output_dropout,
          attention_dropout=attention_dropout,
+          norm_first=norm_first,
          output_range=transformer_output_range,
          kernel_initializer=initializer,
          name='transformer/layer_%d' % i)
@@ -211,6 +216,7 @@ class BertEncoder(tf.keras.Model):
        'output_range': output_range,
        'embedding_width': embedding_width,
        'embedding_layer': embedding_layer,
+        'norm_first': norm_first,
    }

    # We are storing the config dict as a namedtuple here to ensure checkpoint

--- a/official/nlp/keras_nlp/encoders/bert_encoder_test.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder_test.py
@@ -205,7 +205,8 @@ class BertEncoderTest(keras_parameterized.TestCase):
        initializer="glorot_uniform",
        output_range=-1,
        embedding_width=16,
-        embedding_layer=None)
+        embedding_layer=None,
+        norm_first=False)
    network = bert_encoder.BertEncoder(**kwargs)
    expected_config = dict(kwargs)
    expected_config["inner_activation"] = tf.keras.activations.serialize(

--- a/official/nlp/keras_nlp/layers/position_embedding_test.py
+++ b/official/nlp/keras_nlp/layers/position_embedding_test.py
@@ -48,12 +48,12 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    test_layer = position_embedding.PositionEmbedding(
        max_length=sequence_length, seq_axis=2)
    width = 30
-    input_tensor = tf.keras.Input(shape=(sequence_length, width, width))
+    input_tensor = tf.keras.Input(shape=(width, sequence_length, width))
    output_tensor = test_layer(input_tensor)

    # When using static positional embedding shapes, the output is expected
    # to be the same as the input shape in all dimensions save batch.
-    expected_output_shape = [None, sequence_length, width, width]
+    expected_output_shape = [None, width, sequence_length, width]
    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
    # The default output dtype for this layer should be tf.float32.
    self.assertEqual(tf.float32, output_tensor.dtype)

--- a/official/nlp/keras_nlp/layers/transformer_encoder_block.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block.py
@@ -249,7 +249,7 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
          attention.

    Returns:
-      An ouput tensor with the same dimensions as input/query tensor.
+      An output tensor with the same dimensions as input/query tensor.
    """
    if isinstance(inputs, (list, tuple)):
      if len(inputs) == 2: