Merge remote-tracking branch 'upstream/master' into add_multilevel_crop_and_resize

47bc1813 · syiming · d8611151 · b035a227 · 47bc1813 · 47bc1813
Commit 47bc1813 authored Jul 01, 2020 by syiming
20 changed files
--- a/official/nlp/configs/bert_test.py
+++ b/official/nlp/configs/bert_test.py
@@ -26,7 +26,7 @@ class BertModelsTest(tf.test.TestCase):
  def test_network_invocation(self):
    config = bert.BertPretrainerConfig(
        encoder=encoders.TransformerEncoderConfig(vocab_size=10, num_layers=1))
-    _ = bert.instantiate_from_cfg(config)
+    _ = bert.instantiate_bertpretrainer_from_cfg(config)
    # Invokes with classification heads.
    config = bert.BertPretrainerConfig(
@@ -35,7 +35,7 @@ class BertModelsTest(tf.test.TestCase):
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=2, name="next_sentence")
        ])
-    _ = bert.instantiate_from_cfg(config)
+    _ = bert.instantiate_bertpretrainer_from_cfg(config)
    with self.assertRaises(ValueError):
      config = bert.BertPretrainerConfig(
@@ -47,7 +47,7 @@ class BertModelsTest(tf.test.TestCase):
              bert.ClsHeadConfig(
                  inner_dim=10, num_classes=2, name="next_sentence")
          ])
-      _ = bert.instantiate_from_cfg(config)
+      _ = bert.instantiate_bertpretrainer_from_cfg(config)
  def test_checkpoint_items(self):
    config = bert.BertPretrainerConfig(
@@ -56,7 +56,7 @@ class BertModelsTest(tf.test.TestCase):
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=2, name="next_sentence")
        ])
-    encoder = bert.instantiate_from_cfg(config)
+    encoder = bert.instantiate_bertpretrainer_from_cfg(config)
    self.assertSameElements(encoder.checkpoint_items.keys(),
                            ["encoder", "next_sentence.pooler_dense"])

--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -13,11 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Configurations for Encoders."""
+"""Transformer Encoders.
+Includes configurations and instantiation methods.
+"""
 import dataclasses
+import tensorflow as tf
+from official.modeling import tf_utils
 from official.modeling.hyperparams import base_config
+from official.nlp.modeling import networks
 @dataclasses.dataclass
@@ -28,9 +34,29 @@ class TransformerEncoderConfig(base_config.Config):
  num_layers: int = 12
  num_attention_heads: int = 12
  hidden_activation: str = "gelu"
-  intermediate_size: int = 3076
+  intermediate_size: int = 3072
  dropout_rate: float = 0.1
  attention_dropout_rate: float = 0.1
  max_position_embeddings: int = 512
  type_vocab_size: int = 2
  initializer_range: float = 0.02
+def instantiate_encoder_from_cfg(
+    config: TransformerEncoderConfig) -> networks.TransformerEncoder:
+  """Instantiate a Transformer encoder network from TransformerEncoderConfig."""
+  encoder_network = networks.TransformerEncoder(
+      vocab_size=config.vocab_size,
+      hidden_size=config.hidden_size,
+      num_layers=config.num_layers,
+      num_attention_heads=config.num_attention_heads,
+      intermediate_size=config.intermediate_size,
+      activation=tf_utils.get_activation(config.hidden_activation),
+      dropout_rate=config.dropout_rate,
+      attention_dropout_rate=config.attention_dropout_rate,
+      sequence_length=None,
+      max_sequence_length=config.max_position_embeddings,
+      type_vocab_size=config.type_vocab_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=config.initializer_range))
+  return encoder_network
--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -33,7 +33,13 @@ from official.nlp.bert import tokenization
 class InputExample(object):
  """A single training/test example for simple sequence classification."""
-  def __init__(self, guid, text_a, text_b=None, label=None, weight=None):
+  def __init__(self,
+               guid,
+               text_a,
+               text_b=None,
+               label=None,
+               weight=None,
+               int_iden=None):
    """Constructs a InputExample.
    Args:
@@ -46,12 +52,15 @@ class InputExample(object):
        specified for train and dev examples, but not for test examples.
      weight: (Optional) float. The weight of the example to be used during
        training.
+      int_iden: (Optional) int. The int identification number of example in the
+        corpus.
    """
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.label = label
    self.weight = weight
+    self.int_iden = int_iden
 class InputFeatures(object):
@@ -63,13 +72,15 @@ class InputFeatures(object):
               segment_ids,
               label_id,
               is_real_example=True,
-               weight=None):
+               weight=None,
+               int_iden=None):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_id = label_id
    self.is_real_example = is_real_example
    self.weight = weight
+    self.int_iden = int_iden
 class DataProcessor(object):
@@ -191,12 +202,68 @@ class XnliProcessor(DataProcessor):
    return "XNLI"
-class PawsxProcessor(DataProcessor):
+class XtremeXnliProcessor(DataProcessor):
-  """Processor for the PAWS-X data set."""
+  """Processor for the XTREME XNLI data set."""
  supported_languages = [
-      "de", "en", "es", "fr", "ja", "ko", "zh"
+      "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr",
+      "ur", "vi", "zh"
  ]
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "train-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "dev-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    examples_by_lang = {k: [] for k in self.supported_languages}
+    for lang in self.supported_languages:
+      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
+      for (i, line) in enumerate(lines):
+        guid = f"test-{i}"
+        text_a = self.process_text_fn(line[0])
+        text_b = self.process_text_fn(line[1])
+        label = "contradiction"
+        examples_by_lang[lang].append(
+            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples_by_lang
+  def get_labels(self):
+    """See base class."""
+    return ["contradiction", "entailment", "neutral"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "XTREME-XNLI"
+class PawsxProcessor(DataProcessor):
+  """Processor for the PAWS-X data set."""
+  supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"]
  def __init__(self,
               language="en",
               process_text_fn=tokenization.convert_to_unicode):
@@ -219,8 +286,7 @@ class PawsxProcessor(DataProcessor):
        train_tsv = "translated_train.tsv"
      # Skips the header.
      lines.extend(
-          self._read_tsv(
+          self._read_tsv(os.path.join(data_dir, language, train_tsv))[1:])
-              os.path.join(data_dir, language, train_tsv))[1:])
    examples = []
    for (i, line) in enumerate(lines):
@@ -235,10 +301,9 @@ class PawsxProcessor(DataProcessor):
  def get_dev_examples(self, data_dir):
    """See base class."""
    lines = []
-    for language in PawsxProcessor.supported_languages:
+    for lang in PawsxProcessor.supported_languages:
-      # Skips the header.
      lines.extend(
-          self._read_tsv(os.path.join(data_dir, language, "dev_2k.tsv"))[1:])
+          self._read_tsv(os.path.join(data_dir, lang, "dev_2k.tsv"))[1:])
    examples = []
    for (i, line) in enumerate(lines):
@@ -252,17 +317,15 @@ class PawsxProcessor(DataProcessor):
  def get_test_examples(self, data_dir):
    """See base class."""
-    examples_by_lang = {k: [] for k in PawsxProcessor.supported_languages}
+    examples_by_lang = {k: [] for k in self.supported_languages}
-    for language in PawsxProcessor.supported_languages:
+    for lang in self.supported_languages:
-      lines = self._read_tsv(os.path.join(data_dir, language, "test_2k.tsv"))
+      lines = self._read_tsv(os.path.join(data_dir, lang, "test_2k.tsv"))[1:]
      for (i, line) in enumerate(lines):
-        if i == 0:
-          continue
        guid = "test-%d" % i
        text_a = self.process_text_fn(line[1])
        text_b = self.process_text_fn(line[2])
        label = self.process_text_fn(line[3])
-        examples_by_lang[language].append(
+        examples_by_lang[lang].append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples_by_lang
@@ -273,7 +336,62 @@ class PawsxProcessor(DataProcessor):
  @staticmethod
  def get_processor_name():
    """See base class."""
-    return "PAWS-X"
+    return "XTREME-PAWS-X"
+class XtremePawsxProcessor(DataProcessor):
+  """Processor for the XTREME PAWS-X data set."""
+  supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"]
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "train-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "dev-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    examples_by_lang = {k: [] for k in self.supported_languages}
+    for lang in self.supported_languages:
+      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
+      for (i, line) in enumerate(lines):
+        guid = "test-%d" % i
+        text_a = self.process_text_fn(line[0])
+        text_b = self.process_text_fn(line[1])
+        label = "0"
+        examples_by_lang[lang].append(
+            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples_by_lang
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "XTREME-PAWS-X"
 class MnliProcessor(DataProcessor):
@@ -407,8 +525,8 @@ class QqpProcessor(DataProcessor):
        label = line[5]
      except IndexError:
        continue
-      examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b,
+      examples.append(
-                                   label=label))
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples
@@ -458,6 +576,53 @@ class ColaProcessor(DataProcessor):
    return examples
+class RteProcessor(DataProcessor):
+  """Processor for the RTE data set (GLUE version)."""
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+  def get_labels(self):
+    """See base class."""
+    # All datasets are converted to 2-class split, where for 3-class datasets we
+    # collapse neutral and contradiction into not_entailment.
+    return ["entailment", "not_entailment"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "RTE"
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for i, line in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, i)
+      text_a = tokenization.convert_to_unicode(line[1])
+      text_b = tokenization.convert_to_unicode(line[2])
+      if set_type == "test":
+        label = "entailment"
+      else:
+        label = tokenization.convert_to_unicode(line[3])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
 class SstProcessor(DataProcessor):
  """Processor for the SST-2 data set (GLUE version)."""
@@ -583,15 +748,16 @@ class TfdsProcessor(DataProcessor):
    is_regression: Whether the task is a regression problem (defaults to False).
  """
-  def __init__(self, tfds_params,
+  def __init__(self,
+               tfds_params,
               process_text_fn=tokenization.convert_to_unicode):
    super(TfdsProcessor, self).__init__(process_text_fn)
    self._process_tfds_params_str(tfds_params)
    if self.module_import:
      importlib.import_module(self.module_import)
-    self.dataset, info = tfds.load(self.dataset_name, data_dir=self.data_dir,
+    self.dataset, info = tfds.load(
-                                   with_info=True)
+        self.dataset_name, data_dir=self.data_dir, with_info=True)
    if self.is_regression:
      self._labels = None
    else:
@@ -660,8 +826,57 @@ class TfdsProcessor(DataProcessor):
      if self.weight_key:
        weight = float(example[self.weight_key])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label,
+          InputExample(
-                       weight=weight))
+              guid=guid,
+              text_a=text_a,
+              text_b=text_b,
+              label=label,
+              weight=weight))
+    return examples
+class WnliProcessor(DataProcessor):
+  """Processor for the WNLI data set (GLUE version)."""
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "WNLI"
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for i, line in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, i)
+      text_a = tokenization.convert_to_unicode(line[1])
+      text_b = tokenization.convert_to_unicode(line[2])
+      if set_type == "test":
+        label = "0"
+      else:
+        label = tokenization.convert_to_unicode(line[3])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples
@@ -748,8 +963,9 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
    logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
    logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
    logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
-    logging.info("label: %s (id = %d)", example.label, label_id)
+    logging.info("label: %s (id = %s)", example.label, str(label_id))
    logging.info("weight: %s", example.weight)
+    logging.info("int_iden: %s", str(example.int_iden))
  feature = InputFeatures(
      input_ids=input_ids,
@@ -757,13 +973,18 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
      segment_ids=segment_ids,
      label_id=label_id,
      is_real_example=True,
-      weight=example.weight)
+      weight=example.weight,
+      int_iden=example.int_iden)
  return feature
-def file_based_convert_examples_to_features(examples, label_list,
+def file_based_convert_examples_to_features(examples,
-                                            max_seq_length, tokenizer,
+                                            label_list,
-                                            output_file, label_type=None):
+                                            max_seq_length,
+                                            tokenizer,
+                                            output_file,
+                                            label_type=None):
  """Convert a set of `InputExample`s to a TFRecord file."""
  tf.io.gfile.makedirs(os.path.dirname(output_file))
@@ -779,6 +1000,7 @@ def file_based_convert_examples_to_features(examples, label_list,
    def create_int_feature(values):
      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
      return f
    def create_float_feature(values):
      f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
      return f
@@ -789,12 +1011,14 @@ def file_based_convert_examples_to_features(examples, label_list,
    features["segment_ids"] = create_int_feature(feature.segment_ids)
    if label_type is not None and label_type == float:
      features["label_ids"] = create_float_feature([feature.label_id])
-    else:
+    elif feature.label_id is not None:
      features["label_ids"] = create_int_feature([feature.label_id])
    features["is_real_example"] = create_int_feature(
        [int(feature.is_real_example)])
    if feature.weight is not None:
      features["weight"] = create_float_feature([feature.weight])
+    if feature.int_iden is not None:
+      features["int_iden"] = create_int_feature([feature.int_iden])
    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    writer.write(tf_example.SerializeToString())
@@ -830,8 +1054,7 @@ def generate_tf_record_from_data_file(processor,
  Arguments:
      processor: Input processor object to be used for generating data. Subclass
        of `DataProcessor`.
-      data_dir: Directory that contains train/eval data to process. Data files
+      data_dir: Directory that contains train/eval/test data to process.
-        should be in from "dev.tsv", "test.tsv", or "train.tsv".
      tokenizer: The tokenizer to be applied on the data.
      train_data_output_path: Output to which processed tf record for training
        will be saved.
@@ -857,8 +1080,7 @@ def generate_tf_record_from_data_file(processor,
  train_input_data_examples = processor.get_train_examples(data_dir)
  file_based_convert_examples_to_features(train_input_data_examples, label_list,
                                          max_seq_length, tokenizer,
-                                          train_data_output_path,
+                                          train_data_output_path, label_type)
-                                          label_type)
  num_training_data = len(train_input_data_examples)
  if eval_data_output_path:
@@ -868,26 +1090,27 @@ def generate_tf_record_from_data_file(processor,
                                            tokenizer, eval_data_output_path,
                                            label_type)
+  meta_data = {
+      "processor_type": processor.get_processor_name(),
+      "train_data_size": num_training_data,
+      "max_seq_length": max_seq_length,
+  }
  if test_data_output_path:
    test_input_data_examples = processor.get_test_examples(data_dir)
    if isinstance(test_input_data_examples, dict):
      for language, examples in test_input_data_examples.items():
        file_based_convert_examples_to_features(
-            examples,
+            examples, label_list, max_seq_length, tokenizer,
-            label_list, max_seq_length,
+            test_data_output_path.format(language), label_type)
-            tokenizer, test_data_output_path.format(language),
+        meta_data["test_{}_data_size".format(language)] = len(examples)
-            label_type)
    else:
      file_based_convert_examples_to_features(test_input_data_examples,
                                              label_list, max_seq_length,
                                              tokenizer, test_data_output_path,
                                              label_type)
+      meta_data["test_data_size"] = len(test_input_data_examples)
-  meta_data = {
-      "processor_type": processor.get_processor_name(),
-      "train_data_size": num_training_data,
-      "max_seq_length": max_seq_length,
-  }
  if is_regression:
    meta_data["task_type"] = "bert_regression"
    meta_data["label_type"] = {int: "int", float: "float"}[label_type]
@@ -900,12 +1123,4 @@ def generate_tf_record_from_data_file(processor,
  if eval_data_output_path:
    meta_data["eval_data_size"] = len(eval_input_data_examples)
-  if test_data_output_path:
-    test_input_data_examples = processor.get_test_examples(data_dir)
-    if isinstance(test_input_data_examples, dict):
-      for language, examples in test_input_data_examples.items():
-        meta_data["test_{}_data_size".format(language)] = len(examples)
-    else:
-      meta_data["test_data_size"] = len(test_input_data_examples)
  return meta_data
--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -27,18 +27,21 @@ from absl import flags
 import tensorflow as tf
 from official.nlp.bert import tokenization
 from official.nlp.data import classifier_data_lib
+from official.nlp.data import sentence_retrieval_lib
 # word-piece tokenizer based squad_lib
 from official.nlp.data import squad_lib as squad_lib_wp
 # sentence-piece tokenizer based squad_lib
 from official.nlp.data import squad_lib_sp
+from official.nlp.data import tagging_data_lib
 FLAGS = flags.FLAGS
+# TODO(chendouble): consider moving each task to its own binary.
 flags.DEFINE_enum(
    "fine_tuning_task_type", "classification",
-    ["classification", "regression", "squad"],
+    ["classification", "regression", "squad", "retrieval", "tagging"],
    "The name of the BERT fine tuning task for which data "
-    "will be generated..")
+    "will be generated.")
 # BERT classification specific flags.
 flags.DEFINE_string(
@@ -48,8 +51,12 @@ flags.DEFINE_string(
 flags.DEFINE_enum("classification_task_name", "MNLI",
                  ["COLA", "MNLI", "MRPC", "QNLI", "QQP", "SST-2", "XNLI",
-                   "PAWS-X"],
+                   "PAWS-X", "XTREME-XNLI", "XTREME-PAWS-X"],
-                  "The name of the task to train BERT classifier.")
+                  "The name of the task to train BERT classifier. The "
+                  "difference between XTREME-XNLI and XNLI is: 1. the format "
+                  "of input tsv files; 2. the dev set for XTREME is english "
+                  "only and for XNLI is all languages combined. Same for "
+                  "PAWS-X.")
 # XNLI task specific flag.
 flags.DEFINE_string(
@@ -63,6 +70,14 @@ flags.DEFINE_string(
    "Language of trainig data for PAWS-X task. If the value is 'all', the data "
    "of all languages will be used for training.")
+# Retrieva task specific flags
+flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
+                  "The name of sentence retrieval task for scoring")
+# Tagging task specific flags
+flags.DEFINE_enum("tagging_task_name", "panx", ["panx", "udpos"],
+                  "The name of BERT tagging (token classification) task.")
 # BERT Squad task specific flags.
 flags.DEFINE_string(
    "squad_data_file", None,
@@ -169,6 +184,7 @@ def generate_classifier_dataset():
        "qnli":
            classifier_data_lib.QnliProcessor,
        "qqp": classifier_data_lib.QqpProcessor,
+        "rte": classifier_data_lib.RteProcessor,
        "sst-2":
            classifier_data_lib.SstProcessor,
        "xnli":
@@ -176,7 +192,12 @@ def generate_classifier_dataset():
                              language=FLAGS.xnli_language),
        "paws-x":
            functools.partial(classifier_data_lib.PawsxProcessor,
-                              language=FLAGS.pawsx_language)
+                              language=FLAGS.pawsx_language),
+        "wnli": classifier_data_lib.WnliProcessor,
+        "xtreme-xnli":
+            functools.partial(classifier_data_lib.XtremeXnliProcessor),
+        "xtreme-paws-x":
+            functools.partial(classifier_data_lib.XtremePawsxProcessor)
    }
    task_name = FLAGS.classification_task_name.lower()
    if task_name not in processors:
@@ -237,6 +258,67 @@ def generate_squad_dataset():
        FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative)
+def generate_retrieval_dataset():
+  """Generate retrieval test and dev dataset and returns input meta data."""
+  assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name)
+  if FLAGS.tokenizer_impl == "word_piece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  else:
+    assert FLAGS.tokenizer_impl == "sentence_piece"
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+  processors = {
+      "bucc": sentence_retrieval_lib.BuccProcessor,
+      "tatoeba": sentence_retrieval_lib.TatoebaProcessor,
+  }
+  task_name = FLAGS.retrieval_task_name.lower()
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % task_name)
+  processor = processors[task_name](process_text_fn=processor_text_fn)
+  return sentence_retrieval_lib.generate_sentence_retrevial_tf_record(
+      processor,
+      FLAGS.input_data_dir,
+      tokenizer,
+      FLAGS.eval_data_output_path,
+      FLAGS.test_data_output_path,
+      FLAGS.max_seq_length)
+def generate_tagging_dataset():
+  """Generates tagging dataset."""
+  processors = {
+      "panx": tagging_data_lib.PanxProcessor,
+      "udpos": tagging_data_lib.UdposProcessor,
+  }
+  task_name = FLAGS.tagging_task_name.lower()
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % task_name)
+  if FLAGS.tokenizer_impl == "word_piece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  elif FLAGS.tokenizer_impl == "sentence_piece":
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+  else:
+    raise ValueError("Unsupported tokenizer_impl: %s" % FLAGS.tokenizer_impl)
+  processor = processors[task_name]()
+  return tagging_data_lib.generate_tf_record_from_data_file(
+      processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
+      FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
+      FLAGS.test_data_output_path, processor_text_fn)
 def main(_):
  if FLAGS.tokenizer_impl == "word_piece":
    if not FLAGS.vocab_file:
@@ -248,12 +330,20 @@ def main(_):
      raise ValueError(
          "FLAG sp_model_file for sentence-piece tokenizer is not specified.")
+  if FLAGS.fine_tuning_task_type != "retrieval":
+    flags.mark_flag_as_required("train_data_output_path")
  if FLAGS.fine_tuning_task_type == "classification":
    input_meta_data = generate_classifier_dataset()
  elif FLAGS.fine_tuning_task_type == "regression":
    input_meta_data = generate_regression_dataset()
-  else:
+  elif FLAGS.fine_tuning_task_type == "retrieval":
+    input_meta_data = generate_retrieval_dataset()
+  elif FLAGS.fine_tuning_task_type == "squad":
    input_meta_data = generate_squad_dataset()
+  else:
+    assert FLAGS.fine_tuning_task_type == "tagging"
+    input_meta_data = generate_tagging_dataset()
  tf.io.gfile.makedirs(os.path.dirname(FLAGS.meta_data_file_path))
  with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer:
@@ -261,6 +351,5 @@ def main(_):
 if __name__ == "__main__":
-  flags.mark_flag_as_required("train_data_output_path")
  flags.mark_flag_as_required("meta_data_file_path")
  app.run(main)
--- a/official/nlp/data/sentence_retrieval_lib.py
+++ b/official/nlp/data/sentence_retrieval_lib.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT library to process data for cross lingual sentence retrieval task."""
+import os
+from absl import logging
+from official.nlp.bert import tokenization
+from official.nlp.data import classifier_data_lib
+class BuccProcessor(classifier_data_lib.DataProcessor):
+  """Procssor for Xtreme BUCC data set."""
+  supported_languages = ["de", "fr", "ru", "zh"]
+  def __init__(self,
+               process_text_fn=tokenization.convert_to_unicode):
+    super(BuccProcessor, self).__init__(process_text_fn)
+    self.languages = BuccProcessor.supported_languages
+  def get_dev_examples(self, data_dir, file_pattern):
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, file_pattern.format("dev"))),
+        "sample")
+  def get_test_examples(self, data_dir, file_pattern):
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, file_pattern.format("test"))),
+        "test")
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "BUCC"
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "%s-%s" % (set_type, i)
+      int_iden = int(line[0].split("-")[1])
+      text_a = self.process_text_fn(line[1])
+      examples.append(
+          classifier_data_lib.InputExample(
+              guid=guid, text_a=text_a, int_iden=int_iden))
+    return examples
+class TatoebaProcessor(classifier_data_lib.DataProcessor):
+  """Procssor for Xtreme Tatoeba data set."""
+  supported_languages = [
+      "af", "ar", "bg", "bn", "de", "el", "es", "et", "eu", "fa", "fi", "fr",
+      "he", "hi", "hu", "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr",
+      "nl", "pt", "ru", "sw", "ta", "te", "th", "tl", "tr", "ur", "vi", "zh"
+  ]
+  def __init__(self,
+               process_text_fn=tokenization.convert_to_unicode):
+    super(TatoebaProcessor, self).__init__(process_text_fn)
+    self.languages = TatoebaProcessor.supported_languages
+  def get_test_examples(self, data_dir, file_path):
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, file_path)), "test")
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "TATOEBA"
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "%s-%s" % (set_type, i)
+      text_a = self.process_text_fn(line[0])
+      examples.append(
+          classifier_data_lib.InputExample(
+              guid=guid, text_a=text_a, int_iden=i))
+    return examples
+def generate_sentence_retrevial_tf_record(processor,
+                                          data_dir,
+                                          tokenizer,
+                                          eval_data_output_path=None,
+                                          test_data_output_path=None,
+                                          max_seq_length=128):
+  """Generates the tf records for retrieval tasks.
+  Args:
+    processor: Input processor object to be used for generating data. Subclass
+      of `DataProcessor`.
+      data_dir: Directory that contains train/eval data to process. Data files
+        should be in from.
+      tokenizer: The tokenizer to be applied on the data.
+      eval_data_output_path: Output to which processed tf record for evaluation
+        will be saved.
+      test_data_output_path: Output to which processed tf record for testing
+        will be saved. Must be a pattern template with {} if processor has
+        language specific test data.
+      max_seq_length: Maximum sequence length of the to be generated
+        training/eval data.
+  Returns:
+      A dictionary containing input meta data.
+  """
+  assert eval_data_output_path or test_data_output_path
+  if processor.get_processor_name() == "BUCC":
+    path_pattern = "{}-en.{{}}.{}"
+  if processor.get_processor_name() == "TATOEBA":
+    path_pattern = "{}-en.{}"
+  meta_data = {
+      "processor_type": processor.get_processor_name(),
+      "max_seq_length": max_seq_length,
+      "number_eval_data": {},
+      "number_test_data": {},
+  }
+  logging.info("Start to process %s task data", processor.get_processor_name())
+  for lang_a in processor.languages:
+    for lang_b in [lang_a, "en"]:
+      if eval_data_output_path:
+        eval_input_data_examples = processor.get_dev_examples(
+            data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))
+        num_eval_data = len(eval_input_data_examples)
+        logging.info("Processing %d dev examples of %s-en.%s", num_eval_data,
+                     lang_a, lang_b)
+        output_file = os.path.join(
+            eval_data_output_path,
+            "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "dev"))
+        classifier_data_lib.file_based_convert_examples_to_features(
+            eval_input_data_examples, None, max_seq_length, tokenizer,
+            output_file, None)
+        meta_data["number_eval_data"][f"{lang_a}-en.{lang_b}"] = num_eval_data
+      if test_data_output_path:
+        test_input_data_examples = processor.get_test_examples(
+            data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))
+        num_test_data = len(test_input_data_examples)
+        logging.info("Processing %d test examples of %s-en.%s", num_test_data,
+                     lang_a, lang_b)
+        output_file = os.path.join(
+            test_data_output_path,
+            "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "test"))
+        classifier_data_lib.file_based_convert_examples_to_features(
+            test_input_data_examples, None, max_seq_length, tokenizer,
+            output_file, None)
+        meta_data["number_test_data"][f"{lang_a}-en.{lang_b}"] = num_test_data
+  return meta_data
--- a/official/nlp/data/tagging_data_lib.py
+++ b/official/nlp/data/tagging_data_lib.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library to process data for tagging task such as NER/POS."""
+import collections
+import os
+from absl import logging
+import tensorflow as tf
+from official.nlp.data import classifier_data_lib
+# A negative label id for the padding label, which will not contribute
+# to loss/metrics in training.
+_PADDING_LABEL_ID = -1
+# The special unknown token, used to substitute a word which has too many
+# subwords after tokenization.
+_UNK_TOKEN = "[UNK]"
+class InputExample(object):
+  """A single training/test example for token classification."""
+  def __init__(self, sentence_id, words=None, label_ids=None):
+    """Constructs an InputExample."""
+    self.sentence_id = sentence_id
+    self.words = words if words else []
+    self.label_ids = label_ids if label_ids else []
+  def add_word_and_label_id(self, word, label_id):
+    """Adds word and label_id pair in the example."""
+    self.words.append(word)
+    self.label_ids.append(label_id)
+def _read_one_file(file_name, label_list):
+  """Reads one file and returns a list of `InputExample` instances."""
+  lines = tf.io.gfile.GFile(file_name, "r").readlines()
+  examples = []
+  label_id_map = {label: i for i, label in enumerate(label_list)}
+  sentence_id = 0
+  example = InputExample(sentence_id=0)
+  for line in lines:
+    line = line.strip("\n")
+    if line:
+      # The format is: <token>\t<label> for train/dev set and <token> for test.
+      items = line.split("\t")
+      assert len(items) == 2 or len(items) == 1
+      token = items[0].strip()
+      # Assign a dummy label_id for test set
+      label_id = label_id_map[items[1].strip()] if len(items) == 2 else 0
+      example.add_word_and_label_id(token, label_id)
+    else:
+      # Empty line indicates a new sentence.
+      if example.words:
+        examples.append(example)
+        sentence_id += 1
+        example = InputExample(sentence_id=sentence_id)
+  if example.words:
+    examples.append(example)
+  return examples
+class PanxProcessor(classifier_data_lib.DataProcessor):
+  """Processor for the Panx data set."""
+  supported_languages = [
+      "ar", "he", "vi", "id", "jv", "ms", "tl", "eu", "ml", "ta", "te", "af",
+      "nl", "en", "de", "el", "bn", "hi", "mr", "ur", "fa", "fr", "it", "pt",
+      "es", "bg", "ru", "ja", "ka", "ko", "th", "sw", "yo", "my", "zh", "kk",
+      "tr", "et", "fi", "hu"
+  ]
+  def get_train_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "train-en.tsv"), self.get_labels())
+  def get_dev_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
+  def get_test_examples(self, data_dir):
+    examples_dict = {}
+    for language in self.supported_languages:
+      examples_dict[language] = _read_one_file(
+          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
+    return examples_dict
+  def get_labels(self):
+    return ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
+  @staticmethod
+  def get_processor_name():
+    return "panx"
+class UdposProcessor(classifier_data_lib.DataProcessor):
+  """Processor for the Udpos data set."""
+  supported_languages = [
+      "af", "ar", "bg", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr",
+      "he", "hi", "hu", "id", "it", "ja", "kk", "ko", "mr", "nl", "pt", "ru",
+      "ta", "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"
+  ]
+  def get_train_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "train-en.tsv"), self.get_labels())
+  def get_dev_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
+  def get_test_examples(self, data_dir):
+    examples_dict = {}
+    for language in self.supported_languages:
+      examples_dict[language] = _read_one_file(
+          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
+    return examples_dict
+  def get_labels(self):
+    return [
+        "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM",
+        "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
+    ]
+  @staticmethod
+  def get_processor_name():
+    return "udpos"
+def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
+  """Tokenizes words and breaks long example into short ones."""
+  # Needs additional [CLS] and [SEP] tokens.
+  max_length = max_length - 2
+  new_examples = []
+  new_example = InputExample(sentence_id=example.sentence_id)
+  for i, word in enumerate(example.words):
+    if any([x < 0 for x in example.label_ids]):
+      raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
+    if text_preprocessing:
+      word = text_preprocessing(word)
+    subwords = tokenizer.tokenize(word)
+    if (not subwords or len(subwords) > max_length) and word:
+      subwords = [_UNK_TOKEN]
+    if len(subwords) + len(new_example.words) > max_length:
+      # Start a new example.
+      new_examples.append(new_example)
+      new_example = InputExample(sentence_id=example.sentence_id)
+    for j, subword in enumerate(subwords):
+      # Use the real label for the first subword, and pad label for
+      # the remainings.
+      subword_label = example.label_ids[i] if j == 0 else _PADDING_LABEL_ID
+      new_example.add_word_and_label_id(subword, subword_label)
+  if new_example.words:
+    new_examples.append(new_example)
+  return new_examples
+def _convert_single_example(example, max_seq_length, tokenizer):
+  """Converts an `InputExample` instance to a `tf.train.Example` instance."""
+  tokens = ["[CLS]"]
+  tokens.extend(example.words)
+  tokens.append("[SEP]")
+  input_ids = tokenizer.convert_tokens_to_ids(tokens)
+  label_ids = [_PADDING_LABEL_ID]
+  label_ids.extend(example.label_ids)
+  label_ids.append(_PADDING_LABEL_ID)
+  segment_ids = [0] * len(input_ids)
+  input_mask = [1] * len(input_ids)
+  # Pad up to the sequence length.
+  while len(input_ids) < max_seq_length:
+    input_ids.append(0)
+    input_mask.append(0)
+    segment_ids.append(0)
+    label_ids.append(_PADDING_LABEL_ID)
+  def create_int_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  features = collections.OrderedDict()
+  features["input_ids"] = create_int_feature(input_ids)
+  features["input_mask"] = create_int_feature(input_mask)
+  features["segment_ids"] = create_int_feature(segment_ids)
+  features["label_ids"] = create_int_feature(label_ids)
+  features["sentence_id"] = create_int_feature([example.sentence_id])
+  tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+  return tf_example
+def write_example_to_file(examples,
+                          tokenizer,
+                          max_seq_length,
+                          output_file,
+                          text_preprocessing=None):
+  """Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
+  Note that the words inside each example will be tokenized and be applied by
+  `text_preprocessing` if available. Also, if the length of sentence (plus
+  special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
+  will be broken into multiple short examples. For example:
+  Example (text_preprocessing=lowercase, max_seq_length=5)
+    words:        ["What", "a", "great", "weekend"]
+    labels:       [     7,   5,       9,        10]
+    sentence_id:  0
+    preprocessed: ["what", "a", "great", "weekend"]
+    tokenized:    ["what", "a", "great", "week", "##end"]
+  will result in two tf.example protos:
+    tokens:      ["[CLS]", "what", "a", "great", "[SEP]"]
+    label_ids:   [-1,       7,     5,     9,     -1]
+    input_mask:  [ 1,       1,     1,     1,      1]
+    segment_ids: [ 0,       0,     0,     0,      0]
+    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
+    sentence_id: 0
+    tokens:      ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
+    label_ids:   [-1,       10,     -1,    -1,       -1]
+    input_mask:  [ 1,       1,       1,     0,        0]
+    segment_ids: [ 0,       0,       0,     0,        0]
+    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
+    sentence_id: 0
+    Note the use of -1 in `label_ids` to indicate that a token should not be
+    considered for classification (e.g., trailing ## wordpieces or special
+    token). Token classification models should accordingly ignore these when
+    calculating loss, metrics, etc...
+  Args:
+    examples: A list of `InputExample` instances.
+    tokenizer: The tokenizer to be applied on the data.
+    max_seq_length: Maximum length of generated sequences.
+    output_file: The name of the output tfrecord file.
+    text_preprocessing: optional preprocessing run on each word prior to
+      tokenization.
+  Returns:
+    The total number of tf.train.Example proto written to file.
+  """
+  tf.io.gfile.makedirs(os.path.dirname(output_file))
+  writer = tf.io.TFRecordWriter(output_file)
+  num_tokenized_examples = 0
+  for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      logging.info("Writing example %d of %d to %s", ex_index, len(examples),
+                   output_file)
+    tokenized_examples = _tokenize_example(example, max_seq_length,
+                                           tokenizer, text_preprocessing)
+    num_tokenized_examples += len(tokenized_examples)
+    for per_tokenized_example in tokenized_examples:
+      tf_example = _convert_single_example(
+          per_tokenized_example, max_seq_length, tokenizer)
+      writer.write(tf_example.SerializeToString())
+  writer.close()
+  return num_tokenized_examples
+def token_classification_meta_data(train_data_size,
+                                   max_seq_length,
+                                   num_labels,
+                                   eval_data_size=None,
+                                   test_data_size=None,
+                                   label_list=None,
+                                   processor_type=None):
+  """Creates metadata for tagging (token classification) datasets."""
+  meta_data = {
+      "train_data_size": train_data_size,
+      "max_seq_length": max_seq_length,
+      "num_labels": num_labels,
+      "task_type": "tagging",
+      "label_type": "int",
+      "label_shape": [max_seq_length],
+  }
+  if eval_data_size:
+    meta_data["eval_data_size"] = eval_data_size
+  if test_data_size:
+    meta_data["test_data_size"] = test_data_size
+  if label_list:
+    meta_data["label_list"] = label_list
+  if processor_type:
+    meta_data["processor_type"] = processor_type
+  return meta_data
+def generate_tf_record_from_data_file(processor,
+                                      data_dir,
+                                      tokenizer,
+                                      max_seq_length,
+                                      train_data_output_path,
+                                      eval_data_output_path,
+                                      test_data_output_path,
+                                      text_preprocessing):
+  """Generates tfrecord files from the raw data."""
+  common_kwargs = dict(tokenizer=tokenizer, max_seq_length=max_seq_length,
+                       text_preprocessing=text_preprocessing)
+  train_examples = processor.get_train_examples(data_dir)
+  train_data_size = write_example_to_file(
+      train_examples, output_file=train_data_output_path, **common_kwargs)
+  eval_examples = processor.get_dev_examples(data_dir)
+  eval_data_size = write_example_to_file(
+      eval_examples, output_file=eval_data_output_path, **common_kwargs)
+  test_input_data_examples = processor.get_test_examples(data_dir)
+  test_data_size = {}
+  for language, examples in test_input_data_examples.items():
+    test_data_size[language] = write_example_to_file(
+        examples,
+        output_file=test_data_output_path.format(language),
+        **common_kwargs)
+  labels = processor.get_labels()
+  meta_data = token_classification_meta_data(
+      train_data_size,
+      max_seq_length,
+      len(labels),
+      eval_data_size,
+      test_data_size,
+      label_list=labels,
+      processor_type=processor.get_processor_name())
+  return meta_data
--- a/official/nlp/data/tagging_data_loader.py
+++ b/official/nlp/data/tagging_data_loader.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Loads dataset for the tagging (e.g., NER/POS) task."""
+from typing import Mapping, Optional
+import tensorflow as tf
+from official.core import input_reader
+class TaggingDataLoader:
+  """A class to load dataset for tagging (e.g., NER and POS) task."""
+  def __init__(self, params):
+    self._params = params
+    self._seq_length = params.seq_length
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'label_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+    }
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['label_ids']
+    return (x, y)
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
+    return reader.read(input_context)
--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
@@ -9,13 +9,17 @@ assemble new layers, networks, or models.
    initialization parameters.
 *   [MultiHeadAttention](attention.py) implements an optionally masked attention
-    between two tensors, from_tensor and to_tensor, as described in
+    between query, key, value tensors as described in
    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). If
    `from_tensor` and `to_tensor` are the same, then this is self-attention.
 *   [CachedAttention](attention.py) implements an attention layer with cache
    used for auto-agressive decoding.
+*   [MultiChannelAttention](multi_channel_attention.py) implements an variant of
+    multi-head attention which can be used to merge multiple streams for
+    cross-attentions.
 *   [TalkingHeadsAttention](talking_heads_attention.py) implements the talking
    heads attention, as decribed in
    ["Talking-Heads Attention"](https://arxiv.org/abs/2003.02436).
@@ -24,6 +28,10 @@ assemble new layers, networks, or models.
    described in
    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
+*   [TransformerDecoderLayer](transformer.py) TransformerDecoderLayer is made up
+    of self multi-head attention, cross multi-head attention and
+    feedforward network.
 *   [ReZeroTransformer](rezero_transformer.py) implements Transformer with
    ReZero described in
    ["ReZero is All You Need: Fast Convergence at Large Depth"](https://arxiv.org/abs/2003.04887).
@@ -45,6 +53,9 @@ assemble new layers, networks, or models.
    should be masked), the output will have masked positions set to
    approximately zero.
+*   [`MaskedLM`](masked_lm.py) implements a masked language model. It assumes
+    the embedding table variable is passed to it.
 *   [ClassificationHead](cls_head.py) A pooling head over a sequence of
    embeddings, commonly used by classification tasks.

--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
@@ -18,11 +18,13 @@ from official.nlp.modeling.layers.attention import *
 from official.nlp.modeling.layers.cls_head import *
 from official.nlp.modeling.layers.dense_einsum import DenseEinsum
 from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
+from official.nlp.modeling.layers.masked_lm import MaskedLM
 from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
+from official.nlp.modeling.layers.multi_channel_attention import *
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
 from official.nlp.modeling.layers.position_embedding import PositionEmbedding
 from official.nlp.modeling.layers.rezero_transformer import ReZeroTransformer
 from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask
 from official.nlp.modeling.layers.talking_heads_attention import TalkingHeadsAttention
-from official.nlp.modeling.layers.transformer import Transformer
+from official.nlp.modeling.layers.transformer import *
 from official.nlp.modeling.layers.transformer_scaffold import TransformerScaffold
--- a/official/nlp/modeling/networks/masked_lm.py
+++ b/official/nlp/modeling/networks/masked_lm.py
@@ -25,91 +25,74 @@ from official.modeling import tf_utils
 @tf.keras.utils.register_keras_serializable(package='Text')
-class MaskedLM(tf.keras.Model):
+class MaskedLM(tf.keras.layers.Layer):
  """Masked language model network head for BERT modeling.
  This network implements a masked language model based on the provided network.
  It assumes that the network being passed has a "get_embedding_table()" method.
  Arguments:
-    input_width: The innermost dimension of the input tensor to this network.
+    embedding_table: The embedding table of the targets.
-    num_predictions: The number of predictions to make per sequence.
+    activation: The activation, if any, for the dense layer.
-    source_network: The network with the embedding layer to use for the
+    initializer: The intializer for the dense layer. Defaults to a Glorot
-      embedding layer.
+      uniform initializer.
-    embedding_table: The embedding table of a source network, If None, the
-      `source_network.get_embedding_table()` method is used.
-    activation: The activation, if any, for the dense layer in this network.
-    initializer: The intializer for the dense layer in this network. Defaults to
-      a Glorot uniform initializer.
    output: The output style for this network. Can be either 'logits' or
      'predictions'.
  """
  def __init__(self,
-               input_width,
+               embedding_table,
-               num_predictions,
-               source_network,
-               embedding_table=None,
               activation=None,
               initializer='glorot_uniform',
               output='logits',
+               name='cls/predictions',
               **kwargs):
+    super(MaskedLM, self).__init__(name=name, **kwargs)
+    self.embedding_table = embedding_table
+    self.activation = activation
+    self.initializer = tf.keras.initializers.get(initializer)
-    if embedding_table is None:
+    if output not in ('predictions', 'logits'):
-      embedding_table = source_network.get_embedding_table()
-    vocab_size, hidden_size = embedding_table.shape
-    sequence_data = tf.keras.layers.Input(
-        shape=(None, input_width), name='sequence_data', dtype=tf.float32)
-    masked_lm_positions = tf.keras.layers.Input(
-        shape=(num_predictions,), name='masked_lm_positions', dtype=tf.int32)
-    masked_lm_input = tf.keras.layers.Lambda(
-        lambda x: self._gather_indexes(x[0], x[1]))(
-            [sequence_data, masked_lm_positions])
-    lm_data = (
-        tf.keras.layers.Dense(
-            hidden_size,
-            activation=activation,
-            kernel_initializer=initializer,
-            name='cls/predictions/transform/dense')(masked_lm_input))
-    lm_data = tf.keras.layers.LayerNormalization(
-        axis=-1, epsilon=1e-12, name='cls/predictions/transform/LayerNorm')(
-            lm_data)
-    lm_data = tf.keras.layers.Lambda(
-        lambda x: tf.matmul(x, embedding_table, transpose_b=True))(
-            lm_data)
-    logits = Bias(
-        initializer=tf.keras.initializers.Zeros(),
-        name='cls/predictions/output_bias')(
-            lm_data)
-    # We can't use the standard Keras reshape layer here, since it expects
-    # the input and output batch size to be the same.
-    reshape_layer = tf.keras.layers.Lambda(
-        lambda x: tf.reshape(x, [-1, num_predictions, vocab_size]))
-    self.logits = reshape_layer(logits)
-    predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(self.logits)
-    if output == 'logits':
-      output_tensors = self.logits
-    elif output == 'predictions':
-      output_tensors = predictions
-    else:
      raise ValueError(
          ('Unknown `output` value "%s". `output` can be either "logits" or '
           '"predictions"') % output)
+    self._output_type = output
-    super(MaskedLM, self).__init__(
+  def build(self, input_shape):
-        inputs=[sequence_data, masked_lm_positions],
+    self._vocab_size, hidden_size = self.embedding_table.shape
-        outputs=output_tensors,
+    self.dense = tf.keras.layers.Dense(
-        **kwargs)
+        hidden_size,
+        activation=self.activation,
+        kernel_initializer=self.initializer,
+        name='transform/dense')
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+    super(MaskedLM, self).build(input_shape)
+  def call(self, sequence_data, masked_positions):
+    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
+    lm_data = self.dense(masked_lm_input)
+    lm_data = self.layer_norm(lm_data)
+    lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
+    logits = tf.nn.bias_add(lm_data, self.bias)
+    masked_positions_shape = tf_utils.get_shape_list(
+        masked_positions, name='masked_positions_tensor')
+    logits = tf.reshape(logits,
+                        [-1, masked_positions_shape[1], self._vocab_size])
+    if self._output_type == 'logits':
+      return logits
+    return tf.nn.log_softmax(logits)
  def get_config(self):
-    raise NotImplementedError('MaskedLM cannot be directly serialized at this '
+    raise NotImplementedError('MaskedLM cannot be directly serialized because '
-                              'time. Please use it only in Layers or '
+                              'it has variable sharing logic.')
-                              'functionally subclassed Models/Networks.')
  def _gather_indexes(self, sequence_tensor, positions):
    """Gathers the vectors at the specific positions.
@@ -139,51 +122,3 @@ class MaskedLM(tf.keras.Model):
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
    return output_tensor
-@tf.keras.utils.register_keras_serializable(package='Text')
-# Temporary until we can create a Dense layer that ties the embedding.
-class Bias(tf.keras.layers.Layer):
-  """Adds a bias term to an input."""
-  def __init__(self,
-               initializer='zeros',
-               regularizer=None,
-               constraint=None,
-               activation=None,
-               **kwargs):
-    super(Bias, self).__init__(**kwargs)
-    self._initializer = tf.keras.initializers.get(initializer)
-    self._regularizer = tf.keras.regularizers.get(regularizer)
-    self._constraint = tf.keras.constraints.get(constraint)
-    self._activation = tf.keras.activations.get(activation)
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    self._bias = self.add_weight(
-        'bias',
-        shape=input_shape[1:],
-        initializer=self._initializer,
-        regularizer=self._regularizer,
-        constraint=self._constraint,
-        dtype=self._dtype,
-        trainable=True)
-    super(Bias, self).build(input_shape)
-  def get_config(self):
-    config = {
-        'activation': tf.keras.activations.serialize(self._activation),
-        'initializer': tf.keras.initializers.serialize(self._initializer),
-        'regularizer': tf.keras.regularizers.serialize(self._regularizer),
-        'constraint': tf.keras.constraints.serialize(self._constraint)
-    }
-    base_config = super(Bias, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-  def call(self, inputs):
-    outputs = tf.nn.bias_add(inputs, self._bias)
-    if self._activation is not None:
-      return self._activation(outputs)  # pylint: disable=not-callable
-    else:
-      return outputs
--- a/official/nlp/modeling/networks/masked_lm_test.py
+++ b/official/nlp/modeling/networks/masked_lm_test.py
@@ -23,7 +23,7 @@ import tensorflow as tf
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import masked_lm
+from official.nlp.modeling.layers import masked_lm
 from official.nlp.modeling.networks import transformer_encoder
@@ -32,13 +32,12 @@ from official.nlp.modeling.networks import transformer_encoder
 @keras_parameterized.run_all_keras_modes
 class MaskedLMTest(keras_parameterized.TestCase):
-  def create_network(self,
+  def create_layer(self,
-                     vocab_size,
+                   vocab_size,
-                     sequence_length,
+                   sequence_length,
-                     hidden_size,
+                   hidden_size,
-                     num_predictions,
+                   output='predictions',
-                     output='predictions',
+                   xformer_stack=None):
-                     xformer_stack=None):
    # First, create a transformer stack that we can use to get the LM's
    # vocabulary weight.
    if xformer_stack is None:
@@ -49,82 +48,32 @@ class MaskedLMTest(keras_parameterized.TestCase):
          hidden_size=hidden_size,
          num_attention_heads=4,
      )
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_outputs, _ = xformer_stack([word_ids, mask, type_ids])
    # Create a maskedLM from the transformer stack.
-    test_network = masked_lm.MaskedLM(
+    test_layer = masked_lm.MaskedLM(
-        num_predictions=num_predictions,
+        embedding_table=xformer_stack.get_embedding_table(),
-        input_width=lm_outputs.shape[-1],
-        source_network=xformer_stack,
        output=output)
-    return test_network
+    return test_layer
-  def test_network_creation(self):
+  def test_layer_creation(self):
    vocab_size = 100
    sequence_length = 32
    hidden_size = 64
    num_predictions = 21
-    test_network = self.create_network(
+    test_layer = self.create_layer(
        vocab_size=vocab_size,
        sequence_length=sequence_length,
-        hidden_size=hidden_size,
+        hidden_size=hidden_size)
-        num_predictions=num_predictions)
    # Make sure that the output tensor of the masked LM is the right shape.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-        shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions=masked_positions)
-    output = test_network([lm_input_tensor, masked_lm_positions])
    expected_output_shape = [None, num_predictions, vocab_size]
    self.assertEqual(expected_output_shape, output.shape.as_list())
-  def test_network_invocation_with_internal_logits(self):
+  def test_layer_invocation_with_external_logits(self):
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    test_network = self.create_network(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
-    # Create a model from the masked LM layer.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
-        shape=(num_predictions,), dtype=tf.int32)
-    output = test_network([lm_input_tensor, masked_lm_positions])
-    model = tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
-    logits_model = tf.keras.Model(test_network.inputs, test_network.logits)
-    # Invoke the masked LM on some fake data to make sure there are no runtime
-    # errors in the code.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    outputs = model.predict([lm_input_data, masked_position_data])
-    logits = logits_model.predict([lm_input_data, masked_position_data])
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, num_predictions, vocab_size)
-    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertEqual(expected_output_shape, logits.shape)
-    # Ensure that the logits, when softmaxed, create the outputs.
-    input_tensor = tf.keras.Input(expected_output_shape[1:])
-    output_tensor = tf.keras.layers.Activation(tf.nn.log_softmax)(input_tensor)
-    softmax_model = tf.keras.Model(input_tensor, output_tensor)
-    calculated_softmax = softmax_model.predict(logits)
-    self.assertAllClose(outputs, calculated_softmax)
-  def test_network_invocation_with_external_logits(self):
    vocab_size = 100
    sequence_length = 32
    hidden_size = 64
@@ -136,31 +85,28 @@ class MaskedLMTest(keras_parameterized.TestCase):
        hidden_size=hidden_size,
        num_attention_heads=4,
    )
-    test_network = self.create_network(
+    test_layer = self.create_layer(
        vocab_size=vocab_size,
        sequence_length=sequence_length,
        hidden_size=hidden_size,
-        num_predictions=num_predictions,
        xformer_stack=xformer_stack,
        output='predictions')
-    logit_network = self.create_network(
+    logit_layer = self.create_layer(
        vocab_size=vocab_size,
        sequence_length=sequence_length,
        hidden_size=hidden_size,
-        num_predictions=num_predictions,
        xformer_stack=xformer_stack,
        output='logits')
-    logit_network.set_weights(test_network.get_weights())
    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-        shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions)
-    output = test_network([lm_input_tensor, masked_lm_positions])
+    logit_output = logit_layer(lm_input_tensor, masked_positions)
-    logit_output = logit_network([lm_input_tensor, masked_lm_positions])
+    logit_output = tf.keras.layers.Activation(tf.nn.log_softmax)(logit_output)
+    logit_layer.set_weights(test_layer.get_weights())
-    model = tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
+    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
-    logits_model = tf.keras.Model(([lm_input_tensor, masked_lm_positions]),
+    logits_model = tf.keras.Model(([lm_input_tensor, masked_positions]),
                                  logit_output)
    # Invoke the masked LM on some fake data to make sure there are no runtime
@@ -169,40 +115,33 @@ class MaskedLMTest(keras_parameterized.TestCase):
    lm_input_data = 10 * np.random.random_sample(
        (batch_size, sequence_length, hidden_size))
    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
+        sequence_length, size=(batch_size, num_predictions))
-    outputs = model.predict([lm_input_data, masked_position_data])
+    # ref_outputs = model.predict([lm_input_data, masked_position_data])
-    logits = logits_model.predict([lm_input_data, masked_position_data])
+    # outputs = logits_model.predict([lm_input_data, masked_position_data])
+    ref_outputs = model([lm_input_data, masked_position_data])
+    outputs = logits_model([lm_input_data, masked_position_data])
    # Ensure that the tensor shapes are correct.
    expected_output_shape = (batch_size, num_predictions, vocab_size)
+    self.assertEqual(expected_output_shape, ref_outputs.shape)
    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertEqual(expected_output_shape, logits.shape)
+    self.assertAllClose(ref_outputs, outputs)
-    # Ensure that the logits, when softmaxed, create the outputs.
+  def test_layer_invocation(self):
-    input_tensor = tf.keras.Input(expected_output_shape[1:])
-    output_tensor = tf.keras.layers.Activation(tf.nn.log_softmax)(input_tensor)
-    softmax_model = tf.keras.Model(input_tensor, output_tensor)
-    calculated_softmax = softmax_model.predict(logits)
-    self.assertAllClose(outputs, calculated_softmax)
-  def test_network_invocation(self):
    vocab_size = 100
    sequence_length = 32
    hidden_size = 64
    num_predictions = 21
-    test_network = self.create_network(
+    test_layer = self.create_layer(
        vocab_size=vocab_size,
        sequence_length=sequence_length,
-        hidden_size=hidden_size,
+        hidden_size=hidden_size)
-        num_predictions=num_predictions)
    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-        shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions)
-    output = test_network([lm_input_tensor, masked_lm_positions])
+    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
-    model = tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
    # Invoke the masked LM on some fake data to make sure there are no runtime
    # errors in the code.
@@ -215,12 +154,8 @@ class MaskedLMTest(keras_parameterized.TestCase):
  def test_unknown_output_type_fails(self):
    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = self.create_network(
+      _ = self.create_layer(
-          vocab_size=8,
+          vocab_size=8, sequence_length=8, hidden_size=8, output='bad')
-          sequence_length=8,
-          hidden_size=8,
-          num_predictions=8,
-          output='bad')
 if __name__ == '__main__':

--- a/official/nlp/nhnet/multi_channel_attention.py
+++ b/official/nlp/nhnet/multi_channel_attention.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Multi-channel decoder."""
+"""Multi-channel Attention."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
@@ -24,11 +25,25 @@ import math
 import tensorflow as tf
 from official.modeling import tf_utils
-from official.nlp.modeling import layers
+from official.nlp.modeling.layers import attention
+from official.nlp.modeling.layers import dense_einsum
+from official.nlp.modeling.layers import masked_softmax
-class DocAttention(tf.keras.layers.Layer):
-  """Documents Attention layer."""
+class VotingAttention(tf.keras.layers.Layer):
+  """Voting Attention layer.
+  Arguments:
+    num_heads: the number of attention heads.
+    head_size: per-head hidden size.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+  """
  def __init__(self,
               num_heads,
@@ -41,7 +56,7 @@ class DocAttention(tf.keras.layers.Layer):
               kernel_constraint=None,
               bias_constraint=None,
               **kwargs):
-    super(DocAttention, self).__init__(**kwargs)
+    super(VotingAttention, self).__init__(**kwargs)
    self._num_heads = num_heads
    self._head_size = head_size
    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
@@ -52,7 +67,7 @@ class DocAttention(tf.keras.layers.Layer):
    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
  def build(self, unused_input_shapes):
-    self._query_dense = layers.DenseEinsum(
+    self._query_dense = dense_einsum.DenseEinsum(
        output_shape=(self._num_heads, self._head_size),
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
@@ -63,7 +78,7 @@ class DocAttention(tf.keras.layers.Layer):
        bias_constraint=self._bias_constraint,
        dtype=self.dtype,
        name="encdocatt_query")
-    self._key_dense = layers.DenseEinsum(
+    self._key_dense = dense_einsum.DenseEinsum(
        output_shape=(self._num_heads, self._head_size),
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
@@ -74,7 +89,7 @@ class DocAttention(tf.keras.layers.Layer):
        bias_constraint=self._bias_constraint,
        dtype=self.dtype,
        name="encdocatt_key")
-    super(DocAttention, self).build(unused_input_shapes)
+    super(VotingAttention, self).build(unused_input_shapes)
  def call(self, encoder_outputs, doc_attention_mask):
    num_docs = tf_utils.get_shape_list(encoder_outputs, expected_rank=[4])[1]
@@ -95,12 +110,16 @@ class DocAttention(tf.keras.layers.Layer):
    return tf.nn.softmax(doc_attention_probs + infadder)
-class MultiChannelAttention(layers.MultiHeadAttention):
+class MultiChannelAttention(attention.MultiHeadAttention):
-  """Multi-channel Attention layer."""
+  """Multi-channel Attention layer.
+  Introduced in: https://arxiv.org/abs/2001.09386. Expects multiple
+  cross-attention target sequences.
+  """
-  def build(self, input_shape):
+  def _build_attention(self, qkv_rank):
-    super(MultiChannelAttention, self).build(input_shape)
+    super(MultiChannelAttention, self)._build_attention(qkv_rank)
-    self._masked_softmax = layers.MaskedSoftmax(mask_expansion_axes=[2])
+    self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[2])
  def call(self, inputs, attention_mask=None):
    from_tensor = inputs[0]

--- a/official/nlp/nhnet/multi_channel_attention_test.py
+++ b/official/nlp/nhnet/multi_channel_attention_test.py
@@ -22,14 +22,15 @@ from __future__ import print_function
 import numpy as np
 import tensorflow as tf
-from official.nlp.nhnet import multi_channel_attention
+from official.nlp.modeling.layers import multi_channel_attention
 class MultiChannelAttentionTest(tf.test.TestCase):
  def test_doc_attention(self):
    num_heads = 2
-    doc_attention = multi_channel_attention.DocAttention(num_heads, head_size=8)
+    doc_attention = multi_channel_attention.VotingAttention(
+        num_heads, head_size=8)
    num_docs = 3
    inputs = np.zeros((2, num_docs, 10, 16), dtype=np.float32)
    doc_mask = np.zeros((2, num_docs), dtype=np.float32)

--- a/official/nlp/modeling/layers/transformer.py
+++ b/official/nlp/modeling/layers/transformer.py
@@ -24,6 +24,7 @@ import tensorflow as tf
 from official.nlp.modeling.layers import attention
 from official.nlp.modeling.layers import dense_einsum
+from official.nlp.modeling.layers import multi_channel_attention
 from official.nlp.modeling.layers.util import tf_function_if_eager
@@ -78,6 +79,7 @@ class Transformer(tf.keras.layers.Layer):
    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
@@ -236,3 +238,200 @@ class CompiledTransformer(Transformer):
  @tf_function_if_eager(experimental_compile=True)
  def call(self, inputs):
    return super(CompiledTransformer, self).call(inputs)
+@tf.keras.utils.register_keras_serializable(package="Text")
+class TransformerDecoderLayer(tf.keras.layers.Layer):
+  """Single transformer layer for decoder.
+  It has three sub-layers:
+  (1) a multi-head self-attention mechanism.
+  (2) a encoder-decoder attention.
+  (3) a positionwise fully connected feed-forward network.
+  Arguments:
+    num_attention_heads: Number of attention heads.
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    dropout_rate: Dropout probability for the post-attention and output dropout.
+    attention_dropout_rate: Dropout probability for within the attention layer.
+    multi_channel_cross_attention: Whether to use `MultiChannelAttention` for
+      cross-attention between target sequences and source sequences.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+  """
+  def __init__(self,
+               num_attention_heads,
+               intermediate_size,
+               intermediate_activation,
+               dropout_rate=0.0,
+               attention_dropout_rate=0.0,
+               multi_channel_cross_attention=False,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(TransformerDecoderLayer, self).__init__(**kwargs)
+    self.num_attention_heads = num_attention_heads
+    self.intermediate_size = intermediate_size
+    self.intermediate_activation = tf.keras.activations.get(
+        intermediate_activation)
+    self.dropout_rate = dropout_rate
+    self.attention_dropout_rate = attention_dropout_rate
+    self.multi_channel_cross_attention = multi_channel_cross_attention
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    if self.multi_channel_cross_attention:
+      self._cross_attention_cls = multi_channel_attention.MultiChannelAttention
+    else:
+      self._cross_attention_cls = attention.MultiHeadAttention
+  def build(self, input_shape):
+    target_tensor_shape = tf.TensorShape(input_shape[0])
+    if len(target_tensor_shape) != 3:
+      raise ValueError("TransformerLayer expects a three-dimensional input of "
+                       "shape [batch, sequence, width].")
+    hidden_size = target_tensor_shape[2]
+    if hidden_size % self.num_attention_heads != 0:
+      raise ValueError(
+          "The hidden size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self.num_attention_heads))
+    self.attention_head_size = int(hidden_size / self.num_attention_heads)
+    # Self attention.
+    self.self_attention = attention.CachedAttention(
+        num_heads=self.num_attention_heads,
+        key_size=self.attention_head_size,
+        dropout=self.attention_dropout_rate,
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint,
+        name="self_attention")
+    self.self_attention_output_dense = dense_einsum.DenseEinsum(
+        output_shape=hidden_size,
+        num_summed_dimensions=2,
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint,
+        name="self_attention_output")
+    self.self_attention_dropout = tf.keras.layers.Dropout(
+        rate=self.dropout_rate)
+    self.self_attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm", axis=-1, epsilon=1e-12))
+    # Encoder-decoder attention.
+    self.encdec_attention = self._cross_attention_cls(
+        num_heads=self.num_attention_heads,
+        key_size=self.attention_head_size,
+        dropout=self.attention_dropout_rate,
+        output_shape=hidden_size,
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint,
+        name="attention/encdec")
+    self.encdec_attention_dropout = tf.keras.layers.Dropout(
+        rate=self.dropout_rate)
+    self.encdec_attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))
+    # Feed-forward projection.
+    self.intermediate_dense = dense_einsum.DenseEinsum(
+        output_shape=self.intermediate_size,
+        activation=None,
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint,
+        name="intermediate")
+    self.intermediate_activation_layer = tf.keras.layers.Activation(
+        self.intermediate_activation)
+    self.output_dense = dense_einsum.DenseEinsum(
+        output_shape=hidden_size,
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint,
+        name="output")
+    self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+    self.output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm", axis=-1, epsilon=1e-12)
+    super(TransformerDecoderLayer, self).build(input_shape)
+  def common_layers_with_encoder(self):
+    """Gets layer objects that can make a Transformer encoder block."""
+    return [
+        self.self_attention, self.self_attention_layer_norm,
+        self.intermediate_dense, self.output_dense, self.output_layer_norm
+    ]
+  def call(self, inputs, cache=None, decode_loop_step=None):
+    if self.multi_channel_cross_attention:
+      if len(inputs) != 5:
+        raise ValueError(
+            "TransformerDecoderLayer must have 5 inputs, when it uses "
+            "multi_channel_cross_attention. But it got: %d" % len(inputs))
+    elif len(inputs) != 4:
+      raise ValueError(
+          "TransformerDecoderLayer must have 4 inputs, but it got: %d" %
+          len(inputs))
+    input_tensor, memory, attention_mask, self_attention_mask = inputs[:4]
+    self_attention_inputs = [input_tensor, input_tensor]
+    self_attention_output, cache = self.self_attention(
+        self_attention_inputs,
+        attention_mask=self_attention_mask,
+        cache=cache,
+        decode_loop_step=decode_loop_step)
+    self_attention_output = self.self_attention_dropout(self_attention_output)
+    self_attention_output = self.self_attention_layer_norm(
+        input_tensor + self_attention_output)
+    cross_attn_inputs = [self_attention_output, memory]
+    if self.multi_channel_cross_attention:
+      # Accesses the 5-th input tensor for the doc-attention probabilities.
+      cross_attn_inputs.append(inputs[-1])
+    attention_output = self.encdec_attention(cross_attn_inputs, attention_mask)
+    attention_output = self.encdec_attention_dropout(attention_output)
+    attention_output = self.encdec_attention_layer_norm(self_attention_output +
+                                                        attention_output)
+    intermediate_output = self.intermediate_dense(attention_output)
+    intermediate_output = self.intermediate_activation_layer(
+        intermediate_output)
+    layer_output = self.output_dense(intermediate_output)
+    layer_output = self.output_dropout(layer_output)
+    layer_output = self.output_layer_norm(layer_output + attention_output)
+    return layer_output, cache
--- a/official/nlp/modeling/layers/transformer_test.py
+++ b/official/nlp/modeling/layers/transformer_test.py
@@ -215,5 +215,39 @@ class TransformerLayerTest(keras_parameterized.TestCase):
    self.assertAllEqual([1, input_length, width], output_data.shape)
+def _create_cache(batch_size, init_decode_length, num_heads, head_size):
+  return {
+      'key':
+          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
+                   dtype=tf.float32),
+      'value':
+          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
+                   dtype=tf.float32)
+  }
+@keras_parameterized.run_all_keras_modes
+class TransformerDecoderLayerTest(keras_parameterized.TestCase):
+  def test_decoder_block_with_cache(self):
+    num_attention_heads = 2
+    hidden_size = 16
+    decoder_block = transformer.TransformerDecoderLayer(
+        num_attention_heads=num_attention_heads,
+        intermediate_size=32,
+        intermediate_activation='relu',
+        dropout_rate=0.1,
+        attention_dropout_rate=0.1)
+    # Forward path.
+    dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
+    inputs = [dummy_tensor, dummy_tensor, dummy_mask, dummy_mask]
+    cache = _create_cache(2, 0, num_attention_heads,
+                          hidden_size // num_attention_heads)
+    output, cache = decoder_block(inputs, cache)
+    self.assertEqual(output.shape, (2, 4, hidden_size))
+    self.assertEqual(cache['value'].shape, (2, 4, 2, 8))
 if __name__ == '__main__':
  tf.test.main()
--- a/official/nlp/modeling/losses/README.md
+++ b/official/nlp/modeling/losses/README.md
@@ -4,6 +4,3 @@ Losses contains common loss computation used in NLP tasks.
 * `weighted_sparse_categorical_crossentropy_loss` computes per-batch sparse
 categorical crossentropy loss.
-* `weighted_sparse_categorical_crossentropy_per_example_loss` computes
-per-example sparse categorical crossentropy loss.
--- a/official/nlp/modeling/losses/__init__.py
+++ b/official/nlp/modeling/losses/__init__.py
@@ -14,4 +14,3 @@
 # ==============================================================================
 """Activations package definition. Subject to change."""
 from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import loss as weighted_sparse_categorical_crossentropy_loss
-from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import per_example_loss as weighted_sparse_categorical_crossentropy_per_example_loss
--- a/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
+++ b/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sparse categorical cross-entropy losses."""
+"""Weighted sparse categorical cross-entropy losses."""
 from __future__ import absolute_import
 from __future__ import division
@@ -43,37 +43,7 @@ def _validate_rank(labels, predictions, weights):
         "predictions.shape was %s.") % (labels.shape, predictions.shape))
-def per_example_loss(labels, predictions, weights=None):
+def loss(labels, predictions, weights=None, from_logits=False):
-  """Calculate a per-example sparse categorical crossentropy loss.
-  This loss function assumes that the predictions are post-softmax.
-  Args:
-    labels: The labels to evaluate against. Should be a set of integer indices
-      ranging from 0 to (vocab_size-1).
-    predictions: The network predictions. Should have softmax already applied.
-    weights: An optional weight array of the same shape as the 'labels' array.
-      If None, all examples will be used.
-  Returns:
-    A tensor of shape predictions.shape[:-1] containing the per-example
-      loss.
-  """
-  # When using these functions with the Keras core API, we will need to squeeze
-  # the labels tensor - Keras adds a spurious inner dimension.
-  labels, predictions = _adjust_labels(labels, predictions)
-  _validate_rank(labels, predictions, weights)
-  labels_one_hot = tf.one_hot(labels, predictions.shape[-1])
-  labels_one_hot = tf.cast(labels_one_hot, predictions.dtype)
-  per_example_loss_data = -tf.reduce_sum(
-      predictions * labels_one_hot, axis=[-1])
-  if weights is not None:
-    weights = tf.cast(weights, per_example_loss_data.dtype)
-    per_example_loss_data = weights * per_example_loss_data
-  return per_example_loss_data
-def loss(labels, predictions, weights=None):
  """Calculate a per-batch sparse categorical crossentropy loss.
  This loss function assumes that the predictions are post-softmax.
@@ -83,6 +53,7 @@ def loss(labels, predictions, weights=None):
    predictions: The network predictions. Should have softmax already applied.
    weights: An optional weight array of the same shape as the 'labels' array.
      If None, all examples will be used.
+    from_logits: Whether the input predictions are logits.
  Returns:
    A loss scalar.
@@ -95,12 +66,11 @@ def loss(labels, predictions, weights=None):
  labels, predictions = _adjust_labels(labels, predictions)
  _validate_rank(labels, predictions, weights)
-  per_example_loss_data = per_example_loss(labels, predictions, weights)
+  example_losses = tf.keras.losses.sparse_categorical_crossentropy(
+      labels, predictions, from_logits=from_logits)
  if weights is None:
-    return tf.reduce_mean(per_example_loss_data)
+    return tf.reduce_mean(example_losses)
-  else:
+  weights = tf.cast(weights, predictions.dtype)
-    numerator = tf.reduce_sum(per_example_loss_data)
+  return tf.math.divide_no_nan(
-    weights = tf.cast(weights, predictions.dtype)
+      tf.reduce_sum(example_losses * weights), tf.reduce_sum(weights))
-    denominator = tf.reduce_sum(weights) + 1e-5
-    return numerator / denominator
--- a/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
+++ b/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
@@ -23,6 +23,7 @@ import numpy as np
 import tensorflow as tf
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling import layers
 from official.nlp.modeling import networks
 from official.nlp.modeling.losses import weighted_sparse_categorical_crossentropy
@@ -48,139 +49,19 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_outputs, _ = xformer_stack([word_ids, mask, type_ids])
+    _ = xformer_stack([word_ids, mask, type_ids])
    # Create a maskedLM from the transformer stack.
-    test_network = networks.MaskedLM(
+    test_layer = layers.MaskedLM(
-        num_predictions=num_predictions,
+        embedding_table=xformer_stack.get_embedding_table(), output=output)
-        input_width=lm_outputs.shape[-1],
-        source_network=xformer_stack,
-        output=output)
    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
    masked_lm_positions = tf.keras.Input(
        shape=(num_predictions,), dtype=tf.int32)
-    output = test_network([lm_input_tensor, masked_lm_positions])
+    output = test_layer(lm_input_tensor, masked_positions=masked_lm_positions)
    return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
-  def create_classification_model(self, input_width, num_classes):
-    test_object = networks.Classification(
-        input_width=input_width, num_classes=num_classes)
-    # Create a 2-dimensional input (the first dimension is implicit).
-    pooled_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    output = test_object(pooled_data)
-    return tf.keras.Model(pooled_data, output)
-  def test_per_example_loss_3d_input(self):
-    """Test per-example loss with a 3-dimensional input, from a masked LM."""
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
-    # Get the output of the masked LM.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    output_data = model.predict([lm_input_data, masked_position_data])
-    # Calculate per-example loss.
-    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-    # Per-example loss data should have one value per prediction, and those
-    # values shouldn't be zero in this case (as we're using random data).
-    expected_shape = [batch_size, num_predictions]
-    self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
-    self.assertNotAllClose(
-        tf.zeros_like(per_example_loss_data), per_example_loss_data)
-  def test_per_example_loss_2d_input(self):
-    """Test per-example loss with a 2-d input, from a classifier."""
-    input_width = 512
-    num_classes = 10
-    model = self.create_classification_model(input_width, num_classes)
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    output_data = model.predict(input_data)
-    # Calculate per example loss.
-    labels = np.random.randint(num_classes, size=(batch_size))
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-    # Per-example loss data should have one value per batch item, and those
-    # values shouldn't be zero in this case (as we're using random data).
-    self.assertEqual([batch_size], per_example_loss_data.shape.as_list())
-    self.assertNotAllClose(
-        tf.zeros_like(per_example_loss_data), per_example_loss_data)
-  def test_per_example_loss_weights_3d_input(self):
-    """Test weighted per-example loss with a 3-d input, from a masked LM."""
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
-    # Get the output of the masked LM.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    output_data = model.predict([lm_input_data, masked_position_data])
-    # Calculate per-example loss with weights.
-    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
-    weights = np.random.randint(2, size=(batch_size, num_predictions))
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
-    # Weighted per-example loss data should be equivalent to multiplying the
-    # loss tensor by the weights tensor.
-    expected_weighted_loss = per_example_loss_data * weights
-    self.assertAllClose(expected_weighted_loss, per_example_loss_data)
-  def test_per_example_loss_weights_2d_input(self):
-    """Test weighted per-example loss with a 2-d input, from a classifier."""
-    input_width = 512
-    num_classes = 10
-    model = self.create_classification_model(input_width, num_classes)
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    output_data = model.predict(input_data)
-    # Calculate per-example loss with weights.
-    labels = np.random.randint(num_classes, size=(batch_size))
-    weights = np.random.randint(2, size=(batch_size))
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
-    # Weighted per-example loss data should be equivalent to multiplying the
-    # loss tensor by the weights tensor.
-    expected_weighted_loss = per_example_loss_data * weights
-    self.assertAllClose(expected_weighted_loss, per_example_loss_data)
  def test_loss_3d_input(self):
    """Test overall loss with a 3-dimensional input, from a masked LM."""
    vocab_size = 100
@@ -214,26 +95,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    self.assertNotAllClose(
        tf.zeros_like(per_example_loss_data), per_example_loss_data)
-  def test_loss_2d_input(self):
-    """Test overall loss with a 2-d input, from a classifier."""
-    input_width = 512
-    num_classes = 10
-    model = self.create_classification_model(input_width, num_classes)
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    output_data = model.predict(input_data)
-    # Calculate per example loss.
-    labels = np.random.randint(num_classes, size=(batch_size))
-    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels)
-    # Loss data should have one value only, and that value shouldn't be zero in
-    # this case (as we're using random data).
-    self.assertNotAllClose(0, loss_data)
  def test_loss_weights_3d_input(self):
    """Test masked loss with a 3-dimensional input, from a masked LM."""
    vocab_size = 100
@@ -263,26 +124,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    # Because the tensor is fully masked, the loss should be 0.
    self.assertAllClose(0, weighted_loss_data)
-  def test_loss_weights_2d_input(self):
-    """Test masked loss with a 2-d input, from a classifier."""
-    input_width = 512
-    num_classes = 10
-    model = self.create_classification_model(input_width, num_classes)
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    output_data = model.predict(input_data)
-    # Calculate a fully masked weight tensor. This should give a loss of zero.
-    labels = np.random.randint(num_classes, size=(batch_size))
-    null_weights = np.zeros((batch_size))
-    weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=null_weights)
-    # Because the tensor is fully masked, the loss should be 0.
-    self.assertAllClose(0, weighted_loss_data)
  def test_mismatched_predictions_and_labels_ranks_squeezes(self):
    """Test that the loss asserts when rank(predictions)-1 != rank(labels)."""
    batch_size = 3
@@ -290,7 +131,7 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    labels = np.random.randint(10, size=(batch_size, 1))
    # All that this test tests is that the squeeze is successful.
-    _ = weighted_sparse_categorical_crossentropy.per_example_loss(
+    _ = weighted_sparse_categorical_crossentropy.loss(
        predictions=output_data, labels=labels)
  def test_mismatched_weights_and_labels_ranks_fail(self):
@@ -300,9 +141,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    labels = np.random.randint(10, size=(batch_size, 10))
    weights = np.random.randint(2, size=(batch_size))
-    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
-      _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-          predictions=output_data, labels=labels, weights=weights)
    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
      _ = weighted_sparse_categorical_crossentropy.loss(
          predictions=output_data, labels=labels, weights=weights)
@@ -318,8 +156,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    # We're not trying to validate numerical correctness, just ensure that
    # we can in fact pass tensors to these functions without causing runtime
    # errors from the shape checking code.
-    _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
    _ = weighted_sparse_categorical_crossentropy.loss(
        predictions=output_data, labels=labels, weights=weights)
@@ -339,20 +175,15 @@ class ClassificationLossTest(keras_parameterized.TestCase):
          [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509]]])
    labels = np.array([[4, 0], [2, 2], [2, 1]])
-    # Validate that per_example loss calculations are the same.
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-    expected_per_example_loss_data = [[1.2923571, 2.7117882],
-                                      [2.287932, 2.287932],
-                                      [3.0924666, 1.8219438]]
-    self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
    # Validate that overall loss calculations are the same.
    weights = np.array([[1, 0], [0, 0], [0, 0]])
    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
+        predictions=output_data,
+        labels=labels,
+        weights=weights,
+        from_logits=True)
    expected_loss_data = 1.2923441
-    self.assertAllClose(expected_loss_data, loss_data)
+    self.assertAllClose(expected_loss_data, loss_data, rtol=1e-3)
  def test_legacy_classification_loss_compatibility(self):
    """Test to validate computational correctness during refactors."""
@@ -363,19 +194,15 @@ class ClassificationLossTest(keras_parameterized.TestCase):
                            [-1.6975292e-03, -6.4009643e+00, -1.0226612e+01]])
    labels = np.array([2, 1])
-    # Validate that per_example loss calculations are the same.
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-    expected_per_example_loss_data = [6.4434357, 6.4009643]
-    self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
    # Validate that overall loss calculations are the same.
    weights = None
    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
+        predictions=output_data,
+        labels=labels,
+        weights=weights,
+        from_logits=True)
    expected_loss_data = 6.4222
-    self.assertAllClose(expected_loss_data, loss_data)
+    self.assertAllClose(expected_loss_data, loss_data, rtol=1e-3)
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/models/bert_pretrainer.py
+++ b/official/nlp/modeling/models/bert_pretrainer.py
@@ -25,6 +25,7 @@ from typing import List, Optional
 import gin
 import tensorflow as tf
+from official.nlp.modeling import layers
 from official.nlp.modeling import networks
@@ -47,8 +48,8 @@ class BertPretrainer(tf.keras.Model):
    num_token_predictions: Number of tokens to predict from the masked LM.
    embedding_table: Embedding table of a network. If None, the
      "network.get_embedding_table()" is used.
-    activation: The activation (if any) to use in the masked LM network.
+    activation: The activation (if any) to use in the masked LM network. If
-      If None, no activation will be used.
+      None, no activation will be used.
    initializer: The initializer (if any) to use in the masked LM and
      classification networks. Defaults to a Glorot uniform initializer.
    output: The output style for this network. Can be either 'logits' or
@@ -106,16 +107,16 @@ class BertPretrainer(tf.keras.Model):
        dtype=tf.int32)
    inputs.append(masked_lm_positions)
-    self.masked_lm = networks.MaskedLM(
+    if embedding_table is None:
-        num_predictions=num_token_predictions,
+      embedding_table = self.encoder.get_embedding_table()
-        input_width=sequence_output.shape[-1],
+    self.masked_lm = layers.MaskedLM(
-        source_network=network,
        embedding_table=embedding_table,
        activation=activation,
        initializer=initializer,
        output=output,
-        name='masked_lm')
+        name='cls/predictions')
-    lm_outputs = self.masked_lm([sequence_output, masked_lm_positions])
+    lm_outputs = self.masked_lm(
+        sequence_output, masked_positions=masked_lm_positions)
    self.classification = networks.Classification(
        input_width=cls_output.shape[-1],
@@ -126,7 +127,9 @@ class BertPretrainer(tf.keras.Model):
    sentence_outputs = self.classification(cls_output)
    super(BertPretrainer, self).__init__(
-        inputs=inputs, outputs=[lm_outputs, sentence_outputs], **kwargs)
+        inputs=inputs,
+        outputs=dict(masked_lm=lm_outputs, classification=sentence_outputs),
+        **kwargs)
  def get_config(self):
    return self._config
@@ -151,8 +154,8 @@ class BertPretrainerV2(tf.keras.Model):
    num_masked_tokens: Number of tokens to predict from the masked LM.
    encoder_network: A transformer network. This network should output a
      sequence output and a classification output.
-    mlm_activation: The activation (if any) to use in the masked LM network.
+    mlm_activation: The activation (if any) to use in the masked LM network. If
-      If None, no activation will be used.
+      None, no activation will be used.
    mlm_initializer: The initializer (if any) to use in the masked LM. Default
      to a Glorot uniform initializer.
    classification_heads: A list of optional head layers to transform on encoder
@@ -193,17 +196,18 @@ class BertPretrainerV2(tf.keras.Model):
    outputs = dict()
    if num_masked_tokens > 0:
-      self.masked_lm = networks.MaskedLM(
+      self.masked_lm = layers.MaskedLM(
-          num_predictions=num_masked_tokens,
+          embedding_table=self.encoder_network.get_embedding_table(),
-          input_width=sequence_output.shape[-1],
-          source_network=self.encoder_network,
          activation=mlm_activation,
          initializer=mlm_initializer,
-          name='masked_lm')
+          name='cls/predictions')
-      masked_lm_positions = copy.copy(self.masked_lm.inputs[-1])
+      masked_lm_positions = tf.keras.layers.Input(
+          shape=(num_masked_tokens,),
+          name='masked_lm_positions',
+          dtype=tf.int32)
      inputs.append(masked_lm_positions)
      outputs['lm_output'] = self.masked_lm(
-          [sequence_output, masked_lm_positions])
+          sequence_output, masked_positions=masked_lm_positions)
    for cls_head in self.classification_heads:
      outputs[cls_head.name] = cls_head(sequence_output)