Internal change

PiperOrigin-RevId: 314451720

Internal change
PiperOrigin-RevId: 314451720
20897493 · Tianqi Liu · A. Unique TensorFlower · 2db2501b · 20897493 · 20897493
Commit 20897493 authored Jun 02, 2020 by Tianqi Liu Committed by A. Unique TensorFlower Jun 02, 2020
6 changed files
--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -187,6 +187,91 @@ class XnliProcessor(DataProcessor):
    return "XNLI"
+class PawsxProcessor(DataProcessor):
+  """Processor for the PAWS-X data set."""
+  supported_languages = [
+      "de", "en", "es", "fr", "ja", "ko", "zh"
+  ]
+  def __init__(self,
+               language="en",
+               process_text_fn=tokenization.convert_to_unicode):
+    super(PawsxProcessor, self).__init__(process_text_fn)
+    if language == "all":
+      self.languages = PawsxProcessor.supported_languages
+    elif language not in PawsxProcessor.supported_languages:
+      raise ValueError("language %s is not supported for PAWS-X task." %
+                       language)
+    else:
+      self.languages = [language]
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    lines = []
+    for language in self.languages:
+      if language == "en":
+        train_tsv = "train.tsv"
+      else:
+        train_tsv = "translated_train.tsv"
+      # Skips the header.
+      lines.extend(
+          self._read_tsv(
+              os.path.join(data_dir, language, train_tsv))[1:])
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "train-%d" % i
+      text_a = self.process_text_fn(line[1])
+      text_b = self.process_text_fn(line[2])
+      label = self.process_text_fn(line[3])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    lines = []
+    for language in PawsxProcessor.supported_languages:
+      # Skips the header.
+      lines.extend(
+          self._read_tsv(os.path.join(data_dir, language, "dev_2k.tsv"))[1:])
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "dev-%d" % i
+      text_a = self.process_text_fn(line[1])
+      text_b = self.process_text_fn(line[2])
+      label = self.process_text_fn(line[3])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    examples_by_lang = {k: [] for k in PawsxProcessor.supported_languages}
+    for language in PawsxProcessor.supported_languages:
+      lines = self._read_tsv(os.path.join(data_dir, language, "test_2k.tsv"))
+      for (i, line) in enumerate(lines):
+        if i == 0:
+          continue
+        guid = "test-%d" % i
+        text_a = self.process_text_fn(line[1])
+        text_b = self.process_text_fn(line[2])
+        label = self.process_text_fn(line[3])
+        examples_by_lang[language].append(
+            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples_by_lang
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "PAWS-X"
 class MnliProcessor(DataProcessor):
  """Processor for the MultiNLI data set (GLUE version)."""

--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -47,14 +47,21 @@ flags.DEFINE_string(
    "for the task.")
 flags.DEFINE_enum("classification_task_name", "MNLI",
-                  ["COLA", "MNLI", "MRPC", "QNLI", "QQP", "SST-2", "XNLI"],
+                  ["COLA", "MNLI", "MRPC", "QNLI", "QQP", "SST-2", "XNLI",
+                   "PAWS-X"],
                  "The name of the task to train BERT classifier.")
 # XNLI task specific flag.
 flags.DEFINE_string(
    "xnli_language", "en",
-    "Language of training and evaluation data for XNIL task. If the value is "
+    "Language of training data for XNIL task. If the value is 'all', the data "
-    "'all', the data of all languages will be used for training.")
+    "of all languages will be used for training.")
+# PAWS-X task specific flag.
+flags.DEFINE_string(
+    "pawsx_language", "en",
+    "Language of trainig data for PAWS-X task. If the value is 'all', the data "
+    "of all languages will be used for training.")
 # BERT Squad task specific flags.
 flags.DEFINE_string(
@@ -166,6 +173,9 @@ def generate_classifier_dataset():
        "xnli":
            functools.partial(classifier_data_lib.XnliProcessor,
                              language=FLAGS.xnli_language),
+        "paws-x":
+            functools.partial(classifier_data_lib.PawsxProcessor,
+                              language=FLAGS.pawsx_language)
    }
    task_name = FLAGS.classification_task_name.lower()
    if task_name not in processors:

--- a/official/nlp/modeling/layers/masked_softmax.py
+++ b/official/nlp/modeling/layers/masked_softmax.py
@@ -59,7 +59,6 @@ class MaskedSoftmax(tf.keras.layers.Layer):
      # Since we are adding it to the raw scores before the softmax, this is
      # effectively the same as removing these entirely.
      scores += adder
    if len(self._normalization_axes) == 1:

--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
@@ -19,8 +19,6 @@ from __future__ import division
 # from __future__ import google_type_annotations
 from __future__ import print_function
-import math
 import tensorflow as tf
 from official.modeling import tf_utils
@@ -120,81 +118,3 @@ class PositionEmbedding(tf.keras.layers.Layer):
      position_embeddings = self._position_embeddings
    return tf.broadcast_to(position_embeddings, input_shape)
-@tf.keras.utils.register_keras_serializable(package="Text")
-class RelativePositionEmbedding(tf.keras.layers.Layer):
-  """Creates a positional embedding.
-  This layer calculates the position encoding as a mix of sine and cosine
-  functions with geometrically increasing wavelengths. Defined and formulized in
-   "Attention is All You Need", section 3.5.
-  (https://arxiv.org/abs/1706.03762).
-  Arguments:
-    hidden_size: Size of the hidden layer.
-    min_timescale: Minimum scale that will be applied at each position
-    max_timescale: Maximum scale that will be applied at each position.
-    length: Number of positions. Should be specified if `inputs` is None at
-      `call(self, inputs)`
-  """
-  def __init__(self,
-               hidden_size,
-               min_timescale=1.0,
-               max_timescale=1.0e4,
-               length=None,
-               **kwargs):
-    # We need to have a default dtype of float32, since the inputs (which Keras
-    # usually uses to infer the dtype) will always be int32.
-    # We compute the positional encoding in float32 even if the model uses
-    # float16, as many of the ops used, like log and exp, are numerically
-    # unstable in float16.
-    if "dtype" not in kwargs:
-      kwargs["dtype"] = "float32"
-    super(RelativePositionEmbedding, self).__init__(**kwargs)
-    self._hidden_size = hidden_size
-    self._min_timescale = min_timescale
-    self._max_timescale = max_timescale
-    self._length = length
-  def get_config(self):
-    config = {
-        "hidden_size": self._hidden_size,
-        "min_timescale": self._min_timescale,
-        "max_timescale": self._max_timescale,
-        "length": self._length,
-    }
-    base_config = super(RelativePositionEmbedding, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-  def build(self, input_shape):
-    """Implements build() for the layer."""
-    super(RelativePositionEmbedding, self).build(input_shape)
-  def call(self, inputs):
-    """Implements call() for the layer."""
-    length = self._length
-    if inputs is None and length is None:
-      raise ValueError(
-          "If inputs is None, `length` must be set in "
-          "RelativePositionEmbedding().")
-    if inputs is not None:
-      input_shape = tf_utils.get_shape_list(inputs)
-      if length is not None and length != input_shape[1]:
-        raise ValueError(
-            "If inputs is not None, `length` must equal to input_shape[1]."
-        )
-      length = input_shape[1]
-    position = tf.cast(tf.range(length), tf.float32)
-    num_timescales = self._hidden_size // 2
-    min_timescale, max_timescale = self._min_timescale, self._max_timescale
-    log_timescale_increment = (
-        math.log(float(max_timescale) / float(min_timescale)) /
-        (tf.cast(num_timescales, tf.float32) - 1))
-    inv_timescales = min_timescale * tf.exp(
-        tf.cast(tf.range(num_timescales), tf.float32) *
-        -log_timescale_increment)
-    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales,
-                                                               0)
-    position_embeddings = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)],
-                                    axis=1)
-    return position_embeddings
--- a/official/nlp/modeling/layers/position_embedding_test.py
+++ b/official/nlp/modeling/layers/position_embedding_test.py
@@ -36,7 +36,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    sequence_length = 21
    width = 30
    input_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(input_tensor) # pylint: disable=not-callable
+    output_tensor = test_layer(input_tensor)
    # When using static positional embedding shapes, the output is expected
    # to be the same as the input shape in all dimensions save batch.
@@ -51,7 +51,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    sequence_length = 21
    width = 30
    input_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(input_tensor) # pylint: disable=not-callable
+    output_tensor = test_layer(input_tensor)
    # When using static positional embedding shapes, the output is expected
    # to be the same as the input shape in all dimensions save batch.
@@ -67,7 +67,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    # Create a 3-dimensional input (the first dimension is implicit).
    width = 30
    input_tensor = tf.keras.Input(shape=(None, width))
-    output_tensor = test_layer(input_tensor) # pylint: disable=not-callable
+    output_tensor = test_layer(input_tensor)
    # When using dynamic positional embedding shapes, the output is expected
    # to be the same as the input shape in all dimensions - but may be None if
@@ -82,7 +82,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    # Create a 3-dimensional input (the first dimension is implicit).
    width = 30
    input_tensor = tf.keras.Input(shape=(None, width))
-    output_tensor = test_layer(input_tensor) # pylint: disable=not-callable
+    output_tensor = test_layer(input_tensor)
    model = tf.keras.Model(input_tensor, output_tensor)
@@ -98,34 +98,6 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    self.assertAllEqual([1, input_length, width], output_data.shape)
-  def test_relative_tensor_input(self):
-    hidden_size = 8
-    test_layer = position_embedding.RelativePositionEmbedding(
-        hidden_size=hidden_size)
-    # create a 3-dimensional input for test_layer to infer length as 1.
-    input_tensor = tf.constant([[[0] * hidden_size]])
-    output_tensor = test_layer(input_tensor) # pylint: disable=not-callable
-    # expected output is the theoretical result of the input based on
-    # sine cosine relative position embedding formula.
-    expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
-    self.assertAllEqual(output_tensor, expected_output_tensor)
-  def test_relative_length_input(self):
-    hidden_size = 8
-    # When we do not have tensor as input, we explicitly specify length
-    # value when initializing test_layer.
-    test_layer = position_embedding.RelativePositionEmbedding(
-        hidden_size=hidden_size, length=1)
-    input_tensor = None
-    output_tensor = test_layer(input_tensor) # pylint: disable=not-callable
-    # expected output is the theoretical result of the input based on
-    # sine cosine relative position embedding formula.
-    expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
-    self.assertAllEqual(output_tensor, expected_output_tensor)
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/transformer/transformer.py
+++ b/official/nlp/transformer/transformer.py
@@ -22,7 +22,6 @@ from __future__ import division
 from __future__ import print_function
 import tensorflow as tf
-from official.nlp.modeling.layers import position_embedding
 from official.nlp.transformer import attention_layer
 from official.nlp.transformer import beam_search
 from official.nlp.transformer import embedding_layer
@@ -171,9 +170,9 @@ class Transformer(tf.keras.Model):
      attention_bias = tf.cast(attention_bias, self.params["dtype"])
      with tf.name_scope("add_pos_encoding"):
-        pos_layer = position_embedding.RelativePositionEmbedding(
+        length = tf.shape(embedded_inputs)[1]
-            hidden_size=self.params["hidden_size"])
+        pos_encoding = model_utils.get_position_encoding(
-        pos_encoding = pos_layer(embedded_inputs)
+            length, self.params["hidden_size"])
        pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
        encoder_inputs = embedded_inputs + pos_encoding
@@ -210,9 +209,8 @@ class Transformer(tf.keras.Model):
                                [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
      with tf.name_scope("add_pos_encoding"):
        length = tf.shape(decoder_inputs)[1]
-        pos_layer = position_embedding.RelativePositionEmbedding(
+        pos_encoding = model_utils.get_position_encoding(
-            hidden_size=self.params["hidden_size"])
+            length, self.params["hidden_size"])
-        pos_encoding = pos_layer(decoder_inputs)
        pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
        decoder_inputs += pos_encoding
      if training:
@@ -235,10 +233,8 @@ class Transformer(tf.keras.Model):
  def _get_symbols_to_logits_fn(self, max_decode_length, training):
    """Returns a decoding function that calculates logits of the next tokens."""
-    pos_layer = position_embedding.RelativePositionEmbedding(
+    timing_signal = model_utils.get_position_encoding(
-        hidden_size=self.params["hidden_size"],
+        max_decode_length + 1, self.params["hidden_size"])
-        length=max_decode_length + 1)
-    timing_signal = pos_layer(None)
    timing_signal = tf.cast(timing_signal, self.params["dtype"])
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        max_decode_length, dtype=self.params["dtype"])