Commit 20897493 authored by Tianqi Liu's avatar Tianqi Liu Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 314451720
parent 2db2501b
...@@ -187,6 +187,91 @@ class XnliProcessor(DataProcessor): ...@@ -187,6 +187,91 @@ class XnliProcessor(DataProcessor):
return "XNLI" return "XNLI"
class PawsxProcessor(DataProcessor):
"""Processor for the PAWS-X data set."""
supported_languages = [
"de", "en", "es", "fr", "ja", "ko", "zh"
]
def __init__(self,
language="en",
process_text_fn=tokenization.convert_to_unicode):
super(PawsxProcessor, self).__init__(process_text_fn)
if language == "all":
self.languages = PawsxProcessor.supported_languages
elif language not in PawsxProcessor.supported_languages:
raise ValueError("language %s is not supported for PAWS-X task." %
language)
else:
self.languages = [language]
def get_train_examples(self, data_dir):
"""See base class."""
lines = []
for language in self.languages:
if language == "en":
train_tsv = "train.tsv"
else:
train_tsv = "translated_train.tsv"
# Skips the header.
lines.extend(
self._read_tsv(
os.path.join(data_dir, language, train_tsv))[1:])
examples = []
for (i, line) in enumerate(lines):
guid = "train-%d" % i
text_a = self.process_text_fn(line[1])
text_b = self.process_text_fn(line[2])
label = self.process_text_fn(line[3])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_dev_examples(self, data_dir):
"""See base class."""
lines = []
for language in PawsxProcessor.supported_languages:
# Skips the header.
lines.extend(
self._read_tsv(os.path.join(data_dir, language, "dev_2k.tsv"))[1:])
examples = []
for (i, line) in enumerate(lines):
guid = "dev-%d" % i
text_a = self.process_text_fn(line[1])
text_b = self.process_text_fn(line[2])
label = self.process_text_fn(line[3])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_test_examples(self, data_dir):
"""See base class."""
examples_by_lang = {k: [] for k in PawsxProcessor.supported_languages}
for language in PawsxProcessor.supported_languages:
lines = self._read_tsv(os.path.join(data_dir, language, "test_2k.tsv"))
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "test-%d" % i
text_a = self.process_text_fn(line[1])
text_b = self.process_text_fn(line[2])
label = self.process_text_fn(line[3])
examples_by_lang[language].append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples_by_lang
def get_labels(self):
"""See base class."""
return ["0", "1"]
@staticmethod
def get_processor_name():
"""See base class."""
return "PAWS-X"
class MnliProcessor(DataProcessor): class MnliProcessor(DataProcessor):
"""Processor for the MultiNLI data set (GLUE version).""" """Processor for the MultiNLI data set (GLUE version)."""
......
...@@ -47,14 +47,21 @@ flags.DEFINE_string( ...@@ -47,14 +47,21 @@ flags.DEFINE_string(
"for the task.") "for the task.")
flags.DEFINE_enum("classification_task_name", "MNLI", flags.DEFINE_enum("classification_task_name", "MNLI",
["COLA", "MNLI", "MRPC", "QNLI", "QQP", "SST-2", "XNLI"], ["COLA", "MNLI", "MRPC", "QNLI", "QQP", "SST-2", "XNLI",
"PAWS-X"],
"The name of the task to train BERT classifier.") "The name of the task to train BERT classifier.")
# XNLI task specific flag. # XNLI task specific flag.
flags.DEFINE_string( flags.DEFINE_string(
"xnli_language", "en", "xnli_language", "en",
"Language of training and evaluation data for XNIL task. If the value is " "Language of training data for XNIL task. If the value is 'all', the data "
"'all', the data of all languages will be used for training.") "of all languages will be used for training.")
# PAWS-X task specific flag.
flags.DEFINE_string(
"pawsx_language", "en",
"Language of trainig data for PAWS-X task. If the value is 'all', the data "
"of all languages will be used for training.")
# BERT Squad task specific flags. # BERT Squad task specific flags.
flags.DEFINE_string( flags.DEFINE_string(
...@@ -166,6 +173,9 @@ def generate_classifier_dataset(): ...@@ -166,6 +173,9 @@ def generate_classifier_dataset():
"xnli": "xnli":
functools.partial(classifier_data_lib.XnliProcessor, functools.partial(classifier_data_lib.XnliProcessor,
language=FLAGS.xnli_language), language=FLAGS.xnli_language),
"paws-x":
functools.partial(classifier_data_lib.PawsxProcessor,
language=FLAGS.pawsx_language)
} }
task_name = FLAGS.classification_task_name.lower() task_name = FLAGS.classification_task_name.lower()
if task_name not in processors: if task_name not in processors:
......
...@@ -59,7 +59,6 @@ class MaskedSoftmax(tf.keras.layers.Layer): ...@@ -59,7 +59,6 @@ class MaskedSoftmax(tf.keras.layers.Layer):
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
scores += adder scores += adder
if len(self._normalization_axes) == 1: if len(self._normalization_axes) == 1:
......
...@@ -19,8 +19,6 @@ from __future__ import division ...@@ -19,8 +19,6 @@ from __future__ import division
# from __future__ import google_type_annotations # from __future__ import google_type_annotations
from __future__ import print_function from __future__ import print_function
import math
import tensorflow as tf import tensorflow as tf
from official.modeling import tf_utils from official.modeling import tf_utils
...@@ -120,81 +118,3 @@ class PositionEmbedding(tf.keras.layers.Layer): ...@@ -120,81 +118,3 @@ class PositionEmbedding(tf.keras.layers.Layer):
position_embeddings = self._position_embeddings position_embeddings = self._position_embeddings
return tf.broadcast_to(position_embeddings, input_shape) return tf.broadcast_to(position_embeddings, input_shape)
@tf.keras.utils.register_keras_serializable(package="Text")
class RelativePositionEmbedding(tf.keras.layers.Layer):
"""Creates a positional embedding.
This layer calculates the position encoding as a mix of sine and cosine
functions with geometrically increasing wavelengths. Defined and formulized in
"Attention is All You Need", section 3.5.
(https://arxiv.org/abs/1706.03762).
Arguments:
hidden_size: Size of the hidden layer.
min_timescale: Minimum scale that will be applied at each position
max_timescale: Maximum scale that will be applied at each position.
length: Number of positions. Should be specified if `inputs` is None at
`call(self, inputs)`
"""
def __init__(self,
hidden_size,
min_timescale=1.0,
max_timescale=1.0e4,
length=None,
**kwargs):
# We need to have a default dtype of float32, since the inputs (which Keras
# usually uses to infer the dtype) will always be int32.
# We compute the positional encoding in float32 even if the model uses
# float16, as many of the ops used, like log and exp, are numerically
# unstable in float16.
if "dtype" not in kwargs:
kwargs["dtype"] = "float32"
super(RelativePositionEmbedding, self).__init__(**kwargs)
self._hidden_size = hidden_size
self._min_timescale = min_timescale
self._max_timescale = max_timescale
self._length = length
def get_config(self):
config = {
"hidden_size": self._hidden_size,
"min_timescale": self._min_timescale,
"max_timescale": self._max_timescale,
"length": self._length,
}
base_config = super(RelativePositionEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
"""Implements build() for the layer."""
super(RelativePositionEmbedding, self).build(input_shape)
def call(self, inputs):
"""Implements call() for the layer."""
length = self._length
if inputs is None and length is None:
raise ValueError(
"If inputs is None, `length` must be set in "
"RelativePositionEmbedding().")
if inputs is not None:
input_shape = tf_utils.get_shape_list(inputs)
if length is not None and length != input_shape[1]:
raise ValueError(
"If inputs is not None, `length` must equal to input_shape[1]."
)
length = input_shape[1]
position = tf.cast(tf.range(length), tf.float32)
num_timescales = self._hidden_size // 2
min_timescale, max_timescale = self._min_timescale, self._max_timescale
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
(tf.cast(num_timescales, tf.float32) - 1))
inv_timescales = min_timescale * tf.exp(
tf.cast(tf.range(num_timescales), tf.float32) *
-log_timescale_increment)
scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales,
0)
position_embeddings = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)],
axis=1)
return position_embeddings
...@@ -36,7 +36,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase): ...@@ -36,7 +36,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
sequence_length = 21 sequence_length = 21
width = 30 width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width)) input_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(input_tensor) # pylint: disable=not-callable output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected # When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch. # to be the same as the input shape in all dimensions save batch.
...@@ -51,7 +51,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase): ...@@ -51,7 +51,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
sequence_length = 21 sequence_length = 21
width = 30 width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width)) input_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(input_tensor) # pylint: disable=not-callable output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected # When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch. # to be the same as the input shape in all dimensions save batch.
...@@ -67,7 +67,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase): ...@@ -67,7 +67,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
# Create a 3-dimensional input (the first dimension is implicit). # Create a 3-dimensional input (the first dimension is implicit).
width = 30 width = 30
input_tensor = tf.keras.Input(shape=(None, width)) input_tensor = tf.keras.Input(shape=(None, width))
output_tensor = test_layer(input_tensor) # pylint: disable=not-callable output_tensor = test_layer(input_tensor)
# When using dynamic positional embedding shapes, the output is expected # When using dynamic positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions - but may be None if # to be the same as the input shape in all dimensions - but may be None if
...@@ -82,7 +82,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase): ...@@ -82,7 +82,7 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
# Create a 3-dimensional input (the first dimension is implicit). # Create a 3-dimensional input (the first dimension is implicit).
width = 30 width = 30
input_tensor = tf.keras.Input(shape=(None, width)) input_tensor = tf.keras.Input(shape=(None, width))
output_tensor = test_layer(input_tensor) # pylint: disable=not-callable output_tensor = test_layer(input_tensor)
model = tf.keras.Model(input_tensor, output_tensor) model = tf.keras.Model(input_tensor, output_tensor)
...@@ -98,34 +98,6 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase): ...@@ -98,34 +98,6 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
self.assertAllEqual([1, input_length, width], output_data.shape) self.assertAllEqual([1, input_length, width], output_data.shape)
def test_relative_tensor_input(self):
hidden_size = 8
test_layer = position_embedding.RelativePositionEmbedding(
hidden_size=hidden_size)
# create a 3-dimensional input for test_layer to infer length as 1.
input_tensor = tf.constant([[[0] * hidden_size]])
output_tensor = test_layer(input_tensor) # pylint: disable=not-callable
# expected output is the theoretical result of the input based on
# sine cosine relative position embedding formula.
expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
self.assertAllEqual(output_tensor, expected_output_tensor)
def test_relative_length_input(self):
hidden_size = 8
# When we do not have tensor as input, we explicitly specify length
# value when initializing test_layer.
test_layer = position_embedding.RelativePositionEmbedding(
hidden_size=hidden_size, length=1)
input_tensor = None
output_tensor = test_layer(input_tensor) # pylint: disable=not-callable
# expected output is the theoretical result of the input based on
# sine cosine relative position embedding formula.
expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
self.assertAllEqual(output_tensor, expected_output_tensor)
if __name__ == "__main__": if __name__ == "__main__":
tf.test.main() tf.test.main()
...@@ -22,7 +22,6 @@ from __future__ import division ...@@ -22,7 +22,6 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import tensorflow as tf import tensorflow as tf
from official.nlp.modeling.layers import position_embedding
from official.nlp.transformer import attention_layer from official.nlp.transformer import attention_layer
from official.nlp.transformer import beam_search from official.nlp.transformer import beam_search
from official.nlp.transformer import embedding_layer from official.nlp.transformer import embedding_layer
...@@ -171,9 +170,9 @@ class Transformer(tf.keras.Model): ...@@ -171,9 +170,9 @@ class Transformer(tf.keras.Model):
attention_bias = tf.cast(attention_bias, self.params["dtype"]) attention_bias = tf.cast(attention_bias, self.params["dtype"])
with tf.name_scope("add_pos_encoding"): with tf.name_scope("add_pos_encoding"):
pos_layer = position_embedding.RelativePositionEmbedding( length = tf.shape(embedded_inputs)[1]
hidden_size=self.params["hidden_size"]) pos_encoding = model_utils.get_position_encoding(
pos_encoding = pos_layer(embedded_inputs) length, self.params["hidden_size"])
pos_encoding = tf.cast(pos_encoding, self.params["dtype"]) pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
encoder_inputs = embedded_inputs + pos_encoding encoder_inputs = embedded_inputs + pos_encoding
...@@ -210,9 +209,8 @@ class Transformer(tf.keras.Model): ...@@ -210,9 +209,8 @@ class Transformer(tf.keras.Model):
[[0, 0], [1, 0], [0, 0]])[:, :-1, :] [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
with tf.name_scope("add_pos_encoding"): with tf.name_scope("add_pos_encoding"):
length = tf.shape(decoder_inputs)[1] length = tf.shape(decoder_inputs)[1]
pos_layer = position_embedding.RelativePositionEmbedding( pos_encoding = model_utils.get_position_encoding(
hidden_size=self.params["hidden_size"]) length, self.params["hidden_size"])
pos_encoding = pos_layer(decoder_inputs)
pos_encoding = tf.cast(pos_encoding, self.params["dtype"]) pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
decoder_inputs += pos_encoding decoder_inputs += pos_encoding
if training: if training:
...@@ -235,10 +233,8 @@ class Transformer(tf.keras.Model): ...@@ -235,10 +233,8 @@ class Transformer(tf.keras.Model):
def _get_symbols_to_logits_fn(self, max_decode_length, training): def _get_symbols_to_logits_fn(self, max_decode_length, training):
"""Returns a decoding function that calculates logits of the next tokens.""" """Returns a decoding function that calculates logits of the next tokens."""
pos_layer = position_embedding.RelativePositionEmbedding( timing_signal = model_utils.get_position_encoding(
hidden_size=self.params["hidden_size"], max_decode_length + 1, self.params["hidden_size"])
length=max_decode_length + 1)
timing_signal = pos_layer(None)
timing_signal = tf.cast(timing_signal, self.params["dtype"]) timing_signal = tf.cast(timing_signal, self.params["dtype"])
decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
max_decode_length, dtype=self.params["dtype"]) max_decode_length, dtype=self.params["dtype"])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment