Internal change

PiperOrigin-RevId: 326286926

Internal change
PiperOrigin-RevId: 326286926
88253ce5 · Hongkun Yu · A. Unique TensorFlower · 52371ffe · 88253ce5 · 88253ce5
Commit 88253ce5 authored Aug 12, 2020 by Hongkun Yu Committed by A. Unique TensorFlower Aug 12, 2020
20 changed files
--- a/official/nlp/data/create_pretraining_data_test.py
+++ b/official/nlp/data/create_pretraining_data_test.py
@@ -25,10 +25,7 @@ _VOCAB_WORDS = ["vocab_1", "vocab_2"]
 class CreatePretrainingDataTest(tf.test.TestCase):
-  def assertTokens(self,
+  def assertTokens(self, input_tokens, output_tokens, masked_positions,
-                   input_tokens,
-                   output_tokens,
-                   masked_positions,
                   masked_labels):
    # Ensure the masked positions are unique.
    self.assertCountEqual(masked_positions, set(masked_positions))
@@ -42,24 +39,18 @@ class CreatePretrainingDataTest(tf.test.TestCase):
    # Ensure each label is valid.
    for pos, label in zip(masked_positions, masked_labels):
      output_token = output_tokens[pos]
-      if (output_token == "[MASK]" or
+      if (output_token == "[MASK]" or output_token in _VOCAB_WORDS or
-          output_token in _VOCAB_WORDS or
          output_token == input_tokens[pos]):
        continue
      self.fail("invalid mask value: {}".format(output_token))
  def test_wordpieces_to_grams(self):
    tests = [
-        (["That", "cone"],
+        (["That", "cone"], [(0, 1), (1, 2)]),
-         [(0, 1), (1, 2)]),
+        (["That", "cone", "##s"], [(0, 1), (1, 3)]),
-        (["That", "cone", "##s"],
+        (["Swit", "##zer", "##land"], [(0, 3)]),
-         [(0, 1), (1, 3)]),
+        (["[CLS]", "Up", "##dog"], [(1, 3)]),
-        (["Swit", "##zer", "##land"],
+        (["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
-         [(0, 3)]),
-        (["[CLS]", "Up", "##dog"],
-         [(1, 3)]),
-        (["[CLS]", "Up", "##dog", "[SEP]", "Down"],
-         [(1, 3), (4, 5)]),
    ]
    for inp, expected in tests:
      output = cpd._wordpieces_to_grams(inp)
@@ -93,8 +84,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
              max_ngram_size=None))
      self.assertEqual(len(masked_positions), 3)
      self.assertEqual(len(masked_labels), 3)
-      self.assertTokens(tokens, output_tokens,
+      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-                        masked_positions, masked_labels)
  def test_create_masked_lm_predictions_whole_word(self):
    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
@@ -113,8 +103,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
      # only take two.
      self.assertEqual(len(masked_positions), 2)
      self.assertEqual(len(masked_labels), 2)
-      self.assertTokens(tokens, output_tokens,
+      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-                        masked_positions, masked_labels)
      # ensure that we took an entire word.
      self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])
@@ -133,8 +122,7 @@ class CreatePretrainingDataTest(tf.test.TestCase):
              max_ngram_size=3))
      self.assertEqual(len(masked_positions), 76)
      self.assertEqual(len(masked_labels), 76)
-      self.assertTokens(tokens, output_tokens,
+      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-                        masked_positions, masked_labels)
 if __name__ == "__main__":

--- a/official/nlp/data/data_loader.py
+++ b/official/nlp/data/data_loader.py
@@ -37,8 +37,8 @@ class DataLoader(metaclass=abc.ABCMeta):
    Args:
      input_context: This is a context class that is passed to the user's input
        function and contains information about the compute replicas and input
-        pipelines. This object is used for multi-host inputs and passed by
+        pipelines. This object is used for multi-host inputs and passed by the
-        the distribution strategy.
+        distribution strategy.
    Returns:
      A per-host tf.data dataset. Note that, we usually create the distributed

--- a/official/nlp/data/data_loader_factory_test.py
+++ b/official/nlp/data/data_loader_factory_test.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for official.nlp.data.data_loader_factory."""
 import dataclasses
 import tensorflow as tf

--- a/official/nlp/data/question_answering_dataloader.py
+++ b/official/nlp/data/question_answering_dataloader.py
@@ -15,6 +15,7 @@
 # ==============================================================================
 """Loads dataset for the question answering (e.g, SQuAD) task."""
 from typing import Mapping, Optional
 import dataclasses
 import tensorflow as tf

--- a/official/nlp/data/question_answering_dataloader_test.py
+++ b/official/nlp/data/question_answering_dataloader_test.py
@@ -15,6 +15,7 @@
 # ==============================================================================
 """Tests for official.nlp.data.question_answering_dataloader."""
 import os
 import numpy as np
 import tensorflow as tf

--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -15,6 +15,7 @@
 # ==============================================================================
 """Loads dataset for the sentence prediction (classification) task."""
 from typing import Mapping, Optional
 import dataclasses
 import tensorflow as tf
@@ -23,7 +24,6 @@ from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.data import data_loader
 from official.nlp.data import data_loader_factory
 LABEL_TYPES_MAP = {'int': tf.int64, 'float': tf.float32}

--- a/official/nlp/data/sentence_retrieval_lib.py
+++ b/official/nlp/data/sentence_retrieval_lib.py
@@ -25,8 +25,7 @@ class BuccProcessor(classifier_data_lib.DataProcessor):
  """Procssor for Xtreme BUCC data set."""
  supported_languages = ["de", "fr", "ru", "zh"]
-  def __init__(self,
+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-               process_text_fn=tokenization.convert_to_unicode):
    super(BuccProcessor, self).__init__(process_text_fn)
    self.languages = BuccProcessor.supported_languages
@@ -66,8 +65,7 @@ class TatoebaProcessor(classifier_data_lib.DataProcessor):
      "nl", "pt", "ru", "sw", "ta", "te", "th", "tl", "tr", "ur", "vi", "zh"
  ]
-  def __init__(self,
+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-               process_text_fn=tokenization.convert_to_unicode):
    super(TatoebaProcessor, self).__init__(process_text_fn)
    self.languages = TatoebaProcessor.supported_languages

--- a/official/nlp/data/squad_lib.py
+++ b/official/nlp/data/squad_lib.py
@@ -24,6 +24,7 @@ import copy
 import json
 import math
 import os
 import six
 from absl import logging
@@ -40,8 +41,8 @@ class SquadExample(object):
  Attributes:
    qas_id: ID of the question-answer pair.
    question_text: Original text for the question.
-    doc_tokens: The list of tokens in the context obtained by splitting
+    doc_tokens: The list of tokens in the context obtained by splitting on
-      on whitespace only.
+      whitespace only.
    orig_answer_text: Original text for the answer.
    start_position: Starting index of the answer in `doc_tokens`.
    end_position: Ending index of the answer in `doc_tokens`.
@@ -209,8 +210,8 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
-            actual_text = " ".join(
+            actual_text = " ".join(doc_tokens[start_position:(end_position +
-                doc_tokens[start_position:(end_position + 1)])
+                                                              1)])
            cleaned_answer_text = " ".join(
                tokenization.whitespace_tokenize(orig_answer_text))
            if actual_text.find(cleaned_answer_text) == -1:
@@ -520,15 +521,16 @@ def write_predictions(all_examples,
  logging.info("Writing nbest to: %s", (output_nbest_file))
  all_predictions, all_nbest_json, scores_diff_json = (
-      postprocess_output(all_examples=all_examples,
+      postprocess_output(
-                         all_features=all_features,
+          all_examples=all_examples,
-                         all_results=all_results,
+          all_features=all_features,
-                         n_best_size=n_best_size,
+          all_results=all_results,
-                         max_answer_length=max_answer_length,
+          n_best_size=n_best_size,
-                         do_lower_case=do_lower_case,
+          max_answer_length=max_answer_length,
-                         version_2_with_negative=version_2_with_negative,
+          do_lower_case=do_lower_case,
-                         null_score_diff_threshold=null_score_diff_threshold,
+          version_2_with_negative=version_2_with_negative,
-                         verbose=verbose))
+          null_score_diff_threshold=null_score_diff_threshold,
+          verbose=verbose))
  write_to_json_files(all_predictions, output_prediction_file)
  write_to_json_files(all_nbest_json, output_nbest_file)

--- a/official/nlp/data/squad_lib_sp.py
+++ b/official/nlp/data/squad_lib_sp.py
@@ -27,6 +27,7 @@ import copy
 import json
 import math
 import os
 from absl import logging
 import numpy as np
 import tensorflow as tf
@@ -246,6 +247,7 @@ def convert_examples_to_features(examples,
      f = np.zeros((max_n, max_m), dtype=np.float32)
    g = {}
    # pylint: disable=cell-var-from-loop
    def _lcs_match(max_dist, n=n, m=m):
      """Longest-common-substring algorithm."""
@@ -277,6 +279,7 @@ def convert_examples_to_features(examples,
              remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]):
            g[(i, j)] = 2
            f[i, j] = f_prev + 1
    # pylint: enable=cell-var-from-loop
    max_dist = abs(n - m) + 5
@@ -580,15 +583,16 @@ def write_predictions(all_examples,
  logging.info("Writing nbest to: %s", (output_nbest_file))
  all_predictions, all_nbest_json, scores_diff_json = (
-      postprocess_output(all_examples=all_examples,
+      postprocess_output(
-                         all_features=all_features,
+          all_examples=all_examples,
-                         all_results=all_results,
+          all_features=all_features,
-                         n_best_size=n_best_size,
+          all_results=all_results,
-                         max_answer_length=max_answer_length,
+          n_best_size=n_best_size,
-                         do_lower_case=do_lower_case,
+          max_answer_length=max_answer_length,
-                         version_2_with_negative=version_2_with_negative,
+          do_lower_case=do_lower_case,
-                         null_score_diff_threshold=null_score_diff_threshold,
+          version_2_with_negative=version_2_with_negative,
-                         verbose=verbose))
+          null_score_diff_threshold=null_score_diff_threshold,
+          verbose=verbose))
  write_to_json_files(all_predictions, output_prediction_file)
  write_to_json_files(all_nbest_json, output_nbest_file)

--- a/official/nlp/data/tagging_data_lib.py
+++ b/official/nlp/data/tagging_data_lib.py
@@ -267,12 +267,12 @@ def write_example_to_file(examples,
      logging.info("Writing example %d of %d to %s", ex_index, len(examples),
                   output_file)
-    tokenized_examples = _tokenize_example(example, max_seq_length,
+    tokenized_examples = _tokenize_example(example, max_seq_length, tokenizer,
-                                           tokenizer, text_preprocessing)
+                                           text_preprocessing)
    num_tokenized_examples += len(tokenized_examples)
    for per_tokenized_example in tokenized_examples:
-      tf_example = _convert_single_example(
+      tf_example = _convert_single_example(per_tokenized_example,
-          per_tokenized_example, max_seq_length, tokenizer)
+                                           max_seq_length, tokenizer)
      writer.write(tf_example.SerializeToString())
  writer.close()
@@ -307,17 +307,16 @@ def token_classification_meta_data(train_data_size,
  return meta_data
-def generate_tf_record_from_data_file(processor,
+def generate_tf_record_from_data_file(processor, data_dir, tokenizer,
-                                      data_dir,
+                                      max_seq_length, train_data_output_path,
-                                      tokenizer,
-                                      max_seq_length,
-                                      train_data_output_path,
                                      eval_data_output_path,
                                      test_data_output_path,
                                      text_preprocessing):
  """Generates tfrecord files from the raw data."""
-  common_kwargs = dict(tokenizer=tokenizer, max_seq_length=max_seq_length,
+  common_kwargs = dict(
-                       text_preprocessing=text_preprocessing)
+      tokenizer=tokenizer,
+      max_seq_length=max_seq_length,
+      text_preprocessing=text_preprocessing)
  train_examples = processor.get_train_examples(data_dir)
  train_data_size = write_example_to_file(
      train_examples, output_file=train_data_output_path, **common_kwargs)

--- a/official/nlp/data/tagging_data_loader.py
+++ b/official/nlp/data/tagging_data_loader.py
@@ -15,6 +15,7 @@
 # ==============================================================================
 """Loads dataset for the tagging (e.g., NER/POS) task."""
 from typing import Mapping, Optional
 import dataclasses
 import tensorflow as tf

--- a/official/nlp/modeling/layers/dense_einsum.py
+++ b/official/nlp/modeling/layers/dense_einsum.py
@@ -59,9 +59,8 @@ class DenseEinsum(tf.keras.layers.Layer):
      `(batch_size, units)`.
  """
-  @deprecation.deprecated(
+  @deprecation.deprecated(None, "DenseEinsum is deprecated. Please use "
-      None, "DenseEinsum is deprecated. Please use "
+                          "tf.keras.experimental.EinsumDense layer instead.")
-      "tf.keras.experimental.EinsumDense layer instead.")
  def __init__(self,
               output_shape,
               num_summed_dimensions=1,

--- a/official/nlp/modeling/layers/gated_feedforward.py
+++ b/official/nlp/modeling/layers/gated_feedforward.py
@@ -36,19 +36,19 @@ class GatedFeedforward(tf.keras.layers.Layer):
    intermediate_size: Size of the intermediate layer.
    intermediate_activation: Activation for the intermediate layer.
    dropout: Dropout probability for the output dropout.
-    use_gate: Whether to use gated linear units. If True, assuming `GELU` as
+    use_gate: Whether to use gated linear units. If True, assuming `GELU` as the
-      the activation and omitting bias, will apply
+      activation and omitting bias, will apply `GEGLU(x, W, V, W_2) = (GEGLU(xW)
-      `GEGLU(x, W, V, W_2) = (GEGLU(xW) * xV)W2`; if False, will follow
+      * xV)W2`; if False, will follow
-      "Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper
+      "Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper and
-      and apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
+        apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
-    num_blocks: The number of feedforward blocks to stack. Each block contains
+    num_blocks: The number of feedforward blocks to stack. Each block contains a
-      a (gated) linear layer and a fully connected layer followed by dropout,
+      (gated) linear layer and a fully connected layer followed by dropout,
      layer norm and residual.
    dropout_position: Where to apply the dropout, the value can be either
      `before_residual` or `after_residual`. If `before_residual`, will apply
-      `layer_output = layer_norm(dropout(layer_output) + layer_input)`;
+      `layer_output = layer_norm(dropout(layer_output) + layer_input)`; if
-      if `after residual`, will apply
+      `after residual`, will apply `layer_output =
-      `layer_output = dropout(layer_norm(layer_output + layer_input))`.
+      dropout(layer_norm(layer_output + layer_input))`.
    kernel_initializer: Initializer for dense layer kernels.
    bias_initializer: Initializer for dense layer biases.
    kernel_regularizer: Regularizer for dense layer kernels.
@@ -124,8 +124,9 @@ class GatedFeedforward(tf.keras.layers.Layer):
              bias_axes="d",
              name="intermediate_%d" % i,
              **common_kwargs))
-      self._intermediate_activation_layers.append(tf.keras.layers.Activation(
+      self._intermediate_activation_layers.append(
-          self._intermediate_activation, dtype=activation_policy))
+          tf.keras.layers.Activation(
+              self._intermediate_activation, dtype=activation_policy))
      if self._use_gate:
        self._gate_dense.append(
            tf.keras.layers.experimental.EinsumDense(
@@ -141,8 +142,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
              bias_axes="d",
              name="output_%d" % i,
              **common_kwargs))
-      self._output_dropout.append(
+      self._output_dropout.append(tf.keras.layers.Dropout(rate=self._dropout))
-          tf.keras.layers.Dropout(rate=self._dropout))
      # Use float32 in layernorm for numeric stability.
      self._output_layer_norm.append(
          tf.keras.layers.LayerNormalization(

--- a/official/nlp/modeling/layers/gated_feedforward_test.py
+++ b/official/nlp/modeling/layers/gated_feedforward_test.py
@@ -123,5 +123,6 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
    # If the serialization was successful, the new config should match the old.
    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/layers/masked_lm_test.py
+++ b/official/nlp/modeling/layers/masked_lm_test.py
@@ -49,8 +49,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
    # Create a maskedLM from the transformer stack.
    test_layer = masked_lm.MaskedLM(
-        embedding_table=xformer_stack.get_embedding_table(),
+        embedding_table=xformer_stack.get_embedding_table(), output=output)
-        output=output)
    return test_layer
  def test_layer_creation(self):
@@ -59,8 +58,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
    hidden_size = 64
    num_predictions = 21
    test_layer = self.create_layer(
-        vocab_size=vocab_size,
+        vocab_size=vocab_size, hidden_size=hidden_size)
-        hidden_size=hidden_size)
    # Make sure that the output tensor of the masked LM is the right shape.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
@@ -127,8 +125,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
    hidden_size = 64
    num_predictions = 21
    test_layer = self.create_layer(
-        vocab_size=vocab_size,
+        vocab_size=vocab_size, hidden_size=hidden_size)
-        hidden_size=hidden_size)
    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
@@ -147,8 +144,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
  def test_unknown_output_type_fails(self):
    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = self.create_layer(
+      _ = self.create_layer(vocab_size=8, hidden_size=8, output='bad')
-          vocab_size=8, hidden_size=8, output='bad')
 if __name__ == '__main__':

--- a/official/nlp/modeling/layers/on_device_embedding.py
+++ b/official/nlp/modeling/layers/on_device_embedding.py
@@ -92,5 +92,5 @@ class OnDeviceEmbedding(tf.keras.layers.Layer):
        tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
    embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
    if self._use_scale:
-      embeddings *= self._embedding_width ** 0.5
+      embeddings *= self._embedding_width**0.5
    return embeddings
--- a/official/nlp/modeling/layers/on_device_embedding_test.py
+++ b/official/nlp/modeling/layers/on_device_embedding_test.py
@@ -89,8 +89,7 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
    embedding_width = 27
    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width,
+        vocab_size=vocab_size, embedding_width=embedding_width, dtype=policy)
-        dtype=policy)
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
@@ -214,5 +213,6 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
    output = model.predict(input_data)
    self.assertEqual(tf.float32, output.dtype)
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
@@ -171,22 +171,20 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
      inputs: An tensor whose second dimension will be used as `length`. If
        `None`, the other `length` argument must be specified.
      length: An optional integer specifying the number of positions. If both
-        `inputs` and `length` are spcified, `length` must be equal to the
+        `inputs` and `length` are spcified, `length` must be equal to the second
-        second dimension of `inputs`.
+        dimension of `inputs`.
    Returns:
      A tensor in shape of [length, hidden_size].
    """
    if inputs is None and length is None:
-      raise ValueError(
+      raise ValueError("If inputs is None, `length` must be set in "
-          "If inputs is None, `length` must be set in "
+                       "RelativePositionEmbedding().")
-          "RelativePositionEmbedding().")
    if inputs is not None:
      input_shape = tf_utils.get_shape_list(inputs)
      if length is not None and length != input_shape[1]:
        raise ValueError(
-            "If inputs is not None, `length` must equal to input_shape[1]."
+            "If inputs is not None, `length` must equal to input_shape[1].")
-        )
      length = input_shape[1]
    position = tf.cast(tf.range(length), tf.float32)
    num_timescales = self._hidden_size // 2
@@ -197,8 +195,8 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
    inv_timescales = min_timescale * tf.exp(
        tf.cast(tf.range(num_timescales), tf.float32) *
        -log_timescale_increment)
-    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales,
+    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
-                                                               0)
+        inv_timescales, 0)
-    position_embeddings = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)],
+    position_embeddings = tf.concat(
-                                    axis=1)
+        [tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
    return position_embeddings
--- a/official/nlp/modeling/layers/position_embedding_test.py
+++ b/official/nlp/modeling/layers/position_embedding_test.py
@@ -127,5 +127,6 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
    self.assertAllEqual(output_tensor, expected_output_tensor)
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/layers/rezero_transformer.py
+++ b/official/nlp/modeling/layers/rezero_transformer.py
@@ -161,7 +161,8 @@ class ReZeroTransformer(tf.keras.layers.Layer):
    self._rezero_a = self.add_weight(
        name="rezero_alpha",
        initializer=tf.keras.initializers.Zeros(),
-        trainable=True, dtype=tf.float32)
+        trainable=True,
+        dtype=tf.float32)
    super(ReZeroTransformer, self).build(input_shape)