Internal change

PiperOrigin-RevId: 384018258

Internal change
PiperOrigin-RevId: 384018258
0e74158f · A. Unique TensorFlower · 5f23689e · 0e74158f · 0e74158f
Commit 0e74158f authored Jul 10, 2021 by A. Unique TensorFlower
2 changed files
--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -222,13 +222,12 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
    """Berts preprocess."""
    segments = [record[x] for x in self._text_fields]
    model_inputs = self._text_processor(segments)
-    if self._include_example_id:
+    for key in record:
-      model_inputs['example_id'] = record['example_id']
+      if key not in self._text_fields:
-    model_inputs[self._label_field] = record[self._label_field]
+        model_inputs[key] = record[key]
    return model_inputs
-  def _decode(self, record: tf.Tensor):
+  def name_to_features_spec(self):
-    """Decodes a serialized tf.Example."""
    name_to_features = {}
    for text_field in self._text_fields:
      name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string)
@@ -237,8 +236,11 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
    name_to_features[self._label_field] = tf.io.FixedLenFeature([], label_type)
    if self._include_example_id:
      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
-    example = tf.io.parse_single_example(record, name_to_features)
+    return name_to_features
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    example = tf.io.parse_single_example(record, self.name_to_features_spec())
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in example:

--- a/official/nlp/data/sentence_prediction_dataloader_test.py
+++ b/official/nlp/data/sentence_prediction_dataloader_test.py
@@ -198,9 +198,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
+    expected_keys = [
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
-        features.keys())
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
@@ -233,9 +236,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
+    expected_keys = [
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
-        features.keys())
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
@@ -268,9 +274,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
    features = next(iter(dataset))
    label_field = data_config.label_field
-    self.assertCountEqual(
+    expected_keys = [
-        ['input_word_ids', 'input_type_ids', 'input_mask', label_field],
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
-        features.keys())
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))