Internal change

PiperOrigin-RevId: 327830072

Internal change
PiperOrigin-RevId: 327830072
e0b6ce02 · Chen Chen · A. Unique TensorFlower · fe30e189 · e0b6ce02 · e0b6ce02
Commit e0b6ce02 authored Aug 21, 2020 by Chen Chen Committed by A. Unique TensorFlower Aug 21, 2020
5 changed files
--- a/official/nlp/data/tagging_data_lib.py
+++ b/official/nlp/data/tagging_data_lib.py
@@ -33,9 +33,14 @@ _UNK_TOKEN = "[UNK]"
 class InputExample(object):
  """A single training/test example for token classification."""
-  def __init__(self, sentence_id, words=None, label_ids=None):
+  def __init__(self,
+               sentence_id,
+               sub_sentence_id=0,
+               words=None,
+               label_ids=None):
    """Constructs an InputExample."""
    self.sentence_id = sentence_id
+    self.sub_sentence_id = sub_sentence_id
    self.words = words if words else []
    self.label_ids = label_ids if label_ids else []
@@ -146,7 +151,7 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
  # Needs additional [CLS] and [SEP] tokens.
  max_length = max_length - 2
  new_examples = []
-  new_example = InputExample(sentence_id=example.sentence_id)
+  new_example = InputExample(sentence_id=example.sentence_id, sub_sentence_id=0)
  for i, word in enumerate(example.words):
    if any([x < 0 for x in example.label_ids]):
      raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
@@ -160,7 +165,10 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
    if len(subwords) + len(new_example.words) > max_length:
      # Start a new example.
      new_examples.append(new_example)
-      new_example = InputExample(sentence_id=example.sentence_id)
+      last_sub_sentence_id = new_example.sub_sentence_id
+      new_example = InputExample(
+          sentence_id=example.sentence_id,
+          sub_sentence_id=last_sub_sentence_id + 1)
    for j, subword in enumerate(subwords):
      # Use the real label for the first subword, and pad label for
@@ -203,6 +211,7 @@ def _convert_single_example(example, max_seq_length, tokenizer):
  features["segment_ids"] = create_int_feature(segment_ids)
  features["label_ids"] = create_int_feature(label_ids)
  features["sentence_id"] = create_int_feature([example.sentence_id])
+  features["sub_sentence_id"] = create_int_feature([example.sub_sentence_id])
  tf_example = tf.train.Example(features=tf.train.Features(feature=features))
  return tf_example

--- a/official/nlp/data/tagging_dataloader.py
+++ b/official/nlp/data/tagging_dataloader.py
@@ -52,6 +52,7 @@ class TaggingDataLoader(data_loader.DataLoader):
    }
    if self._include_sentence_id:
      name_to_features['sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features['sub_sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
    example = tf.io.parse_single_example(record, name_to_features)
@@ -74,6 +75,8 @@ class TaggingDataLoader(data_loader.DataLoader):
    }
    if self._include_sentence_id:
      x['sentence_id'] = record['sentence_id']
+      x['sub_sentence_id'] = record['sub_sentence_id']
    y = record['label_ids']
    return (x, y)

--- a/official/nlp/data/tagging_dataloader_test.py
+++ b/official/nlp/data/tagging_dataloader_test.py
@@ -16,13 +16,14 @@
 """Tests for official.nlp.data.tagging_data_loader."""
 import os
+from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
 from official.nlp.data import tagging_dataloader
-def _create_fake_dataset(output_path, seq_length):
+def _create_fake_dataset(output_path, seq_length, include_sentence_id):
  """Creates a fake dataset."""
  writer = tf.io.TFRecordWriter(output_path)
@@ -30,7 +31,7 @@ def _create_fake_dataset(output_path, seq_length):
    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
    return f
-  for _ in range(100):
+  for i in range(100):
    features = {}
    input_ids = np.random.randint(100, size=(seq_length))
    features['input_ids'] = create_int_feature(input_ids)
@@ -38,32 +39,44 @@ def _create_fake_dataset(output_path, seq_length):
    features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
    features['label_ids'] = create_int_feature(
        np.random.randint(10, size=(seq_length)))
+    if include_sentence_id:
+      features['sentence_id'] = create_int_feature([i])
+      features['sub_sentence_id'] = create_int_feature([0])
    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    writer.write(tf_example.SerializeToString())
  writer.close()
-class TaggingDataLoaderTest(tf.test.TestCase):
+class TaggingDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
-  def test_load_dataset(self):
+  @parameterized.parameters(True, False)
+  def test_load_dataset(self, include_sentence_id):
    seq_length = 16
    batch_size = 10
    train_data_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
-    _create_fake_dataset(train_data_path, seq_length)
+    _create_fake_dataset(train_data_path, seq_length, include_sentence_id)
    data_config = tagging_dataloader.TaggingDataConfig(
        input_path=train_data_path,
        seq_length=seq_length,
-        global_batch_size=batch_size)
+        global_batch_size=batch_size,
+        include_sentence_id=include_sentence_id)
    dataset = tagging_dataloader.TaggingDataLoader(data_config).load()
    features, labels = next(iter(dataset))
-    self.assertCountEqual(['input_word_ids', 'input_mask', 'input_type_ids'],
-                          features.keys())
+    expected_keys = ['input_word_ids', 'input_mask', 'input_type_ids']
+    if include_sentence_id:
+      expected_keys.extend(['sentence_id', 'sub_sentence_id'])
+    self.assertCountEqual(expected_keys, features.keys())
    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
    self.assertEqual(labels.shape, (batch_size, seq_length))
+    if include_sentence_id:
+      self.assertEqual(features['sentence_id'].shape, (batch_size,))
+      self.assertEqual(features['sub_sentence_id'].shape, (batch_size,))
 if __name__ == '__main__':

--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
@@ -214,8 +214,9 @@ class TaggingTask(base_task.Task):
    }
-def predict(task: TaggingTask, params: cfg.DataConfig,
+def predict(task: TaggingTask,
-            model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]:
+            params: cfg.DataConfig,
+            model: tf.keras.Model) -> List[Tuple[int, int, List[int]]]:
  """Predicts on the input data.
  Args:
@@ -224,46 +225,50 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
    model: A keras.Model.
  Returns:
-    A tuple of `predict_ids` and `sentence_ids`, which are list with length
+    A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and
-      of `num_examples`. Each element in `predict_ids` is a sequence of
+      a list of predicted ids.
-      predicted per-word label id, and each element in `sentence_ids` is the
-      sentence id of the corresponding example.
  """
  def predict_step(inputs):
    """Replicated prediction calculation."""
    x, y = inputs
    sentence_ids = x.pop('sentence_id')
+    sub_sentence_ids = x.pop('sub_sentence_id')
    outputs = task.inference_step(x, model)
    predict_ids = outputs['predict_ids']
    label_mask = tf.greater_equal(y, 0)
    return dict(
        predict_ids=predict_ids,
        label_mask=label_mask,
-        sentence_ids=sentence_ids)
+        sentence_ids=sentence_ids,
+        sub_sentence_ids=sub_sentence_ids)
  def aggregate_fn(state, outputs):
    """Concatenates model's outputs."""
    if state is None:
-      state = {'predict_ids': [], 'sentence_ids': []}
+      state = []
-    cur_predict_ids = state['predict_ids']
+    for (batch_predict_ids, batch_label_mask, batch_sentence_ids,
-    cur_sentence_ids = state['sentence_ids']
+         batch_sub_sentence_ids) in zip(outputs['predict_ids'],
-    for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip(
+                                        outputs['label_mask'],
-        outputs['predict_ids'], outputs['label_mask'], outputs['sentence_ids']):
+                                        outputs['sentence_ids'],
-      for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip(
+                                        outputs['sub_sentence_ids']):
-          batch_predict_ids.numpy(), batch_label_mask.numpy(),
+      for (tmp_predict_ids, tmp_label_mask, tmp_sentence_id,
-          batch_sentence_ids.numpy()):
+           tmp_sub_sentence_id) in zip(batch_predict_ids.numpy(),
-        cur_sentence_ids.append(tmp_sentence_id)
+                                       batch_label_mask.numpy(),
-        cur_predict_ids.append([])
+                                       batch_sentence_ids.numpy(),
+                                       batch_sub_sentence_ids.numpy()):
+        real_predict_ids = []
        assert len(tmp_predict_ids) == len(tmp_label_mask)
        for i in range(len(tmp_predict_ids)):
          # Skip the padding label.
          if tmp_label_mask[i]:
-            cur_predict_ids[-1].append(tmp_predict_ids[i])
+            real_predict_ids.append(tmp_predict_ids[i])
+        state.append((tmp_sentence_id, tmp_sub_sentence_id, real_predict_ids))
    return state
  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
  outputs = utils.predict(predict_step, aggregate_fn, dataset)
-  return outputs['predict_ids'], outputs['sentence_ids']
+  return sorted(outputs, key=lambda x: (x[0], x[1]))
--- a/official/nlp/tasks/tagging_test.py
+++ b/official/nlp/tasks/tagging_test.py
@@ -44,6 +44,7 @@ def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
    features["label_ids"] = create_int_feature(
        np.random.random_integers(-1, num_labels - 1, size=(seq_length)))
    features["sentence_id"] = create_int_feature([i])
+    features["sub_sentence_id"] = create_int_feature([0])
    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    writer.write(tf_example.SerializeToString())
@@ -189,9 +190,9 @@ class TaggingTest(tf.test.TestCase):
        drop_remainder=False,
        include_sentence_id=True)
-    predict_ids, sentence_ids = tagging.predict(task, test_data_config, model)
+    results = tagging.predict(task, test_data_config, model)
-    self.assertLen(predict_ids, num_examples)
+    self.assertLen(results, num_examples)
-    self.assertLen(sentence_ids, num_examples)
+    self.assertLen(results[0], 3)
 if __name__ == "__main__":