Commit e0b6ce02 authored by Chen Chen's avatar Chen Chen Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 327830072
parent fe30e189
...@@ -33,9 +33,14 @@ _UNK_TOKEN = "[UNK]" ...@@ -33,9 +33,14 @@ _UNK_TOKEN = "[UNK]"
class InputExample(object): class InputExample(object):
"""A single training/test example for token classification.""" """A single training/test example for token classification."""
def __init__(self, sentence_id, words=None, label_ids=None): def __init__(self,
sentence_id,
sub_sentence_id=0,
words=None,
label_ids=None):
"""Constructs an InputExample.""" """Constructs an InputExample."""
self.sentence_id = sentence_id self.sentence_id = sentence_id
self.sub_sentence_id = sub_sentence_id
self.words = words if words else [] self.words = words if words else []
self.label_ids = label_ids if label_ids else [] self.label_ids = label_ids if label_ids else []
...@@ -146,7 +151,7 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None): ...@@ -146,7 +151,7 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
# Needs additional [CLS] and [SEP] tokens. # Needs additional [CLS] and [SEP] tokens.
max_length = max_length - 2 max_length = max_length - 2
new_examples = [] new_examples = []
new_example = InputExample(sentence_id=example.sentence_id) new_example = InputExample(sentence_id=example.sentence_id, sub_sentence_id=0)
for i, word in enumerate(example.words): for i, word in enumerate(example.words):
if any([x < 0 for x in example.label_ids]): if any([x < 0 for x in example.label_ids]):
raise ValueError("Unexpected negative label_id: %s" % example.label_ids) raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
...@@ -160,7 +165,10 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None): ...@@ -160,7 +165,10 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
if len(subwords) + len(new_example.words) > max_length: if len(subwords) + len(new_example.words) > max_length:
# Start a new example. # Start a new example.
new_examples.append(new_example) new_examples.append(new_example)
new_example = InputExample(sentence_id=example.sentence_id) last_sub_sentence_id = new_example.sub_sentence_id
new_example = InputExample(
sentence_id=example.sentence_id,
sub_sentence_id=last_sub_sentence_id + 1)
for j, subword in enumerate(subwords): for j, subword in enumerate(subwords):
# Use the real label for the first subword, and pad label for # Use the real label for the first subword, and pad label for
...@@ -203,6 +211,7 @@ def _convert_single_example(example, max_seq_length, tokenizer): ...@@ -203,6 +211,7 @@ def _convert_single_example(example, max_seq_length, tokenizer):
features["segment_ids"] = create_int_feature(segment_ids) features["segment_ids"] = create_int_feature(segment_ids)
features["label_ids"] = create_int_feature(label_ids) features["label_ids"] = create_int_feature(label_ids)
features["sentence_id"] = create_int_feature([example.sentence_id]) features["sentence_id"] = create_int_feature([example.sentence_id])
features["sub_sentence_id"] = create_int_feature([example.sub_sentence_id])
tf_example = tf.train.Example(features=tf.train.Features(feature=features)) tf_example = tf.train.Example(features=tf.train.Features(feature=features))
return tf_example return tf_example
......
...@@ -52,6 +52,7 @@ class TaggingDataLoader(data_loader.DataLoader): ...@@ -52,6 +52,7 @@ class TaggingDataLoader(data_loader.DataLoader):
} }
if self._include_sentence_id: if self._include_sentence_id:
name_to_features['sentence_id'] = tf.io.FixedLenFeature([], tf.int64) name_to_features['sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
name_to_features['sub_sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
example = tf.io.parse_single_example(record, name_to_features) example = tf.io.parse_single_example(record, name_to_features)
...@@ -74,6 +75,8 @@ class TaggingDataLoader(data_loader.DataLoader): ...@@ -74,6 +75,8 @@ class TaggingDataLoader(data_loader.DataLoader):
} }
if self._include_sentence_id: if self._include_sentence_id:
x['sentence_id'] = record['sentence_id'] x['sentence_id'] = record['sentence_id']
x['sub_sentence_id'] = record['sub_sentence_id']
y = record['label_ids'] y = record['label_ids']
return (x, y) return (x, y)
......
...@@ -16,13 +16,14 @@ ...@@ -16,13 +16,14 @@
"""Tests for official.nlp.data.tagging_data_loader.""" """Tests for official.nlp.data.tagging_data_loader."""
import os import os
from absl.testing import parameterized
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from official.nlp.data import tagging_dataloader from official.nlp.data import tagging_dataloader
def _create_fake_dataset(output_path, seq_length): def _create_fake_dataset(output_path, seq_length, include_sentence_id):
"""Creates a fake dataset.""" """Creates a fake dataset."""
writer = tf.io.TFRecordWriter(output_path) writer = tf.io.TFRecordWriter(output_path)
...@@ -30,7 +31,7 @@ def _create_fake_dataset(output_path, seq_length): ...@@ -30,7 +31,7 @@ def _create_fake_dataset(output_path, seq_length):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f return f
for _ in range(100): for i in range(100):
features = {} features = {}
input_ids = np.random.randint(100, size=(seq_length)) input_ids = np.random.randint(100, size=(seq_length))
features['input_ids'] = create_int_feature(input_ids) features['input_ids'] = create_int_feature(input_ids)
...@@ -38,32 +39,44 @@ def _create_fake_dataset(output_path, seq_length): ...@@ -38,32 +39,44 @@ def _create_fake_dataset(output_path, seq_length):
features['segment_ids'] = create_int_feature(np.ones_like(input_ids)) features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
features['label_ids'] = create_int_feature( features['label_ids'] = create_int_feature(
np.random.randint(10, size=(seq_length))) np.random.randint(10, size=(seq_length)))
if include_sentence_id:
features['sentence_id'] = create_int_feature([i])
features['sub_sentence_id'] = create_int_feature([0])
tf_example = tf.train.Example(features=tf.train.Features(feature=features)) tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString()) writer.write(tf_example.SerializeToString())
writer.close() writer.close()
class TaggingDataLoaderTest(tf.test.TestCase): class TaggingDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
def test_load_dataset(self): @parameterized.parameters(True, False)
def test_load_dataset(self, include_sentence_id):
seq_length = 16 seq_length = 16
batch_size = 10 batch_size = 10
train_data_path = os.path.join(self.get_temp_dir(), 'train.tf_record') train_data_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
_create_fake_dataset(train_data_path, seq_length) _create_fake_dataset(train_data_path, seq_length, include_sentence_id)
data_config = tagging_dataloader.TaggingDataConfig( data_config = tagging_dataloader.TaggingDataConfig(
input_path=train_data_path, input_path=train_data_path,
seq_length=seq_length, seq_length=seq_length,
global_batch_size=batch_size) global_batch_size=batch_size,
include_sentence_id=include_sentence_id)
dataset = tagging_dataloader.TaggingDataLoader(data_config).load() dataset = tagging_dataloader.TaggingDataLoader(data_config).load()
features, labels = next(iter(dataset)) features, labels = next(iter(dataset))
self.assertCountEqual(['input_word_ids', 'input_mask', 'input_type_ids'],
features.keys()) expected_keys = ['input_word_ids', 'input_mask', 'input_type_ids']
if include_sentence_id:
expected_keys.extend(['sentence_id', 'sub_sentence_id'])
self.assertCountEqual(expected_keys, features.keys())
self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
self.assertEqual(features['input_mask'].shape, (batch_size, seq_length)) self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
self.assertEqual(labels.shape, (batch_size, seq_length)) self.assertEqual(labels.shape, (batch_size, seq_length))
if include_sentence_id:
self.assertEqual(features['sentence_id'].shape, (batch_size,))
self.assertEqual(features['sub_sentence_id'].shape, (batch_size,))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -214,8 +214,9 @@ class TaggingTask(base_task.Task): ...@@ -214,8 +214,9 @@ class TaggingTask(base_task.Task):
} }
def predict(task: TaggingTask, params: cfg.DataConfig, def predict(task: TaggingTask,
model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]: params: cfg.DataConfig,
model: tf.keras.Model) -> List[Tuple[int, int, List[int]]]:
"""Predicts on the input data. """Predicts on the input data.
Args: Args:
...@@ -224,46 +225,50 @@ def predict(task: TaggingTask, params: cfg.DataConfig, ...@@ -224,46 +225,50 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
model: A keras.Model. model: A keras.Model.
Returns: Returns:
A tuple of `predict_ids` and `sentence_ids`, which are list with length A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and
of `num_examples`. Each element in `predict_ids` is a sequence of a list of predicted ids.
predicted per-word label id, and each element in `sentence_ids` is the
sentence id of the corresponding example.
""" """
def predict_step(inputs): def predict_step(inputs):
"""Replicated prediction calculation.""" """Replicated prediction calculation."""
x, y = inputs x, y = inputs
sentence_ids = x.pop('sentence_id') sentence_ids = x.pop('sentence_id')
sub_sentence_ids = x.pop('sub_sentence_id')
outputs = task.inference_step(x, model) outputs = task.inference_step(x, model)
predict_ids = outputs['predict_ids'] predict_ids = outputs['predict_ids']
label_mask = tf.greater_equal(y, 0) label_mask = tf.greater_equal(y, 0)
return dict( return dict(
predict_ids=predict_ids, predict_ids=predict_ids,
label_mask=label_mask, label_mask=label_mask,
sentence_ids=sentence_ids) sentence_ids=sentence_ids,
sub_sentence_ids=sub_sentence_ids)
def aggregate_fn(state, outputs): def aggregate_fn(state, outputs):
"""Concatenates model's outputs.""" """Concatenates model's outputs."""
if state is None: if state is None:
state = {'predict_ids': [], 'sentence_ids': []} state = []
cur_predict_ids = state['predict_ids'] for (batch_predict_ids, batch_label_mask, batch_sentence_ids,
cur_sentence_ids = state['sentence_ids'] batch_sub_sentence_ids) in zip(outputs['predict_ids'],
for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip( outputs['label_mask'],
outputs['predict_ids'], outputs['label_mask'], outputs['sentence_ids']): outputs['sentence_ids'],
for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip( outputs['sub_sentence_ids']):
batch_predict_ids.numpy(), batch_label_mask.numpy(), for (tmp_predict_ids, tmp_label_mask, tmp_sentence_id,
batch_sentence_ids.numpy()): tmp_sub_sentence_id) in zip(batch_predict_ids.numpy(),
cur_sentence_ids.append(tmp_sentence_id) batch_label_mask.numpy(),
cur_predict_ids.append([]) batch_sentence_ids.numpy(),
batch_sub_sentence_ids.numpy()):
real_predict_ids = []
assert len(tmp_predict_ids) == len(tmp_label_mask) assert len(tmp_predict_ids) == len(tmp_label_mask)
for i in range(len(tmp_predict_ids)): for i in range(len(tmp_predict_ids)):
# Skip the padding label. # Skip the padding label.
if tmp_label_mask[i]: if tmp_label_mask[i]:
cur_predict_ids[-1].append(tmp_predict_ids[i]) real_predict_ids.append(tmp_predict_ids[i])
state.append((tmp_sentence_id, tmp_sub_sentence_id, real_predict_ids))
return state return state
dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
task.build_inputs, params) task.build_inputs, params)
outputs = utils.predict(predict_step, aggregate_fn, dataset) outputs = utils.predict(predict_step, aggregate_fn, dataset)
return outputs['predict_ids'], outputs['sentence_ids'] return sorted(outputs, key=lambda x: (x[0], x[1]))
...@@ -44,6 +44,7 @@ def _create_fake_dataset(output_path, seq_length, num_labels, num_examples): ...@@ -44,6 +44,7 @@ def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
features["label_ids"] = create_int_feature( features["label_ids"] = create_int_feature(
np.random.random_integers(-1, num_labels - 1, size=(seq_length))) np.random.random_integers(-1, num_labels - 1, size=(seq_length)))
features["sentence_id"] = create_int_feature([i]) features["sentence_id"] = create_int_feature([i])
features["sub_sentence_id"] = create_int_feature([0])
tf_example = tf.train.Example(features=tf.train.Features(feature=features)) tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString()) writer.write(tf_example.SerializeToString())
...@@ -189,9 +190,9 @@ class TaggingTest(tf.test.TestCase): ...@@ -189,9 +190,9 @@ class TaggingTest(tf.test.TestCase):
drop_remainder=False, drop_remainder=False,
include_sentence_id=True) include_sentence_id=True)
predict_ids, sentence_ids = tagging.predict(task, test_data_config, model) results = tagging.predict(task, test_data_config, model)
self.assertLen(predict_ids, num_examples) self.assertLen(results, num_examples)
self.assertLen(sentence_ids, num_examples) self.assertLen(results[0], 3)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment