"tests/vscode:/vscode.git/clone" did not exist on "599163e6453f6d246cb74582a40440968b9a1734"
Commit e0b6ce02 authored by Chen Chen's avatar Chen Chen Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 327830072
parent fe30e189
......@@ -33,9 +33,14 @@ _UNK_TOKEN = "[UNK]"
class InputExample(object):
"""A single training/test example for token classification."""
def __init__(self, sentence_id, words=None, label_ids=None):
def __init__(self,
sentence_id,
sub_sentence_id=0,
words=None,
label_ids=None):
"""Constructs an InputExample."""
self.sentence_id = sentence_id
self.sub_sentence_id = sub_sentence_id
self.words = words if words else []
self.label_ids = label_ids if label_ids else []
......@@ -146,7 +151,7 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
# Needs additional [CLS] and [SEP] tokens.
max_length = max_length - 2
new_examples = []
new_example = InputExample(sentence_id=example.sentence_id)
new_example = InputExample(sentence_id=example.sentence_id, sub_sentence_id=0)
for i, word in enumerate(example.words):
if any([x < 0 for x in example.label_ids]):
raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
......@@ -160,7 +165,10 @@ def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
if len(subwords) + len(new_example.words) > max_length:
# Start a new example.
new_examples.append(new_example)
new_example = InputExample(sentence_id=example.sentence_id)
last_sub_sentence_id = new_example.sub_sentence_id
new_example = InputExample(
sentence_id=example.sentence_id,
sub_sentence_id=last_sub_sentence_id + 1)
for j, subword in enumerate(subwords):
# Use the real label for the first subword, and pad label for
......@@ -203,6 +211,7 @@ def _convert_single_example(example, max_seq_length, tokenizer):
features["segment_ids"] = create_int_feature(segment_ids)
features["label_ids"] = create_int_feature(label_ids)
features["sentence_id"] = create_int_feature([example.sentence_id])
features["sub_sentence_id"] = create_int_feature([example.sub_sentence_id])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
return tf_example
......
......@@ -52,6 +52,7 @@ class TaggingDataLoader(data_loader.DataLoader):
}
if self._include_sentence_id:
name_to_features['sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
name_to_features['sub_sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
example = tf.io.parse_single_example(record, name_to_features)
......@@ -74,6 +75,8 @@ class TaggingDataLoader(data_loader.DataLoader):
}
if self._include_sentence_id:
x['sentence_id'] = record['sentence_id']
x['sub_sentence_id'] = record['sub_sentence_id']
y = record['label_ids']
return (x, y)
......
......@@ -16,13 +16,14 @@
"""Tests for official.nlp.data.tagging_data_loader."""
import os
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.nlp.data import tagging_dataloader
def _create_fake_dataset(output_path, seq_length):
def _create_fake_dataset(output_path, seq_length, include_sentence_id):
"""Creates a fake dataset."""
writer = tf.io.TFRecordWriter(output_path)
......@@ -30,7 +31,7 @@ def _create_fake_dataset(output_path, seq_length):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
for _ in range(100):
for i in range(100):
features = {}
input_ids = np.random.randint(100, size=(seq_length))
features['input_ids'] = create_int_feature(input_ids)
......@@ -38,32 +39,44 @@ def _create_fake_dataset(output_path, seq_length):
features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
features['label_ids'] = create_int_feature(
np.random.randint(10, size=(seq_length)))
if include_sentence_id:
features['sentence_id'] = create_int_feature([i])
features['sub_sentence_id'] = create_int_feature([0])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()
class TaggingDataLoaderTest(tf.test.TestCase):
class TaggingDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
def test_load_dataset(self):
@parameterized.parameters(True, False)
def test_load_dataset(self, include_sentence_id):
seq_length = 16
batch_size = 10
train_data_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
_create_fake_dataset(train_data_path, seq_length)
_create_fake_dataset(train_data_path, seq_length, include_sentence_id)
data_config = tagging_dataloader.TaggingDataConfig(
input_path=train_data_path,
seq_length=seq_length,
global_batch_size=batch_size)
global_batch_size=batch_size,
include_sentence_id=include_sentence_id)
dataset = tagging_dataloader.TaggingDataLoader(data_config).load()
features, labels = next(iter(dataset))
self.assertCountEqual(['input_word_ids', 'input_mask', 'input_type_ids'],
features.keys())
expected_keys = ['input_word_ids', 'input_mask', 'input_type_ids']
if include_sentence_id:
expected_keys.extend(['sentence_id', 'sub_sentence_id'])
self.assertCountEqual(expected_keys, features.keys())
self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
self.assertEqual(labels.shape, (batch_size, seq_length))
if include_sentence_id:
self.assertEqual(features['sentence_id'].shape, (batch_size,))
self.assertEqual(features['sub_sentence_id'].shape, (batch_size,))
if __name__ == '__main__':
......
......@@ -214,8 +214,9 @@ class TaggingTask(base_task.Task):
}
def predict(task: TaggingTask, params: cfg.DataConfig,
model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]:
def predict(task: TaggingTask,
params: cfg.DataConfig,
model: tf.keras.Model) -> List[Tuple[int, int, List[int]]]:
"""Predicts on the input data.
Args:
......@@ -224,46 +225,50 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
model: A keras.Model.
Returns:
A tuple of `predict_ids` and `sentence_ids`, which are list with length
of `num_examples`. Each element in `predict_ids` is a sequence of
predicted per-word label id, and each element in `sentence_ids` is the
sentence id of the corresponding example.
A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and
a list of predicted ids.
"""
def predict_step(inputs):
"""Replicated prediction calculation."""
x, y = inputs
sentence_ids = x.pop('sentence_id')
sub_sentence_ids = x.pop('sub_sentence_id')
outputs = task.inference_step(x, model)
predict_ids = outputs['predict_ids']
label_mask = tf.greater_equal(y, 0)
return dict(
predict_ids=predict_ids,
label_mask=label_mask,
sentence_ids=sentence_ids)
sentence_ids=sentence_ids,
sub_sentence_ids=sub_sentence_ids)
def aggregate_fn(state, outputs):
"""Concatenates model's outputs."""
if state is None:
state = {'predict_ids': [], 'sentence_ids': []}
cur_predict_ids = state['predict_ids']
cur_sentence_ids = state['sentence_ids']
for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip(
outputs['predict_ids'], outputs['label_mask'], outputs['sentence_ids']):
for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip(
batch_predict_ids.numpy(), batch_label_mask.numpy(),
batch_sentence_ids.numpy()):
cur_sentence_ids.append(tmp_sentence_id)
cur_predict_ids.append([])
state = []
for (batch_predict_ids, batch_label_mask, batch_sentence_ids,
batch_sub_sentence_ids) in zip(outputs['predict_ids'],
outputs['label_mask'],
outputs['sentence_ids'],
outputs['sub_sentence_ids']):
for (tmp_predict_ids, tmp_label_mask, tmp_sentence_id,
tmp_sub_sentence_id) in zip(batch_predict_ids.numpy(),
batch_label_mask.numpy(),
batch_sentence_ids.numpy(),
batch_sub_sentence_ids.numpy()):
real_predict_ids = []
assert len(tmp_predict_ids) == len(tmp_label_mask)
for i in range(len(tmp_predict_ids)):
# Skip the padding label.
if tmp_label_mask[i]:
cur_predict_ids[-1].append(tmp_predict_ids[i])
real_predict_ids.append(tmp_predict_ids[i])
state.append((tmp_sentence_id, tmp_sub_sentence_id, real_predict_ids))
return state
dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
task.build_inputs, params)
outputs = utils.predict(predict_step, aggregate_fn, dataset)
return outputs['predict_ids'], outputs['sentence_ids']
return sorted(outputs, key=lambda x: (x[0], x[1]))
......@@ -44,6 +44,7 @@ def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
features["label_ids"] = create_int_feature(
np.random.random_integers(-1, num_labels - 1, size=(seq_length)))
features["sentence_id"] = create_int_feature([i])
features["sub_sentence_id"] = create_int_feature([0])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
......@@ -189,9 +190,9 @@ class TaggingTest(tf.test.TestCase):
drop_remainder=False,
include_sentence_id=True)
predict_ids, sentence_ids = tagging.predict(task, test_data_config, model)
self.assertLen(predict_ids, num_examples)
self.assertLen(sentence_ids, num_examples)
results = tagging.predict(task, test_data_config, model)
self.assertLen(results, num_examples)
self.assertLen(results[0], 3)
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment