"docker/vscode:/vscode.git/clone" did not exist on "4b4565077c12626e20197233d25b71a2b8504e67"
Commit 2284f823 authored by Chen Chen's avatar Chen Chen Committed by A. Unique TensorFlower
Browse files

Support create fine-tuning data for tagging task. (XTREME's udpos/panx)

PiperOrigin-RevId: 318829996
parent db39ef82
...@@ -32,14 +32,16 @@ from official.nlp.data import sentence_retrieval_lib ...@@ -32,14 +32,16 @@ from official.nlp.data import sentence_retrieval_lib
from official.nlp.data import squad_lib as squad_lib_wp from official.nlp.data import squad_lib as squad_lib_wp
# sentence-piece tokenizer based squad_lib # sentence-piece tokenizer based squad_lib
from official.nlp.data import squad_lib_sp from official.nlp.data import squad_lib_sp
from official.nlp.data import tagging_data_lib
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
# TODO(chendouble): consider moving each task to its own binary.
flags.DEFINE_enum( flags.DEFINE_enum(
"fine_tuning_task_type", "classification", "fine_tuning_task_type", "classification",
["classification", "regression", "squad", "retrieval"], ["classification", "regression", "squad", "retrieval", "tagging"],
"The name of the BERT fine tuning task for which data " "The name of the BERT fine tuning task for which data "
"will be generated..") "will be generated.")
# BERT classification specific flags. # BERT classification specific flags.
flags.DEFINE_string( flags.DEFINE_string(
...@@ -56,9 +58,6 @@ flags.DEFINE_enum("classification_task_name", "MNLI", ...@@ -56,9 +58,6 @@ flags.DEFINE_enum("classification_task_name", "MNLI",
"only and for XNLI is all languages combined. Same for " "only and for XNLI is all languages combined. Same for "
"PAWS-X.") "PAWS-X.")
flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
"The name of sentence retrieval task for scoring")
# XNLI task specific flag. # XNLI task specific flag.
flags.DEFINE_string( flags.DEFINE_string(
"xnli_language", "en", "xnli_language", "en",
...@@ -71,6 +70,14 @@ flags.DEFINE_string( ...@@ -71,6 +70,14 @@ flags.DEFINE_string(
"Language of trainig data for PAWS-X task. If the value is 'all', the data " "Language of trainig data for PAWS-X task. If the value is 'all', the data "
"of all languages will be used for training.") "of all languages will be used for training.")
# Retrieva task specific flags
flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
"The name of sentence retrieval task for scoring")
# Tagging task specific flags
flags.DEFINE_enum("tagging_task_name", "panx", ["panx", "udpos"],
"The name of BERT tagging (token classification) task.")
# BERT Squad task specific flags. # BERT Squad task specific flags.
flags.DEFINE_string( flags.DEFINE_string(
"squad_data_file", None, "squad_data_file", None,
...@@ -284,6 +291,34 @@ def generate_retrieval_dataset(): ...@@ -284,6 +291,34 @@ def generate_retrieval_dataset():
FLAGS.max_seq_length) FLAGS.max_seq_length)
def generate_tagging_dataset():
"""Generates tagging dataset."""
processors = {
"panx": tagging_data_lib.PanxProcessor,
"udpos": tagging_data_lib.UdposProcessor,
}
task_name = FLAGS.tagging_task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % task_name)
if FLAGS.tokenizer_impl == "word_piece":
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
processor_text_fn = tokenization.convert_to_unicode
elif FLAGS.tokenizer_impl == "sentence_piece":
tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
processor_text_fn = functools.partial(
tokenization.preprocess_text, lower=FLAGS.do_lower_case)
else:
raise ValueError("Unsupported tokenizer_impl: %s" % FLAGS.tokenizer_impl)
processor = processors[task_name]()
return tagging_data_lib.generate_tf_record_from_data_file(
processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
FLAGS.test_data_output_path, processor_text_fn)
def main(_): def main(_):
if FLAGS.tokenizer_impl == "word_piece": if FLAGS.tokenizer_impl == "word_piece":
if not FLAGS.vocab_file: if not FLAGS.vocab_file:
...@@ -304,8 +339,11 @@ def main(_): ...@@ -304,8 +339,11 @@ def main(_):
input_meta_data = generate_regression_dataset() input_meta_data = generate_regression_dataset()
elif FLAGS.fine_tuning_task_type == "retrieval": elif FLAGS.fine_tuning_task_type == "retrieval":
input_meta_data = generate_retrieval_dataset() input_meta_data = generate_retrieval_dataset()
else: elif FLAGS.fine_tuning_task_type == "squad":
input_meta_data = generate_squad_dataset() input_meta_data = generate_squad_dataset()
else:
assert FLAGS.fine_tuning_task_type == "tagging"
input_meta_data = generate_tagging_dataset()
tf.io.gfile.makedirs(os.path.dirname(FLAGS.meta_data_file_path)) tf.io.gfile.makedirs(os.path.dirname(FLAGS.meta_data_file_path))
with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer: with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer:
......
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library to process data for tagging task such as NER/POS."""
import collections
import os
from absl import logging
import tensorflow as tf
from official.nlp.data import classifier_data_lib
# A negative label id for the padding label, which will not contribute
# to loss/metrics in training.
_PADDING_LABEL_ID = -1
# The special unknown token, used to substitute a word which has too many
# subwords after tokenization.
_UNK_TOKEN = "[UNK]"
class InputExample(object):
"""A single training/test example for token classification."""
def __init__(self, sentence_id, words=None, label_ids=None):
"""Constructs an InputExample."""
self.sentence_id = sentence_id
self.words = words if words else []
self.label_ids = label_ids if label_ids else []
def add_word_and_label_id(self, word, label_id):
"""Adds word and label_id pair in the example."""
self.words.append(word)
self.label_ids.append(label_id)
def _read_one_file(file_name, label_list):
"""Reads one file and returns a list of `InputExample` instances."""
lines = tf.io.gfile.GFile(file_name, "r").readlines()
examples = []
label_id_map = {label: i for i, label in enumerate(label_list)}
sentence_id = 0
example = InputExample(sentence_id=0)
for line in lines:
line = line.strip("\n")
if line:
# The format is: <token>\t<label> for train/dev set and <token> for test.
items = line.split("\t")
assert len(items) == 2 or len(items) == 1
token = items[0].strip()
# Assign a dummy label_id for test set
label_id = label_id_map[items[1].strip()] if len(items) == 2 else 0
example.add_word_and_label_id(token, label_id)
else:
# Empty line indicates a new sentence.
if example.words:
examples.append(example)
sentence_id += 1
example = InputExample(sentence_id=sentence_id)
if example.words:
examples.append(example)
return examples
class PanxProcessor(classifier_data_lib.DataProcessor):
"""Processor for the Panx data set."""
supported_languages = [
"ar", "he", "vi", "id", "jv", "ms", "tl", "eu", "ml", "ta", "te", "af",
"nl", "en", "de", "el", "bn", "hi", "mr", "ur", "fa", "fr", "it", "pt",
"es", "bg", "ru", "ja", "ka", "ko", "th", "sw", "yo", "my", "zh", "kk",
"tr", "et", "fi", "hu"
]
def get_train_examples(self, data_dir):
return _read_one_file(
os.path.join(data_dir, "train-en.tsv"), self.get_labels())
def get_dev_examples(self, data_dir):
return _read_one_file(
os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
def get_test_examples(self, data_dir):
examples_dict = {}
for language in self.supported_languages:
examples_dict[language] = _read_one_file(
os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
return examples_dict
def get_labels(self):
return ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
@staticmethod
def get_processor_name():
return "panx"
class UdposProcessor(classifier_data_lib.DataProcessor):
"""Processor for the Udpos data set."""
supported_languages = [
"af", "ar", "bg", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr",
"he", "hi", "hu", "id", "it", "ja", "kk", "ko", "mr", "nl", "pt", "ru",
"ta", "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"
]
def get_train_examples(self, data_dir):
return _read_one_file(
os.path.join(data_dir, "train-en.tsv"), self.get_labels())
def get_dev_examples(self, data_dir):
return _read_one_file(
os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
def get_test_examples(self, data_dir):
examples_dict = {}
for language in self.supported_languages:
examples_dict[language] = _read_one_file(
os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
return examples_dict
def get_labels(self):
return [
"ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM",
"PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
]
@staticmethod
def get_processor_name():
return "udpos"
def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
"""Tokenizes words and breaks long example into short ones."""
# Needs additional [CLS] and [SEP] tokens.
max_length = max_length - 2
new_examples = []
new_example = InputExample(sentence_id=example.sentence_id)
for i, word in enumerate(example.words):
if text_preprocessing:
word = text_preprocessing(word)
subwords = tokenizer.tokenize(word)
if (not subwords or len(subwords) > max_length) and word:
subwords = [_UNK_TOKEN]
if len(subwords) + len(new_example.words) > max_length:
# Start a new example.
new_examples.append(new_example)
new_example = InputExample(sentence_id=example.sentence_id)
for j, subword in enumerate(subwords):
# Use the real label for the first subword, and pad label for
# the remainings.
subword_label = example.label_ids[i] if j == 0 else _PADDING_LABEL_ID
new_example.add_word_and_label_id(subword, subword_label)
if new_example.words:
new_examples.append(new_example)
return new_examples
def _convert_single_example(example, max_seq_length, tokenizer):
"""Converts an `InputExample` instance to a `tf.train.Example` instance."""
tokens = ["[CLS]"]
tokens.extend(example.words)
tokens.append("[SEP]")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
label_ids = [_PADDING_LABEL_ID]
if any([x < 0 for x in example.label_ids]):
raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
label_ids.extend(example.label_ids)
label_ids.append(_PADDING_LABEL_ID)
segment_ids = [0] * len(input_ids)
input_mask = [1] * len(input_ids)
# Pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
label_ids.append(_PADDING_LABEL_ID)
def create_int_feature(values):
return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["label_ids"] = create_int_feature(label_ids)
features["sentence_id"] = create_int_feature([example.sentence_id])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
return tf_example
def write_example_to_file(examples,
tokenizer,
max_seq_length,
output_file,
text_preprocessing=None):
"""Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
Note that the words inside each example will be tokenized and be applied by
`text_preprocessing` if available. Also, if the length of sentence (plus
special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
will be broken into multiple short examples. For example:
Example (text_preprocessing=lowercase, max_seq_length=5)
words: ["What", "a", "great", "weekend"]
labels: [ 7, 5, 9, 10]
sentence_id: 0
preprocessed: ["what", "a", "great", "weekend"]
tokenized: ["what", "a", "great", "week", "##end"]
will result in two tf.example protos:
tokens: ["[CLS]", "what", "a", "great", "[SEP]"]
label_ids: [-1, 7, 5, 9, -1]
input_mask: [ 1, 1, 1, 1, 1]
segment_ids: [ 0, 0, 0, 0, 0]
input_ids: [ tokenizer.convert_tokens_to_ids(tokens) ]
sentence_id: 0
tokens: ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
label_ids: [-1, 10, -1, -1, -1]
input_mask: [ 1, 1, 1, 0, 0]
segment_ids: [ 0, 0, 0, 0, 0]
input_ids: [ tokenizer.convert_tokens_to_ids(tokens) ]
sentence_id: 0
Note the use of -1 in `label_ids` to indicate that a token should not be
considered for classification (e.g., trailing ## wordpieces or special
token). Token classification models should accordingly ignore these when
calculating loss, metrics, etc...
Args:
examples: A list of `InputExample` instances.
tokenizer: The tokenizer to be applied on the data.
max_seq_length: Maximum length of generated sequences.
output_file: The name of the output tfrecord file.
text_preprocessing: optional preprocessing run on each word prior to
tokenization.
Returns:
The total number of tf.train.Example proto written to file.
"""
tf.io.gfile.makedirs(os.path.dirname(output_file))
writer = tf.io.TFRecordWriter(output_file)
num_tokenized_examples = 0
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logging.info("Writing example %d of %d to %s", ex_index, len(examples),
output_file)
tokenized_examples = _tokenize_example(example, max_seq_length,
tokenizer, text_preprocessing)
num_tokenized_examples += len(tokenized_examples)
for per_tokenized_example in tokenized_examples:
tf_example = _convert_single_example(
per_tokenized_example, max_seq_length, tokenizer)
writer.write(tf_example.SerializeToString())
writer.close()
return num_tokenized_examples
def token_classification_meta_data(train_data_size,
max_seq_length,
num_labels,
eval_data_size=None,
test_data_size=None,
label_list=None,
processor_type=None):
"""Creates metadata for tagging (token classification) datasets."""
meta_data = {
"train_data_size": train_data_size,
"max_seq_length": max_seq_length,
"num_labels": num_labels,
"task_type": "tagging",
"label_type": "int",
"label_shape": [max_seq_length],
}
if eval_data_size:
meta_data["eval_data_size"] = eval_data_size
if test_data_size:
meta_data["test_data_size"] = test_data_size
if label_list:
meta_data["label_list"] = label_list
if processor_type:
meta_data["processor_type"] = processor_type
return meta_data
def generate_tf_record_from_data_file(processor,
data_dir,
tokenizer,
max_seq_length,
train_data_output_path,
eval_data_output_path,
test_data_output_path,
text_preprocessing):
"""Generates tfrecord files from the raw data."""
common_kwargs = dict(tokenizer=tokenizer, max_seq_length=max_seq_length,
text_preprocessing=text_preprocessing)
train_examples = processor.get_train_examples(data_dir)
train_data_size = write_example_to_file(
train_examples, output_file=train_data_output_path, **common_kwargs)
eval_examples = processor.get_dev_examples(data_dir)
eval_data_size = write_example_to_file(
eval_examples, output_file=eval_data_output_path, **common_kwargs)
test_input_data_examples = processor.get_test_examples(data_dir)
test_data_size = {}
for language, examples in test_input_data_examples.items():
test_data_size[language] = write_example_to_file(
examples,
output_file=test_data_output_path.format(language),
**common_kwargs)
labels = processor.get_labels()
meta_data = token_classification_meta_data(
train_data_size,
max_seq_length,
len(labels),
eval_data_size,
test_data_size,
label_list=labels,
processor_type=processor.get_processor_name())
return meta_data
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment