Support create fine-tuning data for tagging task. (XTREME's udpos/panx)

PiperOrigin-RevId: 318829996

Support create fine-tuning data for tagging task. (XTREME's udpos/panx)
PiperOrigin-RevId: 318829996
2284f823 · Chen Chen · A. Unique TensorFlower · db39ef82 · 2284f823 · 2284f823
Commit 2284f823 authored Jun 29, 2020 by Chen Chen Committed by A. Unique TensorFlower Jun 29, 2020
Showing with 391 additions and 6 deletions

official/nlp/data/create_finetuning_data.py official/nlp/data/create_finetuning_data.py +44 -6

official/nlp/data/tagging_data_lib.py official/nlp/data/tagging_data_lib.py +347 -0

No files found.
--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -32,14 +32,16 @@ from official.nlp.data import sentence_retrieval_lib
 from official.nlp.data import squad_lib as squad_lib_wp
 # sentence-piece tokenizer based squad_lib
 from official.nlp.data import squad_lib_sp
+from official.nlp.data import tagging_data_lib

 FLAGS = flags.FLAGS

+# TODO(chendouble): consider moving each task to its own binary.
 flags.DEFINE_enum(
    "fine_tuning_task_type", "classification",
-    ["classification", "regression", "squad", "retrieval"],
+    ["classification", "regression", "squad", "retrieval", "tagging"],
    "The name of the BERT fine tuning task for which data "
-    "will be generated..")
+    "will be generated.")

 # BERT classification specific flags.
 flags.DEFINE_string(
@@ -56,9 +58,6 @@ flags.DEFINE_enum("classification_task_name", "MNLI",
                  "only and for XNLI is all languages combined. Same for "
                  "PAWS-X.")

-flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
-                  "The name of sentence retrieval task for scoring")
-
 # XNLI task specific flag.
 flags.DEFINE_string(
    "xnli_language", "en",
@@ -71,6 +70,14 @@ flags.DEFINE_string(
    "Language of trainig data for PAWS-X task. If the value is 'all', the data "
    "of all languages will be used for training.")

+# Retrieva task specific flags
+flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
+                  "The name of sentence retrieval task for scoring")
+
+# Tagging task specific flags
+flags.DEFINE_enum("tagging_task_name", "panx", ["panx", "udpos"],
+                  "The name of BERT tagging (token classification) task.")
+
 # BERT Squad task specific flags.
 flags.DEFINE_string(
    "squad_data_file", None,
@@ -284,6 +291,34 @@ def generate_retrieval_dataset():
      FLAGS.max_seq_length)


+def generate_tagging_dataset():
+  """Generates tagging dataset."""
+  processors = {
+      "panx": tagging_data_lib.PanxProcessor,
+      "udpos": tagging_data_lib.UdposProcessor,
+  }
+  task_name = FLAGS.tagging_task_name.lower()
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % task_name)
+
+  if FLAGS.tokenizer_impl == "word_piece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  elif FLAGS.tokenizer_impl == "sentence_piece":
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+  else:
+    raise ValueError("Unsupported tokenizer_impl: %s" % FLAGS.tokenizer_impl)
+
+  processor = processors[task_name]()
+  return tagging_data_lib.generate_tf_record_from_data_file(
+      processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
+      FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
+      FLAGS.test_data_output_path, processor_text_fn)
+
+
 def main(_):
  if FLAGS.tokenizer_impl == "word_piece":
    if not FLAGS.vocab_file:
@@ -304,8 +339,11 @@ def main(_):
    input_meta_data = generate_regression_dataset()
  elif FLAGS.fine_tuning_task_type == "retrieval":
    input_meta_data = generate_retrieval_dataset()
-  else:
+  elif FLAGS.fine_tuning_task_type == "squad":
    input_meta_data = generate_squad_dataset()
+  else:
+    assert FLAGS.fine_tuning_task_type == "tagging"
+    input_meta_data = generate_tagging_dataset()

  tf.io.gfile.makedirs(os.path.dirname(FLAGS.meta_data_file_path))
  with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer:

--- a/official/nlp/data/tagging_data_lib.py
+++ b/official/nlp/data/tagging_data_lib.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library to process data for tagging task such as NER/POS."""
+import collections
+import os
+
+from absl import logging
+import tensorflow as tf
+
+from official.nlp.data import classifier_data_lib
+
+# A negative label id for the padding label, which will not contribute
+# to loss/metrics in training.
+_PADDING_LABEL_ID = -1
+
+# The special unknown token, used to substitute a word which has too many
+# subwords after tokenization.
+_UNK_TOKEN = "[UNK]"
+
+
+class InputExample(object):
+  """A single training/test example for token classification."""
+
+  def __init__(self, sentence_id, words=None, label_ids=None):
+    """Constructs an InputExample."""
+    self.sentence_id = sentence_id
+    self.words = words if words else []
+    self.label_ids = label_ids if label_ids else []
+
+  def add_word_and_label_id(self, word, label_id):
+    """Adds word and label_id pair in the example."""
+    self.words.append(word)
+    self.label_ids.append(label_id)
+
+
+def _read_one_file(file_name, label_list):
+  """Reads one file and returns a list of `InputExample` instances."""
+  lines = tf.io.gfile.GFile(file_name, "r").readlines()
+  examples = []
+  label_id_map = {label: i for i, label in enumerate(label_list)}
+  sentence_id = 0
+  example = InputExample(sentence_id=0)
+  for line in lines:
+    line = line.strip("\n")
+    if line:
+      # The format is: <token>\t<label> for train/dev set and <token> for test.
+      items = line.split("\t")
+      assert len(items) == 2 or len(items) == 1
+      token = items[0].strip()
+
+      # Assign a dummy label_id for test set
+      label_id = label_id_map[items[1].strip()] if len(items) == 2 else 0
+      example.add_word_and_label_id(token, label_id)
+    else:
+      # Empty line indicates a new sentence.
+      if example.words:
+        examples.append(example)
+        sentence_id += 1
+        example = InputExample(sentence_id=sentence_id)
+
+  if example.words:
+    examples.append(example)
+  return examples
+
+
+class PanxProcessor(classifier_data_lib.DataProcessor):
+  """Processor for the Panx data set."""
+  supported_languages = [
+      "ar", "he", "vi", "id", "jv", "ms", "tl", "eu", "ml", "ta", "te", "af",
+      "nl", "en", "de", "el", "bn", "hi", "mr", "ur", "fa", "fr", "it", "pt",
+      "es", "bg", "ru", "ja", "ka", "ko", "th", "sw", "yo", "my", "zh", "kk",
+      "tr", "et", "fi", "hu"
+  ]
+
+  def get_train_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "train-en.tsv"), self.get_labels())
+
+  def get_dev_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
+
+  def get_test_examples(self, data_dir):
+    examples_dict = {}
+    for language in self.supported_languages:
+      examples_dict[language] = _read_one_file(
+          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
+    return examples_dict
+
+  def get_labels(self):
+    return ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
+
+  @staticmethod
+  def get_processor_name():
+    return "panx"
+
+
+class UdposProcessor(classifier_data_lib.DataProcessor):
+  """Processor for the Udpos data set."""
+  supported_languages = [
+      "af", "ar", "bg", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr",
+      "he", "hi", "hu", "id", "it", "ja", "kk", "ko", "mr", "nl", "pt", "ru",
+      "ta", "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"
+  ]
+
+  def get_train_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "train-en.tsv"), self.get_labels())
+
+  def get_dev_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
+
+  def get_test_examples(self, data_dir):
+    examples_dict = {}
+    for language in self.supported_languages:
+      examples_dict[language] = _read_one_file(
+          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
+    return examples_dict
+
+  def get_labels(self):
+    return [
+        "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM",
+        "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
+    ]
+
+  @staticmethod
+  def get_processor_name():
+    return "udpos"
+
+
+def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
+  """Tokenizes words and breaks long example into short ones."""
+  # Needs additional [CLS] and [SEP] tokens.
+  max_length = max_length - 2
+  new_examples = []
+  new_example = InputExample(sentence_id=example.sentence_id)
+  for i, word in enumerate(example.words):
+    if text_preprocessing:
+      word = text_preprocessing(word)
+    subwords = tokenizer.tokenize(word)
+    if (not subwords or len(subwords) > max_length) and word:
+      subwords = [_UNK_TOKEN]
+
+    if len(subwords) + len(new_example.words) > max_length:
+      # Start a new example.
+      new_examples.append(new_example)
+      new_example = InputExample(sentence_id=example.sentence_id)
+
+    for j, subword in enumerate(subwords):
+      # Use the real label for the first subword, and pad label for
+      # the remainings.
+      subword_label = example.label_ids[i] if j == 0 else _PADDING_LABEL_ID
+      new_example.add_word_and_label_id(subword, subword_label)
+
+  if new_example.words:
+    new_examples.append(new_example)
+
+  return new_examples
+
+
+def _convert_single_example(example, max_seq_length, tokenizer):
+  """Converts an `InputExample` instance to a `tf.train.Example` instance."""
+  tokens = ["[CLS]"]
+  tokens.extend(example.words)
+  tokens.append("[SEP]")
+  input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+  label_ids = [_PADDING_LABEL_ID]
+  if any([x < 0 for x in example.label_ids]):
+    raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
+
+  label_ids.extend(example.label_ids)
+  label_ids.append(_PADDING_LABEL_ID)
+
+  segment_ids = [0] * len(input_ids)
+  input_mask = [1] * len(input_ids)
+
+  # Pad up to the sequence length.
+  while len(input_ids) < max_seq_length:
+    input_ids.append(0)
+    input_mask.append(0)
+    segment_ids.append(0)
+    label_ids.append(_PADDING_LABEL_ID)
+
+  def create_int_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+
+  features = collections.OrderedDict()
+  features["input_ids"] = create_int_feature(input_ids)
+  features["input_mask"] = create_int_feature(input_mask)
+  features["segment_ids"] = create_int_feature(segment_ids)
+  features["label_ids"] = create_int_feature(label_ids)
+  features["sentence_id"] = create_int_feature([example.sentence_id])
+
+  tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+  return tf_example
+
+
+def write_example_to_file(examples,
+                          tokenizer,
+                          max_seq_length,
+                          output_file,
+                          text_preprocessing=None):
+  """Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
+
+  Note that the words inside each example will be tokenized and be applied by
+  `text_preprocessing` if available. Also, if the length of sentence (plus
+  special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
+  will be broken into multiple short examples. For example:
+
+  Example (text_preprocessing=lowercase, max_seq_length=5)
+    words:        ["What", "a", "great", "weekend"]
+    labels:       [     7,   5,       9,        10]
+    sentence_id:  0
+    preprocessed: ["what", "a", "great", "weekend"]
+    tokenized:    ["what", "a", "great", "week", "##end"]
+
+  will result in two tf.example protos:
+
+    tokens:      ["[CLS]", "what", "a", "great", "[SEP]"]
+    label_ids:   [-1,       7,     5,     9,     -1]
+    input_mask:  [ 1,       1,     1,     1,      1]
+    segment_ids: [ 0,       0,     0,     0,      0]
+    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
+    sentence_id: 0
+
+    tokens:      ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
+    label_ids:   [-1,       10,     -1,    -1,       -1]
+    input_mask:  [ 1,       1,       1,     0,        0]
+    segment_ids: [ 0,       0,       0,     0,        0]
+    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
+    sentence_id: 0
+
+    Note the use of -1 in `label_ids` to indicate that a token should not be
+    considered for classification (e.g., trailing ## wordpieces or special
+    token). Token classification models should accordingly ignore these when
+    calculating loss, metrics, etc...
+
+  Args:
+    examples: A list of `InputExample` instances.
+    tokenizer: The tokenizer to be applied on the data.
+    max_seq_length: Maximum length of generated sequences.
+    output_file: The name of the output tfrecord file.
+    text_preprocessing: optional preprocessing run on each word prior to
+      tokenization.
+
+  Returns:
+    The total number of tf.train.Example proto written to file.
+  """
+  tf.io.gfile.makedirs(os.path.dirname(output_file))
+  writer = tf.io.TFRecordWriter(output_file)
+  num_tokenized_examples = 0
+  for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      logging.info("Writing example %d of %d to %s", ex_index, len(examples),
+                   output_file)
+
+    tokenized_examples = _tokenize_example(example, max_seq_length,
+                                           tokenizer, text_preprocessing)
+    num_tokenized_examples += len(tokenized_examples)
+    for per_tokenized_example in tokenized_examples:
+      tf_example = _convert_single_example(
+          per_tokenized_example, max_seq_length, tokenizer)
+      writer.write(tf_example.SerializeToString())
+
+  writer.close()
+  return num_tokenized_examples
+
+
+def token_classification_meta_data(train_data_size,
+                                   max_seq_length,
+                                   num_labels,
+                                   eval_data_size=None,
+                                   test_data_size=None,
+                                   label_list=None,
+                                   processor_type=None):
+  """Creates metadata for tagging (token classification) datasets."""
+  meta_data = {
+      "train_data_size": train_data_size,
+      "max_seq_length": max_seq_length,
+      "num_labels": num_labels,
+      "task_type": "tagging",
+      "label_type": "int",
+      "label_shape": [max_seq_length],
+  }
+  if eval_data_size:
+    meta_data["eval_data_size"] = eval_data_size
+  if test_data_size:
+    meta_data["test_data_size"] = test_data_size
+  if label_list:
+    meta_data["label_list"] = label_list
+  if processor_type:
+    meta_data["processor_type"] = processor_type
+
+  return meta_data
+
+
+def generate_tf_record_from_data_file(processor,
+                                      data_dir,
+                                      tokenizer,
+                                      max_seq_length,
+                                      train_data_output_path,
+                                      eval_data_output_path,
+                                      test_data_output_path,
+                                      text_preprocessing):
+  """Generates tfrecord files from the raw data."""
+  common_kwargs = dict(tokenizer=tokenizer, max_seq_length=max_seq_length,
+                       text_preprocessing=text_preprocessing)
+  train_examples = processor.get_train_examples(data_dir)
+  train_data_size = write_example_to_file(
+      train_examples, output_file=train_data_output_path, **common_kwargs)
+
+  eval_examples = processor.get_dev_examples(data_dir)
+  eval_data_size = write_example_to_file(
+      eval_examples, output_file=eval_data_output_path, **common_kwargs)
+
+  test_input_data_examples = processor.get_test_examples(data_dir)
+  test_data_size = {}
+  for language, examples in test_input_data_examples.items():
+    test_data_size[language] = write_example_to_file(
+        examples,
+        output_file=test_data_output_path.format(language),
+        **common_kwargs)
+
+  labels = processor.get_labels()
+  meta_data = token_classification_meta_data(
+      train_data_size,
+      max_seq_length,
+      len(labels),
+      eval_data_size,
+      test_data_size,
+      label_list=labels,
+      processor_type=processor.get_processor_name())
+  return meta_data