Internal change

PiperOrigin-RevId: 317691679

Internal change
PiperOrigin-RevId: 317691679
d4f5c193 · Chen Chen · A. Unique TensorFlower · 1357ce19 · d4f5c193 · d4f5c193
Commit d4f5c193 authored Jun 22, 2020 by Chen Chen Committed by A. Unique TensorFlower Jun 22, 2020
8 changed files
--- a/official/nlp/configs/bert.py
+++ b/official/nlp/configs/bert.py
@@ -130,3 +130,22 @@ class QADevDataConfig(cfg.DataConfig):
  is_training: bool = False
  seq_length: int = 384
  drop_remainder: bool = False
+@dataclasses.dataclass
+class TaggingDataConfig(cfg.DataConfig):
+  """Data config for tagging (tasks/tagging)."""
+  input_path: str = ""
+  global_batch_size: int = 48
+  is_training: bool = True
+  seq_length: int = 384
+@dataclasses.dataclass
+class TaggingDevDataConfig(cfg.DataConfig):
+  """Dev Data config for tagging (tasks/tagging)."""
+  input_path: str = ""
+  global_batch_size: int = 48
+  is_training: bool = False
+  seq_length: int = 384
+  drop_remainder: bool = False
--- a/official/nlp/data/tagging_data_loader.py
+++ b/official/nlp/data/tagging_data_loader.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Loads dataset for the tagging (e.g., NER/POS) task."""
+from typing import Mapping, Optional
+import tensorflow as tf
+from official.core import input_reader
+class TaggingDataLoader:
+  """A class to load dataset for tagging (e.g., NER and POS) task."""
+  def __init__(self, params):
+    self._params = params
+    self._seq_length = params.seq_length
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'label_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+    }
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['label_ids']
+    return (x, y)
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
+    return reader.read(input_context)
--- a/official/nlp/modeling/models/bert_token_classifier.py
+++ b/official/nlp/modeling/models/bert_token_classifier.py
@@ -55,6 +55,7 @@ class BertTokenClassifier(tf.keras.Model):
               dropout_rate=0.1,
               **kwargs):
    self._self_setattr_tracking = False
+    self._network = network
    self._config = {
        'network': network,
        'num_classes': num_classes,
@@ -84,6 +85,10 @@ class BertTokenClassifier(tf.keras.Model):
    super(BertTokenClassifier, self).__init__(
        inputs=inputs, outputs=predictions, **kwargs)
+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
  def get_config(self):
    return self._config

--- a/official/nlp/tasks/question_answering.py
+++ b/official/nlp/tasks/question_answering.py
@@ -24,6 +24,7 @@ from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.bert import input_pipeline
 from official.nlp.configs import encoders
 from official.nlp.modeling import models
+from official.nlp.tasks import utils
 @dataclasses.dataclass
@@ -57,19 +58,7 @@ class QuestionAnsweringTask(base_task.Task):
  def build_model(self):
    if self._hub_module:
-      # TODO(lehou): maybe add the hub_module building logic to a util function.
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
-      input_word_ids = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_word_ids')
-      input_mask = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_mask')
-      input_type_ids = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_type_ids')
-      bert_model = hub.KerasLayer(self._hub_module, trainable=True)
-      pooled_output, sequence_output = bert_model(
-          [input_word_ids, input_mask, input_type_ids])
-      encoder_network = tf.keras.Model(
-          inputs=[input_word_ids, input_mask, input_type_ids],
-          outputs=[sequence_output, pooled_output])
    else:
      encoder_network = encoders.instantiate_encoder_from_cfg(
          self.task_config.network)

--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -27,6 +27,7 @@ from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
 from official.nlp.data import sentence_prediction_dataloader
 from official.nlp.modeling import losses as loss_lib
+from official.nlp.tasks import utils
 @dataclasses.dataclass
@@ -67,18 +68,7 @@ class SentencePredictionTask(base_task.Task):
  def build_model(self):
    if self._hub_module:
-      input_word_ids = tf.keras.layers.Input(
+      encoder_from_hub = utils.get_encoder_from_hub(self._hub_module)
-          shape=(None,), dtype=tf.int32, name='input_word_ids')
-      input_mask = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_mask')
-      input_type_ids = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_type_ids')
-      bert_model = hub.KerasLayer(self._hub_module, trainable=True)
-      pooled_output, sequence_output = bert_model(
-          [input_word_ids, input_mask, input_type_ids])
-      encoder_from_hub = tf.keras.Model(
-          inputs=[input_word_ids, input_mask, input_type_ids],
-          outputs=[sequence_output, pooled_output])
      return bert.instantiate_bertpretrainer_from_cfg(
          self.task_config.network, encoder_network=encoder_from_hub)
    else:

--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tagging (e.g., NER/POS) task."""
+import logging
+import dataclasses
+import tensorflow as tf
+import tensorflow_hub as hub
+from official.core import base_task
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.configs import encoders
+from official.nlp.data import tagging_data_loader
+from official.nlp.modeling import models
+from official.nlp.tasks import utils
+@dataclasses.dataclass
+class TaggingConfig(cfg.TaskConfig):
+  """The model config."""
+  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
+  init_checkpoint: str = ''
+  hub_module_url: str = ''
+  network: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+  num_classes: int = 0
+  # The ignored label id will not contribute to loss.
+  # A word may be tokenized into multiple word_pieces tokens, and we usually
+  # assign the real label id for the first token of the word, and
+  # `ignore_label_id` for the remaining tokens.
+  ignore_label_id: int = 0
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+@base_task.register_task_cls(TaggingConfig)
+class TaggingTask(base_task.Task):
+  """Task object for tagging (e.g., NER or POS)."""
+  def __init__(self, params=cfg.TaskConfig):
+    super(TaggingTask, self).__init__(params)
+    if params.hub_module_url and params.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if params.num_classes == 0:
+      raise ValueError('TaggingConfig.num_classes cannot be 0.')
+    if params.hub_module_url:
+      self._hub_module = hub.load(params.hub_module_url)
+    else:
+      self._hub_module = None
+  def build_model(self):
+    if self._hub_module:
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
+    else:
+      encoder_network = encoders.instantiate_encoder_from_cfg(
+          self.task_config.network)
+    return models.BertTokenClassifier(
+        network=encoder_network,
+        num_classes=self.task_config.num_classes,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.network.initializer_range),
+        dropout_rate=self.task_config.network.dropout_rate,
+        output='logits')
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    model_outputs = tf.cast(model_outputs, tf.float32)
+    loss = tf.keras.losses.sparse_categorical_crossentropy(
+        labels, model_outputs, from_logits=True)
+    # `ignore_label_id` will not contribute to loss.
+    label_weights = tf.cast(
+        tf.not_equal(labels, self.task_config.ignore_label_id),
+        dtype=tf.float32)
+    numerator_loss = tf.reduce_sum(loss * label_weights)
+    denominator_loss = tf.reduce_sum(label_weights)
+    loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
+    return loss
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for sentence_prediction task."""
+    if params.input_path == 'dummy':
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        x = dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids)
+        y = tf.ones((1, params.seq_length), dtype=tf.int32)
+        return (x, y)
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+    dataset = tagging_data_loader.TaggingDataLoader(params).load(input_context)
+    return dataset
+  def build_metrics(self, training=None):
+    del training
+    # TODO(chendouble): evaluate using seqeval's f1/precision/recall.
+    return [tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')]
+  def process_metrics(self, metrics, labels, model_outputs):
+    # `ignore_label_id` will not contribute to metrics.
+    sample_weight = tf.cast(
+        tf.not_equal(labels, self.task_config.ignore_label_id),
+        dtype=tf.float32)
+    for metric in metrics:
+      metric.update_state(labels, model_outputs, sample_weight)
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    # `ignore_label_id` will not contribute to metrics.
+    sample_weight = tf.cast(
+        tf.not_equal(labels, self.task_config.ignore_label_id),
+        dtype=tf.float32)
+    compiled_metrics.update_state(labels, model_outputs, sample_weight)
+  def initialize(self, model):
+    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    status = ckpt.restore(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info('finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/tagging_test.py
+++ b/official/nlp/tasks/tagging_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.tasks.tagging."""
+import functools
+import os
+import tensorflow as tf
+from official.nlp.bert import configs
+from official.nlp.bert import export_tfhub
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.tasks import tagging
+class TaggingTest(tf.test.TestCase):
+  def setUp(self):
+    super(TaggingTest, self).setUp()
+    self._encoder_config = encoders.TransformerEncoderConfig(
+        vocab_size=30522, num_layers=1)
+    self._train_data_config = bert.TaggingDataConfig(
+        input_path="dummy", seq_length=128, global_batch_size=1)
+  def _run_task(self, config):
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    strategy = tf.distribute.get_strategy()
+    dataset = strategy.experimental_distribute_datasets_from_function(
+        functools.partial(task.build_inputs, config.train_data))
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+  def test_task(self):
+    # Saves a checkpoint.
+    encoder = encoders.instantiate_encoder_from_cfg(self._encoder_config)
+    ckpt = tf.train.Checkpoint(encoder=encoder)
+    saved_path = ckpt.save(self.get_temp_dir())
+    config = tagging.TaggingConfig(
+        init_checkpoint=saved_path,
+        network=self._encoder_config,
+        train_data=self._train_data_config,
+        num_classes=3)
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+    task.initialize(model)
+  def test_task_with_fit(self):
+    config = tagging.TaggingConfig(
+        network=self._encoder_config,
+        train_data=self._train_data_config,
+        num_classes=3)
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    model = task.compile_model(
+        model,
+        optimizer=tf.keras.optimizers.SGD(lr=0.1),
+        train_step=task.train_step,
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
+    dataset = task.build_inputs(config.train_data)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn("loss", logs.history)
+    self.assertIn("accuracy", logs.history)
+  def _export_bert_tfhub(self):
+    bert_config = configs.BertConfig(
+        vocab_size=30522,
+        hidden_size=16,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=1)
+    _, encoder = export_tfhub.create_bert_model(bert_config)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(model=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+    vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
+    with tf.io.gfile.GFile(vocab_file, "w") as f:
+      f.write("dummy content")
+    hub_destination = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
+                                   hub_destination, vocab_file)
+    return hub_destination
+  def test_task_with_hub(self):
+    hub_module_url = self._export_bert_tfhub()
+    config = tagging.TaggingConfig(
+        hub_module_url=hub_module_url,
+        network=self._encoder_config,
+        num_classes=4,
+        train_data=self._train_data_config)
+    self._run_task(config)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/tasks/utils.py
+++ b/official/nlp/tasks/utils.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common utils for tasks."""
+import tensorflow as tf
+import tensorflow_hub as hub
+def get_encoder_from_hub(hub_module: str) -> tf.keras.Model:
+  """Gets an encoder from hub."""
+  input_word_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_type_ids')
+  hub_layer = hub.KerasLayer(hub_module, trainable=True)
+  pooled_output, sequence_output = hub_layer(
+      [input_word_ids, input_mask, input_type_ids])
+  return tf.keras.Model(
+      inputs=[input_word_ids, input_mask, input_type_ids],
+      outputs=[sequence_output, pooled_output])