sentence_prediction_dataloader.py 3.35 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads dataset for the sentence prediction (classification) task."""
from typing import Mapping, Optional
Hongkun Yu's avatar
Hongkun Yu committed
18

Chen Chen's avatar
Chen Chen committed
19
import dataclasses
20
import tensorflow as tf
21
from official.core import config_definitions as cfg
22
from official.core import input_reader
23
from official.nlp.data import data_loader
Chen Chen's avatar
Chen Chen committed
24
from official.nlp.data import data_loader_factory
25

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
26
27
28
LABEL_TYPES_MAP = {'int': tf.int64, 'float': tf.float32}


Chen Chen's avatar
Chen Chen committed
29
30
31
32
33
34
35
@dataclasses.dataclass
class SentencePredictionDataConfig(cfg.DataConfig):
  """Data config for sentence prediction task (tasks/sentence_prediction)."""
  input_path: str = ''
  global_batch_size: int = 32
  is_training: bool = True
  seq_length: int = 128
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
36
  label_type: str = 'int'
Chen Chen's avatar
Chen Chen committed
37
38
  # Whether to include the example id number.
  include_example_id: bool = False
Chen Chen's avatar
Chen Chen committed
39
40
41


@data_loader_factory.register_data_loader_cls(SentencePredictionDataConfig)
42
class SentencePredictionDataLoader(data_loader.DataLoader):
43
44
45
46
47
  """A class to load dataset for sentence prediction (classification) task."""

  def __init__(self, params):
    self._params = params
    self._seq_length = params.seq_length
Chen Chen's avatar
Chen Chen committed
48
    self._include_example_id = params.include_example_id
49
50
51

  def _decode(self, record: tf.Tensor):
    """Decodes a serialized tf.Example."""
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
52
    label_type = LABEL_TYPES_MAP[self._params.label_type]
53
54
55
56
    name_to_features = {
        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
57
        'label_ids': tf.io.FixedLenFeature([], label_type),
58
    }
Chen Chen's avatar
Chen Chen committed
59
60
61
    if self._include_example_id:
      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
    example = tf.io.parse_single_example(record, name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in example:
      t = example[name]
      if t.dtype == tf.int64:
        t = tf.cast(t, tf.int32)
      example[name] = t

    return example

  def _parse(self, record: Mapping[str, tf.Tensor]):
    """Parses raw tensors into a dict of tensors to be consumed by the model."""
    x = {
        'input_word_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'input_type_ids': record['segment_ids']
    }
Chen Chen's avatar
Chen Chen committed
81
82
83
    if self._include_example_id:
      x['example_id'] = record['example_id']

84
85
86
87
88
89
90
91
    y = record['label_ids']
    return (x, y)

  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
    """Returns a tf.dataset.Dataset."""
    reader = input_reader.InputReader(
        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
    return reader.read(input_context)