hans_processors.py 8.38 KB
Newer Older
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" GLUE processors and helpers """

import logging
import os

from transformers.file_utils import is_tf_available
thomwolf's avatar
thomwolf committed
22
23
from utils_hans import DataProcessor, InputExample, InputFeatures

Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
24
25
26
27
28
29
30

if is_tf_available():
    import tensorflow as tf

logger = logging.getLogger(__name__)


thomwolf's avatar
thomwolf committed
31
32
33
34
35
36
37
38
39
40
41
42
def hans_convert_examples_to_features(
    examples,
    tokenizer,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: HANS
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)

thomwolf's avatar
thomwolf committed
89
        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
thomwolf's avatar
thomwolf committed
108
109
110
111
112
113
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
            len(attention_mask), max_length
        )
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
            len(token_type_ids), max_length
        )
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

        if output_mode == "classification":
            label = label_map[example.label] if example.label in label_map else 0
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)
        pairID = str(example.pairID)

        if ex_index < 10:
            logger.info("*** Example ***")
            logger.info("text_a: %s" % (example.text_a))
            logger.info("text_b: %s" % (example.text_b))
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
thomwolf's avatar
thomwolf committed
134
135
136
137
138
139
140
141
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
                pairID=pairID,
            )
        )
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
142
143

    if is_tf_available() and is_tf_dataset:
thomwolf's avatar
thomwolf committed
144

Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
145
146
        def gen():
            for ex in features:
thomwolf's avatar
thomwolf committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    ex.label,
                )

        return tf.data.Dataset.from_generator(
            gen,
            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
168
169
170
171
172
173
174
175
176

    return features


class HansProcessor(DataProcessor):
    """Processor for the HANS data set."""

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
thomwolf's avatar
thomwolf committed
177
178
179
180
181
182
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["premise"].numpy().decode("utf-8"),
            tensor_dict["hypothesis"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
183
184
185

    def get_train_examples(self, data_dir):
        """See base class."""
thomwolf's avatar
thomwolf committed
186
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
187
188
189

    def get_dev_examples(self, data_dir):
        """See base class."""
thomwolf's avatar
thomwolf committed
190
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
191
192
193
194
195
196
197
198
199
200
201
202
203
204

    def get_labels(self):
        """See base class."""
        return ["contradiction", "entailment", "neutral"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[5]
            text_b = line[6]
thomwolf's avatar
thomwolf committed
205
206
207
            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
            label = line[-1]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
Nafise Sadat Moosavi's avatar
Nafise Sadat Moosavi committed
208
209
210
211
212
213
214
215
216
217
218
219
220
221
        return examples


glue_tasks_num_labels = {
    "hans": 3,
}

glue_processors = {
    "hans": HansProcessor,
}

glue_output_modes = {
    "hans": "classification",
}