utils_multiple_choice.py 14.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
16
""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
17
18


Aymeric Augustin's avatar
Aymeric Augustin committed
19
20
21
import csv
import glob
import json
22
23
import logging
import os
24
from typing import List
Aymeric Augustin's avatar
Aymeric Augustin committed
25
26
27

import tqdm

28
from transformers import PreTrainedTokenizer
29
30
31
32
33
34
35
36


logger = logging.getLogger(__name__)


class InputExample(object):
    """A single training/test example for multiple choice"""

37
    def __init__(self, example_id, question, contexts, endings, label=None):
38
39
40
        """Constructs a InputExample.

        Args:
erenup's avatar
erenup committed
41
42
            example_id: Unique id for the example.
            contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
43
            question: string. The untokenized text of the second sequence (question).
erenup's avatar
erenup committed
44
            endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
45
46
47
48
49
50
51
52
53
54
55
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.example_id = example_id
        self.question = question
        self.contexts = contexts
        self.endings = endings
        self.label = label


class InputFeatures(object):
56
    def __init__(self, example_id, choices_features, label):
57
58
        self.example_id = example_id
        self.choices_features = [
59
            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
60
            for input_ids, input_mask, segment_ids in choices_features
61
62
63
64
65
        ]
        self.label = label


class DataProcessor(object):
erenup's avatar
erenup committed
66
    """Base class for data converters for multiple choice data sets."""
67
68
69
70
71
72
73
74
75

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

erenup's avatar
erenup committed
76
    def get_test_examples(self, data_dir):
erenup's avatar
erenup committed
77
        """Gets a collection of `InputExample`s for the test set."""
erenup's avatar
erenup committed
78
79
        raise NotImplementedError()

80
81
82
83
84
85
    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()


class RaceProcessor(DataProcessor):
erenup's avatar
erenup committed
86
    """Processor for the RACE data set."""
87
88
89
90

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
91
92
        high = os.path.join(data_dir, "train/high")
        middle = os.path.join(data_dir, "train/middle")
93
94
        high = self._read_txt(high)
        middle = self._read_txt(middle)
95
        return self._create_examples(high + middle, "train")
96
97
98
99

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
100
101
        high = os.path.join(data_dir, "dev/high")
        middle = os.path.join(data_dir, "dev/middle")
102
103
        high = self._read_txt(high)
        middle = self._read_txt(middle)
104
        return self._create_examples(high + middle, "dev")
105

erenup's avatar
erenup committed
106
107
108
    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} test".format(data_dir))
109
110
        high = os.path.join(data_dir, "test/high")
        middle = os.path.join(data_dir, "test/middle")
erenup's avatar
erenup committed
111
112
        high = self._read_txt(high)
        middle = self._read_txt(middle)
113
        return self._create_examples(high + middle, "test")
erenup's avatar
erenup committed
114

115
116
117
118
119
120
121
122
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_txt(self, input_dir):
        lines = []
        files = glob.glob(input_dir + "/*txt")
        for file in tqdm.tqdm(files, desc="read files"):
123
            with open(file, "r", encoding="utf-8") as fin:
124
125
126
127
128
129
130
131
132
133
134
135
                data_raw = json.load(fin)
                data_raw["race_id"] = file
                lines.append(data_raw)
        return lines

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (_, data_raw) in enumerate(lines):
            race_id = "%s-%s" % (set_type, data_raw["race_id"])
            article = data_raw["article"]
            for i in range(len(data_raw["answers"])):
136
137
138
                truth = str(ord(data_raw["answers"][i]) - ord("A"))
                question = data_raw["questions"][i]
                options = data_raw["options"][i]
139
140
141
142
143

                examples.append(
                    InputExample(
                        example_id=race_id,
                        question=question,
144
                        contexts=[article, article, article, article],  # this is not efficient but convenient
145
                        endings=[options[0], options[1], options[2], options[3]],
146
147
148
                        label=truth,
                    )
                )
149
150
        return examples

151

152
class SwagProcessor(DataProcessor):
erenup's avatar
erenup committed
153
    """Processor for the SWAG data set."""
154
155
156
157
158
159
160
161
162
163
164

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")

erenup's avatar
erenup committed
165
166
    def get_test_examples(self, data_dir):
        """See base class."""
erenup's avatar
erenup committed
167
168
169
170
171
        logger.info("LOOKING AT {} dev".format(data_dir))
        raise ValueError(
            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
            "setting!"
        )
erenup's avatar
erenup committed
172
        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
173

174
175
176
177
178
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_csv(self, input_file):
179
        with open(input_file, "r", encoding="utf-8") as f:
180
            return list(csv.reader(f))
181

182
    def _create_examples(self, lines: List[List[str]], type: str):
183
        """Creates examples for the training and dev sets."""
184
185
        if type == "train" and lines[0][-1] != "label":
            raise ValueError("For training, the input file must contain a label column.")
186
187
188
189
190
191
192

        examples = [
            InputExample(
                example_id=line[2],
                question=line[5],  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
193
194
195
196
197
                contexts=[line[4], line[4], line[4], line[4]],
                endings=[line[7], line[8], line[9], line[10]],
                label=line[11],
            )
            for line in lines[1:]  # we skip the line with the column names
198
199
200
201
202
203
        ]

        return examples


class ArcProcessor(DataProcessor):
erenup's avatar
erenup committed
204
    """Processor for the ARC data set (request from allennlp)."""
205
206
207
208
209
210
211
212
213
214
215

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")

erenup's avatar
erenup committed
216
217
218
219
    def get_test_examples(self, data_dir):
        logger.info("LOOKING AT {} test".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")

220
221
222
223
224
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_json(self, input_file):
225
        with open(input_file, "r", encoding="utf-8") as fin:
226
227
228
229
230
231
            lines = fin.readlines()
            return lines

    def _create_examples(self, lines, type):
        """Creates examples for the training and dev sets."""

232
        # There are two types of labels. They should be normalized
233
234
235
236
237
238
        def normalize(truth):
            if truth in "ABCD":
                return ord(truth) - ord("A")
            elif truth in "1234":
                return int(truth) - 1
            else:
erenup's avatar
erenup committed
239
240
                logger.info("truth ERROR! %s", str(truth))
                return None
erenup's avatar
erenup committed
241

242
243
244
245
246
        examples = []
        three_choice = 0
        four_choice = 0
        five_choice = 0
        other_choices = 0
erenup's avatar
erenup committed
247
        # we deleted example which has more than or less than four choices
248
249
250
251
252
253
254
255
256
257
258
259
260
        for line in tqdm.tqdm(lines, desc="read arc data"):
            data_raw = json.loads(line.strip("\n"))
            if len(data_raw["question"]["choices"]) == 3:
                three_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) == 5:
                five_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) != 4:
                other_choices += 1
                continue
            four_choice += 1
            truth = str(normalize(data_raw["answerKey"]))
erenup's avatar
erenup committed
261
            assert truth != "None"
262
263
264
265
266
267
268
            question_choices = data_raw["question"]
            question = question_choices["stem"]
            id = data_raw["id"]
            options = question_choices["choices"]
            if len(options) == 4:
                examples.append(
                    InputExample(
269
                        example_id=id,
270
                        question=question,
271
272
273
274
275
276
                        contexts=[
                            options[0]["para"].replace("_", ""),
                            options[1]["para"].replace("_", ""),
                            options[2]["para"].replace("_", ""),
                            options[3]["para"].replace("_", ""),
                        ],
277
                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
278
279
280
                        label=truth,
                    )
                )
281
282
283
284
285
286
287
288
289
290
291
292
293

        if type == "train":
            assert len(examples) > 1
            assert examples[0].label is not None
        logger.info("len examples: %s}", str(len(examples)))
        logger.info("Three choices: %s", str(three_choice))
        logger.info("Five choices: %s", str(five_choice))
        logger.info("Other choices: %s", str(other_choices))
        logger.info("four choices: %s", str(four_choice))

        return examples


294
295
296
297
298
299
300
301
302
303
304
305
def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_length: int,
    tokenizer: PreTrainedTokenizer,
    pad_token_segment_id=0,
    pad_on_left=False,
    pad_token=0,
    mask_padding_with_zero=True,
) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
306
307
    """

308
    label_map = {label: i for i, label in enumerate(label_list)}
309
310
311
312
313
314
315

    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        choices_features = []
        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
316
            text_a = context
317
            if example.question.find("_") != -1:
318
319
                # this is for cloze question
                text_b = example.question.replace("_", ending)
320
            else:
321
322
                text_b = example.question + " " + ending

323
324
325
326
327
328
329
            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
                logger.info(
                    "Attention! you are cropping tokens (swag task is ok). "
                    "If you are training ARC and RACE and you are poping question + options,"
                    "you need to try to use a bigger max seq length!"
                )
330

331
            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
332
333
334

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
335
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
336
337

            # Zero-pad up to the sequence length.
338
            padding_length = max_length - len(input_ids)
339
340
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
341
342
                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
343
344
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
345
346
347
348
349
350
351
352
                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            assert len(input_ids) == max_length
            assert len(attention_mask) == max_length
            assert len(token_type_ids) == max_length
            choices_features.append((input_ids, attention_mask, token_type_ids))

353
354
355
356
357
        label = label_map[example.label]

        if ex_index < 2:
            logger.info("*** Example ***")
            logger.info("race_id: {}".format(example.example_id))
358
            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
359
                logger.info("choice: {}".format(choice_idx))
360
361
362
                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
363
364
                logger.info("label: {}".format(label))

365
        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
366
367
368
369

    return features


370
processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}
371
372


373
MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}