utils_multiple_choice.py 14.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
16
""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
17
18
19
20
21
22
23
24
25
26
27
28

from __future__ import absolute_import, division, print_function


import logging
import os
import sys
from io import open
import json
import csv
import glob
import tqdm
29
30
from typing import List
from transformers import PreTrainedTokenizer
31
32
33
34
35
36
37
38


logger = logging.getLogger(__name__)


class InputExample(object):
    """A single training/test example for multiple choice"""

39
    def __init__(self, example_id, question, contexts, endings, label=None):
40
41
42
        """Constructs a InputExample.

        Args:
erenup's avatar
erenup committed
43
44
            example_id: Unique id for the example.
            contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
45
            question: string. The untokenized text of the second sequence (question).
erenup's avatar
erenup committed
46
            endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
47
48
49
50
51
52
53
54
55
56
57
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.example_id = example_id
        self.question = question
        self.contexts = contexts
        self.endings = endings
        self.label = label


class InputFeatures(object):
58
    def __init__(self, example_id, choices_features, label):
59
60
        self.example_id = example_id
        self.choices_features = [
61
            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
62
            for input_ids, input_mask, segment_ids in choices_features
63
64
65
66
67
        ]
        self.label = label


class DataProcessor(object):
erenup's avatar
erenup committed
68
    """Base class for data converters for multiple choice data sets."""
69
70
71
72
73
74
75
76
77

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

erenup's avatar
erenup committed
78
    def get_test_examples(self, data_dir):
erenup's avatar
erenup committed
79
        """Gets a collection of `InputExample`s for the test set."""
erenup's avatar
erenup committed
80
81
        raise NotImplementedError()

82
83
84
85
86
87
    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()


class RaceProcessor(DataProcessor):
erenup's avatar
erenup committed
88
    """Processor for the RACE data set."""
89
90
91
92

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
93
94
        high = os.path.join(data_dir, "train/high")
        middle = os.path.join(data_dir, "train/middle")
95
96
        high = self._read_txt(high)
        middle = self._read_txt(middle)
97
        return self._create_examples(high + middle, "train")
98
99
100
101

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
102
103
        high = os.path.join(data_dir, "dev/high")
        middle = os.path.join(data_dir, "dev/middle")
104
105
        high = self._read_txt(high)
        middle = self._read_txt(middle)
106
        return self._create_examples(high + middle, "dev")
107

erenup's avatar
erenup committed
108
109
110
    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} test".format(data_dir))
111
112
        high = os.path.join(data_dir, "test/high")
        middle = os.path.join(data_dir, "test/middle")
erenup's avatar
erenup committed
113
114
        high = self._read_txt(high)
        middle = self._read_txt(middle)
115
        return self._create_examples(high + middle, "test")
erenup's avatar
erenup committed
116

117
118
119
120
121
122
123
124
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_txt(self, input_dir):
        lines = []
        files = glob.glob(input_dir + "/*txt")
        for file in tqdm.tqdm(files, desc="read files"):
125
            with open(file, "r", encoding="utf-8") as fin:
126
127
128
129
130
131
132
133
134
135
136
137
                data_raw = json.load(fin)
                data_raw["race_id"] = file
                lines.append(data_raw)
        return lines

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (_, data_raw) in enumerate(lines):
            race_id = "%s-%s" % (set_type, data_raw["race_id"])
            article = data_raw["article"]
            for i in range(len(data_raw["answers"])):
138
139
140
                truth = str(ord(data_raw["answers"][i]) - ord("A"))
                question = data_raw["questions"][i]
                options = data_raw["options"][i]
141
142
143
144
145

                examples.append(
                    InputExample(
                        example_id=race_id,
                        question=question,
146
                        contexts=[article, article, article, article],  # this is not efficient but convenient
147
                        endings=[options[0], options[1], options[2], options[3]],
148
149
150
                        label=truth,
                    )
                )
151
152
        return examples

153

154
class SwagProcessor(DataProcessor):
erenup's avatar
erenup committed
155
    """Processor for the SWAG data set."""
156
157
158
159
160
161
162
163
164
165
166

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")

erenup's avatar
erenup committed
167
168
    def get_test_examples(self, data_dir):
        """See base class."""
erenup's avatar
erenup committed
169
170
171
172
173
        logger.info("LOOKING AT {} dev".format(data_dir))
        raise ValueError(
            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
            "setting!"
        )
erenup's avatar
erenup committed
174
        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
175

176
177
178
179
180
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_csv(self, input_file):
181
        with open(input_file, "r", encoding="utf-8") as f:
182
183
184
185
            reader = csv.reader(f)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
186
                    line = list(unicode(cell, "utf-8") for cell in line)
187
188
189
                lines.append(line)
            return lines

190
    def _create_examples(self, lines: List[List[str]], type: str):
191
        """Creates examples for the training and dev sets."""
192
193
        if type == "train" and lines[0][-1] != "label":
            raise ValueError("For training, the input file must contain a label column.")
194
195
196
197
198
199
200

        examples = [
            InputExample(
                example_id=line[2],
                question=line[5],  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
201
202
203
204
205
                contexts=[line[4], line[4], line[4], line[4]],
                endings=[line[7], line[8], line[9], line[10]],
                label=line[11],
            )
            for line in lines[1:]  # we skip the line with the column names
206
207
208
209
210
211
        ]

        return examples


class ArcProcessor(DataProcessor):
erenup's avatar
erenup committed
212
    """Processor for the ARC data set (request from allennlp)."""
213
214
215
216
217
218
219
220
221
222
223

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")

erenup's avatar
erenup committed
224
225
226
227
    def get_test_examples(self, data_dir):
        logger.info("LOOKING AT {} test".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")

228
229
230
231
232
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_json(self, input_file):
233
        with open(input_file, "r", encoding="utf-8") as fin:
234
235
236
237
238
239
            lines = fin.readlines()
            return lines

    def _create_examples(self, lines, type):
        """Creates examples for the training and dev sets."""

240
        # There are two types of labels. They should be normalized
241
242
243
244
245
246
        def normalize(truth):
            if truth in "ABCD":
                return ord(truth) - ord("A")
            elif truth in "1234":
                return int(truth) - 1
            else:
erenup's avatar
erenup committed
247
248
                logger.info("truth ERROR! %s", str(truth))
                return None
erenup's avatar
erenup committed
249

250
251
252
253
254
        examples = []
        three_choice = 0
        four_choice = 0
        five_choice = 0
        other_choices = 0
erenup's avatar
erenup committed
255
        # we deleted example which has more than or less than four choices
256
257
258
259
260
261
262
263
264
265
266
267
268
        for line in tqdm.tqdm(lines, desc="read arc data"):
            data_raw = json.loads(line.strip("\n"))
            if len(data_raw["question"]["choices"]) == 3:
                three_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) == 5:
                five_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) != 4:
                other_choices += 1
                continue
            four_choice += 1
            truth = str(normalize(data_raw["answerKey"]))
erenup's avatar
erenup committed
269
            assert truth != "None"
270
271
272
273
274
275
276
            question_choices = data_raw["question"]
            question = question_choices["stem"]
            id = data_raw["id"]
            options = question_choices["choices"]
            if len(options) == 4:
                examples.append(
                    InputExample(
277
                        example_id=id,
278
                        question=question,
279
280
281
282
283
284
                        contexts=[
                            options[0]["para"].replace("_", ""),
                            options[1]["para"].replace("_", ""),
                            options[2]["para"].replace("_", ""),
                            options[3]["para"].replace("_", ""),
                        ],
285
                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
286
287
288
                        label=truth,
                    )
                )
289
290
291
292
293
294
295
296
297
298
299
300
301

        if type == "train":
            assert len(examples) > 1
            assert examples[0].label is not None
        logger.info("len examples: %s}", str(len(examples)))
        logger.info("Three choices: %s", str(three_choice))
        logger.info("Five choices: %s", str(five_choice))
        logger.info("Other choices: %s", str(other_choices))
        logger.info("four choices: %s", str(four_choice))

        return examples


302
303
304
305
306
307
308
309
310
311
312
313
def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_length: int,
    tokenizer: PreTrainedTokenizer,
    pad_token_segment_id=0,
    pad_on_left=False,
    pad_token=0,
    mask_padding_with_zero=True,
) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
314
315
    """

316
    label_map = {label: i for i, label in enumerate(label_list)}
317
318
319
320
321
322
323

    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        choices_features = []
        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
324
            text_a = context
325
            if example.question.find("_") != -1:
326
327
                # this is for cloze question
                text_b = example.question.replace("_", ending)
328
            else:
329
330
                text_b = example.question + " " + ending

331
332
333
334
335
336
337
            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
                logger.info(
                    "Attention! you are cropping tokens (swag task is ok). "
                    "If you are training ARC and RACE and you are poping question + options,"
                    "you need to try to use a bigger max seq length!"
                )
338

339
            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
340
341
342

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
343
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
344
345

            # Zero-pad up to the sequence length.
346
            padding_length = max_length - len(input_ids)
347
348
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
349
350
                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
351
352
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
353
354
355
356
357
358
359
360
                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            assert len(input_ids) == max_length
            assert len(attention_mask) == max_length
            assert len(token_type_ids) == max_length
            choices_features.append((input_ids, attention_mask, token_type_ids))

361
362
363
364
365
        label = label_map[example.label]

        if ex_index < 2:
            logger.info("*** Example ***")
            logger.info("race_id: {}".format(example.example_id))
366
            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
367
                logger.info("choice: {}".format(choice_idx))
368
369
370
                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
371
372
                logger.info("label: {}".format(label))

373
        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
374
375
376
377

    return features


378
processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}
379
380


381
MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}