utils_multiple_choice.py 14.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
16
""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
17
18


Aymeric Augustin's avatar
Aymeric Augustin committed
19
20
21
import csv
import glob
import json
22
23
24
import logging
import os
from io import open
25
from typing import List
Aymeric Augustin's avatar
Aymeric Augustin committed
26
27
28

import tqdm

29
from transformers import PreTrainedTokenizer
30
31
32
33
34
35
36
37


logger = logging.getLogger(__name__)


class InputExample(object):
    """A single training/test example for multiple choice"""

38
    def __init__(self, example_id, question, contexts, endings, label=None):
39
40
41
        """Constructs a InputExample.

        Args:
erenup's avatar
erenup committed
42
43
            example_id: Unique id for the example.
            contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
44
            question: string. The untokenized text of the second sequence (question).
erenup's avatar
erenup committed
45
            endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
46
47
48
49
50
51
52
53
54
55
56
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.example_id = example_id
        self.question = question
        self.contexts = contexts
        self.endings = endings
        self.label = label


class InputFeatures(object):
57
    def __init__(self, example_id, choices_features, label):
58
59
        self.example_id = example_id
        self.choices_features = [
60
            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
61
            for input_ids, input_mask, segment_ids in choices_features
62
63
64
65
66
        ]
        self.label = label


class DataProcessor(object):
erenup's avatar
erenup committed
67
    """Base class for data converters for multiple choice data sets."""
68
69
70
71
72
73
74
75
76

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

erenup's avatar
erenup committed
77
    def get_test_examples(self, data_dir):
erenup's avatar
erenup committed
78
        """Gets a collection of `InputExample`s for the test set."""
erenup's avatar
erenup committed
79
80
        raise NotImplementedError()

81
82
83
84
85
86
    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()


class RaceProcessor(DataProcessor):
erenup's avatar
erenup committed
87
    """Processor for the RACE data set."""
88
89
90
91

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
92
93
        high = os.path.join(data_dir, "train/high")
        middle = os.path.join(data_dir, "train/middle")
94
95
        high = self._read_txt(high)
        middle = self._read_txt(middle)
96
        return self._create_examples(high + middle, "train")
97
98
99
100

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
101
102
        high = os.path.join(data_dir, "dev/high")
        middle = os.path.join(data_dir, "dev/middle")
103
104
        high = self._read_txt(high)
        middle = self._read_txt(middle)
105
        return self._create_examples(high + middle, "dev")
106

erenup's avatar
erenup committed
107
108
109
    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} test".format(data_dir))
110
111
        high = os.path.join(data_dir, "test/high")
        middle = os.path.join(data_dir, "test/middle")
erenup's avatar
erenup committed
112
113
        high = self._read_txt(high)
        middle = self._read_txt(middle)
114
        return self._create_examples(high + middle, "test")
erenup's avatar
erenup committed
115

116
117
118
119
120
121
122
123
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_txt(self, input_dir):
        lines = []
        files = glob.glob(input_dir + "/*txt")
        for file in tqdm.tqdm(files, desc="read files"):
124
            with open(file, "r", encoding="utf-8") as fin:
125
126
127
128
129
130
131
132
133
134
135
136
                data_raw = json.load(fin)
                data_raw["race_id"] = file
                lines.append(data_raw)
        return lines

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (_, data_raw) in enumerate(lines):
            race_id = "%s-%s" % (set_type, data_raw["race_id"])
            article = data_raw["article"]
            for i in range(len(data_raw["answers"])):
137
138
139
                truth = str(ord(data_raw["answers"][i]) - ord("A"))
                question = data_raw["questions"][i]
                options = data_raw["options"][i]
140
141
142
143
144

                examples.append(
                    InputExample(
                        example_id=race_id,
                        question=question,
145
                        contexts=[article, article, article, article],  # this is not efficient but convenient
146
                        endings=[options[0], options[1], options[2], options[3]],
147
148
149
                        label=truth,
                    )
                )
150
151
        return examples

152

153
class SwagProcessor(DataProcessor):
erenup's avatar
erenup committed
154
    """Processor for the SWAG data set."""
155
156
157
158
159
160
161
162
163
164
165

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")

erenup's avatar
erenup committed
166
167
    def get_test_examples(self, data_dir):
        """See base class."""
erenup's avatar
erenup committed
168
169
170
171
172
        logger.info("LOOKING AT {} dev".format(data_dir))
        raise ValueError(
            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
            "setting!"
        )
erenup's avatar
erenup committed
173
        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
174

175
176
177
178
179
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_csv(self, input_file):
180
        with open(input_file, "r", encoding="utf-8") as f:
181
            return list(csv.reader(f))
182

183
    def _create_examples(self, lines: List[List[str]], type: str):
184
        """Creates examples for the training and dev sets."""
185
186
        if type == "train" and lines[0][-1] != "label":
            raise ValueError("For training, the input file must contain a label column.")
187
188
189
190
191
192
193

        examples = [
            InputExample(
                example_id=line[2],
                question=line[5],  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
194
195
196
197
198
                contexts=[line[4], line[4], line[4], line[4]],
                endings=[line[7], line[8], line[9], line[10]],
                label=line[11],
            )
            for line in lines[1:]  # we skip the line with the column names
199
200
201
202
203
204
        ]

        return examples


class ArcProcessor(DataProcessor):
erenup's avatar
erenup committed
205
    """Processor for the ARC data set (request from allennlp)."""
206
207
208
209
210
211
212
213
214
215
216

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")

erenup's avatar
erenup committed
217
218
219
220
    def get_test_examples(self, data_dir):
        logger.info("LOOKING AT {} test".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")

221
222
223
224
225
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_json(self, input_file):
226
        with open(input_file, "r", encoding="utf-8") as fin:
227
228
229
230
231
232
            lines = fin.readlines()
            return lines

    def _create_examples(self, lines, type):
        """Creates examples for the training and dev sets."""

233
        # There are two types of labels. They should be normalized
234
235
236
237
238
239
        def normalize(truth):
            if truth in "ABCD":
                return ord(truth) - ord("A")
            elif truth in "1234":
                return int(truth) - 1
            else:
erenup's avatar
erenup committed
240
241
                logger.info("truth ERROR! %s", str(truth))
                return None
erenup's avatar
erenup committed
242

243
244
245
246
247
        examples = []
        three_choice = 0
        four_choice = 0
        five_choice = 0
        other_choices = 0
erenup's avatar
erenup committed
248
        # we deleted example which has more than or less than four choices
249
250
251
252
253
254
255
256
257
258
259
260
261
        for line in tqdm.tqdm(lines, desc="read arc data"):
            data_raw = json.loads(line.strip("\n"))
            if len(data_raw["question"]["choices"]) == 3:
                three_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) == 5:
                five_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) != 4:
                other_choices += 1
                continue
            four_choice += 1
            truth = str(normalize(data_raw["answerKey"]))
erenup's avatar
erenup committed
262
            assert truth != "None"
263
264
265
266
267
268
269
            question_choices = data_raw["question"]
            question = question_choices["stem"]
            id = data_raw["id"]
            options = question_choices["choices"]
            if len(options) == 4:
                examples.append(
                    InputExample(
270
                        example_id=id,
271
                        question=question,
272
273
274
275
276
277
                        contexts=[
                            options[0]["para"].replace("_", ""),
                            options[1]["para"].replace("_", ""),
                            options[2]["para"].replace("_", ""),
                            options[3]["para"].replace("_", ""),
                        ],
278
                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
279
280
281
                        label=truth,
                    )
                )
282
283
284
285
286
287
288
289
290
291
292
293
294

        if type == "train":
            assert len(examples) > 1
            assert examples[0].label is not None
        logger.info("len examples: %s}", str(len(examples)))
        logger.info("Three choices: %s", str(three_choice))
        logger.info("Five choices: %s", str(five_choice))
        logger.info("Other choices: %s", str(other_choices))
        logger.info("four choices: %s", str(four_choice))

        return examples


295
296
297
298
299
300
301
302
303
304
305
306
def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_length: int,
    tokenizer: PreTrainedTokenizer,
    pad_token_segment_id=0,
    pad_on_left=False,
    pad_token=0,
    mask_padding_with_zero=True,
) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
307
308
    """

309
    label_map = {label: i for i, label in enumerate(label_list)}
310
311
312
313
314
315
316

    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        choices_features = []
        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
317
            text_a = context
318
            if example.question.find("_") != -1:
319
320
                # this is for cloze question
                text_b = example.question.replace("_", ending)
321
            else:
322
323
                text_b = example.question + " " + ending

324
325
326
327
328
329
330
            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
                logger.info(
                    "Attention! you are cropping tokens (swag task is ok). "
                    "If you are training ARC and RACE and you are poping question + options,"
                    "you need to try to use a bigger max seq length!"
                )
331

332
            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
333
334
335

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
336
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
337
338

            # Zero-pad up to the sequence length.
339
            padding_length = max_length - len(input_ids)
340
341
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
342
343
                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
344
345
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
346
347
348
349
350
351
352
353
                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            assert len(input_ids) == max_length
            assert len(attention_mask) == max_length
            assert len(token_type_ids) == max_length
            choices_features.append((input_ids, attention_mask, token_type_ids))

354
355
356
357
358
        label = label_map[example.label]

        if ex_index < 2:
            logger.info("*** Example ***")
            logger.info("race_id: {}".format(example.example_id))
359
            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
360
                logger.info("choice: {}".format(choice_idx))
361
362
363
                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
364
365
                logger.info("label: {}".format(label))

366
        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
367
368
369
370

    return features


371
processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}
372
373


374
MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}