squad.py 32.5 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Aymeric Augustin's avatar
Aymeric Augustin committed
15
import json
Lysandre's avatar
Lysandre committed
16
import os
erenup's avatar
erenup committed
17
from functools import partial
Aymeric Augustin's avatar
Aymeric Augustin committed
18
19
20
21
from multiprocessing import Pool, cpu_count

import numpy as np
from tqdm import tqdm
Lysandre's avatar
Lysandre committed
22

Aymeric Augustin's avatar
Aymeric Augustin committed
23
from ...file_utils import is_tf_available, is_torch_available
Sylvain Gugger's avatar
Sylvain Gugger committed
24
from ...models.bert.tokenization_bert import whitespace_tokenize
25
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
Lysandre Debut's avatar
Lysandre Debut committed
26
from ...utils import logging
27
from .utils import DataProcessor
Aymeric Augustin's avatar
Aymeric Augustin committed
28

LysandreJik's avatar
LysandreJik committed
29

30
# Store the tokenizers which insert 2 separators tokens
StillKeepTry's avatar
StillKeepTry committed
31
MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
32
33


LysandreJik's avatar
Cleanup  
LysandreJik committed
34
if is_torch_available():
LysandreJik's avatar
LysandreJik committed
35
36
    import torch
    from torch.utils.data import TensorDataset
Lysandre's avatar
Lysandre committed
37
38
39
40

if is_tf_available():
    import tensorflow as tf

Lysandre Debut's avatar
Lysandre Debut committed
41
logger = logging.get_logger(__name__)
Lysandre's avatar
Lysandre committed
42

Lysandre's avatar
Lysandre committed
43
44

def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
LysandreJik's avatar
LysandreJik committed
45
46
47
48
49
    """Returns tokenized answer spans that better match the annotated answer."""
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
Lysandre's avatar
Lysandre committed
50
            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
LysandreJik's avatar
LysandreJik committed
51
52
53
54
55
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

Lysandre's avatar
Lysandre committed
56

LysandreJik's avatar
LysandreJik committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def _check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span.start + doc_span.length - 1
        if position < doc_span.start:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span.start
        num_right_context = end - position
        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index

Lysandre's avatar
Lysandre committed
76

LysandreJik's avatar
LysandreJik committed
77
78
79
def _new_check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""
    # if len(doc_spans) == 1:
Lysandre's avatar
Lysandre committed
80
    # return True
LysandreJik's avatar
LysandreJik committed
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span["start"] + doc_span["length"] - 1
        if position < doc_span["start"]:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span["start"]
        num_right_context = end - position
        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index

Lysandre's avatar
Lysandre committed
98

LysandreJik's avatar
LysandreJik committed
99
100
101
102
def _is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False
Lysandre's avatar
wip  
Lysandre committed
103

104

105
106
107
def squad_convert_example_to_features(
    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
):
erenup's avatar
erenup committed
108
109
110
111
112
113
114
    features = []
    if is_training and not example.is_impossible:
        # Get start and end position
        start_position = example.start_position
        end_position = example.end_position

        # If the answer cannot be found in the text, then skip this example.
115
        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
erenup's avatar
erenup committed
116
117
        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
118
            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
erenup's avatar
erenup committed
119
120
121
122
123
124
125
            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
126
127
128
129
130
131
132
133
        if tokenizer.__class__.__name__ in [
            "RobertaTokenizer",
            "LongformerTokenizer",
            "BartTokenizer",
            "RobertaTokenizerFast",
            "LongformerTokenizerFast",
            "BartTokenizerFast",
        ]:
134
135
136
            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
        else:
            sub_tokens = tokenizer.tokenize(token)
erenup's avatar
erenup committed
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    if is_training and not example.is_impossible:
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1

        (tok_start_position, tok_end_position) = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
        )

    spans = []

154
155
156
    truncated_query = tokenizer.encode(
        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
    )
157
158
159
160

    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
    # in the way they compute mask of added tokens.
    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
161
    sequence_added_tokens = (
Sylvain Gugger's avatar
Sylvain Gugger committed
162
        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
163
        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
Sylvain Gugger's avatar
Sylvain Gugger committed
164
        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
165
    )
Sylvain Gugger's avatar
Sylvain Gugger committed
166
    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
erenup's avatar
erenup committed
167
168
169
170

    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):

171
172
173
174
175
176
177
178
179
180
        # Define the side we want to truncate / pad and the text/pair sorting
        if tokenizer.padding_side == "right":
            texts = truncated_query
            pairs = span_doc_tokens
            truncation = TruncationStrategy.ONLY_SECOND.value
        else:
            texts = span_doc_tokens
            pairs = truncated_query
            truncation = TruncationStrategy.ONLY_FIRST.value

181
        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
182
183
184
185
            texts,
            pairs,
            truncation=truncation,
            padding=padding_strategy,
erenup's avatar
erenup committed
186
187
188
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
189
            return_token_type_ids=True,
erenup's avatar
erenup committed
190
        )
Lysandre's avatar
Lysandre committed
191

192
193
194
195
        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )
erenup's avatar
erenup committed
196

197
        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
198
199
200
201
202
203
204
205
            if tokenizer.padding_side == "right":
                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
            else:
                last_padding_id_position = (
                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
                )
                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]

erenup's avatar
erenup committed
206
        else:
207
            non_padded_ids = encoded_dict["input_ids"]
erenup's avatar
erenup committed
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        for i in range(paragraph_len):
            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]

        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len

        spans.append(encoded_dict)

226
227
228
        if "overflowing_tokens" not in encoded_dict or (
            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
        ):
erenup's avatar
erenup committed
229
230
231
232
233
234
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
235
236
237
238
239
            index = (
                j
                if tokenizer.padding_side == "left"
                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
            )
erenup's avatar
erenup committed
240
241
242
243
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
244
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
erenup's avatar
erenup committed
245
246

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
247
248
        # Original TF implem also keep the classification token (set to 0)
        p_mask = np.ones_like(span["token_type_ids"])
erenup's avatar
erenup committed
249
        if tokenizer.padding_side == "right":
250
251
252
253
254
255
256
257
            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
        else:
            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0

        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
        ).nonzero()
erenup's avatar
erenup committed
258

259
260
        p_mask[pad_token_indices] = 1
        p_mask[special_token_indices] = 1
erenup's avatar
erenup committed
261

262
        # Set the cls index to 0: the CLS index can be used for impossible answers
erenup's avatar
erenup committed
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
        p_mask[cls_index] = 0

        span_is_impossible = example.is_impossible
        start_position = 0
        end_position = 0
        if is_training and not span_is_impossible:
            # For training, if our document chunk does not contain an annotation
            # we throw it out, since there is nothing to predict.
            doc_start = span["start"]
            doc_end = span["start"] + span["length"] - 1
            out_of_span = False

            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
                out_of_span = True

            if out_of_span:
                start_position = cls_index
                end_position = cls_index
                span_is_impossible = True
            else:
                if tokenizer.padding_side == "left":
                    doc_offset = 0
                else:
                    doc_offset = len(truncated_query) + sequence_added_tokens

                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
        features.append(
            SquadFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_position=start_position,
                end_position=end_position,
Lysandre's avatar
Lysandre committed
306
                is_impossible=span_is_impossible,
307
                qas_id=example.qas_id,
308
309
            )
        )
erenup's avatar
erenup committed
310
311
    return features

312

313
def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
erenup's avatar
erenup committed
314
315
316
    global tokenizer
    tokenizer = tokenizer_for_convert

317
318

def squad_convert_examples_to_features(
319
320
321
322
323
324
    examples,
    tokenizer,
    max_seq_length,
    doc_stride,
    max_query_length,
    is_training,
325
    padding_strategy="max_length",
326
327
328
    return_dataset=False,
    threads=1,
    tqdm_enabled=True,
329
):
LysandreJik's avatar
LysandreJik committed
330
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
331
332
    Converts a list of examples into a list of features that can be directly given as input to a model. It is
    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
LysandreJik's avatar
LysandreJik committed
333
334
335
336
337
338
339

    Args:
        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
        max_seq_length: The maximum sequence length of the inputs.
        doc_stride: The stride used when the context is too large and is split across several features.
        max_query_length: The maximum length of the query.
LysandreJik's avatar
LysandreJik committed
340
        is_training: whether to create features for model evaluation or model training.
341
        padding_strategy: Default to "max_length". Which padding strategy to use
LysandreJik's avatar
LysandreJik committed
342
        return_dataset: Default False. Either 'pt' or 'tf'.
Sylvain Gugger's avatar
Sylvain Gugger committed
343
            if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
344
        threads: multiple processing threads.
erenup's avatar
erenup committed
345

LysandreJik's avatar
LysandreJik committed
346
347
348
349
350
351
352
353
354

    Returns:
        list of :class:`~transformers.data.processors.squad.SquadFeatures`

    Example::

        processor = SquadV2Processor()
        examples = processor.get_dev_examples(data_dir)

355
        features = squad_convert_examples_to_features(
LysandreJik's avatar
LysandreJik committed
356
357
358
359
360
361
362
363
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
        )
    """
364
    # Defining helper methods
Lysandre's avatar
Lysandre committed
365
    features = []
366

erenup's avatar
erenup committed
367
368
    threads = min(threads, cpu_count())
    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
369
370
371
372
373
        annotate_ = partial(
            squad_convert_example_to_features,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
374
            padding_strategy=padding_strategy,
375
376
377
378
379
380
381
            is_training=is_training,
        )
        features = list(
            tqdm(
                p.imap(annotate_, examples, chunksize=32),
                total=len(examples),
                desc="convert squad examples to features",
382
                disable=not tqdm_enabled,
383
384
            )
        )
385

erenup's avatar
erenup committed
386
387
388
    new_features = []
    unique_id = 1000000000
    example_index = 0
389
390
391
    for example_features in tqdm(
        features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled
    ):
erenup's avatar
erenup committed
392
393
394
395
396
397
        if not example_features:
            continue
        for example_feature in example_features:
            example_feature.example_index = example_index
            example_feature.unique_id = unique_id
            new_features.append(example_feature)
LysandreJik's avatar
LysandreJik committed
398
            unique_id += 1
erenup's avatar
erenup committed
399
400
401
        example_index += 1
    features = new_features
    del new_features
402
    if return_dataset == "pt":
LysandreJik's avatar
LysandreJik committed
403
        if not is_torch_available():
Aymeric Augustin's avatar
Aymeric Augustin committed
404
            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
LysandreJik's avatar
LysandreJik committed
405
406
407

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
Lysandre's avatar
Lysandre committed
408
409
        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
LysandreJik's avatar
LysandreJik committed
410
411
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
Lysandre's avatar
Lysandre committed
412
        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
LysandreJik's avatar
LysandreJik committed
413
414

        if not is_training:
415
            all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
Lysandre's avatar
Lysandre committed
416
            dataset = TensorDataset(
417
                all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
Lysandre's avatar
Lysandre committed
418
            )
LysandreJik's avatar
LysandreJik committed
419
420
421
        else:
            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
Lysandre's avatar
Lysandre committed
422
423
424
425
426
427
428
429
            dataset = TensorDataset(
                all_input_ids,
                all_attention_masks,
                all_token_type_ids,
                all_start_positions,
                all_end_positions,
                all_cls_index,
                all_p_mask,
Lysandre's avatar
Lysandre committed
430
                all_is_impossible,
Lysandre's avatar
Lysandre committed
431
            )
LysandreJik's avatar
LysandreJik committed
432
433

        return features, dataset
Lysandre's avatar
Lysandre committed
434
435
    elif return_dataset == "tf":
        if not is_tf_available():
Aymeric Augustin's avatar
Aymeric Augustin committed
436
            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
Lysandre's avatar
Lysandre committed
437
438

        def gen():
439
            for i, ex in enumerate(features):
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
                if ex.token_type_ids is None:
                    yield (
                        {
                            "input_ids": ex.input_ids,
                            "attention_mask": ex.attention_mask,
                            "feature_index": i,
                            "qas_id": ex.qas_id,
                        },
                        {
                            "start_positions": ex.start_position,
                            "end_positions": ex.end_position,
                            "cls_index": ex.cls_index,
                            "p_mask": ex.p_mask,
                            "is_impossible": ex.is_impossible,
                        },
                    )
                else:
                    yield (
                        {
                            "input_ids": ex.input_ids,
                            "attention_mask": ex.attention_mask,
                            "token_type_ids": ex.token_type_ids,
                            "feature_index": i,
                            "qas_id": ex.qas_id,
                        },
                        {
                            "start_positions": ex.start_position,
                            "end_positions": ex.end_position,
                            "cls_index": ex.cls_index,
                            "p_mask": ex.p_mask,
                            "is_impossible": ex.is_impossible,
                        },
                    )
Lysandre's avatar
Lysandre committed
473

474
        # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
        if "token_type_ids" in tokenizer.model_input_names:
            train_types = (
                {
                    "input_ids": tf.int32,
                    "attention_mask": tf.int32,
                    "token_type_ids": tf.int32,
                    "feature_index": tf.int64,
                    "qas_id": tf.string,
                },
                {
                    "start_positions": tf.int64,
                    "end_positions": tf.int64,
                    "cls_index": tf.int64,
                    "p_mask": tf.int32,
                    "is_impossible": tf.int32,
                },
            )
LysandreJik's avatar
LysandreJik committed
492

493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
            train_shapes = (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                    "feature_index": tf.TensorShape([]),
                    "qas_id": tf.TensorShape([]),
                },
                {
                    "start_positions": tf.TensorShape([]),
                    "end_positions": tf.TensorShape([]),
                    "cls_index": tf.TensorShape([]),
                    "p_mask": tf.TensorShape([None]),
                    "is_impossible": tf.TensorShape([]),
                },
            )
        else:
            train_types = (
                {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
                {
                    "start_positions": tf.int64,
                    "end_positions": tf.int64,
                    "cls_index": tf.int64,
                    "p_mask": tf.int32,
                    "is_impossible": tf.int32,
                },
            )

            train_shapes = (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "feature_index": tf.TensorShape([]),
                    "qas_id": tf.TensorShape([]),
                },
                {
                    "start_positions": tf.TensorShape([]),
                    "end_positions": tf.TensorShape([]),
                    "cls_index": tf.TensorShape([]),
                    "p_mask": tf.TensorShape([None]),
                    "is_impossible": tf.TensorShape([]),
                },
            )
536
537
538
539

        return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
    else:
        return features
Lysandre's avatar
Lysandre committed
540

Lysandre's avatar
wip  
Lysandre committed
541

Lysandre's avatar
Lysandre committed
542
class SquadProcessor(DataProcessor):
LysandreJik's avatar
LysandreJik committed
543
    """
544
    Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
Sylvain Gugger's avatar
Sylvain Gugger committed
545
    version 2.0 of SQuAD, respectively.
LysandreJik's avatar
LysandreJik committed
546
    """
Lysandre's avatar
Lysandre committed
547

Lysandre's avatar
Lysandre committed
548
549
    train_file = None
    dev_file = None
LysandreJik's avatar
LysandreJik committed
550

LysandreJik's avatar
LysandreJik committed
551
    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
552
        if not evaluate:
Lysandre's avatar
Lysandre committed
553
554
            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
555
            answers = []
556
        else:
Lysandre's avatar
Lysandre committed
557
558
559
560
            answers = [
                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
            ]
561
562
563
564

            answer = None
            answer_start = None

Lysandre's avatar
Lysandre committed
565
        return SquadExample(
Lysandre's avatar
Lysandre committed
566
567
568
            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
            question_text=tensor_dict["question"].numpy().decode("utf-8"),
            context_text=tensor_dict["context"].numpy().decode("utf-8"),
569
570
            answer_text=answer,
            start_position_character=answer_start,
Lysandre's avatar
Lysandre committed
571
572
            title=tensor_dict["title"].numpy().decode("utf-8"),
            answers=answers,
LysandreJik's avatar
LysandreJik committed
573
574
        )

575
    def get_examples_from_dataset(self, dataset, evaluate=False):
LysandreJik's avatar
Cleanup  
LysandreJik committed
576
577
578
579
580
        """
        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.

        Args:
            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
Tiger's avatar
Tiger committed
581
            evaluate: Boolean specifying if in evaluation mode or in training mode
LysandreJik's avatar
Cleanup  
LysandreJik committed
582
583
584
585
586
587

        Returns:
            List of SquadExample

        Examples::

588
589
            >>> import tensorflow_datasets as tfds
            >>> dataset = tfds.load("squad")
LysandreJik's avatar
Cleanup  
LysandreJik committed
590

591
592
            >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
            >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
LysandreJik's avatar
Cleanup  
LysandreJik committed
593
594
595
596
597
598
        """

        if evaluate:
            dataset = dataset["validation"]
        else:
            dataset = dataset["train"]
Lysandre's avatar
Lysandre committed
599
600
601

        examples = []
        for tensor_dict in tqdm(dataset):
Lysandre's avatar
Lysandre committed
602
            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
Lysandre's avatar
Lysandre committed
603
604
605

        return examples

LysandreJik's avatar
LysandreJik committed
606
607
608
609
610
611
612
613
614
615
    def get_train_examples(self, data_dir, filename=None):
        """
        Returns the training examples from the data directory.

        Args:
            data_dir: Directory containing the data files used for training and evaluating.
            filename: None by default, specify this if the training file has a different name than the original one
                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.

        """
616
617
618
        if data_dir is None:
            data_dir = ""

Lysandre's avatar
Lysandre committed
619
620
621
        if self.train_file is None:
            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")

Lysandre's avatar
Lysandre committed
622
623
624
        with open(
            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
        ) as reader:
LysandreJik's avatar
LysandreJik committed
625
            input_data = json.load(reader)["data"]
LysandreJik's avatar
LysandreJik committed
626
        return self._create_examples(input_data, "train")
LysandreJik's avatar
LysandreJik committed
627

LysandreJik's avatar
LysandreJik committed
628
629
630
631
632
633
634
    def get_dev_examples(self, data_dir, filename=None):
        """
        Returns the evaluation example from the data directory.

        Args:
            data_dir: Directory containing the data files used for training and evaluating.
            filename: None by default, specify this if the evaluation file has a different name than the original one
635
                which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
LysandreJik's avatar
LysandreJik committed
636
        """
637
638
639
        if data_dir is None:
            data_dir = ""

Lysandre's avatar
Lysandre committed
640
641
        if self.dev_file is None:
            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
Lysandre's avatar
Lysandre committed
642
643
644
645

        with open(
            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
        ) as reader:
LysandreJik's avatar
LysandreJik committed
646
            input_data = json.load(reader)["data"]
LysandreJik's avatar
LysandreJik committed
647
        return self._create_examples(input_data, "dev")
LysandreJik's avatar
LysandreJik committed
648

LysandreJik's avatar
LysandreJik committed
649
    def _create_examples(self, input_data, set_type):
LysandreJik's avatar
LysandreJik committed
650
651
        is_training = set_type == "train"
        examples = []
652
        for entry in tqdm(input_data):
Lysandre's avatar
Lysandre committed
653
            title = entry["title"]
LysandreJik's avatar
LysandreJik committed
654
655
656
657
658
            for paragraph in entry["paragraphs"]:
                context_text = paragraph["context"]
                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
659
                    start_position_character = None
LysandreJik's avatar
LysandreJik committed
660
                    answer_text = None
661
                    answers = []
Lysandre's avatar
Lysandre committed
662

663
                    is_impossible = qa.get("is_impossible", False)
LysandreJik's avatar
LysandreJik committed
664
665
666
                    if not is_impossible:
                        if is_training:
                            answer = qa["answers"][0]
Lysandre's avatar
Lysandre committed
667
668
                            answer_text = answer["text"]
                            start_position_character = answer["answer_start"]
LysandreJik's avatar
LysandreJik committed
669
670
                        else:
                            answers = qa["answers"]
LysandreJik's avatar
LysandreJik committed
671

Lysandre's avatar
Lysandre committed
672
                    example = SquadExample(
LysandreJik's avatar
LysandreJik committed
673
674
675
676
                        qas_id=qas_id,
                        question_text=question_text,
                        context_text=context_text,
                        answer_text=answer_text,
677
                        start_position_character=start_position_character,
Lysandre's avatar
Lysandre committed
678
                        title=title,
LysandreJik's avatar
LysandreJik committed
679
                        is_impossible=is_impossible,
Lysandre's avatar
Lysandre committed
680
                        answers=answers,
LysandreJik's avatar
LysandreJik committed
681
682
683
684
                    )
                    examples.append(example)
        return examples

Lysandre's avatar
Lysandre committed
685

Lysandre's avatar
Lysandre committed
686
687
688
689
690
691
692
693
class SquadV1Processor(SquadProcessor):
    train_file = "train-v1.1.json"
    dev_file = "dev-v1.1.json"


class SquadV2Processor(SquadProcessor):
    train_file = "train-v2.0.json"
    dev_file = "dev-v2.0.json"
Lysandre's avatar
Lysandre committed
694

LysandreJik's avatar
LysandreJik committed
695

696
class SquadExample:
LysandreJik's avatar
LysandreJik committed
697
698
    """
    A single training/test example for the Squad dataset, as loaded from disk.
LysandreJik's avatar
LysandreJik committed
699
700
701
702
703
704
705
706
707
708

    Args:
        qas_id: The example's unique identifier
        question_text: The question string
        context_text: The context string
        answer_text: The answer string
        start_position_character: The character position of the start of the answer
        title: The title of the example
        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
        is_impossible: False by default, set to True if the example has no possible answer.
LysandreJik's avatar
LysandreJik committed
709
710
    """

Lysandre's avatar
Lysandre committed
711
712
713
714
715
716
717
718
719
720
721
    def __init__(
        self,
        qas_id,
        question_text,
        context_text,
        answer_text,
        start_position_character,
        title,
        answers=[],
        is_impossible=False,
    ):
LysandreJik's avatar
LysandreJik committed
722
723
724
725
726
        self.qas_id = qas_id
        self.question_text = question_text
        self.context_text = context_text
        self.answer_text = answer_text
        self.title = title
Lysandre's avatar
Lysandre committed
727
        self.is_impossible = is_impossible
LysandreJik's avatar
LysandreJik committed
728
        self.answers = answers
Lysandre's avatar
Lysandre committed
729
730

        self.start_position, self.end_position = 0, 0
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750

        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True

        # Split on whitespace so that different tokens may be attributed to their original position.
        for c in self.context_text:
            if _is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        self.doc_tokens = doc_tokens
        self.char_to_word_offset = char_to_word_offset

751
        # Start and end positions only has a value during evaluation.
Lysandre's avatar
Lysandre committed
752
        if start_position_character is not None and not is_impossible:
753
            self.start_position = char_to_word_offset[start_position_character]
754
755
756
            self.end_position = char_to_word_offset[
                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
            ]
LysandreJik's avatar
LysandreJik committed
757
758


759
class SquadFeatures:
LysandreJik's avatar
LysandreJik committed
760
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
761
762
763
    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
    :class:`~transformers.data.processors.squad.SquadExample` using the
    :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
LysandreJik's avatar
LysandreJik committed
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
        cls_index: the index of the CLS token.
        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
        example_index: the index of the example
        unique_id: The unique Feature identifier
        paragraph_len: The length of the context
        token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
            If a token does not have their maximum context in this feature object, it means that another feature object
            has more information related to that token and should be prioritized over this feature for that token.
        tokens: list of tokens corresponding to the input ids
        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
780
781
        start_position: start of the answer token index
        end_position: end of the answer token index
782
        encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods.
LysandreJik's avatar
LysandreJik committed
783
784
    """

Lysandre's avatar
Lysandre committed
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
    def __init__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        cls_index,
        p_mask,
        example_index,
        unique_id,
        paragraph_len,
        token_is_max_context,
        tokens,
        token_to_orig_map,
        start_position,
        end_position,
Lysandre's avatar
Lysandre committed
800
        is_impossible,
801
        qas_id: str = None,
802
        encoding: BatchEncoding = None,
Lysandre's avatar
Lysandre committed
803
804
    ):
        self.input_ids = input_ids
LysandreJik's avatar
LysandreJik committed
805
806
807
808
809
810
811
812
813
814
815
816
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.cls_index = cls_index
        self.p_mask = p_mask

        self.example_index = example_index
        self.unique_id = unique_id
        self.paragraph_len = paragraph_len
        self.token_is_max_context = token_is_max_context
        self.tokens = tokens
        self.token_to_orig_map = token_to_orig_map

Lysandre's avatar
Lysandre committed
817
818
        self.start_position = start_position
        self.end_position = end_position
Lysandre's avatar
Lysandre committed
819
        self.is_impossible = is_impossible
820
        self.qas_id = qas_id
LysandreJik's avatar
LysandreJik committed
821

822
823
        self.encoding = encoding

Lysandre's avatar
Lysandre committed
824

825
class SquadResult:
LysandreJik's avatar
LysandreJik committed
826
827
828
829
    """
    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.

    Args:
LysandreJik's avatar
LysandreJik committed
830
831
832
        unique_id: The unique identifier corresponding to that example.
        start_logits: The logits corresponding to the start of the answer
        end_logits: The logits corresponding to the end of the answer
LysandreJik's avatar
LysandreJik committed
833
    """
Lysandre's avatar
Lysandre committed
834

LysandreJik's avatar
LysandreJik committed
835
    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
LysandreJik's avatar
Cleanup  
LysandreJik committed
836
837
        self.start_logits = start_logits
        self.end_logits = end_logits
LysandreJik's avatar
LysandreJik committed
838
        self.unique_id = unique_id
Lysandre's avatar
Lysandre committed
839

LysandreJik's avatar
LysandreJik committed
840
841
842
        if start_top_index:
            self.start_top_index = start_top_index
            self.end_top_index = end_top_index
Lysandre's avatar
Lysandre committed
843
            self.cls_logits = cls_logits