tokenization_utils_base.py 173 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
16
"""
Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
17
fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
Sylvain Gugger's avatar
Sylvain Gugger committed
18
of output with special method for the Fast tokenizers)
19
20
21
22
23
"""

import copy
import json
import os
24
import re
25
import warnings
26
from collections import OrderedDict, UserDict
27
from collections.abc import Mapping
28
from contextlib import contextmanager
29
from dataclasses import dataclass, field
Sylvain Gugger's avatar
Sylvain Gugger committed
30
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
31
32

import numpy as np
33
from packaging import version
34

35
from . import __version__
36
from .dynamic_module_utils import custom_object_save
37
from .utils import (
38
39
    ExplicitEnum,
    PaddingStrategy,
Sylvain Gugger's avatar
Sylvain Gugger committed
40
    PushToHubMixin,
41
    TensorType,
42
    add_end_docstrings,
43
    cached_file,
44
    copy_func,
45
    download_url,
46
    extract_commit_hash,
47
    is_flax_available,
48
    is_offline_mode,
49
    is_remote_url,
50
    is_tf_available,
51
    is_tokenizers_available,
52
    is_torch_available,
53
    logging,
54
    to_py_obj,
55
56
    torch_required,
)
57
from .utils.generic import _is_jax, _is_numpy, _is_tensorflow, _is_torch, _is_torch_device
58
59


Sylvain Gugger's avatar
Sylvain Gugger committed
60
61
62
63
64
65
66
67
if TYPE_CHECKING:
    if is_torch_available():
        import torch
    if is_tf_available():
        import tensorflow as tf
    if is_flax_available():
        import jax.numpy as jnp  # noqa: F401

68
69
70
71
72
73
74
75

if is_tokenizers_available():
    from tokenizers import AddedToken
    from tokenizers import Encoding as EncodingFast
else:

    @dataclass(frozen=True, eq=True)
    class AddedToken:
Sylvain Gugger's avatar
Sylvain Gugger committed
76
77
78
        """
        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
        way it should behave.
79
80
81
82
83
84
85
86
87
88
89
90
91
        """

        content: str = field(default_factory=str)
        single_word: bool = False
        lstrip: bool = False
        rstrip: bool = False
        normalized: bool = True

        def __getstate__(self):
            return self.__dict__

    @dataclass
    class EncodingFast:
Patrick von Platen's avatar
Patrick von Platen committed
92
        """This is dummy class because without the `tokenizers` library we don't have these objects anyway"""
93
94
95

        pass

96

Lysandre Debut's avatar
Lysandre Debut committed
97
logger = logging.get_logger(__name__)
98
99
100
101
102
103
104
105
106
107
108
109
110

VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER

# Define type aliases and NamedTuples
TextInput = str
PreTokenizedInput = List[str]
EncodedInput = List[int]
TextInputPair = Tuple[str, str]
PreTokenizedInputPair = Tuple[List[str], List[str]]
EncodedInputPair = Tuple[List[int], List[int]]


111
# Slow tokenizers used to be saved in three separated files
112
113
114
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
115
116

# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
117
FULL_TOKENIZER_FILE = "tokenizer.json"
118
_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")
119
120
121


class TruncationStrategy(ExplicitEnum):
Sylvain Gugger's avatar
Sylvain Gugger committed
122
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
123
124
    Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in
    an IDE.
Sylvain Gugger's avatar
Sylvain Gugger committed
125
126
    """

127
128
129
130
131
132
133
    ONLY_FIRST = "only_first"
    ONLY_SECOND = "only_second"
    LONGEST_FIRST = "longest_first"
    DO_NOT_TRUNCATE = "do_not_truncate"


class CharSpan(NamedTuple):
Sylvain Gugger's avatar
Sylvain Gugger committed
134
135
    """
    Character span in the original string.
136

Sylvain Gugger's avatar
Sylvain Gugger committed
137
    Args:
138
139
        start (`int`): Index of the first character in the original string.
        end (`int`): Index of the character following the last character in the original string.
140
141
142
143
144
145
146
    """

    start: int
    end: int


class TokenSpan(NamedTuple):
Sylvain Gugger's avatar
Sylvain Gugger committed
147
148
    """
    Token span in an encoded string (list of tokens).
149

Sylvain Gugger's avatar
Sylvain Gugger committed
150
    Args:
151
152
        start (`int`): Index of the first token in the span.
        end (`int`): Index of the token following the last token in the span.
153
154
155
156
157
158
159
    """

    start: int
    end: int


class BatchEncoding(UserDict):
Sylvain Gugger's avatar
Sylvain Gugger committed
160
    """
161
162
163
    Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],
    [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and
    [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).
Sylvain Gugger's avatar
Sylvain Gugger committed
164
165
166
167
168

    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
    utility methods to map from word/character space to token space.

    Args:
169
        data (`dict`):
170
171
            Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
            ('input_ids', 'attention_mask', etc.).
172
        encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
173
            If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
174
            space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this
175
            information.
176
        tensor_type (`Union[None, str, TensorType]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
177
178
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
179
180
181
        prepend_batch_axis (`bool`, *optional*, defaults to `False`):
            Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
        n_sequences (`Optional[int]`, *optional*):
182
183
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
184
185
186
187
188
189
190
191
    """

    def __init__(
        self,
        data: Optional[Dict[str, Any]] = None,
        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
        tensor_type: Union[None, str, TensorType] = None,
        prepend_batch_axis: bool = False,
192
        n_sequences: Optional[int] = None,
193
194
195
196
197
198
199
200
    ):
        super().__init__(data)

        if isinstance(encoding, EncodingFast):
            encoding = [encoding]

        self._encodings = encoding

201
202
203
204
205
        if n_sequences is None and encoding is not None and len(encoding):
            n_sequences = encoding[0].n_sequences

        self._n_sequences = n_sequences

206
207
        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

208
209
210
    @property
    def n_sequences(self) -> Optional[int]:
        """
211
        `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
Sylvain Gugger's avatar
Sylvain Gugger committed
212
213
        [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
        sentences)
214
        """
Lysandre Debut's avatar
Lysandre Debut committed
215
        return self._n_sequences
216

217
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
218
    def is_fast(self) -> bool:
219
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
220
221
        `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`]
        or not.
222
223
224
        """
        return self._encodings is not None

Sylvain Gugger's avatar
Sylvain Gugger committed
225
226
    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
        """
227
        If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
Sylvain Gugger's avatar
Sylvain Gugger committed
228
        etc.).
Sylvain Gugger's avatar
Sylvain Gugger committed
229

230
        If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
        """
        if isinstance(item, str):
            return self.data[item]
        elif self._encodings is not None:
            return self._encodings[item]
        else:
            raise KeyError(
                "Indexing with integers (to access backend Encoding for a given batch index) "
                "is not available when using Python based tokenizers"
            )

    def __getattr__(self, item: str):
        try:
            return self.data[item]
        except KeyError:
            raise AttributeError

248
249
250
251
252
253
254
255
256
257
    def __getstate__(self):
        return {"data": self.data, "encodings": self._encodings}

    def __setstate__(self, state):
        if "data" in state:
            self.data = state["data"]

        if "encodings" in state:
            self._encodings = state["encodings"]

258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
    def keys(self):
        return self.data.keys()

    def values(self):
        return self.data.values()

    def items(self):
        return self.data.items()

    # After this point:
    # Extended properties and methods only available for fast (Rust-based) tokenizers
    # provided by HuggingFace tokenizers library.

    @property
    def encodings(self) -> Optional[List[EncodingFast]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
274
275
        `Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if
        the input was tokenized through Python (i.e., not a fast) tokenizer.
276
277
278
        """
        return self._encodings

279
    def tokens(self, batch_index: int = 0) -> List[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
280
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
281
282
        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
        integer indices) at a given batch index (only works for the output of a fast tokenizer).
Sylvain Gugger's avatar
Sylvain Gugger committed
283
284

        Args:
285
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
Sylvain Gugger's avatar
Sylvain Gugger committed
286
287

        Returns:
288
            `List[str]`: The list of tokens at that index.
Sylvain Gugger's avatar
Sylvain Gugger committed
289
        """
290
        if not self._encodings:
SaulLu's avatar
SaulLu committed
291
292
293
294
            raise ValueError(
                "tokens() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
295
296
        return self._encodings[batch_index].tokens

297
298
299
300
    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to the id of their original sentences:

301
302
303
            - `None` for special tokens added around or between sequences,
            - `0` for tokens corresponding to words in the first sequence,
            - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
304
305
306
              encoded.

        Args:
307
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
308
309

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
310
311
312
            `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
            by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
            sequence.
313
314
        """
        if not self._encodings:
SaulLu's avatar
SaulLu committed
315
316
317
318
            raise ValueError(
                "sequence_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
319
320
        return self._encodings[batch_index].sequence_ids

321
    def words(self, batch_index: int = 0) -> List[Optional[int]]:
Sylvain Gugger's avatar
Sylvain Gugger committed
322
323
324
325
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
326
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
Sylvain Gugger's avatar
Sylvain Gugger committed
327
328

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
329
330
331
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
Sylvain Gugger's avatar
Sylvain Gugger committed
332
        """
333
        if not self._encodings:
SaulLu's avatar
SaulLu committed
334
335
336
337
            raise ValueError(
                "words() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
338
339
340
341
342
343
344
345
346
347
348
349
        warnings.warn(
            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
            "but more self-explanatory `BatchEncoding.word_ids()` property.",
            FutureWarning,
        )
        return self.word_ids(batch_index)

    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
350
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
351
352

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
353
354
355
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
356
357
        """
        if not self._encodings:
SaulLu's avatar
SaulLu committed
358
359
360
361
            raise ValueError(
                "word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
362
363
364
365
        return self._encodings[batch_index].word_ids

    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
366
367
        Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
        for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
368
369
370

        Can be called as:

371
372
        - `self.token_to_sequence(token_index)` if batch size is 1
        - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
373
374
375
376
377
378

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.

        Args:
379
            batch_or_token_index (`int`):
380
381
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the token in the sequence.
382
383
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
384
385
386
                sequence.

        Returns:
387
            `int`: Index of the word in the input sequence.
388
389
390
391
392
393
394
395
396
397
398
399
400
401
        """

        if not self._encodings:
            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_sequence(token_index)
402
403
404

    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
405
        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
406
407
408

        Can be called as:

409
410
        - `self.token_to_word(token_index)` if batch size is 1
        - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
411

Sylvain Gugger's avatar
Sylvain Gugger committed
412
413
414
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.
415
416

        Args:
417
            batch_or_token_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
418
419
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
420
421
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
Sylvain Gugger's avatar
Sylvain Gugger committed
422
                sequence.
423
424

        Returns:
425
            `int`: Index of the word in the input sequence.
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
        """

        if not self._encodings:
            raise ValueError("token_to_word() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_word(token_index)

441
442
443
    def word_to_tokens(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> Optional[TokenSpan]:
444
        """
445
        Get the encoded token span corresponding to a word in a sequence of the batch.
446

447
        Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:
448

Sylvain Gugger's avatar
Sylvain Gugger committed
449
450
        - **start** -- Index of the first token.
        - **end** -- Index of the token following the last token.
451
452
453

        Can be called as:

454
        - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
Sylvain Gugger's avatar
Sylvain Gugger committed
455
456
        - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
          1
457

Sylvain Gugger's avatar
Sylvain Gugger committed
458
459
460
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
461
462

        Args:
463
            batch_or_word_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
464
465
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the word in the sequence.
466
467
            word_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
Sylvain Gugger's avatar
Sylvain Gugger committed
468
                sequence.
469
            sequence_index (`int`, *optional*, defaults to 0):
470
471
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.
472
473

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
474
475
            Optional [`~tokenization_utils_base.TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if
            no tokens correspond to the word.
476
477
478
479
480
481
482
483
484
485
486
487
488
        """

        if not self._encodings:
            raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if word_index < 0:
            word_index = self._seq_len + word_index
489
        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
490
        return TokenSpan(*span) if span is not None else None
491
492
493
494
495

    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
        """
        Get the character span corresponding to an encoded token in a sequence of the batch.

496
        Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:
497

Sylvain Gugger's avatar
Sylvain Gugger committed
498
499
500
        - **start** -- Index of the first character in the original string associated to the token.
        - **end** -- Index of the character following the last character in the original string associated to the
          token.
501
502
503

        Can be called as:

504
505
        - `self.token_to_chars(token_index)` if batch size is 1
        - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
506
507

        Args:
508
            batch_or_token_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
509
510
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
511
512
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
Sylvain Gugger's avatar
Sylvain Gugger committed
513
                the sequence.
514
515

        Returns:
516
517
            [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token
            (e.g. <s>, </s>) doesn't correspond to any chars in the origin string.
518
519
520
521
522
523
524
525
526
        """

        if not self._encodings:
            raise ValueError("token_to_chars() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
527
528
529
        span_indices = self._encodings[batch_index].token_to_chars(token_index)

        return CharSpan(*span_indices) if span_indices is not None else None
530

531
532
533
    def char_to_token(
        self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
    ) -> int:
534
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
535
536
        Get the index of the token in the encoded output comprising a character in the original string for a sequence
        of the batch.
537
538
539

        Can be called as:

540
541
        - `self.char_to_token(char_index)` if batch size is 1
        - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
542

Sylvain Gugger's avatar
Sylvain Gugger committed
543
544
545
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
546
547

        Args:
548
            batch_or_char_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
549
550
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
551
552
            char_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
Sylvain Gugger's avatar
Sylvain Gugger committed
553
                sequence.
554
            sequence_index (`int`, *optional*, defaults to 0):
555
556
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.
557
558
559


        Returns:
560
            `int`: Index of the token.
561
562
563
564
565
566
567
568
569
        """

        if not self._encodings:
            raise ValueError("char_to_token() is not available when using Python based tokenizers")
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
570
        return self._encodings[batch_index].char_to_token(char_index, sequence_index)
571

572
573
574
    def word_to_chars(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> CharSpan:
575
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
576
        Get the character span in the original string corresponding to given word in a sequence of the batch.
577
578
579
580
581
582
583
584

        Character spans are returned as a CharSpan NamedTuple with:

        - start: index of the first character in the original string
        - end: index of the character following the last character in the original string

        Can be called as:

585
586
        - `self.word_to_chars(word_index)` if batch size is 1
        - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1
587
588

        Args:
589
            batch_or_word_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
590
591
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
592
593
            word_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
Sylvain Gugger's avatar
Sylvain Gugger committed
594
                sequence.
595
            sequence_index (`int`, *optional*, defaults to 0):
596
597
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.
598
599

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
600
601
            `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan
            are NamedTuple with:
602
603

                - start: index of the first character associated to the token in the original string
Sylvain Gugger's avatar
Sylvain Gugger committed
604
605
                - end: index of the character following the last character associated to the token in the original
                  string
606
607
608
609
610
611
612
613
614
        """

        if not self._encodings:
            raise ValueError("word_to_chars() is not available when using Python based tokenizers")
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
615
        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
616

617
    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
618
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
619
620
        Get the word in the original string corresponding to a character in the original string of a sequence of the
        batch.
621
622
623

        Can be called as:

624
625
        - `self.char_to_word(char_index)` if batch size is 1
        - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1
626

Sylvain Gugger's avatar
Sylvain Gugger committed
627
628
629
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
630
631

        Args:
632
            batch_or_char_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
633
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
634
                the character in the original string.
635
636
            char_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
637
                original string.
638
            sequence_index (`int`, *optional*, defaults to 0):
639
640
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.
641
642
643


        Returns:
644
            `int` or `List[int]`: Index or indices of the associated encoded token(s).
645
646
647
648
649
650
651
652
653
        """

        if not self._encodings:
            raise ValueError("char_to_word() is not available when using Python based tokenizers")
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
654
        return self._encodings[batch_index].char_to_word(char_index, sequence_index)
655

Sylvain Gugger's avatar
Sylvain Gugger committed
656
657
658
659
660
661
662
    def convert_to_tensors(
        self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
    ):
        """
        Convert the inner content to tensors.

        Args:
663
664
665
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                `None`, no modification is done.
666
            prepend_batch_axis (`int`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
667
668
                Whether or not to add the batch dimension during the conversion.
        """
669
670
671
672
673
674
675
676
        if tensor_type is None:
            return self

        # Convert to TensorType
        if not isinstance(tensor_type, TensorType):
            tensor_type = TensorType(tensor_type)

        # Get a function reference for the correct framework
677
678
679
680
681
        if tensor_type == TensorType.TENSORFLOW:
            if not is_tf_available():
                raise ImportError(
                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
                )
Sylvain Gugger's avatar
Sylvain Gugger committed
682
683
            import tensorflow as tf

684
            as_tensor = tf.constant
685
            is_tensor = tf.is_tensor
686
687
688
        elif tensor_type == TensorType.PYTORCH:
            if not is_torch_available():
                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
Sylvain Gugger's avatar
Sylvain Gugger committed
689
690
            import torch

691
            as_tensor = torch.tensor
692
            is_tensor = torch.is_tensor
693
694
695
        elif tensor_type == TensorType.JAX:
            if not is_flax_available():
                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
Sylvain Gugger's avatar
Sylvain Gugger committed
696
697
            import jax.numpy as jnp  # noqa: F811

698
            as_tensor = jnp.array
699
            is_tensor = _is_jax
700
        else:
701
            as_tensor = np.asarray
702
            is_tensor = _is_numpy
703
704
705
        # (mfuntowicz: This code is unreachable)
        # else:
        #     raise ImportError(
706
        #         f"Unable to convert output to tensors format {tensor_type}"
707
        #     )
708
709
710
711
712
713
714

        # Do the tensor conversion in batch
        for key, value in self.items():
            try:
                if prepend_batch_axis:
                    value = [value]

715
716
                if not is_tensor(value):
                    tensor = as_tensor(value)
717

718
719
720
721
722
723
                    # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
                    # # at-least2d
                    # if tensor.ndim > 2:
                    #     tensor = tensor.squeeze(0)
                    # elif tensor.ndim < 2:
                    #     tensor = tensor[None, :]
724

725
                    self[key] = tensor
726
            except:  # noqa E722
727
728
729
730
731
                if key == "overflowing_tokens":
                    raise ValueError(
                        "Unable to create tensor returning overflowing tokens of different lengths. "
                        "Please see if a fast version of this tokenizer is available to have this feature available."
                    )
732
                raise ValueError(
733
734
735
736
                    "Unable to create tensor, you should probably activate truncation and/or padding with"
                    " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
                    f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
                    " expected)."
737
738
739
740
741
                )

        return self

    @torch_required
742
    def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
Sylvain Gugger's avatar
Sylvain Gugger committed
743
        """
744
        Send all values to device by calling `v.to(device)` (PyTorch only).
Sylvain Gugger's avatar
Sylvain Gugger committed
745
746

        Args:
747
            device (`str` or `torch.device`): The device to put the tensors on.
Sylvain Gugger's avatar
Sylvain Gugger committed
748
749

        Returns:
750
            [`BatchEncoding`]: The same instance after modification.
Sylvain Gugger's avatar
Sylvain Gugger committed
751
        """
752
753
754
755

        # This check catches things like APEX blindly calling "to" on all inputs to a module
        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
        # into a HalfTensor
756
        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
757
758
            self.data = {k: v.to(device=device) for k, v in self.data.items()}
        else:
759
            logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
760
761
762
763
        return self


class SpecialTokensMixin:
Sylvain Gugger's avatar
Sylvain Gugger committed
764
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
765
766
767
    A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to
    special tokens. In particular, this class hold the attributes which can be used to directly access these special
    tokens in a model-independent manner and allow to set and update the special tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
768
769

    Args:
770
        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
771
            A special token representing the beginning of a sentence.
772
        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
773
            A special token representing the end of a sentence.
774
        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
775
            A special token representing an out-of-vocabulary token.
776
        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
777
            A special token separating two different sentences in the same input (used by BERT for instance).
778
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
779
780
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
781
        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
782
            A special token representing the class of the input (used by BERT for instance).
783
        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
784
785
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT).
786
        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
787
            A tuple or a list of additional special tokens.
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
    """

    SPECIAL_TOKENS_ATTRIBUTES = [
        "bos_token",
        "eos_token",
        "unk_token",
        "sep_token",
        "pad_token",
        "cls_token",
        "mask_token",
        "additional_special_tokens",
    ]

    def __init__(self, verbose=True, **kwargs):
        self._bos_token = None
        self._eos_token = None
        self._unk_token = None
        self._sep_token = None
        self._pad_token = None
        self._cls_token = None
        self._mask_token = None
        self._pad_token_type_id = 0
        self._additional_special_tokens = []
        self.verbose = verbose

813
        # We directly set the hidden value to allow initialization with special tokens
814
815
        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
        # TODO clean this up at some point (probably by switching to fast tokenizers)
816
        for key, value in kwargs.items():
817
818
            if value is None:
                continue
819
820
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                if key == "additional_special_tokens":
Teven's avatar
Teven committed
821
                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
NielsRogge's avatar
NielsRogge committed
822
823
824
                    assert all(
                        isinstance(t, (str, AddedToken)) for t in value
                    ), "One of the tokens is not a string or an AddedToken"
825
                    setattr(self, key, value)
826
                elif isinstance(value, (str, AddedToken)):
827
828
                    setattr(self, key, value)
                else:
829
                    raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
830

831
    def sanitize_special_tokens(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
832
        """
833
834
        Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
        `tokenizer.cls_token`, etc.) are in the vocabulary.
835

Sylvain Gugger's avatar
Sylvain Gugger committed
836
837
838
        Add the missing ones to the vocabulary if needed.

        Return:
839
            `int`: The number of tokens added in the vocabulary during the operation.
840
        """
841
        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
842

843
    def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
844
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
845
846
847
        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
        current vocabulary).
848

Sylvain Gugger's avatar
Sylvain Gugger committed
849
850
        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
        matrix of the model so that its embedding matrix matches the tokenizer.
851

852
        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
853

854
        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
855

Sylvain Gugger's avatar
Sylvain Gugger committed
856
        - Special tokens are carefully handled by the tokenizer (they are never split).
857
        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
Sylvain Gugger's avatar
Sylvain Gugger committed
858
          makes it easy to develop model-agnostic training and fine-tuning scripts.
859

Sylvain Gugger's avatar
Sylvain Gugger committed
860
        When possible, special tokens are already registered for provided pretrained models (for instance
Sylvain Gugger's avatar
Sylvain Gugger committed
861
862
        [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be
        `'</s>'`).
863
864

        Args:
865
            special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`):
Sylvain Gugger's avatar
Sylvain Gugger committed
866
867
                Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
                `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
868

Sylvain Gugger's avatar
Sylvain Gugger committed
869
                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
870
                assign the index of the `unk_token` to them).
871
872

        Returns:
873
            `int`: Number of tokens added to the vocabulary.
874

875
        Examples:
876

877
878
        ```python
        # Let's see how to add a new classification token to GPT-2
Sylvain Gugger's avatar
Sylvain Gugger committed
879
880
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        model = GPT2Model.from_pretrained("gpt2")
881

Sylvain Gugger's avatar
Sylvain Gugger committed
882
        special_tokens_dict = {"cls_token": "<CLS>"}
883

884
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
Sylvain Gugger's avatar
Sylvain Gugger committed
885
        print("We have added", num_added_toks, "tokens")
886
887
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
888

Sylvain Gugger's avatar
Sylvain Gugger committed
889
        assert tokenizer.cls_token == "<CLS>"
890
        ```"""
891
892
893
894
895
        if not special_tokens_dict:
            return 0

        added_tokens = 0
        for key, value in special_tokens_dict.items():
Teven's avatar
Teven committed
896
            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
897

898
            if self.verbose:
899
                logger.info(f"Assigning {value} to the {key} key of the tokenizer")
900
901
            setattr(self, key, value)

902
            if key == "additional_special_tokens":
903
904
905
                assert isinstance(value, (list, tuple)) and all(
                    isinstance(t, (str, AddedToken)) for t in value
                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
906
                added_tokens += self.add_tokens(value, special_tokens=True)
907
            else:
908
909
910
                assert isinstance(
                    value, (str, AddedToken)
                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
911
                added_tokens += self.add_tokens([value], special_tokens=True)
912
913
914

        return added_tokens

Sylvain Gugger's avatar
Sylvain Gugger committed
915
916
917
    def add_tokens(
        self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
    ) -> int:
918
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
919
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
920
921
922
        it with indices starting from length of the current vocabulary and and will be isolated before the tokenization
        algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore
        not treated in the same way.
923

924
925
        Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix
        of the model so that its embedding matrix matches the tokenizer.
926

927
        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
928

929
        Args:
930
            new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
Sylvain Gugger's avatar
Sylvain Gugger committed
931
932
933
934
                Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
                token to let you personalize its behavior: whether this token should only match against a single word,
                whether this token should strip all potential whitespaces on the left side, whether this token should
                strip all potential whitespaces on the right side, etc.
935
            special_tokens (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
936
937
                Can be used to specify if the token is a special token. This mostly change the normalization behavior
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
938

939
                See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.
940
941

        Returns:
942
            `int`: Number of tokens added to the vocabulary.
943

944
        Examples:
945

946
947
        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
948
949
        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
        model = BertModel.from_pretrained("bert-base-uncased")
950

Sylvain Gugger's avatar
Sylvain Gugger committed
951
952
        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
953
954
955
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```"""
956
957
958
959
960
961
962
        if not new_tokens:
            return 0

        if not isinstance(new_tokens, (list, tuple)):
            new_tokens = [new_tokens]

        return self._add_tokens(new_tokens, special_tokens=special_tokens)
963

964
965
966
    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
        raise NotImplementedError

967
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
968
969
    def bos_token(self) -> str:
        """
970
        `str`: Beginning of sentence token. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
971
        """
972
973
974
        if self._bos_token is None:
            if self.verbose:
                logger.error("Using bos_token, but it is not set yet.")
975
976
            return None
        return str(self._bos_token)
977
978

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
979
980
    def eos_token(self) -> str:
        """
981
        `str`: End of sentence token. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
982
        """
983
984
985
        if self._eos_token is None:
            if self.verbose:
                logger.error("Using eos_token, but it is not set yet.")
986
987
            return None
        return str(self._eos_token)
988
989

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
990
991
    def unk_token(self) -> str:
        """
992
        `str`: Unknown token. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
993
        """
994
995
996
        if self._unk_token is None:
            if self.verbose:
                logger.error("Using unk_token, but it is not set yet.")
997
998
            return None
        return str(self._unk_token)
999
1000

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1001
1002
    def sep_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1003
1004
        `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
        having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1005
        """
1006
1007
1008
        if self._sep_token is None:
            if self.verbose:
                logger.error("Using sep_token, but it is not set yet.")
1009
1010
            return None
        return str(self._sep_token)
1011
1012

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1013
1014
    def pad_token(self) -> str:
        """
1015
        `str`: Padding token. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1016
        """
1017
1018
1019
        if self._pad_token is None:
            if self.verbose:
                logger.error("Using pad_token, but it is not set yet.")
1020
1021
            return None
        return str(self._pad_token)
1022
1023

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1024
1025
    def cls_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1026
1027
        `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
        depth of the model. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1028
        """
1029
1030
1031
        if self._cls_token is None:
            if self.verbose:
                logger.error("Using cls_token, but it is not set yet.")
1032
1033
            return None
        return str(self._cls_token)
1034
1035

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1036
1037
    def mask_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1038
1039
        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
        having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1040
        """
1041
1042
1043
        if self._mask_token is None:
            if self.verbose:
                logger.error("Using mask_token, but it is not set yet.")
1044
1045
            return None
        return str(self._mask_token)
1046
1047

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1048
1049
    def additional_special_tokens(self) -> List[str]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1050
1051
        `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
        set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1052
        """
1053
1054
1055
        if self._additional_special_tokens is None:
            if self.verbose:
                logger.error("Using additional_special_tokens, but it is not set yet.")
1056
1057
            return None
        return [str(tok) for tok in self._additional_special_tokens]
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091

    @bos_token.setter
    def bos_token(self, value):
        self._bos_token = value

    @eos_token.setter
    def eos_token(self, value):
        self._eos_token = value

    @unk_token.setter
    def unk_token(self, value):
        self._unk_token = value

    @sep_token.setter
    def sep_token(self, value):
        self._sep_token = value

    @pad_token.setter
    def pad_token(self, value):
        self._pad_token = value

    @cls_token.setter
    def cls_token(self, value):
        self._cls_token = value

    @mask_token.setter
    def mask_token(self, value):
        self._mask_token = value

    @additional_special_tokens.setter
    def additional_special_tokens(self, value):
        self._additional_special_tokens = value

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1092
1093
    def bos_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1094
1095
        `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not
        been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1096
        """
1097
1098
        if self._bos_token is None:
            return None
1099
1100
1101
        return self.convert_tokens_to_ids(self.bos_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1102
1103
    def eos_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1104
1105
        `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
        set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1106
        """
1107
1108
        if self._eos_token is None:
            return None
1109
1110
1111
        return self.convert_tokens_to_ids(self.eos_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1112
1113
    def unk_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1114
        `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1115
        """
1116
1117
        if self._unk_token is None:
            return None
1118
1119
1120
        return self.convert_tokens_to_ids(self.unk_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1121
1122
    def sep_token_id(self) -> Optional[int]:
        """
1123
1124
        `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
        sequence. Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1125
        """
1126
1127
        if self._sep_token is None:
            return None
1128
1129
1130
        return self.convert_tokens_to_ids(self.sep_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1131
1132
    def pad_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1133
        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1134
        """
1135
1136
        if self._pad_token is None:
            return None
1137
1138
1139
        return self.convert_tokens_to_ids(self.pad_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1140
1141
    def pad_token_type_id(self) -> int:
        """
1142
        `int`: Id of the padding token type in the vocabulary.
Sylvain Gugger's avatar
Sylvain Gugger committed
1143
        """
1144
1145
1146
        return self._pad_token_type_id

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1147
1148
    def cls_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1149
1150
        `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
        leveraging self-attention along the full depth of the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
1151

1152
        Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1153
        """
1154
1155
        if self._cls_token is None:
            return None
1156
1157
1158
        return self.convert_tokens_to_ids(self.cls_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1159
1160
    def mask_token_id(self) -> Optional[int]:
        """
1161
1162
        `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
        modeling. Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1163
        """
1164
1165
        if self._mask_token is None:
            return None
1166
1167
1168
        return self.convert_tokens_to_ids(self.mask_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1169
1170
    def additional_special_tokens_ids(self) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1171
1172
        `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
        been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1173
        """
1174
1175
        return self.convert_tokens_to_ids(self.additional_special_tokens)

1176
1177
    @bos_token_id.setter
    def bos_token_id(self, value):
1178
        self._bos_token = self.convert_ids_to_tokens(value) if value is not None else None
1179
1180
1181

    @eos_token_id.setter
    def eos_token_id(self, value):
1182
        self._eos_token = self.convert_ids_to_tokens(value) if value is not None else None
1183
1184
1185

    @unk_token_id.setter
    def unk_token_id(self, value):
1186
        self._unk_token = self.convert_ids_to_tokens(value) if value is not None else None
1187
1188
1189

    @sep_token_id.setter
    def sep_token_id(self, value):
1190
        self._sep_token = self.convert_ids_to_tokens(value) if value is not None else None
1191
1192
1193

    @pad_token_id.setter
    def pad_token_id(self, value):
1194
        self._pad_token = self.convert_ids_to_tokens(value) if value is not None else None
1195
1196
1197

    @cls_token_id.setter
    def cls_token_id(self, value):
1198
        self._cls_token = self.convert_ids_to_tokens(value) if value is not None else None
1199
1200
1201

    @mask_token_id.setter
    def mask_token_id(self, value):
1202
        self._mask_token = self.convert_ids_to_tokens(value) if value is not None else None
1203
1204
1205

    @additional_special_tokens_ids.setter
    def additional_special_tokens_ids(self, values):
1206
        self._additional_special_tokens = [self.convert_ids_to_tokens(value) for value in values]
1207

1208
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1209
1210
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
1211
1212
        `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
        `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
Sylvain Gugger's avatar
Sylvain Gugger committed
1213

1214
        Convert potential tokens of `tokenizers.AddedToken` type to string.
1215
1216
1217
1218
1219
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
1220
1221
1222
1223
1224
                set_attr[attr] = (
                    type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value)
                    if isinstance(attr_value, (list, tuple))
                    else str(attr_value)
                )
1225
1226
1227
        return set_attr

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1228
1229
    def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1230
1231
        `Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary mapping
        special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
1232

Sylvain Gugger's avatar
Sylvain Gugger committed
1233
1234
        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
        special tokens are tokenized.
1235
1236
1237
1238
1239
1240
1241
1242
1243
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = attr_value
        return set_attr

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1244
1245
    def all_special_tokens(self) -> List[str]:
        """
1246
        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
Sylvain Gugger's avatar
Sylvain Gugger committed
1247

1248
        Convert tokens of `tokenizers.AddedToken` type to string.
1249
        """
1250
1251
1252
1253
        all_toks = [str(s) for s in self.all_special_tokens_extended]
        return all_toks

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1254
1255
    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1256
1257
        `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class
        attributes.
1258

Sylvain Gugger's avatar
Sylvain Gugger committed
1259
1260
        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
        special tokens are tokenized.
1261
        """
1262
        all_toks = []
1263
        set_attr = self.special_tokens_map_extended
1264
1265
        for attr_value in set_attr.values():
            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
1266
        all_toks = list(OrderedDict.fromkeys(all_toks))
1267
1268
1269
        return all_toks

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1270
1271
    def all_special_ids(self) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1272
        `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
1273
1274
1275
1276
1277
1278
1279
        """
        all_toks = self.all_special_tokens
        all_ids = self.convert_tokens_to_ids(all_toks)
        return all_ids


ENCODE_KWARGS_DOCSTRING = r"""
1280
            add_special_tokens (`bool`, *optional*, defaults to `True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1281
                Whether or not to encode the sequences with the special tokens relative to their model.
1282
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1283
1284
                Activates and controls padding. Accepts the following values:

Sylvain Gugger's avatar
Sylvain Gugger committed
1285
1286
1287
1288
1289
1290
                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
1291
            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1292
1293
                Activates and controls truncation. Accepts the following values:

Sylvain Gugger's avatar
Sylvain Gugger committed
1294
1295
1296
1297
1298
1299
                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
                  sequences (or a batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
Sylvain Gugger's avatar
Sylvain Gugger committed
1300
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
1301
1302
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
Sylvain Gugger's avatar
Sylvain Gugger committed
1303
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
1304
1305
                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).
1306
            max_length (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1307
1308
                Controls the maximum length to use by one of the truncation/padding parameters.

Sylvain Gugger's avatar
Sylvain Gugger committed
1309
1310
1311
                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
                is required by one of the truncation/padding parameters. If the model has no specific maximum input
                length (like XLNet) truncation/padding to a maximum length will be deactivated.
1312
1313
1314
            stride (`int`, *optional*, defaults to 0):
                If set to a number along with `max_length`, the overflowing tokens returned when
                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
Sylvain Gugger's avatar
Sylvain Gugger committed
1315
1316
                returned to provide some overlap between truncated and overflowing sequences. The value of this
                argument defines the number of overlapping tokens.
1317
1318
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
Sylvain Gugger's avatar
Style  
Sylvain Gugger committed
1319
1320
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
1321
            pad_to_multiple_of (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1322
1323
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
1324
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1325
1326
                If set, will return tensors instead of list of python integers. Acceptable values are:

1327
1328
1329
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
1330
1331
1332
"""

ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
1333
            return_token_type_ids (`bool`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1334
                Whether to return token type IDs. If left to the default, will return the token type IDs according to
1335
                the specific tokenizer's default, defined by the `return_outputs` attribute.
1336

1337
1338
                [What are token type IDs?](../glossary#token-type-ids)
            return_attention_mask (`bool`, *optional*):
1339
                Whether to return the attention mask. If left to the default, will return the attention mask according
1340
                to the specific tokenizer's default, defined by the `return_outputs` attribute.
1341

1342
1343
                [What are attention masks?](../glossary#attention-mask)
            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
1344
                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
Sylvain Gugger's avatar
Sylvain Gugger committed
1345
1346
                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
                of returning overflowing tokens.
1347
            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
Tiger's avatar
Tiger committed
1348
                Whether or not to return special tokens mask information.
1349
1350
            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
                Whether or not to return `(char_start, char_end)` for each token.
1351

Sylvain Gugger's avatar
Sylvain Gugger committed
1352
1353
                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
                Python's tokenizer, this method will raise `NotImplementedError`.
1354
            return_length  (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1355
                Whether or not to return the lengths of the encoded inputs.
1356
            verbose (`bool`, *optional*, defaults to `True`):
1357
                Whether or not to print more information and warnings.
1358
            **kwargs: passed to the `self.tokenize()` method
1359

1360
        Return:
1361
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
Sylvain Gugger's avatar
Sylvain Gugger committed
1362
1363
1364

            - **input_ids** -- List of token ids to be fed to a model.

1365
              [What are input IDs?](../glossary#input-ids)
Sylvain Gugger's avatar
Sylvain Gugger committed
1366

Sylvain Gugger's avatar
Sylvain Gugger committed
1367
1368
            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
              if *"token_type_ids"* is in `self.model_input_names`).
Sylvain Gugger's avatar
Sylvain Gugger committed
1369

1370
              [What are token type IDs?](../glossary#token-type-ids)
Sylvain Gugger's avatar
Sylvain Gugger committed
1371

Sylvain Gugger's avatar
Sylvain Gugger committed
1372
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
1373
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
Sylvain Gugger's avatar
Sylvain Gugger committed
1374

1375
              [What are attention masks?](../glossary#attention-mask)
Sylvain Gugger's avatar
Sylvain Gugger committed
1376

1377
1378
1379
1380
            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
              `return_overflowing_tokens=True`).
            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
              `return_overflowing_tokens=True`).
1381
            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
1382
1383
              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
            - **length** -- The length of the inputs (when `return_length=True`)
Sylvain Gugger's avatar
Sylvain Gugger committed
1384
"""
1385

Sylvain Gugger's avatar
Sylvain Gugger committed
1386
1387
INIT_TOKENIZER_DOCSTRING = r"""
    Class attributes (overridden by derived classes)
Sylvain Gugger's avatar
Sylvain Gugger committed
1388

Sylvain Gugger's avatar
Sylvain Gugger committed
1389
1390
1391
        - **vocab_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
          vocabulary file required by the model, and as associated values, the filename for saving the associated file
          (string).
1392
1393
        - **pretrained_vocab_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
          high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
Sylvain Gugger's avatar
Sylvain Gugger committed
1394
1395
1396
1397
1398
          low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
          associated pretrained vocabulary file.
        - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
          of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
          or `None` if the model has no maximum input size.
1399
        - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
Sylvain Gugger's avatar
Sylvain Gugger committed
1400
1401
1402
          `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
          pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
          with the [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`] method.
1403
        - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
1404
1405
        - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
          Should be `'right'` or `'left'`.
1406
1407
        - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
          applied. Should be `'right'` or `'left'`.
Sylvain Gugger's avatar
Sylvain Gugger committed
1408
1409

    Args:
1410
        model_max_length (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1411
            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
Sylvain Gugger's avatar
Sylvain Gugger committed
1412
1413
1414
            loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the
            value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
            default to VERY_LARGE_INTEGER (`int(1e30)`).
1415
        padding_side (`str`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1416
1417
            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
            Default value is picked from the class attribute of the same name.
1418
1419
1420
        truncation_side (`str`, *optional*):
            The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
            Default value is picked from the class attribute of the same name.
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
        model_input_names (`List[string]`, *optional*):
            The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
            `"attention_mask"`). Default value is picked from the class attribute of the same name.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the beginning of a sentence. Will be associated to `self.bos_token` and
            `self.bos_token_id`.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the end of a sentence. Will be associated to `self.eos_token` and
            `self.eos_token_id`.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing an out-of-vocabulary token. Will be associated to `self.unk_token` and
            `self.unk_token_id`.
        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1434
            A special token separating two different sentences in the same input (used by BERT for instance). Will be
1435
1436
            associated to `self.sep_token` and `self.sep_token_id`.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1437
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
Sylvain Gugger's avatar
Sylvain Gugger committed
1438
            attention mechanisms or loss computation. Will be associated to `self.pad_token` and `self.pad_token_id`.
1439
        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1440
            A special token representing the class of the input (used by BERT for instance). Will be associated to
1441
1442
            `self.cls_token` and `self.cls_token_id`.
        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1443
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
1444
1445
            BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1446
            A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
1447
1448
            tokenization process. Will be associated to `self.additional_special_tokens` and
            `self.additional_special_tokens_ids`.
1449
1450
1451
"""


Sylvain Gugger's avatar
Sylvain Gugger committed
1452
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
Sylvain Gugger's avatar
Sylvain Gugger committed
1453
class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
Sylvain Gugger's avatar
Sylvain Gugger committed
1454
    """
1455
    Base class for [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`].
1456

Sylvain Gugger's avatar
Sylvain Gugger committed
1457
    Handles shared (mostly boiler plate) methods for those two classes.
1458
1459
1460
1461
1462
    """

    vocab_files_names: Dict[str, str] = {}
    pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
Sylvain Gugger's avatar
Sylvain Gugger committed
1463
    max_model_input_sizes: Dict[str, Optional[int]] = {}
1464
    _auto_class: Optional[str] = None
1465
1466
1467
1468

    # first name has to correspond to main model input name
    # to make sure `tokenizer.pad(...)` works correctly
    model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
1469
    padding_side: str = "right"
1470
    truncation_side: str = "right"
1471
    slow_tokenizer_class = None
1472

1473
1474
1475
    def __init__(self, **kwargs):
        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
        self.init_inputs = ()
1476
        self.init_kwargs = copy.deepcopy(kwargs)
1477
        self.name_or_path = kwargs.pop("name_or_path", "")
1478
        self._processor_class = kwargs.pop("processor_class", None)
1479
1480

        # For backward compatibility we fallback to set model_max_length from max_len if provided
1481
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
1482
1483
        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER

1484
1485
        # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it
        # is changed.
1486
        self.padding_side = kwargs.pop("padding_side", self.padding_side)
1487
1488
1489
1490
        if self.padding_side not in ["right", "left"]:
            raise ValueError(
                f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
            )
1491
1492
1493
1494
1495
1496
1497

        self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
        if self.truncation_side not in ["right", "left"]:
            raise ValueError(
                f"Padding side should be selected between 'right' and 'left', current value: {self.truncation_side}"
            )

1498
1499
        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

1500
1501
1502
        self.deprecation_warnings = (
            {}
        )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
1503
        self._in_target_context_manager = False
1504
        super().__init__(**kwargs)
1505
1506
1507

    @property
    def max_len_single_sentence(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1508
        """
1509
        `int`: The maximum length of a sentence that can be fed to the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
1510
        """
1511
1512
1513
1514
        return self.model_max_length - self.num_special_tokens_to_add(pair=False)

    @property
    def max_len_sentences_pair(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1515
        """
1516
        `int`: The maximum combined length of a pair of sentences that can be fed to the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
1517
        """
1518
1519
1520
1521
        return self.model_max_length - self.num_special_tokens_to_add(pair=True)

    @max_len_single_sentence.setter
    def max_len_single_sentence(self, value) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1522
        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
1523
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
1524
1525
            if not self.deprecation_warnings.get("max_len_single_sentence", False):
                logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
1526
                    "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
1527
1528
                )
            self.deprecation_warnings["max_len_single_sentence"] = True
1529
1530
        else:
            raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
1531
                "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
1532
1533
1534
1535
            )

    @max_len_sentences_pair.setter
    def max_len_sentences_pair(self, value) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1536
        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
1537
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
1538
1539
            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
                logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
1540
                    "Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
1541
1542
                )
            self.deprecation_warnings["max_len_sentences_pair"] = True
1543
        else:
Sylvain Gugger's avatar
Sylvain Gugger committed
1544
            raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")
1545

1546
1547
1548
1549
    def _set_processor_class(self, processor_class: str):
        """Sets processor class as an attribute."""
        self._processor_class = processor_class

1550
1551
    def __repr__(self) -> str:
        return (
Sylvain Gugger's avatar
Sylvain Gugger committed
1552
1553
1554
1555
            f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}',"
            f" vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast},"
            f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
            f" special_tokens={self.special_tokens_map_extended})"
1556
1557
        )

1558
1559
1560
1561
    def get_vocab(self) -> Dict[str, int]:
        """
        Returns the vocabulary as a dictionary of token to index.

Sylvain Gugger's avatar
Sylvain Gugger committed
1562
1563
        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
        vocab.
1564
1565

        Returns:
1566
            `Dict[str, int]`: The vocabulary.
1567
1568
1569
        """
        raise NotImplementedError()

1570
    @classmethod
1571
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
1572
        r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
1573
1574
        Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined
        tokenizer.
1575
1576

        Args:
1577
            pretrained_model_name_or_path (`str` or `os.PathLike`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1578
1579
                Can be either:

1580
1581
1582
1583
                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                  user or organization name, like `dbmdz/bert-base-german-cased`.
                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
Sylvain Gugger's avatar
Sylvain Gugger committed
1584
1585
                  using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,
                  `./my_model_directory/`.
Sylvain Gugger's avatar
Sylvain Gugger committed
1586
1587
                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
1588
1589
                  `./my_model_directory/vocab.txt`.
            cache_dir (`str` or `os.PathLike`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1590
1591
                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
                standard cache should not be used.
1592
            force_download (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1593
1594
                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
                exist.
1595
            resume_download (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1596
1597
                Whether or not to delete incompletely received files. Attempt to resume the download if such a file
                exists.
1598
            proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1599
1600
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
1601
            use_auth_token (`str` or *bool*, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1602
                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
1603
                when running `huggingface-cli login` (stored in `~/.huggingface`).
1604
            local_files_only (`bool`, *optional*, defaults to `False`):
1605
                Whether or not to only rely on local files and not to attempt to download any files.
1606
            revision (`str`, *optional*, defaults to `"main"`):
Julien Chaumond's avatar
Julien Chaumond committed
1607
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
1608
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
Julien Chaumond's avatar
Julien Chaumond committed
1609
                identifier allowed by git.
1610
            subfolder (`str`, *optional*):
1611
1612
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
1613
1614
1615
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__` method.
            kwargs (additional keyword arguments, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1616
1617
1618
                Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`,
                `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__` for more details.
1619

1620
        <Tip>
1621

1622
        Passing `use_auth_token=True` is required when you want to use a private model.
1623

1624
        </Tip>
1625

1626
        Examples:
1627

1628
1629
1630
        ```python
        # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer
        # Download vocabulary from huggingface.co and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
1631
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
1632

1633
        # Download vocabulary from huggingface.co (user-uploaded) and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
1634
        tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
1635

1636
        # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
Sylvain Gugger's avatar
Sylvain Gugger committed
1637
        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/")
1638

1639
        # If the tokenizer uses a single vocabulary file, you can point directly to this file
Sylvain Gugger's avatar
Sylvain Gugger committed
1640
        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt")
1641

1642
        # You can link tokens to special vocabulary when instantiating
Sylvain Gugger's avatar
Sylvain Gugger committed
1643
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", unk_token="<unk>")
1644
1645
        # You should be sure '<unk>' is in the vocabulary when doing that.
        # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
Sylvain Gugger's avatar
Sylvain Gugger committed
1646
        assert tokenizer.unk_token == "<unk>"
1647
        ```"""
1648
1649
1650
1651
1652
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
1653
        use_auth_token = kwargs.pop("use_auth_token", None)
Julien Chaumond's avatar
Julien Chaumond committed
1654
        revision = kwargs.pop("revision", None)
1655
        subfolder = kwargs.pop("subfolder", None)
1656
1657
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)
1658
        commit_hash = kwargs.pop("_commit_hash", None)
1659
1660
1661
1662

        user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline
1663

1664
1665
1666
1667
        if is_offline_mode() and not local_files_only:
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True

1668
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
1669
1670
1671
        vocab_files = {}
        init_configuration = {}

1672
1673
        is_local = os.path.isdir(pretrained_model_name_or_path)
        if os.path.isfile(pretrained_model_name_or_path):
1674
1675
1676
1677
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
1678
                )
1679
1680
1681
1682
1683
1684
            warnings.warn(
                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
                FutureWarning,
            )
            file_id = list(cls.vocab_files_names.keys())[0]
1685

1686
1687
1688
1689
1690
1691
1692
1693
            vocab_files[file_id] = pretrained_model_name_or_path
        else:
            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
            additional_files_names = {
                "added_tokens_file": ADDED_TOKENS_FILE,
                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
            }
1694
            vocab_files = {**cls.vocab_files_names, **additional_files_names}
1695

1696
            if "tokenizer_file" in vocab_files:
1697
1698
                # Try to get the tokenizer config to see if there are versioned tokenizer files.
                fast_tokenizer_file = FULL_TOKENIZER_FILE
1699
                resolved_config_file = cached_file(
1700
1701
1702
1703
1704
1705
1706
1707
1708
                    pretrained_model_name_or_path,
                    TOKENIZER_CONFIG_FILE,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    resume_download=resume_download,
                    proxies=proxies,
                    use_auth_token=use_auth_token,
                    revision=revision,
                    local_files_only=local_files_only,
1709
                    subfolder=subfolder,
1710
1711
1712
1713
                    user_agent=user_agent,
                    _raise_exceptions_for_missing_entries=False,
                    _raise_exceptions_for_connection_errors=False,
                    _commit_hash=commit_hash,
1714
                )
1715
                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
1716
1717
1718
1719
1720
                if resolved_config_file is not None:
                    with open(resolved_config_file, encoding="utf-8") as reader:
                        tokenizer_config = json.load(reader)
                        if "fast_tokenizer_files" in tokenizer_config:
                            fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
1721
                vocab_files["tokenizer_file"] = fast_tokenizer_file
1722
1723

        # Get files from url, cache, or disk depending on the case
Julien Chaumond's avatar
Julien Chaumond committed
1724
        resolved_vocab_files = {}
1725
        unresolved_files = []
Julien Chaumond's avatar
Julien Chaumond committed
1726
1727
1728
        for file_id, file_path in vocab_files.items():
            if file_path is None:
                resolved_vocab_files[file_id] = None
1729
1730
            elif os.path.isfile(file_path):
                resolved_vocab_files[file_id] = file_path
1731
1732
            elif is_remote_url(file_path):
                resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
Julien Chaumond's avatar
Julien Chaumond committed
1733
            else:
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
                resolved_vocab_files[file_id] = cached_file(
                    pretrained_model_name_or_path,
                    file_path,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    local_files_only=local_files_only,
                    use_auth_token=use_auth_token,
                    user_agent=user_agent,
                    revision=revision,
                    subfolder=subfolder,
                    _raise_exceptions_for_missing_entries=False,
                    _raise_exceptions_for_connection_errors=False,
1748
                    _commit_hash=commit_hash,
1749
                )
1750
                commit_hash = extract_commit_hash(resolved_vocab_files[file_id], commit_hash)
1751

1752
1753
1754
1755
1756
1757
        if len(unresolved_files) > 0:
            logger.info(
                f"Can't load following files from cache: {unresolved_files} and cannot check if these "
                "files are necessary for the tokenizer to operate."
            )

1758
        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
1759
1760
1761
1762
            raise EnvironmentError(
                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
1763
                f"containing all relevant files for a {cls.__name__} tokenizer."
1764
            )
1765

1766
        for file_id, file_path in vocab_files.items():
1767
1768
1769
            if file_id not in resolved_vocab_files:
                continue

1770
            if is_local:
1771
                logger.info(f"loading file {file_path}")
1772
            else:
1773
                logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
1774

1775
        return cls._from_pretrained(
1776
1777
1778
1779
1780
            resolved_vocab_files,
            pretrained_model_name_or_path,
            init_configuration,
            *init_inputs,
            use_auth_token=use_auth_token,
1781
            cache_dir=cache_dir,
1782
            local_files_only=local_files_only,
1783
            _commit_hash=commit_hash,
1784
            **kwargs,
1785
1786
1787
1788
        )

    @classmethod
    def _from_pretrained(
1789
1790
1791
1792
1793
1794
        cls,
        resolved_vocab_files,
        pretrained_model_name_or_path,
        init_configuration,
        *init_inputs,
        use_auth_token=None,
1795
        cache_dir=None,
1796
        local_files_only=False,
1797
        _commit_hash=None,
1798
        **kwargs
1799
    ):
1800
1801
1802
1803
1804
        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
        # file or if `from_slow` is set to True.
        from_slow = kwargs.get("from_slow", False)
        has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
        if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
1805
            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
1806
1807
1808
1809
                copy.deepcopy(resolved_vocab_files),
                pretrained_model_name_or_path,
                copy.deepcopy(init_configuration),
                *init_inputs,
1810
1811
1812
                use_auth_token=use_auth_token,
                cache_dir=cache_dir,
                local_files_only=local_files_only,
1813
                _commit_hash=_commit_hash,
1814
1815
1816
1817
1818
                **(copy.deepcopy(kwargs)),
            )
        else:
            slow_tokenizer = None

1819
1820
1821
1822
1823
1824
        # Prepare tokenizer initialization kwargs
        # Did we saved some inputs and kwargs to reload ?
        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
        if tokenizer_config_file is not None:
            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                init_kwargs = json.load(tokenizer_config_handle)
1825
1826
            # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
            config_tokenizer_class = init_kwargs.get("tokenizer_class")
1827
            init_kwargs.pop("tokenizer_class", None)
1828
            init_kwargs.pop("auto_map", None)
1829
1830
1831
1832
            saved_init_inputs = init_kwargs.pop("init_inputs", ())
            if not init_inputs:
                init_inputs = saved_init_inputs
        else:
1833
            config_tokenizer_class = None
1834
1835
            init_kwargs = init_configuration

1836
        if config_tokenizer_class is None:
1837
            from .models.auto.configuration_auto import AutoConfig  # tests_ignore
1838
1839
1840

            # Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
            try:
1841
1842
1843
1844
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path,
                    use_auth_token=use_auth_token,
                    cache_dir=cache_dir,
1845
                    local_files_only=local_files_only,
1846
                    _commit_hash=_commit_hash,
1847
                )
1848
1849
                config_tokenizer_class = config.tokenizer_class
            except (OSError, ValueError, KeyError):
1850
                # skip if an error occurred.
1851
1852
1853
1854
                config = None
            if config_tokenizer_class is None:
                # Third attempt. If we have not yet found the original type of the tokenizer,
                # we are loading we see if we can infer it from the type of the configuration file
1855
                from .models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES  # tests_ignore
1856
1857

                if hasattr(config, "model_type"):
1858
                    model_type = config.model_type
1859
1860
                else:
                    # Fallback: use pattern matching on the string.
1861
1862
                    model_type = None
                    for pattern in TOKENIZER_MAPPING_NAMES.keys():
1863
                        if pattern in str(pretrained_model_name_or_path):
1864
                            model_type = pattern
1865
1866
                            break

1867
                if model_type is not None:
1868
1869
1870
                    config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(
                        model_type, (None, None)
                    )
1871
1872
                    if config_tokenizer_class is None:
                        config_tokenizer_class = config_tokenizer_class_fast
1873
1874
1875
1876

        if config_tokenizer_class is not None:
            if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
                logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
1877
1878
1879
1880
                    "The tokenizer class you load from this checkpoint is not the same type as the class this"
                    " function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
                    f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
                    f" from is '{cls.__name__}'."
1881
1882
                )

1883
1884
1885
        # Update with newly provided kwargs
        init_kwargs.update(kwargs)

1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
        # Convert AddedTokens serialized as dict to class instances
        def convert_added_tokens(obj: Union[AddedToken, Any]):
            if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
                obj.pop("__type")
                return AddedToken(**obj)
            elif isinstance(obj, (list, tuple)):
                return list(convert_added_tokens(o) for o in obj)
            elif isinstance(obj, dict):
                return {k: convert_added_tokens(v) for k, v in obj.items()}
            return obj

        init_kwargs = convert_added_tokens(init_kwargs)

1899
1900
1901
1902
        # Set max length if needed
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
            # if we're using a pretrained model, ensure the tokenizer
            # wont index sequences longer than the number of positional embeddings
1903

1904
1905
            model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
            if model_max_length is not None and isinstance(model_max_length, (int, float)):
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915

                model_max_length = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
                # TODO(PVP) - uncomment following line in Transformers v5
                # init_kwargs["model_max_length"] = model_max_length
                # TODO(PVP) - remove in Transformers v5
                # ---
                init_kwargs["model_max_length"] = cls._eventually_correct_t5_max_length(
                    pretrained_model_name_or_path, model_max_length, init_kwargs.get("model_max_length")
                )
                # ---
1916
1917
1918
1919
1920
1921
1922

        # Merge resolved_vocab_files arguments in init_kwargs.
        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
        for args_name, file_path in resolved_vocab_files.items():
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path

1923
1924
1925
        if slow_tokenizer is not None:
            init_kwargs["__slow_tokenizer"] = slow_tokenizer

1926
1927
        init_kwargs["name_or_path"] = pretrained_model_name_or_path

1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
        # Instantiate tokenizer.
        try:
            tokenizer = cls(*init_inputs, **init_kwargs)
        except OSError:
            raise OSError(
                "Unable to load vocabulary from file. "
                "Please check that the provided vocabulary is accessible and not corrupted."
            )

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
1938
1939
1940
        # Removed: Now done at the base class level
        # tokenizer.init_inputs = init_inputs
        # tokenizer.init_kwargs = init_kwargs
1941

1942
1943
1944
1945
1946
1947
        # If there is a complementary special token map, load it
        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
        if special_tokens_map_file is not None:
            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
                special_tokens_map = json.load(special_tokens_map_handle)
            for key, value in special_tokens_map.items():
1948
1949
1950
1951
1952
1953
                if key in kwargs and kwargs[key]:
                    # This value has already been redefined by the kwargs
                    # We keep this new value and ignore the one stored in the special_tokens_map_file

                    continue

1954
1955
1956
1957
                if isinstance(value, dict):
                    value = AddedToken(**value)
                elif isinstance(value, list):
                    value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
1958
                setattr(tokenizer, key, value)
1959
1960

        # Add supplementary tokens.
1961
        special_tokens = tokenizer.all_special_tokens
1962
1963
1964
        if added_tokens_file is not None:
            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                added_tok_encoder = json.load(added_tokens_handle)
1965
1966
1967
1968

            # Sort added tokens by index
            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))

1969
1970
1971
1972
1973
            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
            # individual tokens would repeatedly rebuild a trie, which can be slow.
            is_last_special = None
            tokens = []

1974
            for token, index in added_tok_encoder_sorted:
1975
1976
                current_index = len(tokenizer) + len(tokens)
                if has_tokenizer_file and index != current_index and tokenizer.convert_tokens_to_ids(token) != index:
Sylvain Gugger's avatar
Sylvain Gugger committed
1977
1978
1979
1980
1981
1982
                    # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
                    # index is the current length of the tokenizer (not in vocabulary)
                    raise ValueError(
                        f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
                        f"{index}."
                    )
1983
                elif not has_tokenizer_file and index != current_index:
Sylvain Gugger's avatar
Sylvain Gugger committed
1984
1985
1986
1987
                    # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
                    # current length of the tokenizer.
                    raise ValueError(
                        f"Non-consecutive added token '{token}' found. "
1988
                        f"Should have index {current_index} but has index {index} in saved vocabulary."
Sylvain Gugger's avatar
Sylvain Gugger committed
1989
1990
                    )

1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
                is_special = bool(token in special_tokens)
                if is_last_special is None or is_last_special == is_special:
                    tokens.append(token)
                else:
                    tokenizer.add_tokens(tokens, special_tokens=is_last_special)
                    tokens = [token]
                is_last_special = is_special

            if tokens:
                tokenizer.add_tokens(tokens, special_tokens=is_last_special)
2001

Stas Bekman's avatar
Stas Bekman committed
2002
        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
2003
2004
        added_tokens = tokenizer.sanitize_special_tokens()
        if added_tokens:
2005
            logger.warning_advice(
Sylvain Gugger's avatar
Sylvain Gugger committed
2006
2007
                "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
                " fine-tuned or trained."
2008
            )
2009
2010
2011

        return tokenizer

2012
2013
2014
2015
2016
2017
2018
2019
    @staticmethod
    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
        # This method should be deleted in Transformers v5
        # Its only purpose is to potentially throw a warning
        # that incorrectly defined max lengths of T5's tokenizer are used
        # which we will correct in Transformers v5.
        return max_model_length

2020
    def save_pretrained(
2021
2022
        self,
        save_directory: Union[str, os.PathLike],
Sylvain Gugger's avatar
Sylvain Gugger committed
2023
        legacy_format: Optional[bool] = None,
2024
        filename_prefix: Optional[str] = None,
Sylvain Gugger's avatar
Sylvain Gugger committed
2025
2026
        push_to_hub: bool = False,
        **kwargs,
2027
    ) -> Tuple[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
2028
        """
2029
        Save the full tokenizer state.
Sylvain Gugger's avatar
Sylvain Gugger committed
2030

2031

Sylvain Gugger's avatar
Sylvain Gugger committed
2032
        This method make sure the full tokenizer can then be re-loaded using the
2033
        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method..
Sylvain Gugger's avatar
Sylvain Gugger committed
2034

Sylvain Gugger's avatar
Sylvain Gugger committed
2035
2036
        Warning,None This won't save modifications you may have applied to the tokenizer after the instantiation (for
        instance, modifying `tokenizer.do_lower_case` after creation).
Sylvain Gugger's avatar
Sylvain Gugger committed
2037
2038

        Args:
2039
2040
            save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
            legacy_format (`bool`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2041
                Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
2042
2043
                format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
                added_tokens files.
Sylvain Gugger's avatar
Sylvain Gugger committed
2044

Sylvain Gugger's avatar
Sylvain Gugger committed
2045
2046
2047
                If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible with
                "slow" tokenizers (not powered by the *tokenizers* library), so the tokenizer will not be able to be
                loaded in the corresponding "slow" tokenizer.
Sylvain Gugger's avatar
Sylvain Gugger committed
2048

Sylvain Gugger's avatar
Sylvain Gugger committed
2049
2050
                If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
                error is raised.
2051
            filename_prefix: (`str`, *optional*):
2052
                A prefix to add to the names of the files saved by the tokenizer.
2053
            push_to_hub (`bool`, *optional*, defaults to `False`):
2054
2055
2056
2057
2058
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs:
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
2059

Sylvain Gugger's avatar
Sylvain Gugger committed
2060
        Returns:
2061
            A tuple of `str`: The files saved.
2062
        """
2063
        if os.path.isfile(save_directory):
2064
            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
2065
            return
2066

2067
2068
        os.makedirs(save_directory, exist_ok=True)

2069
2070
        if push_to_hub:
            commit_message = kwargs.pop("commit_message", None)
2071
2072
2073
            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
            repo_id, token = self._create_repo(repo_id, **kwargs)
            files_timestamps = self._get_files_timestamps(save_directory)
2074

2075
2076
2077
2078
2079
2080
        special_tokens_map_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
        )
        tokenizer_config_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
        )
2081
2082
2083
2084
2085
2086
2087

        tokenizer_config = copy.deepcopy(self.init_kwargs)
        if len(self.init_inputs) > 0:
            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
        for file_id in self.vocab_files_names.keys():
            tokenizer_config.pop(file_id, None)

2088
        # Sanitize AddedTokens
2089
        def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
2090
2091
            if isinstance(obj, AddedToken):
                out = obj.__getstate__()
2092
2093
                if add_type_field:
                    out["__type"] = "AddedToken"
2094
2095
                return out
            elif isinstance(obj, (list, tuple)):
2096
                return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
2097
            elif isinstance(obj, dict):
2098
                return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
2099
2100
            return obj

2101
2102
        # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
        tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
2103
2104
2105
2106
2107
2108
2109

        # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
        tokenizer_class = self.__class__.__name__
        # Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
        if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
            tokenizer_class = tokenizer_class[:-4]
        tokenizer_config["tokenizer_class"] = tokenizer_class
2110
2111
        if getattr(self, "_auto_map", None) is not None:
            tokenizer_config["auto_map"] = self._auto_map
2112
2113
        if getattr(self, "_processor_class", None) is not None:
            tokenizer_config["processor_class"] = self._processor_class
2114

2115
2116
2117
2118
2119
        # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
        # loaded from the Hub.
        if self._auto_class is not None:
            custom_object_save(self, save_directory, config=tokenizer_config)

2120
        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
2121
2122
            out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
            f.write(out_str)
2123
        logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
2124

2125
        # Sanitize AddedTokens in special_tokens_map
2126
        write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
2127
        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
2128
2129
            out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
            f.write(out_str)
2130
        logger.info(f"Special tokens file saved in {special_tokens_map_file}")
2131

2132
2133
        file_names = (tokenizer_config_file, special_tokens_map_file)

Sylvain Gugger's avatar
Sylvain Gugger committed
2134
        save_files = self._save_pretrained(
2135
2136
2137
2138
2139
            save_directory=save_directory,
            file_names=file_names,
            legacy_format=legacy_format,
            filename_prefix=filename_prefix,
        )
2140

Sylvain Gugger's avatar
Sylvain Gugger committed
2141
        if push_to_hub:
2142
2143
2144
            self._upload_modified_files(
                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
            )
Sylvain Gugger's avatar
Sylvain Gugger committed
2145
2146
2147

        return save_files

2148
2149
    def _save_pretrained(
        self,
2150
        save_directory: Union[str, os.PathLike],
2151
        file_names: Tuple[str],
Sylvain Gugger's avatar
Sylvain Gugger committed
2152
        legacy_format: Optional[bool] = None,
2153
2154
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
2155
2156
        """
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
2157

Sylvain Gugger's avatar
Sylvain Gugger committed
2158
        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
2159
        specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
2160
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2161
        if legacy_format is False:
2162
            raise ValueError(
2163
                "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
2164
2165
            )

2166
2167
        save_directory = str(save_directory)

2168
2169
2170
        added_tokens_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
        )
2171
2172
        added_vocab = self.get_added_vocab()
        if added_vocab:
2173
            with open(added_tokens_file, "w", encoding="utf-8") as f:
2174
                out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
2175
                f.write(out_str)
2176
                logger.info(f"added tokens file saved in {added_tokens_file}")
2177

2178
        vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
2179

2180
2181
2182
2183
2184
2185
        return file_names + vocab_files + (added_tokens_file,)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

Sylvain Gugger's avatar
Sylvain Gugger committed
2186
        This method won't save the configuration and special token mappings of the tokenizer. Use
2187
        [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
2188
2189

        Args:
2190
            save_directory (`str`):
2191
                The directory in which to save the vocabulary.
2192
            filename_prefix (`str`, *optional*):
2193
2194
2195
                An optional prefix to add to the named of the saved files.

        Returns:
2196
            `Tuple(str)`: Paths to the files saved.
2197
2198
        """
        raise NotImplementedError
2199

2200
2201
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        """
2202
        Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
2203
2204

        Args:
2205
            text (`str`):
2206
                The sequence to be encoded.
2207
            pair (`str`, *optional*):
2208
                A second sequence to be encoded with the first.
2209
            add_special_tokens (`bool`, *optional*, defaults to `False`):
2210
                Whether or not to add the special tokens associated with the corresponding model.
2211
            kwargs (additional keyword arguments, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2212
                Will be passed to the underlying model specific encode method. See details in
2213
                [`~PreTrainedTokenizerBase.__call__`]
2214
2215

        Returns:
2216
            `List[str]`: The list of tokens.
2217
2218
2219
        """
        raise NotImplementedError

2220
2221
2222
    @add_end_docstrings(
        ENCODE_KWARGS_DOCSTRING,
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2223
2224
2225
2226
            **kwargs: Passed along to the `.tokenize()` method.
        """,
        """
        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
2227
            `List[int]`, `torch.Tensor`, `tf.Tensor` or `np.ndarray`: The tokenized ids of the text.
Sylvain Gugger's avatar
Sylvain Gugger committed
2228
        """,
2229
2230
2231
2232
2233
2234
    )
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2235
2236
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2237
2238
2239
2240
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
Sylvain Gugger's avatar
Sylvain Gugger committed
2241
    ) -> List[int]:
2242
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2243
        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
2244

2245
        Same as doing `self.convert_tokens_to_ids(self.tokenize(text))`.
2246
2247

        Args:
2248
            text (`str`, `List[str]` or `List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2249
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
2250
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
Sylvain Gugger's avatar
Sylvain Gugger committed
2251
                method).
2252
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2253
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
Sylvain Gugger's avatar
Sylvain Gugger committed
2254
2255
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
        """
        encoded_inputs = self.encode_plus(
            text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            return_tensors=return_tensors,
            **kwargs,
        )

        return encoded_inputs["input_ids"]

    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        raise NotImplementedError

    def _get_padding_truncation_strategies(
2275
        self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
2276
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
2277
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2278
2279
        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
        and pad_to_max_length) and behaviors.
2280
2281
2282
2283
2284
2285
2286
2287
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

        # Backward compatibility for previous behavior, maybe we should deprecate it:
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
2288
                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
2289
                    logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
2290
2291
2292
2293
2294
                        "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
                        " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
                        " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
                        " tokenizer you can select this strategy more precisely by providing a specific strategy to"
                        " `truncation`."
2295
                    )
2296
                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
2297
            truncation = "longest_first"
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307

        # Get padding strategy
        if padding is False and old_pad_to_max_length:
            if verbose:
                warnings.warn(
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
                    "maximal input size of the model (e.g. 512 for Bert).",
2308
                    FutureWarning,
2309
2310
2311
2312
2313
2314
2315
                )
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
2316
                if verbose:
2317
2318
2319
2320
2321
                    if max_length is not None and (truncation is False or truncation == "do_not_truncate"):
                        warnings.warn(
                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
                            "To pad to max length, use `padding='max_length'`."
                        )
2322
2323
                    if old_pad_to_max_length is not False:
                        warnings.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.")
2324
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
2325
            elif not isinstance(padding, PaddingStrategy):
2326
                padding_strategy = PaddingStrategy(padding)
2327
2328
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
2329
2330
2331
2332
2333
2334
2335
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
        if truncation is False and old_truncation_strategy != "do_not_truncate":
            if verbose:
                warnings.warn(
Sylvain Gugger's avatar
Sylvain Gugger committed
2336
2337
2338
2339
2340
2341
2342
2343
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
                    " `truncation=True` to truncate examples to a max length. You can give a specific length with"
                    " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
                    " size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific"
                    " truncation strategy selected among `truncation='only_first'` (will only truncate the first"
                    " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
                    " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
                    " in the pairs).",
2344
                    FutureWarning,
2345
2346
2347
2348
2349
                )
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
        elif truncation is not False:
            if truncation is True:
                truncation_strategy = (
2350
2351
2352
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
2353
                truncation_strategy = TruncationStrategy(truncation)
2354
2355
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
2356
2357
2358
2359
2360
2361
2362
2363
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
2364
2365
                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
                            logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
2366
2367
                                "Asking to pad to max_length but no maximum length is provided and the model has no"
                                " predefined maximum length. Default to no padding."
2368
2369
                            )
                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
2370
2371
2372
2373
2374
2375
2376
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length

            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
2377
2378
                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
                            logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
2379
2380
                                "Asking to truncate to max_length but no maximum length is provided and the model has"
                                " no predefined maximum length. Default to no truncation."
2381
2382
                            )
                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

2395
2396
2397
2398
2399
2400
2401
2402
2403
        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (
            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
            and padding_strategy != PaddingStrategy.DO_NOT_PAD
            and pad_to_multiple_of is not None
            and max_length is not None
            and (max_length % pad_to_multiple_of != 0)
        ):
            raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
2404
                "Truncation and padding are both activated but "
2405
2406
2407
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

2408
2409
2410
2411
2412
        return padding_strategy, truncation_strategy, max_length, kwargs

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def __call__(
        self,
2413
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
2414
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
2415
2416
2417
2418
        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair_target: Optional[
            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
        ] = None,
2419
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2420
2421
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2422
2423
        max_length: Optional[int] = None,
        stride: int = 0,
2424
        is_split_into_words: bool = False,
2425
        pad_to_multiple_of: Optional[int] = None,
2426
2427
2428
2429
2430
2431
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2432
        return_length: bool = False,
2433
2434
2435
2436
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2437
2438
        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
        sequences.
2439
2440

        Args:
2441
            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2442
2443
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
2444
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
2445
            text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2446
2447
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
2448
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
            text_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        """
        # To avoid duplicating
        all_kwargs = dict(
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            is_split_into_words=is_split_into_words,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
        )
        all_kwargs.update(kwargs)
        if text is None and text_target is None:
            raise ValueError("You need to specify either `text` or `text_target`.")
        if text is not None:
            # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
            # input mode in this case.
            if not self._in_target_context_manager:
                self._switch_to_input_mode()
            encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
        if text_target is not None:
            self._switch_to_target_mode()
            target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs)
        # Leave back tokenizer in input mode
        self._switch_to_input_mode()

        if text_target is None:
            return encodings
        elif text is None:
            return target_encodings
        else:
            encodings["labels"] = target_encodings["input_ids"]
            return encodings

    def _call_one(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
2520
        # Input type checking for clearer error
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
        def _is_valid_text_input(t):
            if isinstance(t, str):
                # Strings are fine
                return True
            elif isinstance(t, (list, tuple)):
                # List are fine as long as they are...
                if len(t) == 0:
                    # ... empty
                    return True
                elif isinstance(t[0], str):
                    # ... list of strings
                    return True
                elif isinstance(t[0], (list, tuple)):
                    # ... list with an empty list or with a list of strings
                    return len(t[0]) == 0 or isinstance(t[0][0], str)
                else:
                    return False
            else:
                return False
2540

2541
2542
2543
2544
        if not _is_valid_text_input(text):
            raise ValueError(
                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                "or `List[List[str]]` (batch of pretokenized examples)."
2545
2546
            )

2547
2548
2549
2550
        if text_pair is not None and not _is_valid_text_input(text_pair):
            raise ValueError(
                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                "or `List[List[str]]` (batch of pretokenized examples)."
2551
            )
2552
2553
2554
2555
2556

        if is_split_into_words:
            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
        else:
            is_batched = isinstance(text, (list, tuple))
2557
2558

        if is_batched:
2559
2560
            if isinstance(text_pair, str):
                raise TypeError(
Sylvain Gugger's avatar
Sylvain Gugger committed
2561
2562
                    "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as"
                    " `text`."
2563
2564
2565
                )
            if text_pair is not None and len(text) != len(text_pair):
                raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
2566
2567
                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
                    f" {len(text_pair)}."
2568
                )
2569
2570
2571
2572
2573
2574
2575
2576
            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
            return self.batch_encode_plus(
                batch_text_or_text_pairs=batch_text_or_text_pairs,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
2577
                is_split_into_words=is_split_into_words,
2578
                pad_to_multiple_of=pad_to_multiple_of,
2579
2580
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
2581
                return_attention_mask=return_attention_mask,
2582
                return_overflowing_tokens=return_overflowing_tokens,
2583
                return_special_tokens_mask=return_special_tokens_mask,
2584
                return_offsets_mapping=return_offsets_mapping,
2585
                return_length=return_length,
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
                verbose=verbose,
                **kwargs,
            )
        else:
            return self.encode_plus(
                text=text,
                text_pair=text_pair,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
2598
                is_split_into_words=is_split_into_words,
2599
                pad_to_multiple_of=pad_to_multiple_of,
2600
2601
2602
2603
2604
2605
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
2606
                return_length=return_length,
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
                verbose=verbose,
                **kwargs,
            )

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2617
2618
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2619
2620
        max_length: Optional[int] = None,
        stride: int = 0,
2621
        is_split_into_words: bool = False,
2622
        pad_to_multiple_of: Optional[int] = None,
2623
2624
2625
2626
2627
2628
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2629
        return_length: bool = False,
2630
2631
2632
2633
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2634
2635
        Tokenize and prepare for the model a sequence or a pair of sequences.

2636
2637
2638
2639
2640
        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>
2641
2642

        Args:
2643
            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
Sylvain Gugger's avatar
Sylvain Gugger committed
2644
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
2645
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
Sylvain Gugger's avatar
Sylvain Gugger committed
2646
                method).
2647
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2648
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
Sylvain Gugger's avatar
Sylvain Gugger committed
2649
2650
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
2651
2652
2653
2654
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2655
2656
2657
2658
2659
2660
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
        )

        return self._encode_plus(
            text=text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
2671
            is_split_into_words=is_split_into_words,
2672
            pad_to_multiple_of=pad_to_multiple_of,
2673
2674
2675
2676
2677
2678
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
2679
            return_length=return_length,
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
            verbose=verbose,
            **kwargs,
        )

    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
2693
        is_split_into_words: bool = False,
2694
        pad_to_multiple_of: Optional[int] = None,
2695
2696
2697
2698
2699
2700
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2701
        return_length: bool = False,
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        raise NotImplementedError

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2719
2720
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2721
2722
        max_length: Optional[int] = None,
        stride: int = 0,
2723
        is_split_into_words: bool = False,
2724
        pad_to_multiple_of: Optional[int] = None,
2725
2726
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
2727
        return_attention_mask: Optional[bool] = None,
2728
        return_overflowing_tokens: bool = False,
2729
        return_special_tokens_mask: bool = False,
2730
        return_offsets_mapping: bool = False,
2731
        return_length: bool = False,
2732
2733
2734
2735
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2736
2737
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

2738
2739
2740
2741
2742
        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>
2743
2744

        Args:
2745
            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2746
2747
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
2748
                details in `encode_plus`).
2749
2750
2751
2752
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2753
2754
2755
2756
2757
2758
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
2759
2760
2761
2762
2763
2764
2765
2766
2767
        )

        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
2768
            is_split_into_words=is_split_into_words,
2769
            pad_to_multiple_of=pad_to_multiple_of,
2770
2771
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
2772
            return_attention_mask=return_attention_mask,
2773
            return_overflowing_tokens=return_overflowing_tokens,
2774
            return_special_tokens_mask=return_special_tokens_mask,
2775
            return_offsets_mapping=return_offsets_mapping,
2776
            return_length=return_length,
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
            verbose=verbose,
            **kwargs,
        )

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
2796
        is_split_into_words: bool = False,
2797
        pad_to_multiple_of: Optional[int] = None,
2798
2799
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
2800
        return_attention_mask: Optional[bool] = None,
2801
        return_overflowing_tokens: bool = False,
2802
        return_special_tokens_mask: bool = False,
2803
        return_offsets_mapping: bool = False,
2804
        return_length: bool = False,
2805
2806
2807
2808
2809
2810
2811
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        raise NotImplementedError

    def pad(
        self,
2812
2813
2814
2815
2816
2817
2818
        encoded_inputs: Union[
            BatchEncoding,
            List[BatchEncoding],
            Dict[str, EncodedInput],
            Dict[str, List[EncodedInput]],
            List[Dict[str, EncodedInput]],
        ],
Sylvain Gugger's avatar
Sylvain Gugger committed
2819
        padding: Union[bool, str, PaddingStrategy] = True,
2820
        max_length: Optional[int] = None,
2821
        pad_to_multiple_of: Optional[int] = None,
2822
        return_attention_mask: Optional[bool] = None,
2823
        return_tensors: Optional[Union[str, TensorType]] = None,
2824
        verbose: bool = True,
2825
    ) -> BatchEncoding:
Sylvain Gugger's avatar
Sylvain Gugger committed
2826
2827
2828
        """
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.
2829

2830
        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
2831
2832
2833
2834
        `self.pad_token_id` and `self.pad_token_type_id`).

        Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
        text followed by a call to the `pad` method to get a padded encoding.
2835
2836

        <Tip>
2837

2838
        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
Sylvain Gugger's avatar
Sylvain Gugger committed
2839
2840
        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
        PyTorch tensors, you will lose the specific device of your tensors however.
2841

2842
        </Tip>
2843

2844
        Args:
2845
            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2846
2847
2848
2849
                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
                List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.
2850

Sylvain Gugger's avatar
Sylvain Gugger committed
2851
2852
                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
                the note above for the return type.
2853
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2854
2855
2856
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

Sylvain Gugger's avatar
Sylvain Gugger committed
2857
2858
2859
2860
2861
2862
                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
2863
            max_length (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2864
                Maximum length of the returned list and optionally padding length (see above).
2865
            pad_to_multiple_of (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2866
2867
2868
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2869
                >= 7.5 (Volta).
2870
            return_attention_mask (`bool`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2871
                Whether to return the attention mask. If left to the default, will return the attention mask according
2872
                to the specific tokenizer's default, defined by the `return_outputs` attribute.
Sylvain Gugger's avatar
Sylvain Gugger committed
2873

2874
                [What are attention masks?](../glossary#attention-mask)
2875
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2876
2877
                If set, will return tensors instead of list of python integers. Acceptable values are:

2878
2879
2880
2881
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            verbose (`bool`, *optional*, defaults to `True`):
2882
                Whether or not to print more information and warnings.
2883
        """
2884
2885
2886
2887
2888
2889
2890
2891
2892
        if self.__class__.__name__.endswith("Fast"):
            if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False):
                logger.warning_advice(
                    f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer,"
                    " using the `__call__` method is faster than using a method to encode the text followed by a call"
                    " to the `pad` method to get a padded encoding."
                )
                self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

2893
        # If we have a list of dicts, let's convert it in a dict of lists
2894
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
2895
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
2896
2897
            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

2898
2899
2900
        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
Takuya Makino's avatar
Takuya Makino committed
2901
                "You should supply an encoding or a list of encodings to this method "
2902
2903
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )
2904

2905
2906
2907
        required_input = encoded_inputs[self.model_input_names[0]]

        if not required_input:
2908
            if return_attention_mask:
2909
2910
                encoded_inputs["attention_mask"] = []
            return encoded_inputs
2911

2912
2913
2914
        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch
2915

2916
        first_element = required_input[0]
2917
2918
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
2919
2920
2921
2922
            for item in required_input:
                if len(item) != 0:
                    first_element = item[0]
                    break
2923
2924
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
Sylvain Gugger's avatar
Sylvain Gugger committed
2925
            if is_tf_available() and _is_tensorflow(first_element):
2926
                return_tensors = "tf" if return_tensors is None else return_tensors
Sylvain Gugger's avatar
Sylvain Gugger committed
2927
            elif is_torch_available() and _is_torch(first_element):
2928
2929
2930
2931
2932
2933
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
Sylvain Gugger's avatar
Sylvain Gugger committed
2934
                    "Should be one of a python, numpy, pytorch or tensorflow object."
2935
2936
2937
2938
2939
                )

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)

2940
        # Convert padding_strategy in PaddingStrategy
2941
2942
2943
2944
        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
            padding=padding, max_length=max_length, verbose=verbose
        )

2945
2946
        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
2947
2948
            encoded_inputs = self._pad(
                encoded_inputs,
2949
2950
                max_length=max_length,
                padding_strategy=padding_strategy,
2951
                pad_to_multiple_of=pad_to_multiple_of,
2952
2953
                return_attention_mask=return_attention_mask,
            )
2954
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
2955

2956
        batch_size = len(required_input)
2957
        assert all(
2958
            len(v) == batch_size for v in encoded_inputs.values()
Tiger's avatar
Tiger committed
2959
        ), "Some items in the output dictionary have a different batch size than others."
2960
2961

        if padding_strategy == PaddingStrategy.LONGEST:
2962
            max_length = max(len(inputs) for inputs in required_input)
2963
2964
2965
2966
            padding_strategy = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
2967
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
2968
2969
2970
2971
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding_strategy,
2972
                pad_to_multiple_of=pad_to_multiple_of,
2973
2974
2975
2976
2977
2978
2979
2980
                return_attention_mask=return_attention_mask,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

2981
        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
2982

Sylvain Gugger's avatar
Sylvain Gugger committed
2983
2984
2985
2986
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2987
2988
        Create the token type IDs corresponding to the sequences passed. [What are token type
        IDs?](../glossary#token-type-ids)
Sylvain Gugger's avatar
Sylvain Gugger committed
2989

2990
        Should be overridden in a subclass if the model has a special way of building those.
Sylvain Gugger's avatar
Sylvain Gugger committed
2991
2992

        Args:
2993
2994
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Sylvain Gugger's avatar
Sylvain Gugger committed
2995
2996

        Returns:
2997
            `List[int]`: The token type ids.
Sylvain Gugger's avatar
Sylvain Gugger committed
2998
        """
2999
3000
3001
3002
        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)

Sylvain Gugger's avatar
Sylvain Gugger committed
3003
3004
3005
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
3006
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3007
3008
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
3009

3010
        This implementation does not add special tokens and this method should be overridden in a subclass.
Sylvain Gugger's avatar
Sylvain Gugger committed
3011
3012

        Args:
3013
3014
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Sylvain Gugger's avatar
Sylvain Gugger committed
3015
3016

        Returns:
3017
            `List[int]`: The model input with special tokens.
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
        """
        if token_ids_1 is None:
            return token_ids_0
        return token_ids_0 + token_ids_1

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
3029
3030
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
        **kwargs
    ) -> BatchEncoding:
Sylvain Gugger's avatar
Sylvain Gugger committed
3045
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3046
3047
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
3048
        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
3049
        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
3050
        overflowing tokens. Such a combination of arguments will raise an error.
3051
3052

        Args:
3053
            ids (`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3054
3055
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
                `convert_tokens_to_ids` methods.
3056
3057
3058
            pair_ids (`List[int]`, *optional*):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

3075
        if return_token_type_ids and not add_special_tokens:
3076
3077
3078
3079
3080
3081
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None."
            )

3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
        if (
            return_overflowing_tokens
            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
            and pair_ids is not None
        ):
            raise ValueError(
                "Not possible to return overflowing tokens for pair of sequences with the "
                "`longest_first`. Please select another truncation strategy than `longest_first`, "
                "for instance `only_second` or `only_first`."
            )

3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
3105
        overflowing_tokens = []
3106
3107
3108
3109
3110
3111
3112
3113
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation_strategy,
                stride=stride,
            )
3114
3115
3116
3117

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length
3118
3119
3120
3121
3122
3123
3124

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
3125
            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
3126

Tiger's avatar
Tiger committed
3127
        # Build output dictionary
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Check lengths
3138
        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166

        # Padding
        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding_strategy.value,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(
            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
        )

        return batch_outputs

    def truncate_sequences(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        num_tokens_to_remove: int = 0,
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
        stride: int = 0,
    ) -> Tuple[List[int], List[int], List[int]]:
Sylvain Gugger's avatar
Sylvain Gugger committed
3167
3168
        """
        Truncates a sequence pair in-place following the strategy.
3169
3170

        Args:
3171
            ids (`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3172
3173
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
                `convert_tokens_to_ids` methods.
3174
3175
3176
3177
            pair_ids (`List[int]`, *optional*):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
            num_tokens_to_remove (`int`, *optional*, defaults to 0):
Sylvain Gugger's avatar
Sylvain Gugger committed
3178
                Number of tokens to remove using the truncation strategy.
3179
            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3180
3181
                The strategy to follow for truncation. Can be:

Sylvain Gugger's avatar
Sylvain Gugger committed
3182
3183
3184
3185
3186
3187
                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will truncate
                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
                  batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
Sylvain Gugger's avatar
Sylvain Gugger committed
3188
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
3189
3190
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
Sylvain Gugger's avatar
Sylvain Gugger committed
3191
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
3192
3193
                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
                  than the model maximum admissible input size).
3194
            stride (`int`, *optional*, defaults to 0):
Sylvain Gugger's avatar
Sylvain Gugger committed
3195
3196
                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                sequence returned. The value of this argument defines the number of additional tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
3197
3198

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
3199
3200
3201
            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
            of sequences (or a batch of pairs) is provided.
3202
3203
3204
3205
3206
3207
3208
3209
        """
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []

        if not isinstance(truncation_strategy, TruncationStrategy):
            truncation_strategy = TruncationStrategy(truncation_strategy)

        overflowing_tokens = []
3210
3211
3212
        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
        ):
3213
3214
            if len(ids) > num_tokens_to_remove:
                window_len = min(len(ids), stride + num_tokens_to_remove)
3215
3216
3217
3218
3219
3220
3221
3222
3223
                if self.truncation_side == "left":
                    overflowing_tokens = ids[:window_len]
                    ids = ids[num_tokens_to_remove:]
                elif self.truncation_side == "right":
                    overflowing_tokens = ids[-window_len:]
                    ids = ids[:-num_tokens_to_remove]
                else:
                    raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")

3224
            else:
3225
3226
                error_msg = (
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
3227
3228
                    f"but the first sequence has a length {len(ids)}. "
                )
3229
3230
                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                    error_msg = (
Sylvain Gugger's avatar
Sylvain Gugger committed
3231
3232
                        error_msg
                        + "Please select another truncation strategy than "
3233
3234
3235
3236
3237
                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                    )
                logger.error(error_msg)
        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
            logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
3238
                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
3239
                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
Sylvain Gugger's avatar
Sylvain Gugger committed
3240
3241
                "truncation strategy. So the returned list will always be empty even if some "
                "tokens have been removed."
3242
3243
3244
            )
            for _ in range(num_tokens_to_remove):
                if pair_ids is None or len(ids) > len(pair_ids):
3245
3246
3247
3248
3249
3250
                    if self.truncation_side == "right":
                        ids = ids[:-1]
                    elif self.truncation_side == "left":
                        ids = ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
3251
                else:
3252
3253
3254
3255
3256
3257
                    if self.truncation_side == "right":
                        pair_ids = pair_ids[:-1]
                    elif self.truncation_side == "left":
                        pair_ids = pair_ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
3258
3259
3260
        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
            if len(pair_ids) > num_tokens_to_remove:
                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
3261
3262
3263
3264
3265
3266
3267
3268
                if self.truncation_side == "right":
                    overflowing_tokens = pair_ids[-window_len:]
                    pair_ids = pair_ids[:-num_tokens_to_remove]
                elif self.truncation_side == "left":
                    overflowing_tokens = pair_ids[:window_len]
                    pair_ids = pair_ids[num_tokens_to_remove:]
                else:
                    raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
3269
3270
            else:
                logger.error(
3271
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
3272
3273
                    f"but the second sequence has a length {len(pair_ids)}. "
                    f"Please select another truncation strategy than {truncation_strategy}, "
Sylvain Gugger's avatar
Sylvain Gugger committed
3274
                    "for instance 'longest_first' or 'only_first'."
3275
3276
3277
3278
                )

        return (ids, pair_ids, overflowing_tokens)

3279
3280
    def _pad(
        self,
3281
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
3282
3283
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
3284
        pad_to_multiple_of: Optional[int] = None,
3285
3286
        return_attention_mask: Optional[bool] = None,
    ) -> dict:
Sylvain Gugger's avatar
Sylvain Gugger committed
3287
        """
3288
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
3289
3290

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
3291
3292
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
3293
3294
3295
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.
Sylvain Gugger's avatar
Sylvain Gugger committed
3296

3297
3298
3299
3300
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:
Sylvain Gugger's avatar
Sylvain Gugger committed
3301

3302
3303
                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
3304
3305
3306
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
Sylvain Gugger's avatar
Sylvain Gugger committed
3307
3308
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
3309
3310
3311
3312
3313
        """
        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

3314
3315
        required_input = encoded_inputs[self.model_input_names[0]]

3316
        if padding_strategy == PaddingStrategy.LONGEST:
3317
            max_length = len(required_input)
3318

3319
3320
3321
        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

3322
        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
3323

3324
3325
3326
3327
        # Initialize attention mask if not present.
        if return_attention_mask and "attention_mask" not in encoded_inputs:
            encoded_inputs["attention_mask"] = [1] * len(required_input)

3328
        if needs_to_be_padded:
3329
            difference = max_length - len(required_input)
3330

3331
3332
            if self.padding_side == "right":
                if return_attention_mask:
3333
3334

                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
3335
3336
3337
3338
3339
3340
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
                    )
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
3341
                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
3342
3343
            elif self.padding_side == "left":
                if return_attention_mask:
3344
                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
3345
3346
3347
3348
3349
3350
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
                        "token_type_ids"
                    ]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
3351
                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
3352
3353
3354
3355
3356
            else:
                raise ValueError("Invalid padding strategy:" + str(self.padding_side))

        return encoded_inputs

3357
3358
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
3359
        Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
Sylvain Gugger's avatar
Sylvain Gugger committed
3360
        often want to remove sub-word tokenization artifacts at the same time.
Sylvain Gugger's avatar
Sylvain Gugger committed
3361

3362
        Args:
3363
            tokens (`List[str]`): The token to join in a string.
Sylvain Gugger's avatar
Sylvain Gugger committed
3364

cronoik's avatar
cronoik committed
3365
        Returns:
3366
            `str`: The joined tokens.
3367
3368
3369
        """
        raise NotImplementedError

3370
    def batch_decode(
3371
3372
3373
3374
3375
        self,
        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
3376
3377
3378
3379
3380
    ) -> List[str]:
        """
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
3381
3382
3383
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3384
                Whether or not to remove special tokens in the decoding.
3385
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3386
                Whether or not to clean up the tokenization spaces.
3387
            kwargs (additional keyword arguments, *optional*):
3388
                Will be passed to the underlying model specific decode method.
Sylvain Gugger's avatar
Sylvain Gugger committed
3389
3390

        Returns:
3391
            `List[str]`: The list of decoded sentences.
3392
3393
3394
        """
        return [
            self.decode(
3395
3396
3397
3398
                seq,
                skip_special_tokens=skip_special_tokens,
                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                **kwargs,
3399
3400
3401
            )
            for seq in sequences
        ]
3402

3403
    def decode(
3404
        self,
3405
        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
3406
3407
3408
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
3409
3410
    ) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3411
3412
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.
Sylvain Gugger's avatar
Sylvain Gugger committed
3413

3414
        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
3415
3416

        Args:
3417
3418
3419
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3420
                Whether or not to remove special tokens in the decoding.
3421
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3422
                Whether or not to clean up the tokenization spaces.
3423
            kwargs (additional keyword arguments, *optional*):
3424
                Will be passed to the underlying model specific decode method.
Sylvain Gugger's avatar
Sylvain Gugger committed
3425
3426

        Returns:
3427
            `str`: The decoded sentence.
3428
        """
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
        # Convert inputs to python lists
        token_ids = to_py_obj(token_ids)

        return self._decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )

    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
    ) -> str:
3446
3447
3448
        raise NotImplementedError

    def get_special_tokens_mask(
Sylvain Gugger's avatar
Sylvain Gugger committed
3449
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
3450
3451
3452
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
3453
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
3454
3455

        Args:
3456
            token_ids_0 (`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3457
                List of ids of the first sequence.
3458
            token_ids_1 (`List[int]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
3459
                List of ids of the second sequence.
3460
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
3461
                Whether or not the token list is already formatted with special tokens for the model.
3462
3463
3464
3465
3466
3467

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        assert already_has_special_tokens and token_ids_1 is None, (
            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
3468
            "Please use a slow (full python) tokenizer to activate this argument. "
3469
            "Or set `return_special_tokens_mask=True` when calling the encoding method "
3470
3471
3472
3473
3474
3475
3476
3477
3478
            "to get the special tokens mask in any tokenizer. "
        )

        all_special_ids = self.all_special_ids  # cache the property

        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]

        return special_tokens_mask

3479
3480
    @staticmethod
    def clean_up_tokenization(out_string: str) -> str:
Sylvain Gugger's avatar
Sylvain Gugger committed
3481
        """
3482
        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
Sylvain Gugger's avatar
Sylvain Gugger committed
3483
3484

        Args:
3485
            out_string (`str`): The text to clean up.
Sylvain Gugger's avatar
Sylvain Gugger committed
3486
3487

        Returns:
3488
            `str`: The cleaned-up string.
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
        """
        out_string = (
            out_string.replace(" .", ".")
            .replace(" ?", "?")
            .replace(" !", "!")
            .replace(" ,", ",")
            .replace(" ' ", "'")
            .replace(" n't", "n't")
            .replace(" 'm", "'m")
            .replace(" 's", "'s")
            .replace(" 've", "'ve")
            .replace(" 're", "'re")
        )
        return out_string
3503
3504
3505

    def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
        """
3506
        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
3507
3508
3509
        corresponding model

        Args:
3510
3511
3512
            ids (`List[str]`): The ids produced by the tokenization
            max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
            verbose (`bool`): Whether or not to print more information and warnings.
3513
3514
3515
3516
3517
3518

        """
        if max_length is None and len(ids) > self.model_max_length and verbose:
            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
                logger.warning(
                    "Token indices sequence length is longer than the specified maximum sequence length "
3519
3520
                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
                    "will result in indexing errors"
3521
3522
                )
            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
3523

3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
    def _switch_to_input_mode(self):
        """
        Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
        """
        pass

    def _switch_to_target_mode(self):
        """
        Private method to put the tokenizer in target mode (when it has different modes for input/outputs)
        """
        pass

3536
3537
3538
3539
3540
3541
    @contextmanager
    def as_target_tokenizer(self):
        """
        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
        sequence-to-sequence models that need a slightly different processing for the labels.
        """
3542
3543
3544
3545
3546
3547
3548
        warnings.warn(
            "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
            "labels by using the argument `text_target` of the regular `__call__` method (either in the same call as "
            "your input texts if you use the same keyword arguments, or in a separate call."
        )
        self._switch_to_target_mode()
        self._in_target_context_manager = True
3549
        yield
3550
3551
        self._in_target_context_manager = False
        self._switch_to_input_mode()
3552

3553
3554
3555
3556
3557
3558
    @classmethod
    def register_for_auto_class(cls, auto_class="AutoTokenizer"):
        """
        Register this class with a given auto class. This should only be used for custom tokenizers as the ones in the
        library are already mapped with `AutoTokenizer`.

3559
3560
3561
3562
3563
3564
        <Tip warning={true}>

        This API is experimental and may have some slight breaking changes in the next releases.

        </Tip>

3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoTokenizer"`):
                The auto class to register this new tokenizer with.
        """
        if not isinstance(auto_class, str):
            auto_class = auto_class.__name__

        import transformers.models.auto as auto_module

        if not hasattr(auto_module, auto_class):
            raise ValueError(f"{auto_class} is not a valid auto class.")

        cls._auto_class = auto_class

3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        tgt_texts: Optional[List[str]] = None,
        max_length: Optional[int] = None,
        max_target_length: Optional[int] = None,
        padding: str = "longest",
        return_tensors: str = None,
        truncation: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Prepare model inputs for translation. For best performance, translate one sentence at a time.

        Arguments:
3594
            src_texts (`List[str]`):
3595
                List of documents to summarize or source language texts.
3596
            tgt_texts (`list`, *optional*):
3597
                List of summaries or target language texts.
3598
            max_length (`int`, *optional*):
3599
                Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
Sylvain Gugger's avatar
Sylvain Gugger committed
3600
3601
3602
                left unset or set to `None`, this will use the predefined model maximum length if a maximum length is
                required by one of the truncation/padding parameters. If the model has no specific maximum input length
                (like XLNet) truncation/padding to a maximum length will be deactivated.
3603
            max_target_length (`int`, *optional*):
3604
                Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
3605
                to `None`, this will use the max_length value.
3606
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
3607
3608
                Activates and controls padding. Accepts the following values:

Sylvain Gugger's avatar
Sylvain Gugger committed
3609
3610
3611
3612
3613
3614
                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
3615
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
3616
3617
                If set, will return tensors instead of list of python integers. Acceptable values are:

3618
3619
3620
3621
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `True`):
3622
3623
                Activates and controls truncation. Accepts the following values:

Sylvain Gugger's avatar
Sylvain Gugger committed
3624
3625
3626
3627
3628
3629
                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
                  sequences (or a batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
3630
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
3631
3632
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
3633
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
3634
3635
                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).
3636
            **kwargs:
3637
                Additional keyword arguments passed along to `self.__call__`.
3638
3639

        Return:
3640
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
3641
3642
3643
3644
3645

            - **input_ids** -- List of token ids to be fed to the encoder.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **labels** -- List of token ids for tgt_texts.

3646
            The full set of keys `[input_ids, attention_mask, labels]`, will only be returned if tgt_texts is passed.
3647
3648
            Otherwise, input_ids, attention_mask will be the only keys.
        """
3649
3650
3651
        # docstyle-ignore
        formatted_warning = """
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
3652
`__call__` method to prepare your inputs and targets.
3653
3654
3655

Here is a short example:

3656
3657
3658
3659
3660
model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

3661
model_inputs = tokenizer(src_texts, ...)
3662
labels = tokenizer(text_target=tgt_texts, ...)
3663
3664
3665
3666
3667
3668
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.
"""
        warnings.warn(formatted_warning, FutureWarning)
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
        # mBART-specific kwargs that should be ignored by other models.
        kwargs.pop("src_lang", None)
        kwargs.pop("tgt_lang", None)
        if max_length is None:
            max_length = self.model_max_length
        model_inputs = self(
            src_texts,
            add_special_tokens=True,
            return_tensors=return_tensors,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            **kwargs,
        )
        if tgt_texts is None:
            return model_inputs
        # Process tgt_texts
        if max_target_length is None:
            max_target_length = max_length
        with self.as_target_tokenizer():
            labels = self(
                tgt_texts,
                add_special_tokens=True,
                return_tensors=return_tensors,
                padding=padding,
                max_length=max_target_length,
                truncation=truncation,
                **kwargs,
            )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
3700
3701


3702
def get_fast_tokenizer_file(tokenization_files: List[str]) -> str:
3703
    """
3704
    Get the tokenization file to use for this version of transformers.
3705
3706

    Args:
3707
        tokenization_files (`List[str]`): The list of available configuration files.
3708
3709

    Returns:
3710
        `str`: The tokenization file to use.
3711
3712
    """
    tokenizer_files_map = {}
3713
    for file_name in tokenization_files:
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
        search = _re_tokenizer_file.search(file_name)
        if search is not None:
            v = search.groups()[0]
            tokenizer_files_map[v] = file_name
    available_versions = sorted(tokenizer_files_map.keys())

    # Defaults to FULL_TOKENIZER_FILE and then try to look at some newer versions.
    tokenizer_file = FULL_TOKENIZER_FILE
    transformers_version = version.parse(__version__)
    for v in available_versions:
        if version.parse(v) <= transformers_version:
            tokenizer_file = tokenizer_files_map[v]
        else:
            # No point going further since the versions are sorted.
            break

    return tokenizer_file


3733
3734
3735
3736
3737
# To update the docstring, we need to copy the method, otherwise we change the original docstring.
PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub)
PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format(
    object="tokenizer", object_class="AutoTokenizer", object_files="tokenizer files"
)