tokenization_utils_base.py 178 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
16
"""
Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
17
fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
Sylvain Gugger's avatar
Sylvain Gugger committed
18
of output with special method for the Fast tokenizers)
19
20
21
22
23
"""

import copy
import json
import os
24
import re
25
import warnings
26
from collections import OrderedDict, UserDict
27
from collections.abc import Mapping, Sized
28
from contextlib import contextmanager
29
from dataclasses import dataclass, field
Sylvain Gugger's avatar
Sylvain Gugger committed
30
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
31
32

import numpy as np
33
from packaging import version
34

35
from . import __version__
36
from .dynamic_module_utils import custom_object_save
37
from .utils import (
38
39
    ExplicitEnum,
    PaddingStrategy,
Sylvain Gugger's avatar
Sylvain Gugger committed
40
    PushToHubMixin,
41
    TensorType,
42
    add_end_docstrings,
43
    add_model_info_to_auto_map,
44
    cached_file,
45
    copy_func,
46
    download_url,
47
    extract_commit_hash,
48
    is_flax_available,
49
50
    is_jax_tensor,
    is_numpy_array,
51
    is_offline_mode,
52
    is_remote_url,
53
    is_tf_available,
54
    is_tf_tensor,
55
    is_tokenizers_available,
56
    is_torch_available,
57
58
    is_torch_device,
    is_torch_tensor,
59
    logging,
60
    requires_backends,
61
    to_py_obj,
62
63
64
)


Sylvain Gugger's avatar
Sylvain Gugger committed
65
66
67
68
69
70
71
72
if TYPE_CHECKING:
    if is_torch_available():
        import torch
    if is_tf_available():
        import tensorflow as tf
    if is_flax_available():
        import jax.numpy as jnp  # noqa: F401

73
74
75
76
77
78
79
80

if is_tokenizers_available():
    from tokenizers import AddedToken
    from tokenizers import Encoding as EncodingFast
else:

    @dataclass(frozen=True, eq=True)
    class AddedToken:
Sylvain Gugger's avatar
Sylvain Gugger committed
81
82
83
        """
        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
        way it should behave.
84
85
86
87
88
89
90
91
92
93
94
95
96
        """

        content: str = field(default_factory=str)
        single_word: bool = False
        lstrip: bool = False
        rstrip: bool = False
        normalized: bool = True

        def __getstate__(self):
            return self.__dict__

    @dataclass
    class EncodingFast:
Patrick von Platen's avatar
Patrick von Platen committed
97
        """This is dummy class because without the `tokenizers` library we don't have these objects anyway"""
98
99
100

        pass

101

Lysandre Debut's avatar
Lysandre Debut committed
102
logger = logging.get_logger(__name__)
103
104
105
106
107
108
109
110
111
112
113
114
115

VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER

# Define type aliases and NamedTuples
TextInput = str
PreTokenizedInput = List[str]
EncodedInput = List[int]
TextInputPair = Tuple[str, str]
PreTokenizedInputPair = Tuple[List[str], List[str]]
EncodedInputPair = Tuple[List[int], List[int]]


116
# Slow tokenizers used to be saved in three separated files
117
118
119
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
120
121

# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
122
FULL_TOKENIZER_FILE = "tokenizer.json"
123
_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")
124
125
126


class TruncationStrategy(ExplicitEnum):
Sylvain Gugger's avatar
Sylvain Gugger committed
127
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
128
129
    Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in
    an IDE.
Sylvain Gugger's avatar
Sylvain Gugger committed
130
131
    """

132
133
134
135
136
137
138
    ONLY_FIRST = "only_first"
    ONLY_SECOND = "only_second"
    LONGEST_FIRST = "longest_first"
    DO_NOT_TRUNCATE = "do_not_truncate"


class CharSpan(NamedTuple):
Sylvain Gugger's avatar
Sylvain Gugger committed
139
140
    """
    Character span in the original string.
141

Sylvain Gugger's avatar
Sylvain Gugger committed
142
    Args:
143
144
        start (`int`): Index of the first character in the original string.
        end (`int`): Index of the character following the last character in the original string.
145
146
147
148
149
150
151
    """

    start: int
    end: int


class TokenSpan(NamedTuple):
Sylvain Gugger's avatar
Sylvain Gugger committed
152
153
    """
    Token span in an encoded string (list of tokens).
154

Sylvain Gugger's avatar
Sylvain Gugger committed
155
    Args:
156
157
        start (`int`): Index of the first token in the span.
        end (`int`): Index of the token following the last token in the span.
158
159
160
161
162
163
164
    """

    start: int
    end: int


class BatchEncoding(UserDict):
Sylvain Gugger's avatar
Sylvain Gugger committed
165
    """
166
167
168
    Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],
    [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and
    [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).
Sylvain Gugger's avatar
Sylvain Gugger committed
169
170
171
172
173

    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
    utility methods to map from word/character space to token space.

    Args:
174
        data (`dict`):
175
176
            Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
            ('input_ids', 'attention_mask', etc.).
177
        encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
178
            If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
179
            space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this
180
            information.
181
        tensor_type (`Union[None, str, TensorType]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
182
183
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
184
185
186
        prepend_batch_axis (`bool`, *optional*, defaults to `False`):
            Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
        n_sequences (`Optional[int]`, *optional*):
187
188
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
189
190
191
192
193
194
195
196
    """

    def __init__(
        self,
        data: Optional[Dict[str, Any]] = None,
        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
        tensor_type: Union[None, str, TensorType] = None,
        prepend_batch_axis: bool = False,
197
        n_sequences: Optional[int] = None,
198
199
200
201
202
203
204
205
    ):
        super().__init__(data)

        if isinstance(encoding, EncodingFast):
            encoding = [encoding]

        self._encodings = encoding

206
207
208
209
210
        if n_sequences is None and encoding is not None and len(encoding):
            n_sequences = encoding[0].n_sequences

        self._n_sequences = n_sequences

211
212
        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

213
214
215
    @property
    def n_sequences(self) -> Optional[int]:
        """
216
        `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
Sylvain Gugger's avatar
Sylvain Gugger committed
217
218
        [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
        sentences)
219
        """
Lysandre Debut's avatar
Lysandre Debut committed
220
        return self._n_sequences
221

222
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
223
    def is_fast(self) -> bool:
224
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
225
226
        `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`]
        or not.
227
228
229
        """
        return self._encodings is not None

Sylvain Gugger's avatar
Sylvain Gugger committed
230
231
    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
        """
232
        If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
Sylvain Gugger's avatar
Sylvain Gugger committed
233
        etc.).
Sylvain Gugger's avatar
Sylvain Gugger committed
234

235
        If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
236
237
238

        If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)
        with the constraint of slice.
239
240
241
242
243
        """
        if isinstance(item, str):
            return self.data[item]
        elif self._encodings is not None:
            return self._encodings[item]
244
        elif isinstance(item, slice):
245
            return {key: self.data[key][item] for key in self.data.keys()}
246
247
        else:
            raise KeyError(
248
249
                "Invalid key. Only three types of key are available: "
                "(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."
250
251
252
253
254
255
256
257
            )

    def __getattr__(self, item: str):
        try:
            return self.data[item]
        except KeyError:
            raise AttributeError

258
259
260
261
262
263
264
265
266
267
    def __getstate__(self):
        return {"data": self.data, "encodings": self._encodings}

    def __setstate__(self, state):
        if "data" in state:
            self.data = state["data"]

        if "encodings" in state:
            self._encodings = state["encodings"]

268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
    def keys(self):
        return self.data.keys()

    def values(self):
        return self.data.values()

    def items(self):
        return self.data.items()

    # After this point:
    # Extended properties and methods only available for fast (Rust-based) tokenizers
    # provided by HuggingFace tokenizers library.

    @property
    def encodings(self) -> Optional[List[EncodingFast]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
284
285
        `Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if
        the input was tokenized through Python (i.e., not a fast) tokenizer.
286
287
288
        """
        return self._encodings

289
    def tokens(self, batch_index: int = 0) -> List[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
290
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
291
292
        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
        integer indices) at a given batch index (only works for the output of a fast tokenizer).
Sylvain Gugger's avatar
Sylvain Gugger committed
293
294

        Args:
295
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
Sylvain Gugger's avatar
Sylvain Gugger committed
296
297

        Returns:
298
            `List[str]`: The list of tokens at that index.
Sylvain Gugger's avatar
Sylvain Gugger committed
299
        """
300
        if not self._encodings:
SaulLu's avatar
SaulLu committed
301
302
303
304
            raise ValueError(
                "tokens() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
305
306
        return self._encodings[batch_index].tokens

307
308
309
310
    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to the id of their original sentences:

311
312
313
            - `None` for special tokens added around or between sequences,
            - `0` for tokens corresponding to words in the first sequence,
            - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
314
315
316
              encoded.

        Args:
317
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
318
319

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
320
321
322
            `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
            by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
            sequence.
323
324
        """
        if not self._encodings:
SaulLu's avatar
SaulLu committed
325
326
327
328
            raise ValueError(
                "sequence_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
329
330
        return self._encodings[batch_index].sequence_ids

331
    def words(self, batch_index: int = 0) -> List[Optional[int]]:
Sylvain Gugger's avatar
Sylvain Gugger committed
332
333
334
335
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
336
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
Sylvain Gugger's avatar
Sylvain Gugger committed
337
338

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
339
340
341
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
Sylvain Gugger's avatar
Sylvain Gugger committed
342
        """
343
        if not self._encodings:
SaulLu's avatar
SaulLu committed
344
345
346
347
            raise ValueError(
                "words() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
348
349
350
351
352
353
354
355
356
357
358
359
        warnings.warn(
            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
            "but more self-explanatory `BatchEncoding.word_ids()` property.",
            FutureWarning,
        )
        return self.word_ids(batch_index)

    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
360
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
361
362

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
363
364
365
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
366
367
        """
        if not self._encodings:
SaulLu's avatar
SaulLu committed
368
369
370
371
            raise ValueError(
                "word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
372
373
374
375
        return self._encodings[batch_index].word_ids

    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
376
377
        Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
        for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
378
379
380

        Can be called as:

381
382
        - `self.token_to_sequence(token_index)` if batch size is 1
        - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
383
384
385
386
387
388

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.

        Args:
389
            batch_or_token_index (`int`):
390
391
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the token in the sequence.
392
393
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
394
395
396
                sequence.

        Returns:
397
            `int`: Index of the word in the input sequence.
398
399
400
401
402
403
404
405
406
407
408
409
410
411
        """

        if not self._encodings:
            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_sequence(token_index)
412
413
414

    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
415
        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
416
417
418

        Can be called as:

419
420
        - `self.token_to_word(token_index)` if batch size is 1
        - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
421

Sylvain Gugger's avatar
Sylvain Gugger committed
422
423
424
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.
425
426

        Args:
427
            batch_or_token_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
428
429
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
430
431
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
Sylvain Gugger's avatar
Sylvain Gugger committed
432
                sequence.
433
434

        Returns:
435
            `int`: Index of the word in the input sequence.
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
        """

        if not self._encodings:
            raise ValueError("token_to_word() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_word(token_index)

451
452
453
    def word_to_tokens(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> Optional[TokenSpan]:
454
        """
455
        Get the encoded token span corresponding to a word in a sequence of the batch.
456

457
        Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:
458

Sylvain Gugger's avatar
Sylvain Gugger committed
459
460
        - **start** -- Index of the first token.
        - **end** -- Index of the token following the last token.
461
462
463

        Can be called as:

464
        - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
Sylvain Gugger's avatar
Sylvain Gugger committed
465
466
        - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
          1
467

Sylvain Gugger's avatar
Sylvain Gugger committed
468
469
470
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
471
472

        Args:
473
            batch_or_word_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
474
475
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the word in the sequence.
476
477
            word_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
Sylvain Gugger's avatar
Sylvain Gugger committed
478
                sequence.
479
            sequence_index (`int`, *optional*, defaults to 0):
480
481
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.
482
483

        Returns:
484
485
486
487
            ([`~tokenization_utils_base.TokenSpan`], *optional*): Span of tokens in the encoded sequence. Returns
            `None` if no tokens correspond to the word. This can happen especially when the token is a special token
            that has been used to format the tokenization. For example when we add a class token at the very beginning
            of the tokenization.
488
489
490
491
492
493
494
495
496
497
498
499
500
        """

        if not self._encodings:
            raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if word_index < 0:
            word_index = self._seq_len + word_index
501
        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
502
        return TokenSpan(*span) if span is not None else None
503
504
505
506
507

    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
        """
        Get the character span corresponding to an encoded token in a sequence of the batch.

508
        Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:
509

Sylvain Gugger's avatar
Sylvain Gugger committed
510
511
512
        - **start** -- Index of the first character in the original string associated to the token.
        - **end** -- Index of the character following the last character in the original string associated to the
          token.
513
514
515

        Can be called as:

516
517
        - `self.token_to_chars(token_index)` if batch size is 1
        - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
518
519

        Args:
520
            batch_or_token_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
521
522
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
523
524
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
Sylvain Gugger's avatar
Sylvain Gugger committed
525
                the sequence.
526
527

        Returns:
528
529
            [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token
            (e.g. <s>, </s>) doesn't correspond to any chars in the origin string.
530
531
532
533
534
535
536
537
538
        """

        if not self._encodings:
            raise ValueError("token_to_chars() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
539
540
541
        span_indices = self._encodings[batch_index].token_to_chars(token_index)

        return CharSpan(*span_indices) if span_indices is not None else None
542

543
544
545
    def char_to_token(
        self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
    ) -> int:
546
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
547
548
        Get the index of the token in the encoded output comprising a character in the original string for a sequence
        of the batch.
549
550
551

        Can be called as:

552
553
        - `self.char_to_token(char_index)` if batch size is 1
        - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
554

Sylvain Gugger's avatar
Sylvain Gugger committed
555
556
557
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
558
559

        Args:
560
            batch_or_char_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
561
562
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
563
564
            char_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
Sylvain Gugger's avatar
Sylvain Gugger committed
565
                sequence.
566
            sequence_index (`int`, *optional*, defaults to 0):
567
568
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.
569
570
571


        Returns:
572
            `int`: Index of the token.
573
574
575
576
577
578
579
580
581
        """

        if not self._encodings:
            raise ValueError("char_to_token() is not available when using Python based tokenizers")
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
582
        return self._encodings[batch_index].char_to_token(char_index, sequence_index)
583

584
585
586
    def word_to_chars(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> CharSpan:
587
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
588
        Get the character span in the original string corresponding to given word in a sequence of the batch.
589
590
591
592
593
594
595
596

        Character spans are returned as a CharSpan NamedTuple with:

        - start: index of the first character in the original string
        - end: index of the character following the last character in the original string

        Can be called as:

597
598
        - `self.word_to_chars(word_index)` if batch size is 1
        - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1
599
600

        Args:
601
            batch_or_word_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
602
603
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
604
605
            word_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
Sylvain Gugger's avatar
Sylvain Gugger committed
606
                sequence.
607
            sequence_index (`int`, *optional*, defaults to 0):
608
609
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.
610
611

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
612
613
            `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan
            are NamedTuple with:
614
615

                - start: index of the first character associated to the token in the original string
Sylvain Gugger's avatar
Sylvain Gugger committed
616
617
                - end: index of the character following the last character associated to the token in the original
                  string
618
619
620
621
622
623
624
625
626
        """

        if not self._encodings:
            raise ValueError("word_to_chars() is not available when using Python based tokenizers")
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
627
        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
628

629
    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
630
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
631
632
        Get the word in the original string corresponding to a character in the original string of a sequence of the
        batch.
633
634
635

        Can be called as:

636
637
        - `self.char_to_word(char_index)` if batch size is 1
        - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1
638

Sylvain Gugger's avatar
Sylvain Gugger committed
639
640
641
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
642
643

        Args:
644
            batch_or_char_index (`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
645
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
646
                the character in the original string.
647
648
            char_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
649
                original string.
650
            sequence_index (`int`, *optional*, defaults to 0):
651
652
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.
653
654
655


        Returns:
656
            `int` or `List[int]`: Index or indices of the associated encoded token(s).
657
658
659
660
661
662
663
664
665
        """

        if not self._encodings:
            raise ValueError("char_to_word() is not available when using Python based tokenizers")
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
666
        return self._encodings[batch_index].char_to_word(char_index, sequence_index)
667

Sylvain Gugger's avatar
Sylvain Gugger committed
668
669
670
671
672
673
674
    def convert_to_tensors(
        self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
    ):
        """
        Convert the inner content to tensors.

        Args:
675
676
677
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                `None`, no modification is done.
678
            prepend_batch_axis (`int`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
679
680
                Whether or not to add the batch dimension during the conversion.
        """
681
682
683
684
685
686
687
688
        if tensor_type is None:
            return self

        # Convert to TensorType
        if not isinstance(tensor_type, TensorType):
            tensor_type = TensorType(tensor_type)

        # Get a function reference for the correct framework
689
690
691
692
693
        if tensor_type == TensorType.TENSORFLOW:
            if not is_tf_available():
                raise ImportError(
                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
                )
Sylvain Gugger's avatar
Sylvain Gugger committed
694
695
            import tensorflow as tf

696
            as_tensor = tf.constant
697
            is_tensor = tf.is_tensor
698
699
700
        elif tensor_type == TensorType.PYTORCH:
            if not is_torch_available():
                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
Sylvain Gugger's avatar
Sylvain Gugger committed
701
702
            import torch

703
            as_tensor = torch.tensor
704
            is_tensor = torch.is_tensor
705
706
707
        elif tensor_type == TensorType.JAX:
            if not is_flax_available():
                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
Sylvain Gugger's avatar
Sylvain Gugger committed
708
709
            import jax.numpy as jnp  # noqa: F811

710
            as_tensor = jnp.array
711
            is_tensor = is_jax_tensor
712
        else:
Sanchit Gandhi's avatar
Sanchit Gandhi committed
713
714
715
716
717
718
719
720
721

            def as_tensor(value, dtype=None):
                if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)):
                    value_lens = [len(val) for val in value]
                    if len(set(value_lens)) > 1 and dtype is None:
                        # we have a ragged list so handle explicitly
                        value = as_tensor([np.asarray(val) for val in value], dtype=object)
                return np.asarray(value, dtype=dtype)

722
            is_tensor = is_numpy_array
723
724
725
726
727
728
729

        # Do the tensor conversion in batch
        for key, value in self.items():
            try:
                if prepend_batch_axis:
                    value = [value]

730
731
                if not is_tensor(value):
                    tensor = as_tensor(value)
732

733
734
735
736
737
738
                    # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
                    # # at-least2d
                    # if tensor.ndim > 2:
                    #     tensor = tensor.squeeze(0)
                    # elif tensor.ndim < 2:
                    #     tensor = tensor[None, :]
739

740
                    self[key] = tensor
741
            except Exception as e:
742
743
744
745
                if key == "overflowing_tokens":
                    raise ValueError(
                        "Unable to create tensor returning overflowing tokens of different lengths. "
                        "Please see if a fast version of this tokenizer is available to have this feature available."
746
                    ) from e
747
                raise ValueError(
748
749
750
751
                    "Unable to create tensor, you should probably activate truncation and/or padding with"
                    " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
                    f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
                    " expected)."
752
                ) from e
753
754
755

        return self

756
    def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
Sylvain Gugger's avatar
Sylvain Gugger committed
757
        """
758
        Send all values to device by calling `v.to(device)` (PyTorch only).
Sylvain Gugger's avatar
Sylvain Gugger committed
759
760

        Args:
761
            device (`str` or `torch.device`): The device to put the tensors on.
Sylvain Gugger's avatar
Sylvain Gugger committed
762
763

        Returns:
764
            [`BatchEncoding`]: The same instance after modification.
Sylvain Gugger's avatar
Sylvain Gugger committed
765
        """
766
        requires_backends(self, ["torch"])
767
768
769
770

        # This check catches things like APEX blindly calling "to" on all inputs to a module
        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
        # into a HalfTensor
771
        if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
772
773
            self.data = {k: v.to(device=device) for k, v in self.data.items()}
        else:
774
            logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
775
776
777
778
        return self


class SpecialTokensMixin:
Sylvain Gugger's avatar
Sylvain Gugger committed
779
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
780
781
782
    A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to
    special tokens. In particular, this class hold the attributes which can be used to directly access these special
    tokens in a model-independent manner and allow to set and update the special tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
783
784

    Args:
785
        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
786
            A special token representing the beginning of a sentence.
787
        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
788
            A special token representing the end of a sentence.
789
        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
790
            A special token representing an out-of-vocabulary token.
791
        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
792
            A special token separating two different sentences in the same input (used by BERT for instance).
793
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
794
795
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
796
        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
797
            A special token representing the class of the input (used by BERT for instance).
798
        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
799
800
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT).
801
        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
802
            A tuple or a list of additional special tokens.
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
    """

    SPECIAL_TOKENS_ATTRIBUTES = [
        "bos_token",
        "eos_token",
        "unk_token",
        "sep_token",
        "pad_token",
        "cls_token",
        "mask_token",
        "additional_special_tokens",
    ]

    def __init__(self, verbose=True, **kwargs):
        self._bos_token = None
        self._eos_token = None
        self._unk_token = None
        self._sep_token = None
        self._pad_token = None
        self._cls_token = None
        self._mask_token = None
        self._pad_token_type_id = 0
        self._additional_special_tokens = []
        self.verbose = verbose

828
        # We directly set the hidden value to allow initialization with special tokens
829
830
        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
        # TODO clean this up at some point (probably by switching to fast tokenizers)
831
        for key, value in kwargs.items():
832
833
            if value is None:
                continue
834
835
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                if key == "additional_special_tokens":
Teven's avatar
Teven committed
836
                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
NielsRogge's avatar
NielsRogge committed
837
838
839
                    assert all(
                        isinstance(t, (str, AddedToken)) for t in value
                    ), "One of the tokens is not a string or an AddedToken"
840
                    setattr(self, key, value)
841
                elif isinstance(value, (str, AddedToken)):
842
843
                    setattr(self, key, value)
                else:
844
                    raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
845

846
    def sanitize_special_tokens(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
847
        """
848
849
        Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
        `tokenizer.cls_token`, etc.) are in the vocabulary.
850

Sylvain Gugger's avatar
Sylvain Gugger committed
851
852
853
        Add the missing ones to the vocabulary if needed.

        Return:
854
            `int`: The number of tokens added in the vocabulary during the operation.
855
        """
856
        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
857

858
859
860
    def add_special_tokens(
        self, special_tokens_dict: Dict[str, Union[str, AddedToken]], replace_additional_special_tokens=True
    ) -> int:
861
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
862
863
864
        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
        current vocabulary).
865

Sylvain Gugger's avatar
Sylvain Gugger committed
866
867
        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
        matrix of the model so that its embedding matrix matches the tokenizer.
868

869
        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
870

871
        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
872

Sylvain Gugger's avatar
Sylvain Gugger committed
873
        - Special tokens are carefully handled by the tokenizer (they are never split).
874
        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
Sylvain Gugger's avatar
Sylvain Gugger committed
875
          makes it easy to develop model-agnostic training and fine-tuning scripts.
876

Sylvain Gugger's avatar
Sylvain Gugger committed
877
        When possible, special tokens are already registered for provided pretrained models (for instance
Sylvain Gugger's avatar
Sylvain Gugger committed
878
879
        [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be
        `'</s>'`).
880
881

        Args:
882
            special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`):
Sylvain Gugger's avatar
Sylvain Gugger committed
883
884
                Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
                `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
885

Sylvain Gugger's avatar
Sylvain Gugger committed
886
                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
887
                assign the index of the `unk_token` to them).
888
889
890
891
892
            replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`):
                If `True`, the existing list of additional special tokens will be replaced by the one specified in
                `special_tokens_dict`. Otherwise, `self._additional_special_tokens` is updated. In the former case, the
                tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged as
                non-special tokens.
893
894

        Returns:
895
            `int`: Number of tokens added to the vocabulary.
896

897
        Examples:
898

899
900
        ```python
        # Let's see how to add a new classification token to GPT-2
Sylvain Gugger's avatar
Sylvain Gugger committed
901
902
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        model = GPT2Model.from_pretrained("gpt2")
903

Sylvain Gugger's avatar
Sylvain Gugger committed
904
        special_tokens_dict = {"cls_token": "<CLS>"}
905

906
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
Sylvain Gugger's avatar
Sylvain Gugger committed
907
        print("We have added", num_added_toks, "tokens")
908
909
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
910

Sylvain Gugger's avatar
Sylvain Gugger committed
911
        assert tokenizer.cls_token == "<CLS>"
912
        ```"""
913
914
915
916
917
        if not special_tokens_dict:
            return 0

        added_tokens = 0
        for key, value in special_tokens_dict.items():
Teven's avatar
Teven committed
918
            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
919

920
            if self.verbose:
921
                logger.info(f"Assigning {value} to the {key} key of the tokenizer")
922

923
            if key == "additional_special_tokens":
924
925
926
                assert isinstance(value, (list, tuple)) and all(
                    isinstance(t, (str, AddedToken)) for t in value
                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941

                if replace_additional_special_tokens:
                    setattr(self, key, value)
                else:
                    # This is a copy of `self._additional_special_tokens`
                    additional_special_tokens = getattr(self, key)
                    additional_special_tokens_set = set(additional_special_tokens)
                    to_add = []
                    for token in value:
                        if str(token) not in additional_special_tokens_set and str(token) not in to_add:
                            to_add.append(token)
                    # update the property
                    additional_special_tokens.extend(to_add)
                    self.additional_special_tokens = additional_special_tokens

942
                added_tokens += self.add_tokens(value, special_tokens=True)
943
            else:
944
945
946
                assert isinstance(
                    value, (str, AddedToken)
                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
947
                setattr(self, key, value)
948
                added_tokens += self.add_tokens([value], special_tokens=True)
949
950
951

        return added_tokens

Sylvain Gugger's avatar
Sylvain Gugger committed
952
953
954
    def add_tokens(
        self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
    ) -> int:
955
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
956
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
957
958
959
        it with indices starting from length of the current vocabulary and and will be isolated before the tokenization
        algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore
        not treated in the same way.
960

961
962
        Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix
        of the model so that its embedding matrix matches the tokenizer.
963

964
        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
965

966
        Args:
967
            new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
Sylvain Gugger's avatar
Sylvain Gugger committed
968
969
970
971
                Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
                token to let you personalize its behavior: whether this token should only match against a single word,
                whether this token should strip all potential whitespaces on the left side, whether this token should
                strip all potential whitespaces on the right side, etc.
972
            special_tokens (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
973
974
                Can be used to specify if the token is a special token. This mostly change the normalization behavior
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
975

976
                See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.
977
978

        Returns:
979
            `int`: Number of tokens added to the vocabulary.
980

981
        Examples:
982

983
984
        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
985
986
        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
        model = BertModel.from_pretrained("bert-base-uncased")
987

Sylvain Gugger's avatar
Sylvain Gugger committed
988
989
        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
990
991
992
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```"""
993
994
995
996
997
998
999
        if not new_tokens:
            return 0

        if not isinstance(new_tokens, (list, tuple)):
            new_tokens = [new_tokens]

        return self._add_tokens(new_tokens, special_tokens=special_tokens)
1000

1001
1002
1003
    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
        raise NotImplementedError

1004
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1005
1006
    def bos_token(self) -> str:
        """
1007
        `str`: Beginning of sentence token. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1008
        """
1009
1010
1011
        if self._bos_token is None:
            if self.verbose:
                logger.error("Using bos_token, but it is not set yet.")
1012
1013
            return None
        return str(self._bos_token)
1014
1015

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1016
1017
    def eos_token(self) -> str:
        """
1018
        `str`: End of sentence token. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1019
        """
1020
1021
1022
        if self._eos_token is None:
            if self.verbose:
                logger.error("Using eos_token, but it is not set yet.")
1023
1024
            return None
        return str(self._eos_token)
1025
1026

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1027
1028
    def unk_token(self) -> str:
        """
1029
        `str`: Unknown token. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1030
        """
1031
1032
1033
        if self._unk_token is None:
            if self.verbose:
                logger.error("Using unk_token, but it is not set yet.")
1034
1035
            return None
        return str(self._unk_token)
1036
1037

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1038
1039
    def sep_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1040
1041
        `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
        having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1042
        """
1043
1044
1045
        if self._sep_token is None:
            if self.verbose:
                logger.error("Using sep_token, but it is not set yet.")
1046
1047
            return None
        return str(self._sep_token)
1048
1049

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1050
1051
    def pad_token(self) -> str:
        """
1052
        `str`: Padding token. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1053
        """
1054
1055
1056
        if self._pad_token is None:
            if self.verbose:
                logger.error("Using pad_token, but it is not set yet.")
1057
1058
            return None
        return str(self._pad_token)
1059
1060

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1061
1062
    def cls_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1063
1064
        `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
        depth of the model. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1065
        """
1066
1067
1068
        if self._cls_token is None:
            if self.verbose:
                logger.error("Using cls_token, but it is not set yet.")
1069
1070
            return None
        return str(self._cls_token)
1071
1072

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1073
1074
    def mask_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1075
1076
        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
        having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1077
        """
1078
1079
1080
        if self._mask_token is None:
            if self.verbose:
                logger.error("Using mask_token, but it is not set yet.")
1081
1082
            return None
        return str(self._mask_token)
1083
1084

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1085
1086
    def additional_special_tokens(self) -> List[str]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1087
1088
        `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
        set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1089
        """
1090
1091
1092
        if self._additional_special_tokens is None:
            if self.verbose:
                logger.error("Using additional_special_tokens, but it is not set yet.")
1093
1094
            return None
        return [str(tok) for tok in self._additional_special_tokens]
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128

    @bos_token.setter
    def bos_token(self, value):
        self._bos_token = value

    @eos_token.setter
    def eos_token(self, value):
        self._eos_token = value

    @unk_token.setter
    def unk_token(self, value):
        self._unk_token = value

    @sep_token.setter
    def sep_token(self, value):
        self._sep_token = value

    @pad_token.setter
    def pad_token(self, value):
        self._pad_token = value

    @cls_token.setter
    def cls_token(self, value):
        self._cls_token = value

    @mask_token.setter
    def mask_token(self, value):
        self._mask_token = value

    @additional_special_tokens.setter
    def additional_special_tokens(self, value):
        self._additional_special_tokens = value

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1129
1130
    def bos_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1131
1132
        `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not
        been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1133
        """
1134
1135
        if self._bos_token is None:
            return None
1136
1137
1138
        return self.convert_tokens_to_ids(self.bos_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1139
1140
    def eos_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1141
1142
        `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
        set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1143
        """
1144
1145
        if self._eos_token is None:
            return None
1146
1147
1148
        return self.convert_tokens_to_ids(self.eos_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1149
1150
    def unk_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1151
        `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1152
        """
1153
1154
        if self._unk_token is None:
            return None
1155
1156
1157
        return self.convert_tokens_to_ids(self.unk_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1158
1159
    def sep_token_id(self) -> Optional[int]:
        """
1160
1161
        `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
        sequence. Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1162
        """
1163
1164
        if self._sep_token is None:
            return None
1165
1166
1167
        return self.convert_tokens_to_ids(self.sep_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1168
1169
    def pad_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1170
        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1171
        """
1172
1173
        if self._pad_token is None:
            return None
1174
1175
1176
        return self.convert_tokens_to_ids(self.pad_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1177
1178
    def pad_token_type_id(self) -> int:
        """
1179
        `int`: Id of the padding token type in the vocabulary.
Sylvain Gugger's avatar
Sylvain Gugger committed
1180
        """
1181
1182
1183
        return self._pad_token_type_id

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1184
1185
    def cls_token_id(self) -> Optional[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1186
1187
        `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
        leveraging self-attention along the full depth of the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
1188

1189
        Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1190
        """
1191
1192
        if self._cls_token is None:
            return None
1193
1194
1195
        return self.convert_tokens_to_ids(self.cls_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1196
1197
    def mask_token_id(self) -> Optional[int]:
        """
1198
1199
        `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
        modeling. Returns `None` if the token has not been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1200
        """
1201
1202
        if self._mask_token is None:
            return None
1203
1204
1205
        return self.convert_tokens_to_ids(self.mask_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1206
1207
    def additional_special_tokens_ids(self) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1208
1209
        `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
        been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1210
        """
1211
1212
        return self.convert_tokens_to_ids(self.additional_special_tokens)

1213
1214
    @bos_token_id.setter
    def bos_token_id(self, value):
1215
        self._bos_token = self.convert_ids_to_tokens(value) if value is not None else None
1216
1217
1218

    @eos_token_id.setter
    def eos_token_id(self, value):
1219
        self._eos_token = self.convert_ids_to_tokens(value) if value is not None else None
1220
1221
1222

    @unk_token_id.setter
    def unk_token_id(self, value):
1223
        self._unk_token = self.convert_ids_to_tokens(value) if value is not None else None
1224
1225
1226

    @sep_token_id.setter
    def sep_token_id(self, value):
1227
        self._sep_token = self.convert_ids_to_tokens(value) if value is not None else None
1228
1229
1230

    @pad_token_id.setter
    def pad_token_id(self, value):
1231
        self._pad_token = self.convert_ids_to_tokens(value) if value is not None else None
1232
1233
1234

    @cls_token_id.setter
    def cls_token_id(self, value):
1235
        self._cls_token = self.convert_ids_to_tokens(value) if value is not None else None
1236
1237
1238

    @mask_token_id.setter
    def mask_token_id(self, value):
1239
        self._mask_token = self.convert_ids_to_tokens(value) if value is not None else None
1240
1241
1242

    @additional_special_tokens_ids.setter
    def additional_special_tokens_ids(self, values):
1243
        self._additional_special_tokens = [self.convert_ids_to_tokens(value) for value in values]
1244

1245
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1246
1247
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
1248
1249
        `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
        `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
Sylvain Gugger's avatar
Sylvain Gugger committed
1250

1251
        Convert potential tokens of `tokenizers.AddedToken` type to string.
1252
1253
1254
1255
1256
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
1257
1258
1259
1260
1261
                set_attr[attr] = (
                    type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value)
                    if isinstance(attr_value, (list, tuple))
                    else str(attr_value)
                )
1262
1263
1264
        return set_attr

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1265
1266
    def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1267
1268
        `Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary mapping
        special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
1269

Sylvain Gugger's avatar
Sylvain Gugger committed
1270
1271
        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
        special tokens are tokenized.
1272
1273
1274
1275
1276
1277
1278
1279
1280
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = attr_value
        return set_attr

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1281
1282
    def all_special_tokens(self) -> List[str]:
        """
1283
        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
Sylvain Gugger's avatar
Sylvain Gugger committed
1284

1285
        Convert tokens of `tokenizers.AddedToken` type to string.
1286
        """
1287
1288
1289
1290
        all_toks = [str(s) for s in self.all_special_tokens_extended]
        return all_toks

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1291
1292
    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1293
1294
        `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class
        attributes.
1295

Sylvain Gugger's avatar
Sylvain Gugger committed
1296
1297
        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
        special tokens are tokenized.
1298
        """
1299
        all_toks = []
1300
        set_attr = self.special_tokens_map_extended
1301
1302
        for attr_value in set_attr.values():
            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
1303
        all_toks = list(OrderedDict.fromkeys(all_toks))
1304
1305
1306
        return all_toks

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1307
1308
    def all_special_ids(self) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1309
        `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
1310
1311
1312
1313
1314
1315
1316
        """
        all_toks = self.all_special_tokens
        all_ids = self.convert_tokens_to_ids(all_toks)
        return all_ids


ENCODE_KWARGS_DOCSTRING = r"""
1317
            add_special_tokens (`bool`, *optional*, defaults to `True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1318
                Whether or not to encode the sequences with the special tokens relative to their model.
1319
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1320
1321
                Activates and controls padding. Accepts the following values:

Sylvain Gugger's avatar
Sylvain Gugger committed
1322
1323
1324
1325
1326
1327
                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
1328
            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1329
1330
                Activates and controls truncation. Accepts the following values:

Sylvain Gugger's avatar
Sylvain Gugger committed
1331
1332
1333
1334
1335
1336
                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
                  sequences (or a batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
Sylvain Gugger's avatar
Sylvain Gugger committed
1337
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
1338
1339
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
Sylvain Gugger's avatar
Sylvain Gugger committed
1340
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
1341
1342
                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).
1343
            max_length (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1344
1345
                Controls the maximum length to use by one of the truncation/padding parameters.

Sylvain Gugger's avatar
Sylvain Gugger committed
1346
1347
1348
                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
                is required by one of the truncation/padding parameters. If the model has no specific maximum input
                length (like XLNet) truncation/padding to a maximum length will be deactivated.
1349
1350
1351
            stride (`int`, *optional*, defaults to 0):
                If set to a number along with `max_length`, the overflowing tokens returned when
                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
Sylvain Gugger's avatar
Sylvain Gugger committed
1352
1353
                returned to provide some overlap between truncated and overflowing sequences. The value of this
                argument defines the number of overlapping tokens.
1354
1355
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
Sylvain Gugger's avatar
Style  
Sylvain Gugger committed
1356
1357
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
1358
            pad_to_multiple_of (`int`, *optional*):
1359
1360
1361
                If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta).
1362
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1363
1364
                If set, will return tensors instead of list of python integers. Acceptable values are:

1365
1366
1367
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
1368
1369
1370
"""

ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
1371
            return_token_type_ids (`bool`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1372
                Whether to return token type IDs. If left to the default, will return the token type IDs according to
1373
                the specific tokenizer's default, defined by the `return_outputs` attribute.
1374

1375
1376
                [What are token type IDs?](../glossary#token-type-ids)
            return_attention_mask (`bool`, *optional*):
1377
                Whether to return the attention mask. If left to the default, will return the attention mask according
1378
                to the specific tokenizer's default, defined by the `return_outputs` attribute.
1379

1380
1381
                [What are attention masks?](../glossary#attention-mask)
            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
1382
                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
Sylvain Gugger's avatar
Sylvain Gugger committed
1383
1384
                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
                of returning overflowing tokens.
1385
            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
Tiger's avatar
Tiger committed
1386
                Whether or not to return special tokens mask information.
1387
1388
            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
                Whether or not to return `(char_start, char_end)` for each token.
1389

Sylvain Gugger's avatar
Sylvain Gugger committed
1390
1391
                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
                Python's tokenizer, this method will raise `NotImplementedError`.
1392
            return_length  (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1393
                Whether or not to return the lengths of the encoded inputs.
1394
            verbose (`bool`, *optional*, defaults to `True`):
1395
                Whether or not to print more information and warnings.
1396
            **kwargs: passed to the `self.tokenize()` method
1397

1398
        Return:
1399
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
Sylvain Gugger's avatar
Sylvain Gugger committed
1400
1401
1402

            - **input_ids** -- List of token ids to be fed to a model.

1403
              [What are input IDs?](../glossary#input-ids)
Sylvain Gugger's avatar
Sylvain Gugger committed
1404

Sylvain Gugger's avatar
Sylvain Gugger committed
1405
1406
            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
              if *"token_type_ids"* is in `self.model_input_names`).
Sylvain Gugger's avatar
Sylvain Gugger committed
1407

1408
              [What are token type IDs?](../glossary#token-type-ids)
Sylvain Gugger's avatar
Sylvain Gugger committed
1409

Sylvain Gugger's avatar
Sylvain Gugger committed
1410
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
1411
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
Sylvain Gugger's avatar
Sylvain Gugger committed
1412

1413
              [What are attention masks?](../glossary#attention-mask)
Sylvain Gugger's avatar
Sylvain Gugger committed
1414

1415
1416
1417
1418
            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
              `return_overflowing_tokens=True`).
            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
              `return_overflowing_tokens=True`).
1419
            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
1420
1421
              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
            - **length** -- The length of the inputs (when `return_length=True`)
Sylvain Gugger's avatar
Sylvain Gugger committed
1422
"""
1423

Sylvain Gugger's avatar
Sylvain Gugger committed
1424
1425
INIT_TOKENIZER_DOCSTRING = r"""
    Class attributes (overridden by derived classes)
Sylvain Gugger's avatar
Sylvain Gugger committed
1426

Sylvain Gugger's avatar
Sylvain Gugger committed
1427
1428
1429
        - **vocab_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
          vocabulary file required by the model, and as associated values, the filename for saving the associated file
          (string).
1430
1431
        - **pretrained_vocab_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
          high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
Sylvain Gugger's avatar
Sylvain Gugger committed
1432
1433
1434
1435
1436
          low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
          associated pretrained vocabulary file.
        - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
          of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
          or `None` if the model has no maximum input size.
1437
        - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
Sylvain Gugger's avatar
Sylvain Gugger committed
1438
1439
1440
          `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
          pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
          with the [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`] method.
1441
        - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
1442
1443
        - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
          Should be `'right'` or `'left'`.
1444
1445
        - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
          applied. Should be `'right'` or `'left'`.
Sylvain Gugger's avatar
Sylvain Gugger committed
1446
1447

    Args:
1448
        model_max_length (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1449
            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
Sylvain Gugger's avatar
Sylvain Gugger committed
1450
1451
1452
            loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the
            value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
            default to VERY_LARGE_INTEGER (`int(1e30)`).
1453
        padding_side (`str`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1454
1455
            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
            Default value is picked from the class attribute of the same name.
1456
1457
1458
        truncation_side (`str`, *optional*):
            The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
            Default value is picked from the class attribute of the same name.
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
        model_input_names (`List[string]`, *optional*):
            The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
            `"attention_mask"`). Default value is picked from the class attribute of the same name.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the beginning of a sentence. Will be associated to `self.bos_token` and
            `self.bos_token_id`.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the end of a sentence. Will be associated to `self.eos_token` and
            `self.eos_token_id`.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing an out-of-vocabulary token. Will be associated to `self.unk_token` and
            `self.unk_token_id`.
        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1472
            A special token separating two different sentences in the same input (used by BERT for instance). Will be
1473
1474
            associated to `self.sep_token` and `self.sep_token_id`.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1475
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
Sylvain Gugger's avatar
Sylvain Gugger committed
1476
            attention mechanisms or loss computation. Will be associated to `self.pad_token` and `self.pad_token_id`.
1477
        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1478
            A special token representing the class of the input (used by BERT for instance). Will be associated to
1479
1480
            `self.cls_token` and `self.cls_token_id`.
        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1481
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
1482
1483
            BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1484
            A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
1485
1486
            tokenization process. Will be associated to `self.additional_special_tokens` and
            `self.additional_special_tokens_ids`.
1487
1488
1489
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
            tokenization process.
1490
1491
1492
"""


Sylvain Gugger's avatar
Sylvain Gugger committed
1493
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
Sylvain Gugger's avatar
Sylvain Gugger committed
1494
class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
Sylvain Gugger's avatar
Sylvain Gugger committed
1495
    """
1496
    Base class for [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`].
1497

Sylvain Gugger's avatar
Sylvain Gugger committed
1498
    Handles shared (mostly boiler plate) methods for those two classes.
1499
1500
1501
1502
1503
    """

    vocab_files_names: Dict[str, str] = {}
    pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
Sylvain Gugger's avatar
Sylvain Gugger committed
1504
    max_model_input_sizes: Dict[str, Optional[int]] = {}
1505
    _auto_class: Optional[str] = None
1506
1507
1508
1509

    # first name has to correspond to main model input name
    # to make sure `tokenizer.pad(...)` works correctly
    model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
1510
    padding_side: str = "right"
1511
    truncation_side: str = "right"
1512
    slow_tokenizer_class = None
1513

1514
1515
1516
    def __init__(self, **kwargs):
        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
        self.init_inputs = ()
1517
        self.init_kwargs = copy.deepcopy(kwargs)
1518
        self.name_or_path = kwargs.pop("name_or_path", "")
1519
        self._processor_class = kwargs.pop("processor_class", None)
1520
1521

        # For backward compatibility we fallback to set model_max_length from max_len if provided
1522
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
1523
1524
        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER

1525
1526
        # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it
        # is changed.
1527
        self.padding_side = kwargs.pop("padding_side", self.padding_side)
1528
1529
1530
1531
        if self.padding_side not in ["right", "left"]:
            raise ValueError(
                f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
            )
1532
1533
1534
1535
1536
1537
1538

        self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
        if self.truncation_side not in ["right", "left"]:
            raise ValueError(
                f"Padding side should be selected between 'right' and 'left', current value: {self.truncation_side}"
            )

1539
1540
        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

1541
1542
1543
        # By default, cleaning tokenization spaces for both fast and slow tokenizers
        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)

1544
1545
1546
        self.deprecation_warnings = (
            {}
        )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
1547
        self._in_target_context_manager = False
1548
        super().__init__(**kwargs)
1549
1550
1551

    @property
    def max_len_single_sentence(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1552
        """
1553
        `int`: The maximum length of a sentence that can be fed to the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
1554
        """
1555
1556
1557
1558
        return self.model_max_length - self.num_special_tokens_to_add(pair=False)

    @property
    def max_len_sentences_pair(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1559
        """
1560
        `int`: The maximum combined length of a pair of sentences that can be fed to the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
1561
        """
1562
1563
1564
1565
        return self.model_max_length - self.num_special_tokens_to_add(pair=True)

    @max_len_single_sentence.setter
    def max_len_single_sentence(self, value) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1566
        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
1567
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
1568
1569
            if not self.deprecation_warnings.get("max_len_single_sentence", False):
                logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
1570
                    "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
1571
1572
                )
            self.deprecation_warnings["max_len_single_sentence"] = True
1573
1574
        else:
            raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
1575
                "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
1576
1577
1578
1579
            )

    @max_len_sentences_pair.setter
    def max_len_sentences_pair(self, value) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1580
        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
1581
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
1582
1583
            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
                logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
1584
                    "Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
1585
1586
                )
            self.deprecation_warnings["max_len_sentences_pair"] = True
1587
        else:
Sylvain Gugger's avatar
Sylvain Gugger committed
1588
            raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")
1589

1590
1591
1592
1593
    def _set_processor_class(self, processor_class: str):
        """Sets processor class as an attribute."""
        self._processor_class = processor_class

1594
1595
    def __repr__(self) -> str:
        return (
1596
1597
            f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
            f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
Sylvain Gugger's avatar
Sylvain Gugger committed
1598
            f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
1599
            f" special_tokens={self.special_tokens_map_extended}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces})"
1600
1601
        )

1602
1603
1604
    def __len__(self) -> int:
        raise NotImplementedError()

1605
1606
1607
1608
    def get_vocab(self) -> Dict[str, int]:
        """
        Returns the vocabulary as a dictionary of token to index.

Sylvain Gugger's avatar
Sylvain Gugger committed
1609
1610
        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
        vocab.
1611
1612

        Returns:
1613
            `Dict[str, int]`: The vocabulary.
1614
1615
1616
        """
        raise NotImplementedError()

1617
    @classmethod
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        *init_inputs,
        cache_dir: Optional[Union[str, os.PathLike]] = None,
        force_download: bool = False,
        local_files_only: bool = False,
        token: Optional[Union[str, bool]] = None,
        revision: str = "main",
        **kwargs,
    ):
1629
        r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
1630
1631
        Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined
        tokenizer.
1632
1633

        Args:
1634
            pretrained_model_name_or_path (`str` or `os.PathLike`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1635
1636
                Can be either:

1637
1638
1639
1640
                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                  user or organization name, like `dbmdz/bert-base-german-cased`.
                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
Sylvain Gugger's avatar
Sylvain Gugger committed
1641
1642
                  using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,
                  `./my_model_directory/`.
Sylvain Gugger's avatar
Sylvain Gugger committed
1643
1644
                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
1645
1646
                  `./my_model_directory/vocab.txt`.
            cache_dir (`str` or `os.PathLike`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1647
1648
                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
                standard cache should not be used.
1649
            force_download (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1650
1651
                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
                exist.
1652
            resume_download (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1653
1654
                Whether or not to delete incompletely received files. Attempt to resume the download if such a file
                exists.
1655
            proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1656
1657
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
1658
            token (`str` or *bool*, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1659
                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
1660
                when running `huggingface-cli login` (stored in `~/.huggingface`).
1661
            local_files_only (`bool`, *optional*, defaults to `False`):
1662
                Whether or not to only rely on local files and not to attempt to download any files.
1663
            revision (`str`, *optional*, defaults to `"main"`):
Julien Chaumond's avatar
Julien Chaumond committed
1664
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
1665
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
Julien Chaumond's avatar
Julien Chaumond committed
1666
                identifier allowed by git.
1667
            subfolder (`str`, *optional*):
1668
1669
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
1670
1671
1672
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__` method.
            kwargs (additional keyword arguments, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
1673
1674
1675
                Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`,
                `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__` for more details.
1676

1677
        <Tip>
1678

1679
        Passing `use_auth_token=True` is required when you want to use a private model.
1680

1681
        </Tip>
1682

1683
        Examples:
1684

1685
1686
1687
        ```python
        # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer
        # Download vocabulary from huggingface.co and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
1688
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
1689

1690
        # Download vocabulary from huggingface.co (user-uploaded) and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
1691
        tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
1692

1693
        # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
Sylvain Gugger's avatar
Sylvain Gugger committed
1694
        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/")
1695

1696
        # If the tokenizer uses a single vocabulary file, you can point directly to this file
Sylvain Gugger's avatar
Sylvain Gugger committed
1697
        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt")
1698

1699
        # You can link tokens to special vocabulary when instantiating
Sylvain Gugger's avatar
Sylvain Gugger committed
1700
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", unk_token="<unk>")
1701
1702
        # You should be sure '<unk>' is in the vocabulary when doing that.
        # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
Sylvain Gugger's avatar
Sylvain Gugger committed
1703
        assert tokenizer.unk_token == "<unk>"
1704
        ```"""
1705
1706
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
1707
        use_auth_token = kwargs.pop("use_auth_token", None)
1708
        subfolder = kwargs.pop("subfolder", None)
1709
1710
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)
1711
        commit_hash = kwargs.pop("_commit_hash", None)
1712

1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
        use_auth_token = kwargs.pop("use_auth_token", None)
        if use_auth_token is not None:
            warnings.warn(
                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
            )
            if token is not None:
                raise ValueError(
                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                )
            token = use_auth_token

        if token is not None:
            # change to `token` in a follow-up PR
            kwargs["use_auth_token"] = token

1728
1729
1730
        user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline
1731

1732
1733
1734
1735
        if is_offline_mode() and not local_files_only:
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True

1736
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
1737
1738
1739
        vocab_files = {}
        init_configuration = {}

1740
        is_local = os.path.isdir(pretrained_model_name_or_path)
1741
        single_file_id = None
1742
        if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
1743
1744
1745
1746
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
1747
                )
1748
1749
1750
1751
1752
1753
            warnings.warn(
                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
                FutureWarning,
            )
            file_id = list(cls.vocab_files_names.keys())[0]
1754

1755
            vocab_files[file_id] = pretrained_model_name_or_path
1756
            single_file_id = file_id
1757
1758
1759
1760
1761
1762
1763
        else:
            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
            additional_files_names = {
                "added_tokens_file": ADDED_TOKENS_FILE,
                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
            }
1764
            vocab_files = {**cls.vocab_files_names, **additional_files_names}
1765

1766
            if "tokenizer_file" in vocab_files:
1767
1768
                # Try to get the tokenizer config to see if there are versioned tokenizer files.
                fast_tokenizer_file = FULL_TOKENIZER_FILE
1769
                resolved_config_file = cached_file(
1770
1771
1772
1773
1774
1775
                    pretrained_model_name_or_path,
                    TOKENIZER_CONFIG_FILE,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    resume_download=resume_download,
                    proxies=proxies,
1776
                    use_auth_token=token,
1777
1778
                    revision=revision,
                    local_files_only=local_files_only,
1779
                    subfolder=subfolder,
1780
1781
1782
1783
                    user_agent=user_agent,
                    _raise_exceptions_for_missing_entries=False,
                    _raise_exceptions_for_connection_errors=False,
                    _commit_hash=commit_hash,
1784
                )
1785
                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
1786
1787
1788
1789
1790
                if resolved_config_file is not None:
                    with open(resolved_config_file, encoding="utf-8") as reader:
                        tokenizer_config = json.load(reader)
                        if "fast_tokenizer_files" in tokenizer_config:
                            fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
1791
                vocab_files["tokenizer_file"] = fast_tokenizer_file
1792
1793

        # Get files from url, cache, or disk depending on the case
Julien Chaumond's avatar
Julien Chaumond committed
1794
        resolved_vocab_files = {}
1795
        unresolved_files = []
Julien Chaumond's avatar
Julien Chaumond committed
1796
1797
1798
        for file_id, file_path in vocab_files.items():
            if file_path is None:
                resolved_vocab_files[file_id] = None
1799
1800
1801
1802
1803
            elif single_file_id == file_id:
                if os.path.isfile(file_path):
                    resolved_vocab_files[file_id] = file_path
                elif is_remote_url(file_path):
                    resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
Julien Chaumond's avatar
Julien Chaumond committed
1804
            else:
1805
1806
1807
1808
1809
1810
1811
1812
                resolved_vocab_files[file_id] = cached_file(
                    pretrained_model_name_or_path,
                    file_path,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    local_files_only=local_files_only,
1813
                    use_auth_token=token,
1814
1815
1816
1817
1818
                    user_agent=user_agent,
                    revision=revision,
                    subfolder=subfolder,
                    _raise_exceptions_for_missing_entries=False,
                    _raise_exceptions_for_connection_errors=False,
1819
                    _commit_hash=commit_hash,
1820
                )
1821
                commit_hash = extract_commit_hash(resolved_vocab_files[file_id], commit_hash)
1822

1823
1824
1825
1826
1827
1828
        if len(unresolved_files) > 0:
            logger.info(
                f"Can't load following files from cache: {unresolved_files} and cannot check if these "
                "files are necessary for the tokenizer to operate."
            )

1829
        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
1830
1831
1832
1833
            raise EnvironmentError(
                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
1834
                f"containing all relevant files for a {cls.__name__} tokenizer."
1835
            )
1836

1837
        for file_id, file_path in vocab_files.items():
1838
1839
1840
            if file_id not in resolved_vocab_files:
                continue

1841
            if is_local:
1842
                logger.info(f"loading file {file_path}")
1843
            else:
1844
                logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
1845

1846
        return cls._from_pretrained(
1847
1848
1849
1850
            resolved_vocab_files,
            pretrained_model_name_or_path,
            init_configuration,
            *init_inputs,
1851
            use_auth_token=token,
1852
            cache_dir=cache_dir,
1853
            local_files_only=local_files_only,
1854
            _commit_hash=commit_hash,
1855
            _is_local=is_local,
1856
            **kwargs,
1857
1858
1859
1860
        )

    @classmethod
    def _from_pretrained(
1861
1862
1863
1864
1865
1866
        cls,
        resolved_vocab_files,
        pretrained_model_name_or_path,
        init_configuration,
        *init_inputs,
        use_auth_token=None,
1867
        cache_dir=None,
1868
        local_files_only=False,
1869
        _commit_hash=None,
1870
        _is_local=False,
1871
        **kwargs,
1872
    ):
1873
1874
1875
1876
1877
        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
        # file or if `from_slow` is set to True.
        from_slow = kwargs.get("from_slow", False)
        has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
        if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
1878
            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
1879
1880
1881
1882
                copy.deepcopy(resolved_vocab_files),
                pretrained_model_name_or_path,
                copy.deepcopy(init_configuration),
                *init_inputs,
1883
1884
1885
                use_auth_token=use_auth_token,
                cache_dir=cache_dir,
                local_files_only=local_files_only,
1886
                _commit_hash=_commit_hash,
1887
1888
1889
1890
1891
                **(copy.deepcopy(kwargs)),
            )
        else:
            slow_tokenizer = None

1892
1893
1894
1895
1896
1897
        # Prepare tokenizer initialization kwargs
        # Did we saved some inputs and kwargs to reload ?
        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
        if tokenizer_config_file is not None:
            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                init_kwargs = json.load(tokenizer_config_handle)
1898
1899
            # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
            config_tokenizer_class = init_kwargs.get("tokenizer_class")
1900
            init_kwargs.pop("tokenizer_class", None)
1901
1902
1903
1904
            saved_init_inputs = init_kwargs.pop("init_inputs", ())
            if not init_inputs:
                init_inputs = saved_init_inputs
        else:
1905
            config_tokenizer_class = None
1906
1907
            init_kwargs = init_configuration

1908
1909
1910
1911
1912
1913
1914
1915
        if "auto_map" in init_kwargs and not _is_local:
            # For backward compatibility with odl format.
            if isinstance(init_kwargs["auto_map"], (tuple, list)):
                init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
            init_kwargs["auto_map"] = add_model_info_to_auto_map(
                init_kwargs["auto_map"], pretrained_model_name_or_path
            )

1916
        if config_tokenizer_class is None:
1917
            from .models.auto.configuration_auto import AutoConfig  # tests_ignore
1918
1919
1920

            # Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
            try:
1921
1922
1923
1924
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path,
                    use_auth_token=use_auth_token,
                    cache_dir=cache_dir,
1925
                    local_files_only=local_files_only,
1926
                    _commit_hash=_commit_hash,
1927
                )
1928
1929
                config_tokenizer_class = config.tokenizer_class
            except (OSError, ValueError, KeyError):
1930
                # skip if an error occurred.
1931
1932
1933
1934
                config = None
            if config_tokenizer_class is None:
                # Third attempt. If we have not yet found the original type of the tokenizer,
                # we are loading we see if we can infer it from the type of the configuration file
1935
                from .models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES  # tests_ignore
1936
1937

                if hasattr(config, "model_type"):
1938
                    model_type = config.model_type
1939
1940
                else:
                    # Fallback: use pattern matching on the string.
1941
1942
                    model_type = None
                    for pattern in TOKENIZER_MAPPING_NAMES.keys():
1943
                        if pattern in str(pretrained_model_name_or_path):
1944
                            model_type = pattern
1945
1946
                            break

1947
                if model_type is not None:
1948
1949
1950
                    config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(
                        model_type, (None, None)
                    )
1951
1952
                    if config_tokenizer_class is None:
                        config_tokenizer_class = config_tokenizer_class_fast
1953
1954
1955
1956

        if config_tokenizer_class is not None:
            if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
                logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
1957
1958
1959
1960
                    "The tokenizer class you load from this checkpoint is not the same type as the class this"
                    " function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
                    f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
                    f" from is '{cls.__name__}'."
1961
1962
                )

1963
1964
1965
        # Update with newly provided kwargs
        init_kwargs.update(kwargs)

1966
1967
1968
1969
1970
1971
        # Convert AddedTokens serialized as dict to class instances
        def convert_added_tokens(obj: Union[AddedToken, Any]):
            if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
                obj.pop("__type")
                return AddedToken(**obj)
            elif isinstance(obj, (list, tuple)):
1972
                return [convert_added_tokens(o) for o in obj]
1973
1974
1975
1976
1977
1978
            elif isinstance(obj, dict):
                return {k: convert_added_tokens(v) for k, v in obj.items()}
            return obj

        init_kwargs = convert_added_tokens(init_kwargs)

1979
1980
1981
1982
        # Set max length if needed
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
            # if we're using a pretrained model, ensure the tokenizer
            # wont index sequences longer than the number of positional embeddings
1983

1984
1985
            model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
            if model_max_length is not None and isinstance(model_max_length, (int, float)):
1986
1987
1988
1989
1990
1991
1992
1993
1994
                model_max_length = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
                # TODO(PVP) - uncomment following line in Transformers v5
                # init_kwargs["model_max_length"] = model_max_length
                # TODO(PVP) - remove in Transformers v5
                # ---
                init_kwargs["model_max_length"] = cls._eventually_correct_t5_max_length(
                    pretrained_model_name_or_path, model_max_length, init_kwargs.get("model_max_length")
                )
                # ---
1995
1996
1997
1998
1999
2000
2001

        # Merge resolved_vocab_files arguments in init_kwargs.
        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
        for args_name, file_path in resolved_vocab_files.items():
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path

2002
2003
2004
        if slow_tokenizer is not None:
            init_kwargs["__slow_tokenizer"] = slow_tokenizer

2005
2006
        init_kwargs["name_or_path"] = pretrained_model_name_or_path

2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
        # Instantiate tokenizer.
        try:
            tokenizer = cls(*init_inputs, **init_kwargs)
        except OSError:
            raise OSError(
                "Unable to load vocabulary from file. "
                "Please check that the provided vocabulary is accessible and not corrupted."
            )

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
2017
2018
2019
        # Removed: Now done at the base class level
        # tokenizer.init_inputs = init_inputs
        # tokenizer.init_kwargs = init_kwargs
2020

2021
2022
2023
2024
2025
2026
        # If there is a complementary special token map, load it
        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
        if special_tokens_map_file is not None:
            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
                special_tokens_map = json.load(special_tokens_map_handle)
            for key, value in special_tokens_map.items():
2027
2028
2029
2030
2031
2032
                if key in kwargs and kwargs[key]:
                    # This value has already been redefined by the kwargs
                    # We keep this new value and ignore the one stored in the special_tokens_map_file

                    continue

2033
2034
2035
2036
                if isinstance(value, dict):
                    value = AddedToken(**value)
                elif isinstance(value, list):
                    value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
2037
                setattr(tokenizer, key, value)
2038
2039

        # Add supplementary tokens.
2040
        special_tokens = tokenizer.all_special_tokens
2041
2042
2043
        if added_tokens_file is not None:
            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                added_tok_encoder = json.load(added_tokens_handle)
2044
2045

            # Sort added tokens by index
2046
            added_tok_encoder_sorted = sorted(added_tok_encoder.items(), key=lambda x: x[1])
2047

2048
2049
2050
2051
2052
            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
            # individual tokens would repeatedly rebuild a trie, which can be slow.
            is_last_special = None
            tokens = []

2053
            for token, index in added_tok_encoder_sorted:
2054
2055
                current_index = len(tokenizer) + len(tokens)
                if has_tokenizer_file and index != current_index and tokenizer.convert_tokens_to_ids(token) != index:
Sylvain Gugger's avatar
Sylvain Gugger committed
2056
2057
2058
2059
2060
2061
                    # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
                    # index is the current length of the tokenizer (not in vocabulary)
                    raise ValueError(
                        f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
                        f"{index}."
                    )
2062
                elif not has_tokenizer_file and index != current_index:
Sylvain Gugger's avatar
Sylvain Gugger committed
2063
2064
2065
2066
                    # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
                    # current length of the tokenizer.
                    raise ValueError(
                        f"Non-consecutive added token '{token}' found. "
2067
                        f"Should have index {current_index} but has index {index} in saved vocabulary."
Sylvain Gugger's avatar
Sylvain Gugger committed
2068
2069
                    )

2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
                is_special = bool(token in special_tokens)
                if is_last_special is None or is_last_special == is_special:
                    tokens.append(token)
                else:
                    tokenizer.add_tokens(tokens, special_tokens=is_last_special)
                    tokens = [token]
                is_last_special = is_special

            if tokens:
                tokenizer.add_tokens(tokens, special_tokens=is_last_special)
2080

Stas Bekman's avatar
Stas Bekman committed
2081
        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
2082
2083
        added_tokens = tokenizer.sanitize_special_tokens()
        if added_tokens:
2084
            logger.warning_advice(
Sylvain Gugger's avatar
Sylvain Gugger committed
2085
2086
                "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
                " fine-tuned or trained."
2087
            )
2088
2089
2090

        return tokenizer

2091
2092
2093
2094
2095
2096
2097
2098
    @staticmethod
    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
        # This method should be deleted in Transformers v5
        # Its only purpose is to potentially throw a warning
        # that incorrectly defined max lengths of T5's tokenizer are used
        # which we will correct in Transformers v5.
        return max_model_length

2099
    def save_pretrained(
2100
2101
        self,
        save_directory: Union[str, os.PathLike],
Sylvain Gugger's avatar
Sylvain Gugger committed
2102
        legacy_format: Optional[bool] = None,
2103
        filename_prefix: Optional[str] = None,
Sylvain Gugger's avatar
Sylvain Gugger committed
2104
2105
        push_to_hub: bool = False,
        **kwargs,
2106
    ) -> Tuple[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
2107
        """
2108
        Save the full tokenizer state.
Sylvain Gugger's avatar
Sylvain Gugger committed
2109

2110

Sylvain Gugger's avatar
Sylvain Gugger committed
2111
        This method make sure the full tokenizer can then be re-loaded using the
2112
        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method..
Sylvain Gugger's avatar
Sylvain Gugger committed
2113

Sylvain Gugger's avatar
Sylvain Gugger committed
2114
2115
        Warning,None This won't save modifications you may have applied to the tokenizer after the instantiation (for
        instance, modifying `tokenizer.do_lower_case` after creation).
Sylvain Gugger's avatar
Sylvain Gugger committed
2116
2117

        Args:
2118
2119
            save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
            legacy_format (`bool`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2120
                Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
2121
2122
                format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
                added_tokens files.
Sylvain Gugger's avatar
Sylvain Gugger committed
2123

Sylvain Gugger's avatar
Sylvain Gugger committed
2124
2125
2126
                If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible with
                "slow" tokenizers (not powered by the *tokenizers* library), so the tokenizer will not be able to be
                loaded in the corresponding "slow" tokenizer.
Sylvain Gugger's avatar
Sylvain Gugger committed
2127

Sylvain Gugger's avatar
Sylvain Gugger committed
2128
2129
                If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
                error is raised.
2130
            filename_prefix (`str`, *optional*):
2131
                A prefix to add to the names of the files saved by the tokenizer.
2132
            push_to_hub (`bool`, *optional*, defaults to `False`):
2133
2134
2135
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
2136
            kwargs (`Dict[str, Any]`, *optional*):
2137
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
2138

Sylvain Gugger's avatar
Sylvain Gugger committed
2139
        Returns:
2140
            A tuple of `str`: The files saved.
2141
        """
2142
        if os.path.isfile(save_directory):
2143
            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
2144
            return
2145

2146
2147
        os.makedirs(save_directory, exist_ok=True)

2148
2149
        if push_to_hub:
            commit_message = kwargs.pop("commit_message", None)
2150
            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
2151
            repo_id = self._create_repo(repo_id, **kwargs)
2152
            files_timestamps = self._get_files_timestamps(save_directory)
2153

2154
2155
2156
2157
2158
2159
        special_tokens_map_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
        )
        tokenizer_config_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
        )
2160
2161

        tokenizer_config = copy.deepcopy(self.init_kwargs)
2162
2163
2164

        # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
        # target_keys = self.init_kwargs.keys()
2165
        target_keys = ["model_max_length", "clean_up_tokenization_spaces"]
2166
2167
2168
2169
        for k in target_keys:
            if hasattr(self, k):
                tokenizer_config[k] = getattr(self, k)

2170
2171
2172
2173
2174
        if len(self.init_inputs) > 0:
            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
        for file_id in self.vocab_files_names.keys():
            tokenizer_config.pop(file_id, None)

2175
        # Sanitize AddedTokens
2176
        def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
2177
2178
            if isinstance(obj, AddedToken):
                out = obj.__getstate__()
2179
2180
                if add_type_field:
                    out["__type"] = "AddedToken"
2181
2182
                return out
            elif isinstance(obj, (list, tuple)):
2183
                return [convert_added_tokens(o, add_type_field=add_type_field) for o in obj]
2184
            elif isinstance(obj, dict):
2185
                return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
2186
2187
            return obj

2188
2189
        # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
        tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
2190
2191
2192
2193
2194
2195
2196

        # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
        tokenizer_class = self.__class__.__name__
        # Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
        if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
            tokenizer_class = tokenizer_class[:-4]
        tokenizer_config["tokenizer_class"] = tokenizer_class
2197
2198
        if getattr(self, "_auto_map", None) is not None:
            tokenizer_config["auto_map"] = self._auto_map
2199
2200
        if getattr(self, "_processor_class", None) is not None:
            tokenizer_config["processor_class"] = self._processor_class
2201

2202
2203
2204
2205
2206
        # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
        # loaded from the Hub.
        if self._auto_class is not None:
            custom_object_save(self, save_directory, config=tokenizer_config)

2207
2208
2209
        # remove private information
        if "name_or_path" in tokenizer_config:
            tokenizer_config.pop("name_or_path")
2210
            tokenizer_config.pop("special_tokens_map_file", None)
2211

2212
        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
2213
2214
            out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
            f.write(out_str)
2215
        logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
2216

2217
        # Sanitize AddedTokens in special_tokens_map
2218
        write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
2219
        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
2220
2221
            out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
            f.write(out_str)
2222
        logger.info(f"Special tokens file saved in {special_tokens_map_file}")
2223

2224
2225
        file_names = (tokenizer_config_file, special_tokens_map_file)

Sylvain Gugger's avatar
Sylvain Gugger committed
2226
        save_files = self._save_pretrained(
2227
2228
2229
2230
2231
            save_directory=save_directory,
            file_names=file_names,
            legacy_format=legacy_format,
            filename_prefix=filename_prefix,
        )
2232

Sylvain Gugger's avatar
Sylvain Gugger committed
2233
        if push_to_hub:
2234
            self._upload_modified_files(
2235
2236
2237
2238
2239
                save_directory,
                repo_id,
                files_timestamps,
                commit_message=commit_message,
                token=kwargs.get("use_auth_token"),
2240
            )
Sylvain Gugger's avatar
Sylvain Gugger committed
2241
2242
2243

        return save_files

2244
2245
    def _save_pretrained(
        self,
2246
        save_directory: Union[str, os.PathLike],
2247
        file_names: Tuple[str],
Sylvain Gugger's avatar
Sylvain Gugger committed
2248
        legacy_format: Optional[bool] = None,
2249
2250
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
2251
2252
        """
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
2253

Sylvain Gugger's avatar
Sylvain Gugger committed
2254
        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
2255
        specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
2256
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2257
        if legacy_format is False:
2258
            raise ValueError(
2259
                "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
2260
2261
            )

2262
2263
        save_directory = str(save_directory)

2264
2265
2266
        added_tokens_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
        )
2267
2268
        added_vocab = self.get_added_vocab()
        if added_vocab:
2269
            with open(added_tokens_file, "w", encoding="utf-8") as f:
2270
                out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
2271
                f.write(out_str)
2272
                logger.info(f"added tokens file saved in {added_tokens_file}")
2273

2274
        vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
2275

2276
2277
2278
2279
2280
2281
        return file_names + vocab_files + (added_tokens_file,)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

Sylvain Gugger's avatar
Sylvain Gugger committed
2282
        This method won't save the configuration and special token mappings of the tokenizer. Use
2283
        [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
2284
2285

        Args:
2286
            save_directory (`str`):
2287
                The directory in which to save the vocabulary.
2288
            filename_prefix (`str`, *optional*):
2289
2290
2291
                An optional prefix to add to the named of the saved files.

        Returns:
2292
            `Tuple(str)`: Paths to the files saved.
2293
2294
        """
        raise NotImplementedError
2295

2296
2297
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        """
2298
        Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
2299
2300

        Args:
2301
            text (`str`):
2302
                The sequence to be encoded.
2303
            pair (`str`, *optional*):
2304
                A second sequence to be encoded with the first.
2305
            add_special_tokens (`bool`, *optional*, defaults to `False`):
2306
                Whether or not to add the special tokens associated with the corresponding model.
2307
            kwargs (additional keyword arguments, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2308
                Will be passed to the underlying model specific encode method. See details in
2309
                [`~PreTrainedTokenizerBase.__call__`]
2310
2311

        Returns:
2312
            `List[str]`: The list of tokens.
2313
2314
2315
        """
        raise NotImplementedError

2316
2317
2318
    @add_end_docstrings(
        ENCODE_KWARGS_DOCSTRING,
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2319
2320
2321
2322
            **kwargs: Passed along to the `.tokenize()` method.
        """,
        """
        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
2323
            `List[int]`, `torch.Tensor`, `tf.Tensor` or `np.ndarray`: The tokenized ids of the text.
Sylvain Gugger's avatar
Sylvain Gugger committed
2324
        """,
2325
2326
2327
2328
2329
2330
    )
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2331
        padding: Union[bool, str, PaddingStrategy] = False,
2332
        truncation: Union[bool, str, TruncationStrategy] = None,
2333
2334
2335
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
2336
        **kwargs,
Sylvain Gugger's avatar
Sylvain Gugger committed
2337
    ) -> List[int]:
2338
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2339
        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
2340

2341
        Same as doing `self.convert_tokens_to_ids(self.tokenize(text))`.
2342
2343

        Args:
2344
            text (`str`, `List[str]` or `List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2345
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
2346
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
Sylvain Gugger's avatar
Sylvain Gugger committed
2347
                method).
2348
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2349
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
Sylvain Gugger's avatar
Sylvain Gugger committed
2350
2351
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
        """
        encoded_inputs = self.encode_plus(
            text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            return_tensors=return_tensors,
            **kwargs,
        )

        return encoded_inputs["input_ids"]

    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        raise NotImplementedError

    def _get_padding_truncation_strategies(
2371
        self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
2372
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
2373
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2374
2375
        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
        and pad_to_max_length) and behaviors.
2376
2377
2378
2379
2380
2381
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

        # Backward compatibility for previous behavior, maybe we should deprecate it:
        # If you only set max_length, it activates truncation for max_length
2382
        if max_length is not None and padding is False and truncation is None:
2383
            if verbose:
2384
                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
2385
                    logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
2386
2387
2388
2389
2390
                        "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
                        " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
                        " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
                        " tokenizer you can select this strategy more precisely by providing a specific strategy to"
                        " `truncation`."
2391
                    )
2392
                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
2393
            truncation = "longest_first"
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403

        # Get padding strategy
        if padding is False and old_pad_to_max_length:
            if verbose:
                warnings.warn(
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
                    "maximal input size of the model (e.g. 512 for Bert).",
2404
                    FutureWarning,
2405
2406
2407
2408
2409
2410
2411
                )
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
2412
                if verbose:
2413
2414
2415
                    if max_length is not None and (
                        truncation is None or truncation is False or truncation == "do_not_truncate"
                    ):
2416
2417
2418
2419
                        warnings.warn(
                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
                            "To pad to max length, use `padding='max_length'`."
                        )
2420
2421
                    if old_pad_to_max_length is not False:
                        warnings.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.")
2422
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
2423
            elif not isinstance(padding, PaddingStrategy):
2424
                padding_strategy = PaddingStrategy(padding)
2425
2426
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
2427
2428
2429
2430
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
2431
        if truncation is None and old_truncation_strategy != "do_not_truncate":
2432
2433
            if verbose:
                warnings.warn(
Sylvain Gugger's avatar
Sylvain Gugger committed
2434
2435
2436
2437
2438
2439
2440
2441
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
                    " `truncation=True` to truncate examples to a max length. You can give a specific length with"
                    " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
                    " size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific"
                    " truncation strategy selected among `truncation='only_first'` (will only truncate the first"
                    " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
                    " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
                    " in the pairs).",
2442
                    FutureWarning,
2443
2444
                )
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
2445
        elif truncation is not False and truncation is not None:
2446
2447
            if truncation is True:
                truncation_strategy = (
2448
2449
2450
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
2451
                truncation_strategy = TruncationStrategy(truncation)
2452
2453
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
2454
2455
2456
2457
2458
2459
2460
2461
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
2462
2463
                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
                            logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
2464
2465
                                "Asking to pad to max_length but no maximum length is provided and the model has no"
                                " predefined maximum length. Default to no padding."
2466
2467
                            )
                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
2468
2469
2470
2471
2472
2473
2474
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length

            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
2475
2476
                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
                            logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
2477
2478
                                "Asking to truncate to max_length but no maximum length is provided and the model has"
                                " no predefined maximum length. Default to no truncation."
2479
2480
                            )
                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

2493
2494
2495
2496
2497
2498
2499
2500
2501
        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (
            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
            and padding_strategy != PaddingStrategy.DO_NOT_PAD
            and pad_to_multiple_of is not None
            and max_length is not None
            and (max_length % pad_to_multiple_of != 0)
        ):
            raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
2502
                "Truncation and padding are both activated but "
2503
2504
2505
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

2506
2507
2508
2509
2510
        return padding_strategy, truncation_strategy, max_length, kwargs

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def __call__(
        self,
2511
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
2512
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
2513
2514
2515
2516
        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair_target: Optional[
            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
        ] = None,
2517
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2518
        padding: Union[bool, str, PaddingStrategy] = False,
2519
        truncation: Union[bool, str, TruncationStrategy] = None,
2520
2521
        max_length: Optional[int] = None,
        stride: int = 0,
2522
        is_split_into_words: bool = False,
2523
        pad_to_multiple_of: Optional[int] = None,
2524
2525
2526
2527
2528
2529
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2530
        return_length: bool = False,
2531
        verbose: bool = True,
2532
        **kwargs,
2533
2534
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2535
2536
        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
        sequences.
2537
2538

        Args:
2539
            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2540
2541
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
2542
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
2543
            text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2544
2545
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
2546
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
            text_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        """
        # To avoid duplicating
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
        all_kwargs = {
            "add_special_tokens": add_special_tokens,
            "padding": padding,
            "truncation": truncation,
            "max_length": max_length,
            "stride": stride,
            "is_split_into_words": is_split_into_words,
            "pad_to_multiple_of": pad_to_multiple_of,
            "return_tensors": return_tensors,
            "return_token_type_ids": return_token_type_ids,
            "return_attention_mask": return_attention_mask,
            "return_overflowing_tokens": return_overflowing_tokens,
            "return_special_tokens_mask": return_special_tokens_mask,
            "return_offsets_mapping": return_offsets_mapping,
            "return_length": return_length,
            "verbose": verbose,
        }
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
        all_kwargs.update(kwargs)
        if text is None and text_target is None:
            raise ValueError("You need to specify either `text` or `text_target`.")
        if text is not None:
            # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
            # input mode in this case.
            if not self._in_target_context_manager:
                self._switch_to_input_mode()
            encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
        if text_target is not None:
            self._switch_to_target_mode()
            target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs)
        # Leave back tokenizer in input mode
        self._switch_to_input_mode()

        if text_target is None:
            return encodings
        elif text is None:
            return target_encodings
        else:
            encodings["labels"] = target_encodings["input_ids"]
            return encodings

    def _call_one(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
2603
        truncation: Union[bool, str, TruncationStrategy] = None,
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
2616
        **kwargs,
2617
    ) -> BatchEncoding:
2618
        # Input type checking for clearer error
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
        def _is_valid_text_input(t):
            if isinstance(t, str):
                # Strings are fine
                return True
            elif isinstance(t, (list, tuple)):
                # List are fine as long as they are...
                if len(t) == 0:
                    # ... empty
                    return True
                elif isinstance(t[0], str):
                    # ... list of strings
                    return True
                elif isinstance(t[0], (list, tuple)):
                    # ... list with an empty list or with a list of strings
                    return len(t[0]) == 0 or isinstance(t[0][0], str)
                else:
                    return False
            else:
                return False
2638

2639
2640
2641
2642
        if not _is_valid_text_input(text):
            raise ValueError(
                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                "or `List[List[str]]` (batch of pretokenized examples)."
2643
2644
            )

2645
2646
2647
2648
        if text_pair is not None and not _is_valid_text_input(text_pair):
            raise ValueError(
                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                "or `List[List[str]]` (batch of pretokenized examples)."
2649
            )
2650
2651
2652
2653
2654

        if is_split_into_words:
            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
        else:
            is_batched = isinstance(text, (list, tuple))
2655
2656

        if is_batched:
2657
2658
            if isinstance(text_pair, str):
                raise TypeError(
Sylvain Gugger's avatar
Sylvain Gugger committed
2659
2660
                    "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as"
                    " `text`."
2661
2662
2663
                )
            if text_pair is not None and len(text) != len(text_pair):
                raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
2664
2665
                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
                    f" {len(text_pair)}."
2666
                )
2667
2668
2669
2670
2671
2672
2673
2674
            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
            return self.batch_encode_plus(
                batch_text_or_text_pairs=batch_text_or_text_pairs,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
2675
                is_split_into_words=is_split_into_words,
2676
                pad_to_multiple_of=pad_to_multiple_of,
2677
2678
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
2679
                return_attention_mask=return_attention_mask,
2680
                return_overflowing_tokens=return_overflowing_tokens,
2681
                return_special_tokens_mask=return_special_tokens_mask,
2682
                return_offsets_mapping=return_offsets_mapping,
2683
                return_length=return_length,
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
                verbose=verbose,
                **kwargs,
            )
        else:
            return self.encode_plus(
                text=text,
                text_pair=text_pair,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
2696
                is_split_into_words=is_split_into_words,
2697
                pad_to_multiple_of=pad_to_multiple_of,
2698
2699
2700
2701
2702
2703
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
2704
                return_length=return_length,
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
                verbose=verbose,
                **kwargs,
            )

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2715
        padding: Union[bool, str, PaddingStrategy] = False,
2716
        truncation: Union[bool, str, TruncationStrategy] = None,
2717
2718
        max_length: Optional[int] = None,
        stride: int = 0,
2719
        is_split_into_words: bool = False,
2720
        pad_to_multiple_of: Optional[int] = None,
2721
2722
2723
2724
2725
2726
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2727
        return_length: bool = False,
2728
        verbose: bool = True,
2729
        **kwargs,
2730
2731
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2732
2733
        Tokenize and prepare for the model a sequence or a pair of sequences.

2734
2735
2736
2737
2738
        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>
2739
2740

        Args:
2741
            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
Sylvain Gugger's avatar
Sylvain Gugger committed
2742
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
2743
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
Sylvain Gugger's avatar
Sylvain Gugger committed
2744
                method).
2745
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2746
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
Sylvain Gugger's avatar
Sylvain Gugger committed
2747
2748
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
2749
2750
2751
2752
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2753
2754
2755
2756
2757
2758
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
        )

        return self._encode_plus(
            text=text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
2769
            is_split_into_words=is_split_into_words,
2770
            pad_to_multiple_of=pad_to_multiple_of,
2771
2772
2773
2774
2775
2776
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
2777
            return_length=return_length,
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
            verbose=verbose,
            **kwargs,
        )

    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
2791
        is_split_into_words: bool = False,
2792
        pad_to_multiple_of: Optional[int] = None,
2793
2794
2795
2796
2797
2798
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2799
        return_length: bool = False,
2800
        verbose: bool = True,
2801
        **kwargs,
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
    ) -> BatchEncoding:
        raise NotImplementedError

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2817
        padding: Union[bool, str, PaddingStrategy] = False,
2818
        truncation: Union[bool, str, TruncationStrategy] = None,
2819
2820
        max_length: Optional[int] = None,
        stride: int = 0,
2821
        is_split_into_words: bool = False,
2822
        pad_to_multiple_of: Optional[int] = None,
2823
2824
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
2825
        return_attention_mask: Optional[bool] = None,
2826
        return_overflowing_tokens: bool = False,
2827
        return_special_tokens_mask: bool = False,
2828
        return_offsets_mapping: bool = False,
2829
        return_length: bool = False,
2830
        verbose: bool = True,
2831
        **kwargs,
2832
2833
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2834
2835
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

2836
2837
2838
2839
2840
        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>
2841
2842

        Args:
2843
            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2844
2845
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
2846
                details in `encode_plus`).
2847
2848
2849
2850
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2851
2852
2853
2854
2855
2856
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
2857
2858
2859
2860
2861
2862
2863
2864
2865
        )

        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
2866
            is_split_into_words=is_split_into_words,
2867
            pad_to_multiple_of=pad_to_multiple_of,
2868
2869
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
2870
            return_attention_mask=return_attention_mask,
2871
            return_overflowing_tokens=return_overflowing_tokens,
2872
            return_special_tokens_mask=return_special_tokens_mask,
2873
            return_offsets_mapping=return_offsets_mapping,
2874
            return_length=return_length,
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
            verbose=verbose,
            **kwargs,
        )

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
2894
        is_split_into_words: bool = False,
2895
        pad_to_multiple_of: Optional[int] = None,
2896
2897
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
2898
        return_attention_mask: Optional[bool] = None,
2899
        return_overflowing_tokens: bool = False,
2900
        return_special_tokens_mask: bool = False,
2901
        return_offsets_mapping: bool = False,
2902
        return_length: bool = False,
2903
        verbose: bool = True,
2904
        **kwargs,
2905
2906
2907
2908
2909
    ) -> BatchEncoding:
        raise NotImplementedError

    def pad(
        self,
2910
2911
2912
2913
2914
2915
2916
        encoded_inputs: Union[
            BatchEncoding,
            List[BatchEncoding],
            Dict[str, EncodedInput],
            Dict[str, List[EncodedInput]],
            List[Dict[str, EncodedInput]],
        ],
Sylvain Gugger's avatar
Sylvain Gugger committed
2917
        padding: Union[bool, str, PaddingStrategy] = True,
2918
        max_length: Optional[int] = None,
2919
        pad_to_multiple_of: Optional[int] = None,
2920
        return_attention_mask: Optional[bool] = None,
2921
        return_tensors: Optional[Union[str, TensorType]] = None,
2922
        verbose: bool = True,
2923
    ) -> BatchEncoding:
Sylvain Gugger's avatar
Sylvain Gugger committed
2924
2925
2926
        """
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.
2927

2928
        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
2929
2930
2931
2932
        `self.pad_token_id` and `self.pad_token_type_id`).

        Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
        text followed by a call to the `pad` method to get a padded encoding.
2933
2934

        <Tip>
2935

2936
        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
Sylvain Gugger's avatar
Sylvain Gugger committed
2937
2938
        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
        PyTorch tensors, you will lose the specific device of your tensors however.
2939

2940
        </Tip>
2941

2942
        Args:
2943
            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2944
2945
2946
2947
                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
                List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.
2948

Sylvain Gugger's avatar
Sylvain Gugger committed
2949
2950
                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
                the note above for the return type.
2951
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2952
2953
2954
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

Sylvain Gugger's avatar
Sylvain Gugger committed
2955
2956
2957
2958
2959
2960
                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
2961
            max_length (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2962
                Maximum length of the returned list and optionally padding length (see above).
2963
            pad_to_multiple_of (`int`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2964
2965
2966
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2967
                `>= 7.5` (Volta).
2968
            return_attention_mask (`bool`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2969
                Whether to return the attention mask. If left to the default, will return the attention mask according
2970
                to the specific tokenizer's default, defined by the `return_outputs` attribute.
Sylvain Gugger's avatar
Sylvain Gugger committed
2971

2972
                [What are attention masks?](../glossary#attention-mask)
2973
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
2974
2975
                If set, will return tensors instead of list of python integers. Acceptable values are:

2976
2977
2978
2979
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            verbose (`bool`, *optional*, defaults to `True`):
2980
                Whether or not to print more information and warnings.
2981
        """
2982
2983
2984
2985
2986
2987
2988
2989
2990
        if self.__class__.__name__.endswith("Fast"):
            if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False):
                logger.warning_advice(
                    f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer,"
                    " using the `__call__` method is faster than using a method to encode the text followed by a call"
                    " to the `pad` method to get a padded encoding."
                )
                self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

2991
        # If we have a list of dicts, let's convert it in a dict of lists
2992
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
2993
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
2994
2995
            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

2996
2997
2998
        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
Takuya Makino's avatar
Takuya Makino committed
2999
                "You should supply an encoding or a list of encodings to this method "
3000
3001
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )
3002

3003
3004
        required_input = encoded_inputs[self.model_input_names[0]]

3005
        if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
3006
            if return_attention_mask:
3007
3008
                encoded_inputs["attention_mask"] = []
            return encoded_inputs
3009

3010
3011
3012
        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch
3013

3014
        first_element = required_input[0]
3015
3016
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
3017
3018
3019
3020
            for item in required_input:
                if len(item) != 0:
                    first_element = item[0]
                    break
3021
3022
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
3023
            if is_tf_tensor(first_element):
3024
                return_tensors = "tf" if return_tensors is None else return_tensors
3025
            elif is_torch_tensor(first_element):
3026
3027
3028
3029
3030
3031
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
Sylvain Gugger's avatar
Sylvain Gugger committed
3032
                    "Should be one of a python, numpy, pytorch or tensorflow object."
3033
3034
3035
3036
3037
                )

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)

3038
        # Convert padding_strategy in PaddingStrategy
3039
3040
3041
3042
        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
            padding=padding, max_length=max_length, verbose=verbose
        )

3043
3044
        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
3045
3046
            encoded_inputs = self._pad(
                encoded_inputs,
3047
3048
                max_length=max_length,
                padding_strategy=padding_strategy,
3049
                pad_to_multiple_of=pad_to_multiple_of,
3050
3051
                return_attention_mask=return_attention_mask,
            )
3052
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
3053

3054
        batch_size = len(required_input)
3055
        assert all(
3056
            len(v) == batch_size for v in encoded_inputs.values()
Tiger's avatar
Tiger committed
3057
        ), "Some items in the output dictionary have a different batch size than others."
3058
3059

        if padding_strategy == PaddingStrategy.LONGEST:
3060
            max_length = max(len(inputs) for inputs in required_input)
3061
3062
3063
3064
            padding_strategy = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
3065
            inputs = {k: v[i] for k, v in encoded_inputs.items()}
3066
3067
3068
3069
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding_strategy,
3070
                pad_to_multiple_of=pad_to_multiple_of,
3071
3072
3073
3074
3075
3076
3077
3078
                return_attention_mask=return_attention_mask,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

3079
        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
3080

Sylvain Gugger's avatar
Sylvain Gugger committed
3081
3082
3083
3084
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3085
3086
        Create the token type IDs corresponding to the sequences passed. [What are token type
        IDs?](../glossary#token-type-ids)
Sylvain Gugger's avatar
Sylvain Gugger committed
3087

3088
        Should be overridden in a subclass if the model has a special way of building those.
Sylvain Gugger's avatar
Sylvain Gugger committed
3089
3090

        Args:
3091
3092
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Sylvain Gugger's avatar
Sylvain Gugger committed
3093
3094

        Returns:
3095
            `List[int]`: The token type ids.
Sylvain Gugger's avatar
Sylvain Gugger committed
3096
        """
3097
3098
3099
3100
        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)

Sylvain Gugger's avatar
Sylvain Gugger committed
3101
3102
3103
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
3104
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3105
3106
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
3107

3108
        This implementation does not add special tokens and this method should be overridden in a subclass.
Sylvain Gugger's avatar
Sylvain Gugger committed
3109
3110

        Args:
3111
3112
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Sylvain Gugger's avatar
Sylvain Gugger committed
3113
3114

        Returns:
3115
            `List[int]`: The model input with special tokens.
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
        """
        if token_ids_1 is None:
            return token_ids_0
        return token_ids_0 + token_ids_1

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
3127
        padding: Union[bool, str, PaddingStrategy] = False,
3128
        truncation: Union[bool, str, TruncationStrategy] = None,
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
3141
        **kwargs,
3142
    ) -> BatchEncoding:
Sylvain Gugger's avatar
Sylvain Gugger committed
3143
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3144
3145
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
3146
        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
3147
        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
3148
        overflowing tokens. Such a combination of arguments will raise an error.
3149
3150

        Args:
3151
            ids (`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3152
3153
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
                `convert_tokens_to_ids` methods.
3154
3155
3156
            pair_ids (`List[int]`, *optional*):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

3173
        if return_token_type_ids and not add_special_tokens:
3174
3175
3176
3177
3178
3179
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None."
            )

3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
        if (
            return_overflowing_tokens
            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
            and pair_ids is not None
        ):
            raise ValueError(
                "Not possible to return overflowing tokens for pair of sequences with the "
                "`longest_first`. Please select another truncation strategy than `longest_first`, "
                "for instance `only_second` or `only_first`."
            )

3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
3203
        overflowing_tokens = []
3204
3205
3206
3207
3208
3209
3210
3211
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation_strategy,
                stride=stride,
            )
3212
3213
3214
3215

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length
3216
3217
3218
3219
3220
3221
3222

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
3223
            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
3224

Tiger's avatar
Tiger committed
3225
        # Build output dictionary
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Check lengths
3236
        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264

        # Padding
        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding_strategy.value,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(
            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
        )

        return batch_outputs

    def truncate_sequences(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        num_tokens_to_remove: int = 0,
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
        stride: int = 0,
    ) -> Tuple[List[int], List[int], List[int]]:
Sylvain Gugger's avatar
Sylvain Gugger committed
3265
3266
        """
        Truncates a sequence pair in-place following the strategy.
3267
3268

        Args:
3269
            ids (`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3270
3271
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
                `convert_tokens_to_ids` methods.
3272
3273
3274
3275
            pair_ids (`List[int]`, *optional*):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
            num_tokens_to_remove (`int`, *optional*, defaults to 0):
Sylvain Gugger's avatar
Sylvain Gugger committed
3276
                Number of tokens to remove using the truncation strategy.
3277
            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3278
3279
                The strategy to follow for truncation. Can be:

Sylvain Gugger's avatar
Sylvain Gugger committed
3280
3281
3282
3283
3284
3285
                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will truncate
                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
                  batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
Sylvain Gugger's avatar
Sylvain Gugger committed
3286
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
3287
3288
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
Sylvain Gugger's avatar
Sylvain Gugger committed
3289
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
3290
3291
                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
                  than the model maximum admissible input size).
3292
            stride (`int`, *optional*, defaults to 0):
Sylvain Gugger's avatar
Sylvain Gugger committed
3293
3294
                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                sequence returned. The value of this argument defines the number of additional tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
3295
3296

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
3297
3298
3299
            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
            of sequences (or a batch of pairs) is provided.
3300
3301
3302
3303
3304
3305
3306
3307
        """
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []

        if not isinstance(truncation_strategy, TruncationStrategy):
            truncation_strategy = TruncationStrategy(truncation_strategy)

        overflowing_tokens = []
3308
3309
3310
        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
        ):
3311
3312
            if len(ids) > num_tokens_to_remove:
                window_len = min(len(ids), stride + num_tokens_to_remove)
3313
3314
3315
3316
3317
3318
3319
3320
3321
                if self.truncation_side == "left":
                    overflowing_tokens = ids[:window_len]
                    ids = ids[num_tokens_to_remove:]
                elif self.truncation_side == "right":
                    overflowing_tokens = ids[-window_len:]
                    ids = ids[:-num_tokens_to_remove]
                else:
                    raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")

3322
            else:
3323
3324
                error_msg = (
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
3325
3326
                    f"but the first sequence has a length {len(ids)}. "
                )
3327
3328
                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                    error_msg = (
3329
                        error_msg + "Please select another truncation strategy than "
3330
3331
3332
3333
3334
                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                    )
                logger.error(error_msg)
        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
            logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
3335
                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
3336
                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
Sylvain Gugger's avatar
Sylvain Gugger committed
3337
3338
                "truncation strategy. So the returned list will always be empty even if some "
                "tokens have been removed."
3339
3340
3341
            )
            for _ in range(num_tokens_to_remove):
                if pair_ids is None or len(ids) > len(pair_ids):
3342
3343
3344
3345
3346
3347
                    if self.truncation_side == "right":
                        ids = ids[:-1]
                    elif self.truncation_side == "left":
                        ids = ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
3348
                else:
3349
3350
3351
3352
3353
3354
                    if self.truncation_side == "right":
                        pair_ids = pair_ids[:-1]
                    elif self.truncation_side == "left":
                        pair_ids = pair_ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
3355
3356
3357
        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
            if len(pair_ids) > num_tokens_to_remove:
                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
3358
3359
3360
3361
3362
3363
3364
3365
                if self.truncation_side == "right":
                    overflowing_tokens = pair_ids[-window_len:]
                    pair_ids = pair_ids[:-num_tokens_to_remove]
                elif self.truncation_side == "left":
                    overflowing_tokens = pair_ids[:window_len]
                    pair_ids = pair_ids[num_tokens_to_remove:]
                else:
                    raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
3366
3367
            else:
                logger.error(
3368
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
3369
3370
                    f"but the second sequence has a length {len(pair_ids)}. "
                    f"Please select another truncation strategy than {truncation_strategy}, "
Sylvain Gugger's avatar
Sylvain Gugger committed
3371
                    "for instance 'longest_first' or 'only_first'."
3372
3373
3374
3375
                )

        return (ids, pair_ids, overflowing_tokens)

3376
3377
    def _pad(
        self,
3378
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
3379
3380
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
3381
        pad_to_multiple_of: Optional[int] = None,
3382
3383
        return_attention_mask: Optional[bool] = None,
    ) -> dict:
Sylvain Gugger's avatar
Sylvain Gugger committed
3384
        """
3385
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
3386
3387

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
3388
3389
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
3390
3391
3392
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.
Sylvain Gugger's avatar
Sylvain Gugger committed
3393

3394
3395
3396
3397
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:
Sylvain Gugger's avatar
Sylvain Gugger committed
3398

3399
3400
                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
3401
3402
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
3403
                `>= 7.5` (Volta).
Sylvain Gugger's avatar
Sylvain Gugger committed
3404
3405
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
3406
3407
3408
3409
3410
        """
        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

3411
3412
        required_input = encoded_inputs[self.model_input_names[0]]

3413
        if padding_strategy == PaddingStrategy.LONGEST:
3414
            max_length = len(required_input)
3415

3416
3417
3418
        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

3419
        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
3420

3421
3422
3423
3424
        # Initialize attention mask if not present.
        if return_attention_mask and "attention_mask" not in encoded_inputs:
            encoded_inputs["attention_mask"] = [1] * len(required_input)

3425
        if needs_to_be_padded:
3426
            difference = max_length - len(required_input)
3427

3428
3429
            if self.padding_side == "right":
                if return_attention_mask:
3430
                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
3431
3432
3433
3434
3435
3436
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
                    )
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
3437
                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
3438
3439
            elif self.padding_side == "left":
                if return_attention_mask:
3440
                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
3441
3442
3443
3444
3445
3446
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
                        "token_type_ids"
                    ]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
3447
                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
3448
3449
3450
3451
3452
            else:
                raise ValueError("Invalid padding strategy:" + str(self.padding_side))

        return encoded_inputs

3453
3454
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
3455
        Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
Sylvain Gugger's avatar
Sylvain Gugger committed
3456
        often want to remove sub-word tokenization artifacts at the same time.
Sylvain Gugger's avatar
Sylvain Gugger committed
3457

3458
        Args:
3459
            tokens (`List[str]`): The token to join in a string.
Sylvain Gugger's avatar
Sylvain Gugger committed
3460

cronoik's avatar
cronoik committed
3461
        Returns:
3462
            `str`: The joined tokens.
3463
3464
3465
        """
        raise NotImplementedError

3466
    def batch_decode(
3467
3468
3469
        self,
        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
3470
        clean_up_tokenization_spaces: bool = None,
3471
        **kwargs,
3472
3473
3474
3475
3476
    ) -> List[str]:
        """
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
3477
3478
3479
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3480
                Whether or not to remove special tokens in the decoding.
3481
3482
3483
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces`.
3484
            kwargs (additional keyword arguments, *optional*):
3485
                Will be passed to the underlying model specific decode method.
Sylvain Gugger's avatar
Sylvain Gugger committed
3486
3487

        Returns:
3488
            `List[str]`: The list of decoded sentences.
3489
3490
3491
        """
        return [
            self.decode(
3492
3493
3494
3495
                seq,
                skip_special_tokens=skip_special_tokens,
                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                **kwargs,
3496
3497
3498
            )
            for seq in sequences
        ]
3499

3500
    def decode(
3501
        self,
3502
        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
3503
        skip_special_tokens: bool = False,
3504
        clean_up_tokenization_spaces: bool = None,
3505
        **kwargs,
3506
3507
    ) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3508
3509
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.
Sylvain Gugger's avatar
Sylvain Gugger committed
3510

3511
        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
3512
3513

        Args:
3514
3515
3516
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3517
                Whether or not to remove special tokens in the decoding.
3518
3519
3520
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces`.
3521
            kwargs (additional keyword arguments, *optional*):
3522
                Will be passed to the underlying model specific decode method.
Sylvain Gugger's avatar
Sylvain Gugger committed
3523
3524

        Returns:
3525
            `str`: The decoded sentence.
3526
        """
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
        # Convert inputs to python lists
        token_ids = to_py_obj(token_ids)

        return self._decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )

    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
3541
        clean_up_tokenization_spaces: bool = None,
3542
        **kwargs,
3543
    ) -> str:
3544
3545
3546
        raise NotImplementedError

    def get_special_tokens_mask(
Sylvain Gugger's avatar
Sylvain Gugger committed
3547
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
3548
3549
3550
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
3551
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
3552
3553

        Args:
3554
            token_ids_0 (`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3555
                List of ids of the first sequence.
3556
            token_ids_1 (`List[int]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
3557
                List of ids of the second sequence.
3558
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
3559
                Whether or not the token list is already formatted with special tokens for the model.
3560
3561
3562
3563
3564
3565

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        assert already_has_special_tokens and token_ids_1 is None, (
            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
3566
            "Please use a slow (full python) tokenizer to activate this argument. "
3567
            "Or set `return_special_tokens_mask=True` when calling the encoding method "
3568
3569
3570
3571
3572
3573
3574
3575
3576
            "to get the special tokens mask in any tokenizer. "
        )

        all_special_ids = self.all_special_ids  # cache the property

        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]

        return special_tokens_mask

3577
3578
    @staticmethod
    def clean_up_tokenization(out_string: str) -> str:
Sylvain Gugger's avatar
Sylvain Gugger committed
3579
        """
3580
        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
Sylvain Gugger's avatar
Sylvain Gugger committed
3581
3582

        Args:
3583
            out_string (`str`): The text to clean up.
Sylvain Gugger's avatar
Sylvain Gugger committed
3584
3585

        Returns:
3586
            `str`: The cleaned-up string.
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
        """
        out_string = (
            out_string.replace(" .", ".")
            .replace(" ?", "?")
            .replace(" !", "!")
            .replace(" ,", ",")
            .replace(" ' ", "'")
            .replace(" n't", "n't")
            .replace(" 'm", "'m")
            .replace(" 's", "'s")
            .replace(" 've", "'ve")
            .replace(" 're", "'re")
        )
        return out_string
3601
3602
3603

    def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
        """
3604
        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
3605
3606
3607
        corresponding model

        Args:
3608
3609
3610
            ids (`List[str]`): The ids produced by the tokenization
            max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
            verbose (`bool`): Whether or not to print more information and warnings.
3611
3612
3613
3614
3615
3616

        """
        if max_length is None and len(ids) > self.model_max_length and verbose:
            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
                logger.warning(
                    "Token indices sequence length is longer than the specified maximum sequence length "
3617
3618
                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
                    "will result in indexing errors"
3619
3620
                )
            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
3621

3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
    def _switch_to_input_mode(self):
        """
        Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
        """
        pass

    def _switch_to_target_mode(self):
        """
        Private method to put the tokenizer in target mode (when it has different modes for input/outputs)
        """
        pass

3634
3635
3636
3637
3638
3639
    @contextmanager
    def as_target_tokenizer(self):
        """
        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
        sequence-to-sequence models that need a slightly different processing for the labels.
        """
3640
3641
3642
3643
3644
3645
3646
        warnings.warn(
            "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
            "labels by using the argument `text_target` of the regular `__call__` method (either in the same call as "
            "your input texts if you use the same keyword arguments, or in a separate call."
        )
        self._switch_to_target_mode()
        self._in_target_context_manager = True
3647
        yield
3648
3649
        self._in_target_context_manager = False
        self._switch_to_input_mode()
3650

3651
3652
3653
3654
3655
3656
    @classmethod
    def register_for_auto_class(cls, auto_class="AutoTokenizer"):
        """
        Register this class with a given auto class. This should only be used for custom tokenizers as the ones in the
        library are already mapped with `AutoTokenizer`.

3657
3658
3659
3660
3661
3662
        <Tip warning={true}>

        This API is experimental and may have some slight breaking changes in the next releases.

        </Tip>

3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoTokenizer"`):
                The auto class to register this new tokenizer with.
        """
        if not isinstance(auto_class, str):
            auto_class = auto_class.__name__

        import transformers.models.auto as auto_module

        if not hasattr(auto_module, auto_class):
            raise ValueError(f"{auto_class} is not a valid auto class.")

        cls._auto_class = auto_class

3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        tgt_texts: Optional[List[str]] = None,
        max_length: Optional[int] = None,
        max_target_length: Optional[int] = None,
        padding: str = "longest",
        return_tensors: str = None,
        truncation: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Prepare model inputs for translation. For best performance, translate one sentence at a time.

        Arguments:
3692
            src_texts (`List[str]`):
3693
                List of documents to summarize or source language texts.
3694
            tgt_texts (`list`, *optional*):
3695
                List of summaries or target language texts.
3696
            max_length (`int`, *optional*):
3697
                Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
Sylvain Gugger's avatar
Sylvain Gugger committed
3698
3699
3700
                left unset or set to `None`, this will use the predefined model maximum length if a maximum length is
                required by one of the truncation/padding parameters. If the model has no specific maximum input length
                (like XLNet) truncation/padding to a maximum length will be deactivated.
3701
            max_target_length (`int`, *optional*):
3702
                Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
3703
                to `None`, this will use the max_length value.
3704
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
3705
3706
                Activates and controls padding. Accepts the following values:

Sylvain Gugger's avatar
Sylvain Gugger committed
3707
3708
3709
3710
3711
3712
                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
3713
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
3714
3715
                If set, will return tensors instead of list of python integers. Acceptable values are:

3716
3717
3718
3719
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `True`):
3720
3721
                Activates and controls truncation. Accepts the following values:

Sylvain Gugger's avatar
Sylvain Gugger committed
3722
3723
3724
3725
3726
3727
                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
                  sequences (or a batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
3728
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
3729
3730
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
3731
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
3732
3733
                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).
3734
            **kwargs:
3735
                Additional keyword arguments passed along to `self.__call__`.
3736
3737

        Return:
3738
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
3739
3740
3741
3742
3743

            - **input_ids** -- List of token ids to be fed to the encoder.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **labels** -- List of token ids for tgt_texts.

3744
            The full set of keys `[input_ids, attention_mask, labels]`, will only be returned if tgt_texts is passed.
3745
3746
            Otherwise, input_ids, attention_mask will be the only keys.
        """
3747
3748
3749
        # docstyle-ignore
        formatted_warning = """
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
3750
`__call__` method to prepare your inputs and targets.
3751
3752
3753

Here is a short example:

3754
3755
3756
3757
3758
model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

3759
model_inputs = tokenizer(src_texts, ...)
3760
labels = tokenizer(text_target=tgt_texts, ...)
3761
3762
3763
3764
3765
3766
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.
"""
        warnings.warn(formatted_warning, FutureWarning)
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
        # mBART-specific kwargs that should be ignored by other models.
        kwargs.pop("src_lang", None)
        kwargs.pop("tgt_lang", None)
        if max_length is None:
            max_length = self.model_max_length
        model_inputs = self(
            src_texts,
            add_special_tokens=True,
            return_tensors=return_tensors,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            **kwargs,
        )
        if tgt_texts is None:
            return model_inputs
        # Process tgt_texts
        if max_target_length is None:
            max_target_length = max_length
        with self.as_target_tokenizer():
            labels = self(
                tgt_texts,
                add_special_tokens=True,
                return_tensors=return_tensors,
                padding=padding,
                max_length=max_target_length,
                truncation=truncation,
                **kwargs,
            )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
3798
3799


3800
def get_fast_tokenizer_file(tokenization_files: List[str]) -> str:
3801
    """
3802
    Get the tokenization file to use for this version of transformers.
3803
3804

    Args:
3805
        tokenization_files (`List[str]`): The list of available configuration files.
3806
3807

    Returns:
3808
        `str`: The tokenization file to use.
3809
3810
    """
    tokenizer_files_map = {}
3811
    for file_name in tokenization_files:
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
        search = _re_tokenizer_file.search(file_name)
        if search is not None:
            v = search.groups()[0]
            tokenizer_files_map[v] = file_name
    available_versions = sorted(tokenizer_files_map.keys())

    # Defaults to FULL_TOKENIZER_FILE and then try to look at some newer versions.
    tokenizer_file = FULL_TOKENIZER_FILE
    transformers_version = version.parse(__version__)
    for v in available_versions:
        if version.parse(v) <= transformers_version:
            tokenizer_file = tokenizer_files_map[v]
        else:
            # No point going further since the versions are sorted.
            break

    return tokenizer_file


3831
3832
# To update the docstring, we need to copy the method, otherwise we change the original docstring.
PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub)
3833
3834
3835
3836
if PreTrainedTokenizerBase.push_to_hub.__doc__ is not None:
    PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format(
        object="tokenizer", object_class="AutoTokenizer", object_files="tokenizer files"
    )