"vscode:/vscode.git/clone" did not exist on "c1780ce7a487162f44d74cb705b46ff42e7dfe0c"
tokenization_utils_base.py 157 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
16
"""
Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
17
fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
Sylvain Gugger's avatar
Sylvain Gugger committed
18
of output with special method for the Fast tokenizers)
19
20
21
22
23
24
"""

import copy
import json
import os
import warnings
25
from collections import OrderedDict, UserDict
26
from contextlib import contextmanager
27
from dataclasses import dataclass, field
Sylvain Gugger's avatar
Sylvain Gugger committed
28
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
29
30

import numpy as np
31

Julien Chaumond's avatar
Julien Chaumond committed
32
33
import requests

34
from .file_utils import (
35
36
    ExplicitEnum,
    PaddingStrategy,
Sylvain Gugger's avatar
Sylvain Gugger committed
37
    PushToHubMixin,
38
39
40
41
42
43
    TensorType,
    _is_jax,
    _is_numpy,
    _is_tensorflow,
    _is_torch,
    _is_torch_device,
44
45
46
    add_end_docstrings,
    cached_path,
    hf_bucket_url,
47
    is_flax_available,
48
    is_offline_mode,
49
50
    is_remote_url,
    is_tf_available,
51
    is_tokenizers_available,
52
    is_torch_available,
53
    to_py_obj,
54
55
    torch_required,
)
Lysandre Debut's avatar
Lysandre Debut committed
56
from .utils import logging
57
58


Sylvain Gugger's avatar
Sylvain Gugger committed
59
60
61
62
63
64
65
66
if TYPE_CHECKING:
    if is_torch_available():
        import torch
    if is_tf_available():
        import tensorflow as tf
    if is_flax_available():
        import jax.numpy as jnp  # noqa: F401

67
68
69
70
71
72
73
74

if is_tokenizers_available():
    from tokenizers import AddedToken
    from tokenizers import Encoding as EncodingFast
else:

    @dataclass(frozen=True, eq=True)
    class AddedToken:
Sylvain Gugger's avatar
Sylvain Gugger committed
75
76
77
        """
        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
        way it should behave.
78
79
80
81
82
83
84
85
86
87
88
89
90
        """

        content: str = field(default_factory=str)
        single_word: bool = False
        lstrip: bool = False
        rstrip: bool = False
        normalized: bool = True

        def __getstate__(self):
            return self.__dict__

    @dataclass
    class EncodingFast:
Patrick von Platen's avatar
Patrick von Platen committed
91
        """This is dummy class because without the `tokenizers` library we don't have these objects anyway"""
92
93
94

        pass

95

Lysandre Debut's avatar
Lysandre Debut committed
96
logger = logging.get_logger(__name__)
97
98
99
100
101
102
103
104
105
106
107
108
109

VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER

# Define type aliases and NamedTuples
TextInput = str
PreTokenizedInput = List[str]
EncodedInput = List[int]
TextInputPair = Tuple[str, str]
PreTokenizedInputPair = Tuple[List[str], List[str]]
EncodedInputPair = Tuple[List[int], List[int]]


110
# Slow tokenizers used to be saved in three separated files
111
112
113
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
114
115

# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
116
117
118
119
FULL_TOKENIZER_FILE = "tokenizer.json"


class TruncationStrategy(ExplicitEnum):
Sylvain Gugger's avatar
Sylvain Gugger committed
120
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
121
122
    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
    tab-completion in an IDE.
Sylvain Gugger's avatar
Sylvain Gugger committed
123
124
    """

125
126
127
128
129
130
131
    ONLY_FIRST = "only_first"
    ONLY_SECOND = "only_second"
    LONGEST_FIRST = "longest_first"
    DO_NOT_TRUNCATE = "do_not_truncate"


class CharSpan(NamedTuple):
Sylvain Gugger's avatar
Sylvain Gugger committed
132
133
    """
    Character span in the original string.
134

Sylvain Gugger's avatar
Sylvain Gugger committed
135
136
137
    Args:
        start (:obj:`int`): Index of the first character in the original string.
        end (:obj:`int`): Index of the character following the last character in the original string.
138
139
140
141
142
143
144
    """

    start: int
    end: int


class TokenSpan(NamedTuple):
Sylvain Gugger's avatar
Sylvain Gugger committed
145
146
    """
    Token span in an encoded string (list of tokens).
147

Sylvain Gugger's avatar
Sylvain Gugger committed
148
149
150
    Args:
        start (:obj:`int`): Index of the first token in the span.
        end (:obj:`int`): Index of the token following the last token in the span.
151
152
153
154
155
156
157
    """

    start: int
    end: int


class BatchEncoding(UserDict):
Sylvain Gugger's avatar
Sylvain Gugger committed
158
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
159
160
    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
    :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
Sylvain Gugger's avatar
Sylvain Gugger committed
161
162
163
164
165
166
167
168
169
170
    attention_masks, etc).

    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
    utility methods to map from word/character space to token space.

    Args:
        data (:obj:`dict`):
            Dictionary of lists/arrays/tensors returned by the encode/batch_encode methods ('input_ids',
            'attention_mask', etc.).
        encoding (:obj:`tokenizers.Encoding` or :obj:`Sequence[tokenizers.Encoding]`, `optional`):
171
172
173
            If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
            space to token space the :obj:`tokenizers.Encoding` instance or list of instance (for batches) hold this
            information.
Sylvain Gugger's avatar
Sylvain Gugger committed
174
175
176
177
178
        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
        prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
179
180
181
        n_sequences (:obj:`Optional[int]`, `optional`):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
182
183
184
185
186
187
188
189
    """

    def __init__(
        self,
        data: Optional[Dict[str, Any]] = None,
        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
        tensor_type: Union[None, str, TensorType] = None,
        prepend_batch_axis: bool = False,
190
        n_sequences: Optional[int] = None,
191
192
193
194
195
196
197
198
    ):
        super().__init__(data)

        if isinstance(encoding, EncodingFast):
            encoding = [encoding]

        self._encodings = encoding

199
200
201
202
203
        if n_sequences is None and encoding is not None and len(encoding):
            n_sequences = encoding[0].n_sequences

        self._n_sequences = n_sequences

204
205
        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

206
207
208
209
210
211
212
    @property
    def n_sequences(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
        :class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
        sentence) or :obj:`2` (a pair of sentences)
        """
Lysandre Debut's avatar
Lysandre Debut committed
213
        return self._n_sequences
214

215
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
216
    def is_fast(self) -> bool:
217
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
218
219
        :obj:`bool`: Indicate whether this :class:`~transformers.BatchEncoding` was generated from the result of a
        :class:`~transformers.PreTrainedTokenizerFast` or not.
220
221
222
        """
        return self._encodings is not None

Sylvain Gugger's avatar
Sylvain Gugger committed
223
224
    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
225
226
        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask',
        etc.).
Sylvain Gugger's avatar
Sylvain Gugger committed
227
228

        If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
        """
        if isinstance(item, str):
            return self.data[item]
        elif self._encodings is not None:
            return self._encodings[item]
        else:
            raise KeyError(
                "Indexing with integers (to access backend Encoding for a given batch index) "
                "is not available when using Python based tokenizers"
            )

    def __getattr__(self, item: str):
        try:
            return self.data[item]
        except KeyError:
            raise AttributeError

246
247
248
249
250
251
252
253
254
255
    def __getstate__(self):
        return {"data": self.data, "encodings": self._encodings}

    def __setstate__(self, state):
        if "data" in state:
            self.data = state["data"]

        if "encodings" in state:
            self._encodings = state["encodings"]

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
    def keys(self):
        return self.data.keys()

    def values(self):
        return self.data.values()

    def items(self):
        return self.data.items()

    # After this point:
    # Extended properties and methods only available for fast (Rust-based) tokenizers
    # provided by HuggingFace tokenizers library.

    @property
    def encodings(self) -> Optional[List[EncodingFast]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
272
273
        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
        :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
274
275
276
        """
        return self._encodings

277
    def tokens(self, batch_index: int = 0) -> List[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
278
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
279
280
        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
        integer indices) at a given batch index (only works for the output of a fast tokenizer).
Sylvain Gugger's avatar
Sylvain Gugger committed
281
282
283
284
285
286
287

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.

        Returns:
            :obj:`List[str]`: The list of tokens at that index.
        """
288
        if not self._encodings:
Sylvain Gugger's avatar
Sylvain Gugger committed
289
            raise ValueError("tokens() is not available when using Python-based tokenizers")
290
291
        return self._encodings[batch_index].tokens

292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to the id of their original sentences:

            - :obj:`None` for special tokens added around or between sequences,
            - :obj:`0` for tokens corresponding to words in the first sequence,
            - :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
              encoded.

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.

        Returns:
            :obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
            added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
            corresponding sequence.
        """
        if not self._encodings:
            raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
        return self._encodings[batch_index].sequence_ids

313
    def words(self, batch_index: int = 0) -> List[Optional[int]]:
Sylvain Gugger's avatar
Sylvain Gugger committed
314
315
316
317
318
319
320
321
322
323
324
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.

        Returns:
            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
            word (several tokens will be mapped to the same word index if they are parts of that word).
        """
325
        if not self._encodings:
Sylvain Gugger's avatar
Sylvain Gugger committed
326
            raise ValueError("words() is not available when using Python-based tokenizers")
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
        warnings.warn(
            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
            "but more self-explanatory `BatchEncoding.word_ids()` property.",
            FutureWarning,
        )
        return self.word_ids(batch_index)

    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.

        Returns:
            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
            word (several tokens will be mapped to the same word index if they are parts of that word).
        """
        if not self._encodings:
            raise ValueError("word_ids() is not available when using Python-based tokenizers")
        return self._encodings[batch_index].word_ids

    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
        Get the index of the sequence represented by the given token. In the general use case, this method returns
        :obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair

        Can be called as:

        - ``self.token_to_sequence(token_index)`` if batch size is 1
        - ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.

        Args:
            batch_or_token_index (:obj:`int`):
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the token in the sequence.
            token_index (:obj:`int`, `optional`):
                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
                sequence.

        Returns:
            :obj:`int`: Index of the word in the input sequence.
        """

        if not self._encodings:
            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_sequence(token_index)
388
389
390

    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
391
        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
392
393
394
395
396
397

        Can be called as:

        - ``self.token_to_word(token_index)`` if batch size is 1
        - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1

Sylvain Gugger's avatar
Sylvain Gugger committed
398
399
400
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.
401
402
403

        Args:
            batch_or_token_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
404
405
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
406
            token_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
407
408
                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
                sequence.
409
410

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
411
            :obj:`int`: Index of the word in the input sequence.
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
        """

        if not self._encodings:
            raise ValueError("token_to_word() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_word(token_index)

427
428
429
    def word_to_tokens(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> Optional[TokenSpan]:
430
        """
431
        Get the encoded token span corresponding to a word in a sequence of the batch.
432

Sylvain Gugger's avatar
Sylvain Gugger committed
433
        Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
434

Sylvain Gugger's avatar
Sylvain Gugger committed
435
436
        - **start** -- Index of the first token.
        - **end** -- Index of the token following the last token.
437
438
439

        Can be called as:

440
441
442
        - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
        - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
          to 1
443

Sylvain Gugger's avatar
Sylvain Gugger committed
444
445
446
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
447
448
449

        Args:
            batch_or_word_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
450
451
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the word in the sequence.
452
            word_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
453
454
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
455
456
457
            sequence_index (:obj:`int`, `optional`, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.
458
459

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
460
461
            Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
            Returns :obj:`None` if no tokens correspond to the word.
462
463
464
465
466
467
468
469
470
471
472
473
474
        """

        if not self._encodings:
            raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if word_index < 0:
            word_index = self._seq_len + word_index
475
        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
476
        return TokenSpan(*span) if span is not None else None
477
478
479
480
481

    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
        """
        Get the character span corresponding to an encoded token in a sequence of the batch.

Sylvain Gugger's avatar
Sylvain Gugger committed
482
        Character spans are returned as a :class:`~transformers.tokenization_utils_base.CharSpan` with:
483

Sylvain Gugger's avatar
Sylvain Gugger committed
484
485
486
        - **start** -- Index of the first character in the original string associated to the token.
        - **end** -- Index of the character following the last character in the original string associated to the
          token.
487
488
489
490
491
492
493
494

        Can be called as:

        - ``self.token_to_chars(token_index)`` if batch size is 1
        - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1

        Args:
            batch_or_token_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
495
496
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
497
            token_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
498
499
                If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in
                the sequence.
500
501

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
502
            :class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string.
503
504
505
506
507
508
509
510
511
512
513
        """

        if not self._encodings:
            raise ValueError("token_to_chars() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))

514
515
516
    def char_to_token(
        self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
    ) -> int:
517
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
518
519
        Get the index of the token in the encoded output comprising a character in the original string for a sequence
        of the batch.
520
521
522
523
524
525

        Can be called as:

        - ``self.char_to_token(char_index)`` if batch size is 1
        - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1

Sylvain Gugger's avatar
Sylvain Gugger committed
526
527
528
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
529
530
531

        Args:
            batch_or_char_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
532
533
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
534
            char_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
535
536
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
537
538
539
            sequence_index (:obj:`int`, `optional`, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.
540
541
542
543
544
545
546
547
548
549
550
551
552


        Returns:
            :obj:`int`: Index of the token.
        """

        if not self._encodings:
            raise ValueError("char_to_token() is not available when using Python based tokenizers")
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
553
        return self._encodings[batch_index].char_to_token(char_index, sequence_index)
554

555
556
557
    def word_to_chars(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> CharSpan:
558
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
559
        Get the character span in the original string corresponding to given word in a sequence of the batch.
560
561
562
563
564
565
566
567
568
569
570
571
572

        Character spans are returned as a CharSpan NamedTuple with:

        - start: index of the first character in the original string
        - end: index of the character following the last character in the original string

        Can be called as:

        - ``self.word_to_chars(word_index)`` if batch size is 1
        - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1

        Args:
            batch_or_word_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
573
574
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
575
            word_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
576
577
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
578
579
580
            sequence_index (:obj:`int`, `optional`, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.
581
582

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
583
584
            :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
            CharSpan are NamedTuple with:
585
586

                - start: index of the first character associated to the token in the original string
Sylvain Gugger's avatar
Sylvain Gugger committed
587
588
                - end: index of the character following the last character associated to the token in the original
                  string
589
590
591
592
593
594
595
596
597
        """

        if not self._encodings:
            raise ValueError("word_to_chars() is not available when using Python based tokenizers")
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
598
        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
599

600
    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
601
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
602
603
        Get the word in the original string corresponding to a character in the original string of a sequence of the
        batch.
604
605
606
607
608
609

        Can be called as:

        - ``self.char_to_word(char_index)`` if batch size is 1
        - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1

Sylvain Gugger's avatar
Sylvain Gugger committed
610
611
612
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
613
614
615

        Args:
            batch_or_char_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
616
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
617
                the character in the original string.
618
            char_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
619
                If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
620
                original string.
621
622
623
            sequence_index (:obj:`int`, `optional`, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.
624
625
626


        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
627
            :obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s).
628
629
630
631
632
633
634
635
636
        """

        if not self._encodings:
            raise ValueError("char_to_word() is not available when using Python based tokenizers")
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
637
        return self._encodings[batch_index].char_to_word(char_index, sequence_index)
638

Sylvain Gugger's avatar
Sylvain Gugger committed
639
640
641
642
643
644
645
    def convert_to_tensors(
        self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
    ):
        """
        Convert the inner content to tensors.

        Args:
646
            tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
647
                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
648
                :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
Sylvain Gugger's avatar
Sylvain Gugger committed
649
650
651
            prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
                Whether or not to add the batch dimension during the conversion.
        """
652
653
654
655
656
657
658
659
        if tensor_type is None:
            return self

        # Convert to TensorType
        if not isinstance(tensor_type, TensorType):
            tensor_type = TensorType(tensor_type)

        # Get a function reference for the correct framework
660
661
662
663
664
        if tensor_type == TensorType.TENSORFLOW:
            if not is_tf_available():
                raise ImportError(
                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
                )
Sylvain Gugger's avatar
Sylvain Gugger committed
665
666
            import tensorflow as tf

667
            as_tensor = tf.constant
668
            is_tensor = tf.is_tensor
669
670
671
        elif tensor_type == TensorType.PYTORCH:
            if not is_torch_available():
                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
Sylvain Gugger's avatar
Sylvain Gugger committed
672
673
            import torch

674
            as_tensor = torch.tensor
675
            is_tensor = torch.is_tensor
676
677
678
        elif tensor_type == TensorType.JAX:
            if not is_flax_available():
                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
Sylvain Gugger's avatar
Sylvain Gugger committed
679
680
            import jax.numpy as jnp  # noqa: F811

681
            as_tensor = jnp.array
682
            is_tensor = _is_jax
683
        else:
684
            as_tensor = np.asarray
685
            is_tensor = _is_numpy
686
687
688
        # (mfuntowicz: This code is unreachable)
        # else:
        #     raise ImportError(
689
        #         f"Unable to convert output to tensors format {tensor_type}"
690
        #     )
691
692
693
694
695
696
697

        # Do the tensor conversion in batch
        for key, value in self.items():
            try:
                if prepend_batch_axis:
                    value = [value]

698
699
                if not is_tensor(value):
                    tensor = as_tensor(value)
700

701
702
703
704
705
706
                    # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
                    # # at-least2d
                    # if tensor.ndim > 2:
                    #     tensor = tensor.squeeze(0)
                    # elif tensor.ndim < 2:
                    #     tensor = tensor[None, :]
707

708
                    self[key] = tensor
709
            except:  # noqa E722
710
711
712
713
714
                if key == "overflowing_tokens":
                    raise ValueError(
                        "Unable to create tensor returning overflowing tokens of different lengths. "
                        "Please see if a fast version of this tokenizer is available to have this feature available."
                    )
715
716
717
718
719
720
721
722
                raise ValueError(
                    "Unable to create tensor, you should probably activate truncation and/or padding "
                    "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
                )

        return self

    @torch_required
723
    def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
Sylvain Gugger's avatar
Sylvain Gugger committed
724
725
726
727
728
729
730
        """
        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).

        Args:
            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.

        Returns:
731
            :class:`~transformers.BatchEncoding`: The same instance after modification.
Sylvain Gugger's avatar
Sylvain Gugger committed
732
        """
733
734
735
736

        # This check catches things like APEX blindly calling "to" on all inputs to a module
        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
        # into a HalfTensor
737
        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
738
739
            self.data = {k: v.to(device=device) for k, v in self.data.items()}
        else:
740
            logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
741
742
743
744
        return self


class SpecialTokensMixin:
Sylvain Gugger's avatar
Sylvain Gugger committed
745
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
746
747
    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
    handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
748
    used to directly access these special tokens in a model-independent manner and allow to set and update the special
Sylvain Gugger's avatar
Sylvain Gugger committed
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
    tokens.

    Args:
        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the beginning of a sentence.
        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the end of a sentence.
        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing an out-of-vocabulary token.
        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token separating two different sentences in the same input (used by BERT for instance).
        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the class of the input (used by BERT for instance).
        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT).
        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A tuple or a list of additional special tokens.
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
    """

    SPECIAL_TOKENS_ATTRIBUTES = [
        "bos_token",
        "eos_token",
        "unk_token",
        "sep_token",
        "pad_token",
        "cls_token",
        "mask_token",
        "additional_special_tokens",
    ]

    def __init__(self, verbose=True, **kwargs):
        self._bos_token = None
        self._eos_token = None
        self._unk_token = None
        self._sep_token = None
        self._pad_token = None
        self._cls_token = None
        self._mask_token = None
        self._pad_token_type_id = 0
        self._additional_special_tokens = []
        self.verbose = verbose

795
        # We directly set the hidden value to allow initialization with special tokens
796
797
        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
        # TODO clean this up at some point (probably by switching to fast tokenizers)
798
        for key, value in kwargs.items():
799
800
            if value is None:
                continue
801
802
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                if key == "additional_special_tokens":
Teven's avatar
Teven committed
803
                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
NielsRogge's avatar
NielsRogge committed
804
805
806
                    assert all(
                        isinstance(t, (str, AddedToken)) for t in value
                    ), "One of the tokens is not a string or an AddedToken"
807
                    setattr(self, key, value)
808
                elif isinstance(value, (str, AddedToken)):
809
810
                    setattr(self, key, value)
                else:
811
                    raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
812

813
    def sanitize_special_tokens(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
814
815
816
        """
        Make sure that all the special tokens attributes of the tokenizer (:obj:`tokenizer.mask_token`,
        :obj:`tokenizer.cls_token`, etc.) are in the vocabulary.
817

Sylvain Gugger's avatar
Sylvain Gugger committed
818
819
820
        Add the missing ones to the vocabulary if needed.

        Return:
821
            :obj:`int`: The number of tokens added in the vocabulary during the operation.
822
        """
823
        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
824

825
    def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
826
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
827
828
829
        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
        current vocabulary).
830

831
832
833
834
835
836
837
        .. Note::
            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
            the model so that its embedding matrix matches the tokenizer.

            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.

        Using :obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
838

Sylvain Gugger's avatar
Sylvain Gugger committed
839
840
841
        - Special tokens are carefully handled by the tokenizer (they are never split).
        - You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This
          makes it easy to develop model-agnostic training and fine-tuning scripts.
842

Sylvain Gugger's avatar
Sylvain Gugger committed
843
844
845
        When possible, special tokens are already registered for provided pretrained models (for instance
        :class:`~transformers.BertTokenizer` :obj:`cls_token` is already registered to be :obj`'[CLS]'` and XLM's one
        is also registered to be :obj:`'</s>'`).
846
847

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
848
849
850
            special_tokens_dict (dictionary `str` to `str` or :obj:`tokenizers.AddedToken`):
                Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``,
                ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
851
852
                ``additional_special_tokens``].

Sylvain Gugger's avatar
Sylvain Gugger committed
853
854
                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
                assign the index of the ``unk_token`` to them).
855
856

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
857
            :obj:`int`: Number of tokens added to the vocabulary.
858
859
860
861
862
863
864
865
866
867
868

        Examples::

            # Let's see how to add a new classification token to GPT-2
            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            model = GPT2Model.from_pretrained('gpt2')

            special_tokens_dict = {'cls_token': '<CLS>'}

            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
            print('We have added', num_added_toks, 'tokens')
Sylvain Gugger's avatar
Sylvain Gugger committed
869
870
            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
            model.resize_token_embeddings(len(tokenizer))
871
872
873
874
875
876
877
878

            assert tokenizer.cls_token == '<CLS>'
        """
        if not special_tokens_dict:
            return 0

        added_tokens = 0
        for key, value in special_tokens_dict.items():
Teven's avatar
Teven committed
879
            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
880

881
            if self.verbose:
882
                logger.info(f"Assigning {value} to the {key} key of the tokenizer")
883
884
            setattr(self, key, value)

885
            if key == "additional_special_tokens":
886
887
888
                assert isinstance(value, (list, tuple)) and all(
                    isinstance(t, (str, AddedToken)) for t in value
                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
889
                added_tokens += self.add_tokens(value, special_tokens=True)
890
            else:
891
892
893
                assert isinstance(
                    value, (str, AddedToken)
                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
894
                added_tokens += self.add_tokens([value], special_tokens=True)
895
896
897

        return added_tokens

Sylvain Gugger's avatar
Sylvain Gugger committed
898
899
900
    def add_tokens(
        self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
    ) -> int:
901
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
902
903
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary.
904

905
906
907
908
909
910
        .. Note::
            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
            the model so that its embedding matrix matches the tokenizer.

            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.

911
        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
912
913
914
915
916
            new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`):
                Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a
                string token to let you personalize its behavior: whether this token should only match against a single
                word, whether this token should strip all potential whitespaces on the left side, whether this token
                should strip all potential whitespaces on the right side, etc.
917
            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
918
919
                Can be used to specify if the token is a special token. This mostly change the normalization behavior
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
920

Sylvain Gugger's avatar
Sylvain Gugger committed
921
                See details for :obj:`tokenizers.AddedToken` in HuggingFace tokenizers library.
922
923

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
924
            :obj:`int`: Number of tokens added to the vocabulary.
925
926
927
928
929
930
931
932
933

        Examples::

            # Let's see how to increase the vocabulary of Bert model and tokenizer
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
            model = BertModel.from_pretrained('bert-base-uncased')

            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
            print('We have added', num_added_toks, 'tokens')
Sylvain Gugger's avatar
Sylvain Gugger committed
934
935
             # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
            model.resize_token_embeddings(len(tokenizer))
936
937
938
939
940
941
942
943
        """
        if not new_tokens:
            return 0

        if not isinstance(new_tokens, (list, tuple)):
            new_tokens = [new_tokens]

        return self._add_tokens(new_tokens, special_tokens=special_tokens)
944

945
946
947
    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
        raise NotImplementedError

948
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
949
950
951
952
    def bos_token(self) -> str:
        """
        :obj:`str`: Beginning of sentence token. Log an error if used while not having been set.
        """
953
954
        if self._bos_token is None and self.verbose:
            logger.error("Using bos_token, but it is not set yet.")
955
956
            return None
        return str(self._bos_token)
957
958

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
959
960
961
962
    def eos_token(self) -> str:
        """
        :obj:`str`: End of sentence token. Log an error if used while not having been set.
        """
963
964
        if self._eos_token is None and self.verbose:
            logger.error("Using eos_token, but it is not set yet.")
965
966
            return None
        return str(self._eos_token)
967
968

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
969
970
971
972
    def unk_token(self) -> str:
        """
        :obj:`str`: Unknown token. Log an error if used while not having been set.
        """
973
974
        if self._unk_token is None and self.verbose:
            logger.error("Using unk_token, but it is not set yet.")
975
976
            return None
        return str(self._unk_token)
977
978

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
979
980
    def sep_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
981
982
        :obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while
        not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
983
        """
984
985
        if self._sep_token is None and self.verbose:
            logger.error("Using sep_token, but it is not set yet.")
986
987
            return None
        return str(self._sep_token)
988
989

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
990
991
992
993
    def pad_token(self) -> str:
        """
        :obj:`str`: Padding token. Log an error if used while not having been set.
        """
994
995
        if self._pad_token is None and self.verbose:
            logger.error("Using pad_token, but it is not set yet.")
996
997
            return None
        return str(self._pad_token)
998
999

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1000
1001
    def cls_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1002
1003
        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
        full depth of the model. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1004
        """
1005
1006
        if self._cls_token is None and self.verbose:
            logger.error("Using cls_token, but it is not set yet.")
1007
1008
            return None
        return str(self._cls_token)
1009
1010

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1011
1012
1013
1014
1015
    def mask_token(self) -> str:
        """
        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
        not having been set.
        """
1016
1017
        if self._mask_token is None and self.verbose:
            logger.error("Using mask_token, but it is not set yet.")
1018
1019
            return None
        return str(self._mask_token)
1020
1021

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1022
1023
1024
1025
1026
    def additional_special_tokens(self) -> List[str]:
        """
        :obj:`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having
        been set.
        """
1027
1028
        if self._additional_special_tokens is None and self.verbose:
            logger.error("Using additional_special_tokens, but it is not set yet.")
1029
1030
            return None
        return [str(tok) for tok in self._additional_special_tokens]
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064

    @bos_token.setter
    def bos_token(self, value):
        self._bos_token = value

    @eos_token.setter
    def eos_token(self, value):
        self._eos_token = value

    @unk_token.setter
    def unk_token(self, value):
        self._unk_token = value

    @sep_token.setter
    def sep_token(self, value):
        self._sep_token = value

    @pad_token.setter
    def pad_token(self, value):
        self._pad_token = value

    @cls_token.setter
    def cls_token(self, value):
        self._cls_token = value

    @mask_token.setter
    def mask_token(self, value):
        self._mask_token = value

    @additional_special_tokens.setter
    def additional_special_tokens(self, value):
        self._additional_special_tokens = value

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1065
1066
1067
1068
1069
    def bos_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns :obj:`None` if the token
        has not been set.
        """
1070
1071
        if self._bos_token is None:
            return None
1072
1073
1074
        return self.convert_tokens_to_ids(self.bos_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1075
1076
1077
1078
1079
    def eos_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns :obj:`None` if the token has
        not been set.
        """
1080
1081
        if self._eos_token is None:
            return None
1082
1083
1084
        return self.convert_tokens_to_ids(self.eos_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1085
1086
1087
1088
1089
    def unk_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the unknown token in the vocabulary. Returns :obj:`None` if the token has not been
        set.
        """
1090
1091
        if self._unk_token is None:
            return None
1092
1093
1094
        return self.convert_tokens_to_ids(self.unk_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1095
1096
1097
1098
1099
    def sep_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
        sequence. Returns :obj:`None` if the token has not been set.
        """
1100
1101
        if self._sep_token is None:
            return None
1102
1103
1104
        return self.convert_tokens_to_ids(self.sep_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1105
1106
1107
1108
1109
    def pad_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
        set.
        """
1110
1111
        if self._pad_token is None:
            return None
1112
1113
1114
        return self.convert_tokens_to_ids(self.pad_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1115
1116
1117
1118
    def pad_token_type_id(self) -> int:
        """
        :obj:`int`: Id of the padding token type in the vocabulary.
        """
1119
1120
1121
        return self._pad_token_type_id

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1122
1123
1124
1125
1126
1127
1128
    def cls_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input
        sequence leveraging self-attention along the full depth of the model.

        Returns :obj:`None` if the token has not been set.
        """
1129
1130
        if self._cls_token is None:
            return None
1131
1132
1133
        return self.convert_tokens_to_ids(self.cls_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1134
1135
1136
1137
1138
    def mask_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
        modeling. Returns :obj:`None` if the token has not been set.
        """
1139
1140
        if self._mask_token is None:
            return None
1141
1142
1143
        return self.convert_tokens_to_ids(self.mask_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1144
1145
    def additional_special_tokens_ids(self) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1146
1147
        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
        having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1148
        """
1149
1150
        return self.convert_tokens_to_ids(self.additional_special_tokens)

1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
    @bos_token_id.setter
    def bos_token_id(self, value):
        self._bos_token = self.convert_tokens_to_ids(value)

    @eos_token_id.setter
    def eos_token_id(self, value):
        self._eos_token = self.convert_tokens_to_ids(value)

    @unk_token_id.setter
    def unk_token_id(self, value):
        self._unk_token = self.convert_tokens_to_ids(value)

    @sep_token_id.setter
    def sep_token_id(self, value):
        self._sep_token = self.convert_tokens_to_ids(value)

    @pad_token_id.setter
    def pad_token_id(self, value):
        self._pad_token = self.convert_tokens_to_ids(value)

    @cls_token_id.setter
    def cls_token_id(self, value):
        self._cls_token = self.convert_tokens_to_ids(value)

    @mask_token_id.setter
    def mask_token_id(self, value):
        self._mask_token = self.convert_tokens_to_ids(value)

    @additional_special_tokens_ids.setter
    def additional_special_tokens_ids(self, values):
        self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values]

1183
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1184
1185
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1186
1187
        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`,
        :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
Sylvain Gugger's avatar
Sylvain Gugger committed
1188
1189

        Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
1190
1191
1192
1193
1194
1195
1196
1197
1198
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = str(attr_value)
        return set_attr

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1199
1200
1201
1202
1203
    def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
        """
        :obj:`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary
        mapping special token class attributes (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values
        (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
1204

Sylvain Gugger's avatar
Sylvain Gugger committed
1205
1206
        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
        how special tokens are tokenized.
1207
1208
1209
1210
1211
1212
1213
1214
1215
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = attr_value
        return set_attr

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1216
1217
1218
1219
1220
    def all_special_tokens(self) -> List[str]:
        """
        :obj:`List[str]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.

        Convert tokens of :obj:`tokenizers.AddedToken` type to string.
1221
        """
1222
1223
1224
1225
        all_toks = [str(s) for s in self.all_special_tokens_extended]
        return all_toks

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1226
1227
1228
1229
    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
        """
        :obj:`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.)
        mapped to class attributes.
1230

Sylvain Gugger's avatar
Sylvain Gugger committed
1231
1232
        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
        how special tokens are tokenized.
1233
        """
1234
        all_toks = []
1235
        set_attr = self.special_tokens_map_extended
1236
1237
        for attr_value in set_attr.values():
            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
1238
        all_toks = list(OrderedDict.fromkeys(all_toks))
1239
1240
1241
        return all_toks

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1242
1243
1244
1245
    def all_special_ids(self) -> List[int]:
        """
        :obj:`List[int]`: List the ids of the special tokens(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class
        attributes.
1246
1247
1248
1249
1250
1251
1252
1253
        """
        all_toks = self.all_special_tokens
        all_ids = self.convert_tokens_to_ids(all_toks)
        return all_ids


ENCODE_KWARGS_DOCSTRING = r"""
            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1254
                Whether or not to encode the sequences with the special tokens relative to their model.
1255
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
                Activates and controls padding. Accepts the following values:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
                Activates and controls truncation. Accepts the following values:

                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
                  if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
                  the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
                  sequence lengths greater than the model maximum admissible input size).
            max_length (:obj:`int`, `optional`):
                Controls the maximum length to use by one of the truncation/padding parameters.

                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
            stride (:obj:`int`, `optional`, defaults to 0):
                If set to a number along with :obj:`max_length`, the overflowing tokens returned when
                :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
                returned to provide some overlap between truncated and overflowing sequences. The value of this
                argument defines the number of overlapping tokens.
1290
            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Style  
Sylvain Gugger committed
1291
1292
1293
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
Sylvain Gugger's avatar
Sylvain Gugger committed
1294
1295
1296
            pad_to_multiple_of (:obj:`int`, `optional`):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
1297
            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1298
1299
1300
1301
1302
                If set, will return tensors instead of list of python integers. Acceptable values are:

                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
1303
1304
1305
"""

ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
1306
            return_token_type_ids (:obj:`bool`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1307
1308
                Whether to return token type IDs. If left to the default, will return the token type IDs according to
                the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
1309

Sylvain Gugger's avatar
Sylvain Gugger committed
1310
1311
                `What are token type IDs? <../glossary.html#token-type-ids>`__
            return_attention_mask (:obj:`bool`, `optional`):
1312
1313
1314
1315
1316
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.

                `What are attention masks? <../glossary.html#attention-mask>`__
            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1317
                Whether or not to return overflowing token sequences.
1318
            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Tiger's avatar
Tiger committed
1319
                Whether or not to return special tokens mask information.
1320
            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1321
                Whether or not to return :obj:`(char_start, char_end)` for each token.
1322

Sylvain Gugger's avatar
Sylvain Gugger committed
1323
1324
1325
1326
1327
1328
                This is only available on fast tokenizers inheriting from
                :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
                :obj:`NotImplementedError`.
            return_length  (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to return the lengths of the encoded inputs.
            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
1329
                Whether or not to print more information and warnings.
Sylvain Gugger's avatar
Sylvain Gugger committed
1330
            **kwargs: passed to the :obj:`self.tokenize()` method
1331

Sylvain Gugger's avatar
Sylvain Gugger committed
1332
1333
1334
1335
1336
1337
        Return:
            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.

              `What are input IDs? <../glossary.html#input-ids>`__
Sylvain Gugger's avatar
Sylvain Gugger committed
1338

Sylvain Gugger's avatar
Sylvain Gugger committed
1339
1340
1341
1342
            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
              or if `"token_type_ids"` is in :obj:`self.model_input_names`).

              `What are token type IDs? <../glossary.html#token-type-ids>`__
Sylvain Gugger's avatar
Sylvain Gugger committed
1343

Sylvain Gugger's avatar
Sylvain Gugger committed
1344
1345
1346
1347
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).

              `What are attention masks? <../glossary.html#attention-mask>`__
Sylvain Gugger's avatar
Sylvain Gugger committed
1348

Sylvain Gugger's avatar
Sylvain Gugger committed
1349
1350
1351
1352
            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
              :obj:`return_overflowing_tokens=True`).
            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
              :obj:`return_overflowing_tokens=True`).
1353
            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
Tiger's avatar
Tiger committed
1354
              regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
Sylvain Gugger's avatar
Sylvain Gugger committed
1355
1356
            - **length** -- The length of the inputs (when :obj:`return_length=True`)
"""
1357

Sylvain Gugger's avatar
Sylvain Gugger committed
1358
1359
INIT_TOKENIZER_DOCSTRING = r"""
    Class attributes (overridden by derived classes)
Sylvain Gugger's avatar
Sylvain Gugger committed
1360

Tiger's avatar
Tiger committed
1361
        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
Sylvain Gugger's avatar
Sylvain Gugger committed
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
          each vocabulary file required by the model, and as associated values, the filename for saving the associated
          file (string).
        - **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
          high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the
          low-level being the :obj:`short-cut-names` of the pretrained models with, as associated values, the
          :obj:`url` to the associated pretrained vocabulary file.
        - **max_model_input_sizes** (:obj:`Dict[str, Optinal[int]]`) -- A dictionary with, as keys, the
          :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
          inputs of this model, or :obj:`None` if the model has no maximum input size.
        - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
Sylvain Gugger's avatar
Sylvain Gugger committed
1372
1373
          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
          to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
Sylvain Gugger's avatar
Sylvain Gugger committed
1374
1375
1376
1377
1378
1379
1380
1381
          tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
          method.
        - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
        - **padding_side** (:obj:`str`) -- The default value for the side on which the model should have padding
          applied. Should be :obj:`'right'` or :obj:`'left'`.

    Args:
        model_max_length (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1382
1383
1384
1385
            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
            loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this
            will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no
            value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
Sylvain Gugger's avatar
Sylvain Gugger committed
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
        padding_side: (:obj:`str`, `optional`):
            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
            Default value is picked from the class attribute of the same name.
        model_input_names (:obj:`List[string]`, `optional`):
            The list of inputs accepted by the forward pass of the model (like :obj:`"token_type_ids"` or
            :obj:`"attention_mask"`). Default value is picked from the class attribute of the same name.
        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the beginning of a sentence. Will be associated to ``self.bos_token`` and
            ``self.bos_token_id``.
        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the end of a sentence. Will be associated to ``self.eos_token`` and
            ``self.eos_token_id``.
        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing an out-of-vocabulary token. Will be associated to ``self.unk_token`` and
            ``self.unk_token_id``.
        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token separating two different sentences in the same input (used by BERT for instance). Will be
            associated to ``self.sep_token`` and ``self.sep_token_id``.
        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation. Will be associated to ``self.pad_token`` and
            ``self.pad_token_id``.
        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the class of the input (used by BERT for instance). Will be associated to
            ``self.cls_token`` and ``self.cls_token_id``.
        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT). Will be associated to ``self.mask_token`` and ``self.mask_token_id``.
        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
            tokenization process. Will be associated to ``self.additional_special_tokens`` and
            ``self.additional_special_tokens_ids``.
1418
1419
1420
"""


Sylvain Gugger's avatar
Sylvain Gugger committed
1421
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
Sylvain Gugger's avatar
Sylvain Gugger committed
1422
class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
Sylvain Gugger's avatar
Sylvain Gugger committed
1423
1424
    """
    Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`.
1425

Sylvain Gugger's avatar
Sylvain Gugger committed
1426
    Handles shared (mostly boiler plate) methods for those two classes.
1427
1428
1429
1430
1431
    """

    vocab_files_names: Dict[str, str] = {}
    pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
Sylvain Gugger's avatar
Sylvain Gugger committed
1432
    max_model_input_sizes: Dict[str, Optional[int]] = {}
1433
1434
1435
1436

    # first name has to correspond to main model input name
    # to make sure `tokenizer.pad(...)` works correctly
    model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
1437
    padding_side: str = "right"
1438
    slow_tokenizer_class = None
1439

1440
1441
1442
    def __init__(self, **kwargs):
        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
        self.init_inputs = ()
1443
        self.init_kwargs = copy.deepcopy(kwargs)
1444
        self.name_or_path = kwargs.pop("name_or_path", "")
1445
1446

        # For backward compatibility we fallback to set model_max_length from max_len if provided
1447
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER

        # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
        self.padding_side = kwargs.pop("padding_side", self.padding_side)
        assert self.padding_side in [
            "right",
            "left",
        ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

1458
1459
1460
1461
        self.deprecation_warnings = (
            {}
        )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).

1462
        super().__init__(**kwargs)
1463
1464
1465

    @property
    def max_len_single_sentence(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1466
1467
1468
        """
        :obj:`int`: The maximum length of a sentence that can be fed to the model.
        """
1469
1470
1471
1472
        return self.model_max_length - self.num_special_tokens_to_add(pair=False)

    @property
    def max_len_sentences_pair(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1473
1474
1475
        """
        :obj:`int`: The maximum combined length of a pair of sentences that can be fed to the model.
        """
1476
1477
1478
1479
        return self.model_max_length - self.num_special_tokens_to_add(pair=True)

    @max_len_single_sentence.setter
    def max_len_single_sentence(self, value) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1480
        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
1481
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
1482
1483
1484
1485
1486
            if not self.deprecation_warnings.get("max_len_single_sentence", False):
                logger.warning(
                    "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
                )
            self.deprecation_warnings["max_len_single_sentence"] = True
1487
1488
1489
1490
1491
1492
1493
        else:
            raise ValueError(
                "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
            )

    @max_len_sentences_pair.setter
    def max_len_sentences_pair(self, value) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1494
        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
1495
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
1496
1497
1498
1499
1500
            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
                logger.warning(
                    "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
                )
            self.deprecation_warnings["max_len_sentences_pair"] = True
1501
1502
1503
1504
1505
        else:
            raise ValueError(
                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
            )

1506
1507
1508
1509
1510
1511
1512
    def __repr__(self) -> str:
        return (
            f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
            f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
            f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
        )

1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
    def get_vocab(self) -> Dict[str, int]:
        """
        Returns the vocabulary as a dictionary of token to index.

        :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
        :obj:`token` is in the vocab.

        Returns:
            :obj:`Dict[str, int]`: The vocabulary.
        """
        raise NotImplementedError()

1525
    @classmethod
1526
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
1527
        r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
1528
1529
        Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
        a predefined tokenizer.
1530
1531

        Args:
1532
            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1533
1534
                Can be either:

1535
1536
1537
                - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co.
                  Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a
                  user or organization name, like ``dbmdz/bert-base-german-cased``.
Sylvain Gugger's avatar
Sylvain Gugger committed
1538
1539
1540
1541
1542
1543
                - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
                  using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`
                  method, e.g., ``./my_model_directory/``.
                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
                  ``./my_model_directory/vocab.txt``.
1544
            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1545
1546
1547
1548
1549
1550
1551
1552
1553
                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
                standard cache should not be used.
            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
                exist.
            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to delete incompletely received files. Attempt to resume the download if such a file
                exists.
            proxies (:obj:`Dict[str, str], `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1554
1555
                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
1556
1557
1558
            use_auth_token (:obj:`str` or `bool`, `optional`):
                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
Julien Chaumond's avatar
Julien Chaumond committed
1559
1560
1561
1562
            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
                identifier allowed by git.
1563
1564
1565
            subfolder (:obj:`str`, `optional`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
Sylvain Gugger's avatar
Sylvain Gugger committed
1566
1567
1568
1569
1570
1571
            inputs (additional positional arguments, `optional`):
                Will be passed along to the Tokenizer ``__init__`` method.
            kwargs (additional keyword arguments, `optional`):
                Will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like
                ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
                ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__`` for more details.
1572

1573
1574
1575
1576
        .. note::

            Passing :obj:`use_auth_token=True` is required when you want to use a private model.

1577
1578
        Examples::

Sylvain Gugger's avatar
Sylvain Gugger committed
1579
            # We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer
1580
            # Download vocabulary from huggingface.co and cache.
1581
1582
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

1583
            # Download vocabulary from huggingface.co (user-uploaded) and cache.
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')

            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')

            # If the tokenizer uses a single vocabulary file, you can point directly to this file
            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')

            # You can link tokens to special vocabulary when instantiating
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
            # You should be sure '<unk>' is in the vocabulary when doing that.
            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
            assert tokenizer.unk_token == '<unk>'

        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
1604
        use_auth_token = kwargs.pop("use_auth_token", None)
Julien Chaumond's avatar
Julien Chaumond committed
1605
        revision = kwargs.pop("revision", None)
1606
        subfolder = kwargs.pop("subfolder", None)
1607
1608
1609
1610
1611
1612
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)

        user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline
1613

1614
1615
1616
1617
        if is_offline_mode() and not local_files_only:
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True

1618
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
1619
1620
1621
        vocab_files = {}
        init_configuration = {}

1622
1623
1624
1625
1626
        if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
1627
                )
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
            warnings.warn(
                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
                FutureWarning,
            )
            file_id = list(cls.vocab_files_names.keys())[0]
            vocab_files[file_id] = pretrained_model_name_or_path
        else:
            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
            additional_files_names = {
                "added_tokens_file": ADDED_TOKENS_FILE,
                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
                "tokenizer_file": FULL_TOKENIZER_FILE,
            }
            # Look for the tokenizer files
            for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
                if os.path.isdir(pretrained_model_name_or_path):
                    if subfolder is not None:
                        full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
1648
                    else:
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                    if not os.path.exists(full_file_name):
                        logger.info(f"Didn't find file {full_file_name}. We won't load it.")
                        full_file_name = None
                else:
                    full_file_name = hf_bucket_url(
                        pretrained_model_name_or_path,
                        filename=file_name,
                        subfolder=subfolder,
                        revision=revision,
                        mirror=None,
                    )
1661

1662
                vocab_files[file_id] = full_file_name
1663
1664

        # Get files from url, cache, or disk depending on the case
Julien Chaumond's avatar
Julien Chaumond committed
1665
        resolved_vocab_files = {}
1666
        unresolved_files = []
Julien Chaumond's avatar
Julien Chaumond committed
1667
1668
1669
1670
1671
        for file_id, file_path in vocab_files.items():
            if file_path is None:
                resolved_vocab_files[file_id] = None
            else:
                try:
1672
1673
1674
1675
1676
1677
1678
1679
                    resolved_vocab_files[file_id] = cached_path(
                        file_path,
                        cache_dir=cache_dir,
                        force_download=force_download,
                        proxies=proxies,
                        resume_download=resume_download,
                        local_files_only=local_files_only,
                        use_auth_token=use_auth_token,
1680
                        user_agent=user_agent,
1681
1682
1683
1684
1685
1686
1687
                    )

                except FileNotFoundError as error:
                    if local_files_only:
                        unresolved_files.append(file_id)
                    else:
                        raise error
1688

Julien Chaumond's avatar
Julien Chaumond committed
1689
1690
1691
1692
1693
1694
                except requests.exceptions.HTTPError as err:
                    if "404 Client Error" in str(err):
                        logger.debug(err)
                        resolved_vocab_files[file_id] = None
                    else:
                        raise err
1695

1696
1697
1698
1699
1700
1701
        if len(unresolved_files) > 0:
            logger.info(
                f"Can't load following files from cache: {unresolved_files} and cannot check if these "
                "files are necessary for the tokenizer to operate."
            )

1702
        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
Julien Chaumond's avatar
Julien Chaumond committed
1703
1704
1705
1706
            msg = (
                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
1707
            )
Julien Chaumond's avatar
Julien Chaumond committed
1708
            raise EnvironmentError(msg)
1709
1710

        for file_id, file_path in vocab_files.items():
1711
1712
1713
            if file_id not in resolved_vocab_files:
                continue

1714
            if file_path == resolved_vocab_files[file_id]:
1715
                logger.info(f"loading file {file_path}")
1716
            else:
1717
                logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
1718

1719
1720
1721
1722
1723
1724
1725
1726
        return cls._from_pretrained(
            resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
        )

    @classmethod
    def _from_pretrained(
        cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
    ):
1727
1728
1729
1730
1731
        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
        # file or if `from_slow` is set to True.
        from_slow = kwargs.get("from_slow", False)
        has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
        if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
1732
            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
1733
1734
1735
1736
1737
1738
1739
1740
1741
                copy.deepcopy(resolved_vocab_files),
                pretrained_model_name_or_path,
                copy.deepcopy(init_configuration),
                *init_inputs,
                **(copy.deepcopy(kwargs)),
            )
        else:
            slow_tokenizer = None

1742
1743
1744
1745
1746
1747
        # Prepare tokenizer initialization kwargs
        # Did we saved some inputs and kwargs to reload ?
        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
        if tokenizer_config_file is not None:
            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                init_kwargs = json.load(tokenizer_config_handle)
1748
            init_kwargs.pop("tokenizer_class", None)
1749
1750
1751
1752
1753
1754
1755
1756
1757
            saved_init_inputs = init_kwargs.pop("init_inputs", ())
            if not init_inputs:
                init_inputs = saved_init_inputs
        else:
            init_kwargs = init_configuration

        # Update with newly provided kwargs
        init_kwargs.update(kwargs)

1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
        # Convert AddedTokens serialized as dict to class instances
        def convert_added_tokens(obj: Union[AddedToken, Any]):
            if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
                obj.pop("__type")
                return AddedToken(**obj)
            elif isinstance(obj, (list, tuple)):
                return list(convert_added_tokens(o) for o in obj)
            elif isinstance(obj, dict):
                return {k: convert_added_tokens(v) for k, v in obj.items()}
            return obj

        init_kwargs = convert_added_tokens(init_kwargs)

1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
        # Set max length if needed
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
            # if we're using a pretrained model, ensure the tokenizer
            # wont index sequences longer than the number of positional embeddings
            model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
            if model_max_length is not None and isinstance(model_max_length, (int, float)):
                init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)

        # Merge resolved_vocab_files arguments in init_kwargs.
        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
        for args_name, file_path in resolved_vocab_files.items():
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path

1785
1786
1787
        if slow_tokenizer is not None:
            init_kwargs["__slow_tokenizer"] = slow_tokenizer

1788
1789
        init_kwargs["name_or_path"] = pretrained_model_name_or_path

1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
        # Instantiate tokenizer.
        try:
            tokenizer = cls(*init_inputs, **init_kwargs)
        except OSError:
            raise OSError(
                "Unable to load vocabulary from file. "
                "Please check that the provided vocabulary is accessible and not corrupted."
            )

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
1800
1801
1802
        # Removed: Now done at the base class level
        # tokenizer.init_inputs = init_inputs
        # tokenizer.init_kwargs = init_kwargs
1803

1804
1805
1806
1807
1808
1809
        # If there is a complementary special token map, load it
        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
        if special_tokens_map_file is not None:
            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
                special_tokens_map = json.load(special_tokens_map_handle)
            for key, value in special_tokens_map.items():
1810
1811
1812
1813
                if isinstance(value, dict):
                    value = AddedToken(**value)
                elif isinstance(value, list):
                    value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
1814
                setattr(tokenizer, key, value)
1815
1816

        # Add supplementary tokens.
1817
        special_tokens = tokenizer.all_special_tokens
1818
1819
1820
        if added_tokens_file is not None:
            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                added_tok_encoder = json.load(added_tokens_handle)
1821
1822
1823
1824
1825

            # Sort added tokens by index
            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))

            for token, index in added_tok_encoder_sorted:
Sylvain Gugger's avatar
Sylvain Gugger committed
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
                if has_tokenizer_file and index != len(tokenizer) and tokenizer.convert_tokens_to_ids(token) != index:
                    # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
                    # index is the current length of the tokenizer (not in vocabulary)
                    raise ValueError(
                        f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
                        f"{index}."
                    )
                elif not has_tokenizer_file and index != len(tokenizer):
                    # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
                    # current length of the tokenizer.
                    raise ValueError(
                        f"Non-consecutive added token '{token}' found. "
                        f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
                    )

                # Safe to call on a tokenizer fast even if token already there.
1842
1843
                tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))

Stas Bekman's avatar
Stas Bekman committed
1844
        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
1845
1846
1847
        added_tokens = tokenizer.sanitize_special_tokens()
        if added_tokens:
            logger.warning(
1848
                "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
1849
            )
1850
1851
1852

        return tokenizer

1853
    def save_pretrained(
1854
1855
        self,
        save_directory: Union[str, os.PathLike],
Sylvain Gugger's avatar
Sylvain Gugger committed
1856
        legacy_format: Optional[bool] = None,
1857
        filename_prefix: Optional[str] = None,
Sylvain Gugger's avatar
Sylvain Gugger committed
1858
1859
        push_to_hub: bool = False,
        **kwargs,
1860
    ) -> Tuple[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
1861
        """
1862
        Save the full tokenizer state.
Sylvain Gugger's avatar
Sylvain Gugger committed
1863

1864

Sylvain Gugger's avatar
Sylvain Gugger committed
1865
        This method make sure the full tokenizer can then be re-loaded using the
Sylvain Gugger's avatar
Sylvain Gugger committed
1866
        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method..
Sylvain Gugger's avatar
Sylvain Gugger committed
1867
1868
1869
1870
1871
1872

        .. Warning::
           This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
           modifying :obj:`tokenizer.do_lower_case` after creation).

        Args:
1873
            save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
Sylvain Gugger's avatar
Sylvain Gugger committed
1874
1875
            legacy_format (:obj:`bool`, `optional`):
                Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
1876
1877
                format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
                added_tokens files.
Sylvain Gugger's avatar
Sylvain Gugger committed
1878
1879
1880
1881
1882

                If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible
                with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to
                be loaded in the corresponding "slow" tokenizer.

1883
1884
                If :obj:`True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a
                value error is raised.
1885
1886
            filename_prefix: (:obj:`str`, `optional`):
                A prefix to add to the names of the files saved by the tokenizer.
1887
1888
1889
1890
1891
1892
1893
1894
1895
            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to push your model to the Hugging Face model hub after saving it.

                .. warning::

                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
                    instead.
1896

Sylvain Gugger's avatar
Sylvain Gugger committed
1897
1898
        Returns:
            A tuple of :obj:`str`: The files saved.
1899
        """
1900
        if os.path.isfile(save_directory):
1901
            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
1902
            return
1903
1904
1905
1906
1907

        if push_to_hub:
            commit_message = kwargs.pop("commit_message", None)
            repo = self._create_or_get_repo(save_directory, **kwargs)

1908
        os.makedirs(save_directory, exist_ok=True)
1909

1910
1911
1912
1913
1914
1915
        special_tokens_map_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
        )
        tokenizer_config_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
        )
1916
1917
1918
1919
1920
1921
1922

        tokenizer_config = copy.deepcopy(self.init_kwargs)
        if len(self.init_inputs) > 0:
            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
        for file_id in self.vocab_files_names.keys():
            tokenizer_config.pop(file_id, None)

1923
        # Sanitize AddedTokens
1924
        def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
1925
1926
            if isinstance(obj, AddedToken):
                out = obj.__getstate__()
1927
1928
                if add_type_field:
                    out["__type"] = "AddedToken"
1929
1930
                return out
            elif isinstance(obj, (list, tuple)):
1931
                return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
1932
            elif isinstance(obj, dict):
1933
                return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
1934
1935
            return obj

1936
1937
        # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
        tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
1938
1939
1940
1941
1942
1943
1944
1945

        # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
        tokenizer_class = self.__class__.__name__
        # Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
        if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
            tokenizer_class = tokenizer_class[:-4]
        tokenizer_config["tokenizer_class"] = tokenizer_class

1946
1947
        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
1948
        logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
1949

1950
        # Sanitize AddedTokens in special_tokens_map
1951
        write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
1952
        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
1953
            f.write(json.dumps(write_dict, ensure_ascii=False))
1954
        logger.info(f"Special tokens file saved in {special_tokens_map_file}")
1955

1956
1957
        file_names = (tokenizer_config_file, special_tokens_map_file)

Sylvain Gugger's avatar
Sylvain Gugger committed
1958
        save_files = self._save_pretrained(
1959
1960
1961
1962
1963
            save_directory=save_directory,
            file_names=file_names,
            legacy_format=legacy_format,
            filename_prefix=filename_prefix,
        )
1964

Sylvain Gugger's avatar
Sylvain Gugger committed
1965
        if push_to_hub:
1966
            url = self._push_to_hub(repo, commit_message=commit_message)
Sylvain Gugger's avatar
Sylvain Gugger committed
1967
1968
1969
1970
            logger.info(f"Tokenizer pushed to the hub in this commit: {url}")

        return save_files

1971
1972
    def _save_pretrained(
        self,
1973
        save_directory: Union[str, os.PathLike],
1974
        file_names: Tuple[str],
Sylvain Gugger's avatar
Sylvain Gugger committed
1975
        legacy_format: Optional[bool] = None,
1976
1977
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
1978
1979
        """
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
1980

Sylvain Gugger's avatar
Sylvain Gugger committed
1981
1982
        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
        specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
1983
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1984
        if legacy_format is False:
1985
            raise ValueError(
1986
                "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
1987
1988
            )

1989
1990
        save_directory = str(save_directory)

1991
1992
1993
        added_tokens_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
        )
1994
1995
        added_vocab = self.get_added_vocab()
        if added_vocab:
1996
            with open(added_tokens_file, "w", encoding="utf-8") as f:
1997
                out_str = json.dumps(added_vocab, ensure_ascii=False)
1998
                f.write(out_str)
1999
                logger.info(f"added tokens file saved in {added_tokens_file}")
2000

2001
        vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
2002

2003
2004
2005
2006
2007
2008
        return file_names + vocab_files + (added_tokens_file,)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

Sylvain Gugger's avatar
Sylvain Gugger committed
2009
2010
        This method won't save the configuration and special token mappings of the tokenizer. Use
        :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer.
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021

        Args:
            save_directory (:obj:`str`):
                The directory in which to save the vocabulary.
            filename_prefix (:obj:`str`, `optional`):
                An optional prefix to add to the named of the saved files.

        Returns:
            :obj:`Tuple(str)`: Paths to the files saved.
        """
        raise NotImplementedError
2022

2023
2024
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        """
2025
        Converts a string in a sequence of tokens, replacing unknown tokens with the :obj:`unk_token`.
2026
2027
2028
2029
2030
2031
2032
2033
2034

        Args:
            text (:obj:`str`):
                The sequence to be encoded.
            pair (:obj:`str`, `optional`):
                A second sequence to be encoded with the first.
            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to add the special tokens associated with the corresponding model.
            kwargs (additional keyword arguments, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2035
                Will be passed to the underlying model specific encode method. See details in
2036
                :meth:`~transformers.PreTrainedTokenizerBase.__call__`
2037
2038
2039
2040
2041
2042

        Returns:
            :obj:`List[str]`: The list of tokens.
        """
        raise NotImplementedError

2043
2044
2045
    @add_end_docstrings(
        ENCODE_KWARGS_DOCSTRING,
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2046
2047
2048
2049
            **kwargs: Passed along to the `.tokenize()` method.
        """,
        """
        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
2050
2051
            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the
            text.
Sylvain Gugger's avatar
Sylvain Gugger committed
2052
        """,
2053
2054
2055
2056
2057
2058
    )
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2059
2060
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2061
2062
2063
2064
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
Sylvain Gugger's avatar
Sylvain Gugger committed
2065
    ) -> List[int]:
2066
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2067
        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
2068
2069
2070
2071
2072

        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.

        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2073
2074
2075
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
                method).
Sylvain Gugger's avatar
Sylvain Gugger committed
2076
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2077
2078
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
Sylvain Gugger's avatar
Sylvain Gugger committed
2079
                ``convert_tokens_to_ids`` method).
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
        """
        encoded_inputs = self.encode_plus(
            text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            return_tensors=return_tensors,
            **kwargs,
        )

        return encoded_inputs["input_ids"]

    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        raise NotImplementedError

    def _get_padding_truncation_strategies(
2099
        self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
2100
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
2101
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2102
2103
        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
        and pad_to_max_length) and behaviors.
2104
2105
2106
2107
2108
2109
2110
2111
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

        # Backward compatibility for previous behavior, maybe we should deprecate it:
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
2112
                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
2113
                    logger.warning(
2114
2115
                        "Truncation was not explicitly activated but `max_length` is provided a specific value, "
                        "please use `truncation=True` to explicitly truncate examples to max length. "
2116
2117
2118
2119
                        "Defaulting to 'longest_first' truncation strategy. "
                        "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
                        "more precisely by providing a specific strategy to `truncation`."
                    )
2120
                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
2121
            truncation = "longest_first"
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131

        # Get padding strategy
        if padding is False and old_pad_to_max_length:
            if verbose:
                warnings.warn(
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
                    "maximal input size of the model (e.g. 512 for Bert).",
2132
                    FutureWarning,
2133
2134
2135
2136
2137
2138
2139
2140
                )
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
2141
            elif not isinstance(padding, PaddingStrategy):
2142
                padding_strategy = PaddingStrategy(padding)
2143
2144
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
        if truncation is False and old_truncation_strategy != "do_not_truncate":
            if verbose:
                warnings.warn(
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
                    "use `truncation=True` to truncate examples to a max length. You can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
                    "maximal input size of the model (e.g. 512 for Bert). "
                    " If you have pairs of inputs, you can give a specific truncation strategy selected among "
                    "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
                    "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
                    "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
2160
                    FutureWarning,
2161
2162
2163
2164
2165
                )
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
        elif truncation is not False:
            if truncation is True:
                truncation_strategy = (
2166
2167
2168
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
2169
                truncation_strategy = TruncationStrategy(truncation)
2170
2171
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
2172
2173
2174
2175
2176
2177
2178
2179
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
2180
2181
2182
2183
2184
2185
                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
                            logger.warning(
                                "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
                                "Default to no padding."
                            )
                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
2186
2187
2188
2189
2190
2191
2192
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length

            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
2193
2194
2195
2196
2197
2198
                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
                            logger.warning(
                                "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
                                "Default to no truncation."
                            )
                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (
            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
            and padding_strategy != PaddingStrategy.DO_NOT_PAD
            and pad_to_multiple_of is not None
            and max_length is not None
            and (max_length % pad_to_multiple_of != 0)
        ):
            raise ValueError(
                f"Truncation and padding are both activated but "
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

2224
2225
2226
2227
2228
2229
2230
2231
        return padding_strategy, truncation_strategy, max_length, kwargs

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2232
2233
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2234
2235
        max_length: Optional[int] = None,
        stride: int = 0,
2236
        is_split_into_words: bool = False,
2237
        pad_to_multiple_of: Optional[int] = None,
2238
2239
2240
2241
2242
2243
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2244
        return_length: bool = False,
2245
2246
2247
2248
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2249
2250
        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
        sequences.
2251
2252

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2253
            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2254
2255
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
2256
                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
Sylvain Gugger's avatar
Sylvain Gugger committed
2257
            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2258
2259
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
2260
                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
2261
        """
2262
        # Input type checking for clearer error
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
        def _is_valid_text_input(t):
            if isinstance(t, str):
                # Strings are fine
                return True
            elif isinstance(t, (list, tuple)):
                # List are fine as long as they are...
                if len(t) == 0:
                    # ... empty
                    return True
                elif isinstance(t[0], str):
                    # ... list of strings
                    return True
                elif isinstance(t[0], (list, tuple)):
                    # ... list with an empty list or with a list of strings
                    return len(t[0]) == 0 or isinstance(t[0][0], str)
                else:
                    return False
            else:
                return False
2282

2283
2284
2285
2286
        if not _is_valid_text_input(text):
            raise ValueError(
                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                "or `List[List[str]]` (batch of pretokenized examples)."
2287
2288
            )

2289
2290
2291
2292
        if text_pair is not None and not _is_valid_text_input(text_pair):
            raise ValueError(
                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                "or `List[List[str]]` (batch of pretokenized examples)."
2293
            )
2294
2295
2296
2297
2298

        if is_split_into_words:
            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
        else:
            is_batched = isinstance(text, (list, tuple))
2299
2300

        if is_batched:
2301
2302
2303
2304
2305
2306
2307
2308
            if isinstance(text_pair, str):
                raise TypeError(
                    "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`."
                )
            if text_pair is not None and len(text) != len(text_pair):
                raise ValueError(
                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
                )
2309
2310
2311
2312
2313
2314
2315
2316
            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
            return self.batch_encode_plus(
                batch_text_or_text_pairs=batch_text_or_text_pairs,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
2317
                is_split_into_words=is_split_into_words,
2318
                pad_to_multiple_of=pad_to_multiple_of,
2319
2320
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
2321
                return_attention_mask=return_attention_mask,
2322
                return_overflowing_tokens=return_overflowing_tokens,
2323
                return_special_tokens_mask=return_special_tokens_mask,
2324
                return_offsets_mapping=return_offsets_mapping,
2325
                return_length=return_length,
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
                verbose=verbose,
                **kwargs,
            )
        else:
            return self.encode_plus(
                text=text,
                text_pair=text_pair,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
2338
                is_split_into_words=is_split_into_words,
2339
                pad_to_multiple_of=pad_to_multiple_of,
2340
2341
2342
2343
2344
2345
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
2346
                return_length=return_length,
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
                verbose=verbose,
                **kwargs,
            )

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2357
2358
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2359
2360
        max_length: Optional[int] = None,
        stride: int = 0,
2361
        is_split_into_words: bool = False,
2362
        pad_to_multiple_of: Optional[int] = None,
2363
2364
2365
2366
2367
2368
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2369
        return_length: bool = False,
2370
2371
2372
2373
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2374
2375
2376
2377
        Tokenize and prepare for the model a sequence or a pair of sequences.

        .. warning::
            This method is deprecated, ``__call__`` should be used instead.
2378
2379

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2380
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
Sylvain Gugger's avatar
Sylvain Gugger committed
2381
2382
2383
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
                method).
Sylvain Gugger's avatar
Sylvain Gugger committed
2384
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2385
2386
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
Sylvain Gugger's avatar
Sylvain Gugger committed
2387
                ``convert_tokens_to_ids`` method).
2388
2389
2390
2391
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2392
2393
2394
2395
2396
2397
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
        )

        return self._encode_plus(
            text=text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
2408
            is_split_into_words=is_split_into_words,
2409
            pad_to_multiple_of=pad_to_multiple_of,
2410
2411
2412
2413
2414
2415
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
2416
            return_length=return_length,
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
            verbose=verbose,
            **kwargs,
        )

    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
2430
        is_split_into_words: bool = False,
2431
        pad_to_multiple_of: Optional[int] = None,
2432
2433
2434
2435
2436
2437
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2438
        return_length: bool = False,
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        raise NotImplementedError

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2456
2457
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2458
2459
        max_length: Optional[int] = None,
        stride: int = 0,
2460
        is_split_into_words: bool = False,
2461
        pad_to_multiple_of: Optional[int] = None,
2462
2463
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
2464
        return_attention_mask: Optional[bool] = None,
2465
        return_overflowing_tokens: bool = False,
2466
        return_special_tokens_mask: bool = False,
2467
        return_offsets_mapping: bool = False,
2468
        return_length: bool = False,
2469
2470
2471
2472
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2473
2474
2475
2476
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

        .. warning::
            This method is deprecated, ``__call__`` should be used instead.
2477
2478

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2479
            batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2480
2481
2482
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
                details in ``encode_plus``).
2483
2484
2485
2486
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2487
2488
2489
2490
2491
2492
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
2493
2494
2495
2496
2497
2498
2499
2500
2501
        )

        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
2502
            is_split_into_words=is_split_into_words,
2503
            pad_to_multiple_of=pad_to_multiple_of,
2504
2505
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
2506
            return_attention_mask=return_attention_mask,
2507
            return_overflowing_tokens=return_overflowing_tokens,
2508
            return_special_tokens_mask=return_special_tokens_mask,
2509
            return_offsets_mapping=return_offsets_mapping,
2510
            return_length=return_length,
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
            verbose=verbose,
            **kwargs,
        )

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
2530
        is_split_into_words: bool = False,
2531
        pad_to_multiple_of: Optional[int] = None,
2532
2533
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
2534
        return_attention_mask: Optional[bool] = None,
2535
        return_overflowing_tokens: bool = False,
2536
        return_special_tokens_mask: bool = False,
2537
        return_offsets_mapping: bool = False,
2538
        return_length: bool = False,
2539
2540
2541
2542
2543
2544
2545
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        raise NotImplementedError

    def pad(
        self,
2546
2547
2548
2549
2550
2551
2552
        encoded_inputs: Union[
            BatchEncoding,
            List[BatchEncoding],
            Dict[str, EncodedInput],
            Dict[str, List[EncodedInput]],
            List[Dict[str, EncodedInput]],
        ],
Sylvain Gugger's avatar
Sylvain Gugger committed
2553
        padding: Union[bool, str, PaddingStrategy] = True,
2554
        max_length: Optional[int] = None,
2555
        pad_to_multiple_of: Optional[int] = None,
2556
        return_attention_mask: Optional[bool] = None,
2557
        return_tensors: Optional[Union[str, TensorType]] = None,
2558
        verbose: bool = True,
2559
    ) -> BatchEncoding:
Sylvain Gugger's avatar
Sylvain Gugger committed
2560
2561
2562
        """
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.
2563

Sylvain Gugger's avatar
Sylvain Gugger committed
2564
2565
        Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
        ``self.pad_token_id`` and ``self.pad_token_type_id``)
2566

2567
2568
2569
2570
2571
2572
        .. note::

            If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
            result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
            case of PyTorch tensors, you will lose the specific device of your tensors however.

2573
        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2574
            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2575
2576
2577
2578
                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
                well as in a PyTorch Dataloader collate function.
2579
2580
2581

                Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
                see the note above for the return type.
2582
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
            max_length (:obj:`int`, `optional`):
                Maximum length of the returned list and optionally padding length (see above).
            pad_to_multiple_of (:obj:`int`, `optional`):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2598
                >= 7.5 (Volta).
Sylvain Gugger's avatar
Sylvain Gugger committed
2599
2600
2601
2602
2603
            return_attention_mask (:obj:`bool`, `optional`):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.

                `What are attention masks? <../glossary.html#attention-mask>`__
2604
            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2605
2606
2607
2608
2609
                If set, will return tensors instead of list of python integers. Acceptable values are:

                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
2610
            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
2611
                Whether or not to print more information and warnings.
2612
        """
2613
        # If we have a list of dicts, let's convert it in a dict of lists
2614
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
2615
2616
2617
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

2618
2619
2620
        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
Takuya Makino's avatar
Takuya Makino committed
2621
                "You should supply an encoding or a list of encodings to this method "
2622
2623
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )
2624

2625
2626
2627
        required_input = encoded_inputs[self.model_input_names[0]]

        if not required_input:
2628
            if return_attention_mask:
2629
2630
                encoded_inputs["attention_mask"] = []
            return encoded_inputs
2631

2632
2633
2634
        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch
2635

2636
        first_element = required_input[0]
2637
2638
2639
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
            index = 0
2640
            while len(required_input[index]) == 0:
2641
                index += 1
2642
2643
            if index < len(required_input):
                first_element = required_input[index][0]
2644
2645
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
Sylvain Gugger's avatar
Sylvain Gugger committed
2646
            if is_tf_available() and _is_tensorflow(first_element):
2647
                return_tensors = "tf" if return_tensors is None else return_tensors
Sylvain Gugger's avatar
Sylvain Gugger committed
2648
            elif is_torch_available() and _is_torch(first_element):
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
                    f"Should be one of a python, numpy, pytorch or tensorflow object."
                )

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)

2661
        # Convert padding_strategy in PaddingStrategy
2662
2663
2664
2665
        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
            padding=padding, max_length=max_length, verbose=verbose
        )

2666
2667
        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
2668
2669
            encoded_inputs = self._pad(
                encoded_inputs,
2670
2671
                max_length=max_length,
                padding_strategy=padding_strategy,
2672
                pad_to_multiple_of=pad_to_multiple_of,
2673
2674
                return_attention_mask=return_attention_mask,
            )
2675
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
2676

2677
        batch_size = len(required_input)
2678
        assert all(
2679
            len(v) == batch_size for v in encoded_inputs.values()
Tiger's avatar
Tiger committed
2680
        ), "Some items in the output dictionary have a different batch size than others."
2681
2682

        if padding_strategy == PaddingStrategy.LONGEST:
2683
            max_length = max(len(inputs) for inputs in required_input)
2684
2685
2686
2687
            padding_strategy = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
2688
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
2689
2690
2691
2692
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding_strategy,
2693
                pad_to_multiple_of=pad_to_multiple_of,
2694
2695
2696
2697
2698
2699
2700
2701
                return_attention_mask=return_attention_mask,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

2702
        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
2703

Sylvain Gugger's avatar
Sylvain Gugger committed
2704
2705
2706
2707
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2708
2709
        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
        <../glossary.html#token-type-ids>`__
Sylvain Gugger's avatar
Sylvain Gugger committed
2710

2711
        Should be overridden in a subclass if the model has a special way of building those.
Sylvain Gugger's avatar
Sylvain Gugger committed
2712
2713
2714
2715
2716
2717
2718
2719

        Args:
            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.

        Returns:
            :obj:`List[int]`: The token type ids.
        """
2720
2721
2722
2723
        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)

Sylvain Gugger's avatar
Sylvain Gugger committed
2724
2725
2726
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
2727
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2728
2729
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
2730

2731
        This implementation does not add special tokens and this method should be overridden in a subclass.
Sylvain Gugger's avatar
Sylvain Gugger committed
2732
2733
2734
2735
2736
2737
2738

        Args:
            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.

        Returns:
            :obj:`List[int]`: The model input with special tokens.
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
        """
        if token_ids_1 is None:
            return token_ids_0
        return token_ids_0 + token_ids_1

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2750
2751
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
        **kwargs
    ) -> BatchEncoding:
Sylvain Gugger's avatar
Sylvain Gugger committed
2766
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2767
2768
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
2769
2770
2771
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2772
            ids (:obj:`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2773
2774
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
                and ``convert_tokens_to_ids`` methods.
Sylvain Gugger's avatar
Sylvain Gugger committed
2775
            pair_ids (:obj:`List[int]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2776
2777
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
                and ``convert_tokens_to_ids`` methods.
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

2794
        if return_token_type_ids and not add_special_tokens:
2795
2796
2797
2798
2799
2800
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None."
            )

2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
2813
        overflowing_tokens = []
2814
2815
2816
2817
2818
2819
2820
2821
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation_strategy,
                stride=stride,
            )
2822
2823
2824
2825

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length
2826
2827
2828
2829
2830
2831
2832

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
2833
            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
2834

Tiger's avatar
Tiger committed
2835
        # Build output dictionary
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Check lengths
2846
        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874

        # Padding
        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding_strategy.value,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(
            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
        )

        return batch_outputs

    def truncate_sequences(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        num_tokens_to_remove: int = 0,
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
        stride: int = 0,
    ) -> Tuple[List[int], List[int], List[int]]:
Sylvain Gugger's avatar
Sylvain Gugger committed
2875
2876
        """
        Truncates a sequence pair in-place following the strategy.
2877
2878

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2879
            ids (:obj:`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2880
2881
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
                and ``convert_tokens_to_ids`` methods.
Sylvain Gugger's avatar
Sylvain Gugger committed
2882
            pair_ids (:obj:`List[int]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2883
2884
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
                and ``convert_tokens_to_ids`` methods.
Sylvain Gugger's avatar
Sylvain Gugger committed
2885
2886
            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
                Number of tokens to remove using the truncation strategy.
2887
            truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2888
2889
                The strategy to follow for truncation. Can be:

Sylvain Gugger's avatar
Sylvain Gugger committed
2890
2891
2892
2893
                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
                  sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
2894
2895
2896
2897
2898
2899
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
                  the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
2900
2901
                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).
Sylvain Gugger's avatar
Sylvain Gugger committed
2902
            stride (:obj:`int`, `optional`, defaults to 0):
Sylvain Gugger's avatar
Sylvain Gugger committed
2903
2904
                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                sequence returned. The value of this argument defines the number of additional tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
2905
2906

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
2907
2908
            :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
            list of overflowing tokens.
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
        """
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []

        if not isinstance(truncation_strategy, TruncationStrategy):
            truncation_strategy = TruncationStrategy(truncation_strategy)

        overflowing_tokens = []
        if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
            for _ in range(num_tokens_to_remove):
                if pair_ids is None or len(ids) > len(pair_ids):
                    if not overflowing_tokens:
                        window_len = min(len(ids), stride + 1)
                    else:
                        window_len = 1
                    overflowing_tokens.extend(ids[-window_len:])
                    ids = ids[:-1]
                else:
                    if not overflowing_tokens:
                        window_len = min(len(pair_ids), stride + 1)
                    else:
                        window_len = 1
                    overflowing_tokens.extend(pair_ids[-window_len:])
                    pair_ids = pair_ids[:-1]
        elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
            if len(ids) > num_tokens_to_remove:
                window_len = min(len(ids), stride + num_tokens_to_remove)
                overflowing_tokens = ids[-window_len:]
                ids = ids[:-num_tokens_to_remove]
            else:
                logger.error(
                    f"We need to remove {num_tokens_to_remove} to truncate the input"
                    f"but the first sequence has a length {len(ids)}. "
                    f"Please select another truncation strategy than {truncation_strategy}, "
                    f"for instance 'longest_first' or 'only_second'."
                )
        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
            if len(pair_ids) > num_tokens_to_remove:
                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
                overflowing_tokens = pair_ids[-window_len:]
                pair_ids = pair_ids[:-num_tokens_to_remove]
            else:
                logger.error(
                    f"We need to remove {num_tokens_to_remove} to truncate the input"
                    f"but the second sequence has a length {len(pair_ids)}. "
                    f"Please select another truncation strategy than {truncation_strategy}, "
                    f"for instance 'longest_first' or 'only_first'."
                )

        return (ids, pair_ids, overflowing_tokens)

2960
2961
    def _pad(
        self,
2962
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
2963
2964
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2965
        pad_to_multiple_of: Optional[int] = None,
2966
2967
        return_attention_mask: Optional[bool] = None,
    ) -> dict:
Sylvain Gugger's avatar
Sylvain Gugger committed
2968
        """
2969
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
2970
2971
2972
2973
2974
2975

        Args:
            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.
Sylvain Gugger's avatar
Sylvain Gugger committed
2976

2977
2978
2979
2980
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:
Sylvain Gugger's avatar
Sylvain Gugger committed
2981

2982
2983
                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
2984
2985
2986
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
2987
2988
2989
2990
2991
2992
            return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        """
        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

2993
2994
        required_input = encoded_inputs[self.model_input_names[0]]

2995
        if padding_strategy == PaddingStrategy.LONGEST:
2996
            max_length = len(required_input)
2997

2998
2999
3000
        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

3001
        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
3002
3003

        if needs_to_be_padded:
3004
            difference = max_length - len(required_input)
3005
3006
            if self.padding_side == "right":
                if return_attention_mask:
3007
                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
3008
3009
3010
3011
3012
3013
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
                    )
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
3014
                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
3015
3016
            elif self.padding_side == "left":
                if return_attention_mask:
3017
                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
3018
3019
3020
3021
3022
3023
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
                        "token_type_ids"
                    ]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
3024
                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
3025
3026
            else:
                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
3027
        elif return_attention_mask and "attention_mask" not in encoded_inputs:
3028
            encoded_inputs["attention_mask"] = [1] * len(required_input)
3029
3030
3031

        return encoded_inputs

3032
3033
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3034
3035
        Converts a sequence of tokens in a single string. The most simple way to do it is ``" ".join(tokens)`` but we
        often want to remove sub-word tokenization artifacts at the same time.
Sylvain Gugger's avatar
Sylvain Gugger committed
3036

3037
3038
        Args:
            tokens (:obj:`List[str]`): The token to join in a string.
Sylvain Gugger's avatar
Sylvain Gugger committed
3039

cronoik's avatar
cronoik committed
3040
3041
        Returns:
            :obj:`str`: The joined tokens.
3042
3043
3044
        """
        raise NotImplementedError

3045
    def batch_decode(
3046
3047
3048
3049
3050
        self,
        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
3051
3052
3053
3054
3055
    ) -> List[str]:
        """
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
3056
            sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3057
3058
3059
3060
3061
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to clean up the tokenization spaces.
3062
3063
            kwargs (additional keyword arguments, `optional`):
                Will be passed to the underlying model specific decode method.
Sylvain Gugger's avatar
Sylvain Gugger committed
3064
3065
3066

        Returns:
            :obj:`List[str]`: The list of decoded sentences.
3067
3068
3069
        """
        return [
            self.decode(
3070
3071
3072
3073
                seq,
                skip_special_tokens=skip_special_tokens,
                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                **kwargs,
3074
3075
3076
            )
            for seq in sequences
        ]
3077

3078
    def decode(
3079
        self,
3080
        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
3081
3082
3083
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
3084
3085
    ) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3086
3087
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.
Sylvain Gugger's avatar
Sylvain Gugger committed
3088

3089
3090
3091
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.

        Args:
3092
            token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3093
3094
3095
3096
3097
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to clean up the tokenization spaces.
3098
3099
            kwargs (additional keyword arguments, `optional`):
                Will be passed to the underlying model specific decode method.
Sylvain Gugger's avatar
Sylvain Gugger committed
3100
3101
3102

        Returns:
            :obj:`str`: The decoded sentence.
3103
        """
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
        # Convert inputs to python lists
        token_ids = to_py_obj(token_ids)

        return self._decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )

    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
    ) -> str:
3121
3122
3123
        raise NotImplementedError

    def get_special_tokens_mask(
Sylvain Gugger's avatar
Sylvain Gugger committed
3124
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
3125
3126
3127
3128
3129
3130
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
3131
3132
3133
3134
3135
            token_ids_0 (:obj:`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (:obj:`List[int]`, `optional`):
                List of ids of the second sequence.
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
3136
                Whether or not the token list is already formatted with special tokens for the model.
3137
3138
3139
3140
3141
3142
3143

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        assert already_has_special_tokens and token_ids_1 is None, (
            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
            "Please use a slow (full python) tokenizer to activate this argument."
3144
            "Or set `return_special_tokens_mask=True` when calling the encoding method "
3145
3146
3147
3148
3149
3150
3151
3152
3153
            "to get the special tokens mask in any tokenizer. "
        )

        all_special_ids = self.all_special_ids  # cache the property

        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]

        return special_tokens_mask

3154
3155
    @staticmethod
    def clean_up_tokenization(out_string: str) -> str:
Sylvain Gugger's avatar
Sylvain Gugger committed
3156
        """
3157
        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
Sylvain Gugger's avatar
Sylvain Gugger committed
3158
3159
3160
3161
3162
3163

        Args:
            out_string (:obj:`str`): The text to clean up.

        Returns:
            :obj:`str`: The cleaned-up string.
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
        """
        out_string = (
            out_string.replace(" .", ".")
            .replace(" ?", "?")
            .replace(" !", "!")
            .replace(" ,", ",")
            .replace(" ' ", "'")
            .replace(" n't", "n't")
            .replace(" 'm", "'m")
            .replace(" 's", "'s")
            .replace(" 've", "'ve")
            .replace(" 're", "'re")
        )
        return out_string
3178
3179
3180

    def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
        """
3181
        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
        corresponding model

        Args:
            ids (:obj:`List[str]`): The ids produced by the tokenization
            max_length (:obj:`int`, `optional`): The max_length desired (does not trigger a warning if it is set)
            verbose (:obj:`bool`): Whether or not to print more information and warnings.

        """
        if max_length is None and len(ids) > self.model_max_length and verbose:
            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
                logger.warning(
                    "Token indices sequence length is longer than the specified maximum sequence length "
3194
3195
                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
                    "will result in indexing errors"
3196
3197
                )
            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233

    @contextmanager
    def as_target_tokenizer(self):
        """
        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
        sequence-to-sequence models that need a slightly different processing for the labels.
        """
        yield

    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        tgt_texts: Optional[List[str]] = None,
        max_length: Optional[int] = None,
        max_target_length: Optional[int] = None,
        padding: str = "longest",
        return_tensors: str = None,
        truncation: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Prepare model inputs for translation. For best performance, translate one sentence at a time.

        Arguments:
            src_texts (:obj:`List[str]`):
                List of documents to summarize or source language texts.
            tgt_texts (:obj:`list`, `optional`):
                List of summaries or target language texts.
            max_length (:obj:`int`, `optional`):
                Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
                is required by one of the truncation/padding parameters. If the model has no specific maximum input
                length (like XLNet) truncation/padding to a maximum length will be deactivated.
            max_target_length (:obj:`int`, `optional`):
                Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
                to :obj:`None`, this will use the max_length value.
3234
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
3235
3236
3237
3238
3239
3240
3241
3242
                Activates and controls padding. Accepts the following values:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
3243
            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
                If set, will return tensors instead of list of python integers. Acceptable values are:

                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`):
                Activates and controls truncation. Accepts the following values:

                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
                  if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
                  the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
                  sequence lengths greater than the model maximum admissible input size).
            **kwargs:
                Additional keyword arguments passed along to :obj:`self.__call__`.

        Return:
            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:

            - **input_ids** -- List of token ids to be fed to the encoder.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **labels** -- List of token ids for tgt_texts.

            The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
            Otherwise, input_ids, attention_mask will be the only keys.
        """
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
        # docstyle-ignore
        formatted_warning = """
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.
"""
        warnings.warn(formatted_warning, FutureWarning)
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
        # mBART-specific kwargs that should be ignored by other models.
        kwargs.pop("src_lang", None)
        kwargs.pop("tgt_lang", None)
        if max_length is None:
            max_length = self.model_max_length
        model_inputs = self(
            src_texts,
            add_special_tokens=True,
            return_tensors=return_tensors,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            **kwargs,
        )
        if tgt_texts is None:
            return model_inputs
        # Process tgt_texts
        if max_target_length is None:
            max_target_length = max_length
        with self.as_target_tokenizer():
            labels = self(
                tgt_texts,
                add_special_tokens=True,
                return_tensors=return_tensors,
                padding=padding,
                max_length=max_target_length,
                truncation=truncation,
                **kwargs,
            )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs