tokenization_utils_base.py 153 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
16
"""
Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
17
fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
Sylvain Gugger's avatar
Sylvain Gugger committed
18
of output with special method for the Fast tokenizers)
19
20
21
22
23
24
"""

import copy
import json
import os
import warnings
25
from collections import OrderedDict, UserDict
26
from contextlib import contextmanager
27
from dataclasses import dataclass, field
Sylvain Gugger's avatar
Sylvain Gugger committed
28
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
29
30

import numpy as np
31

Julien Chaumond's avatar
Julien Chaumond committed
32
33
import requests

34
from .file_utils import (
35
36
37
38
39
40
41
42
    ExplicitEnum,
    PaddingStrategy,
    TensorType,
    _is_jax,
    _is_numpy,
    _is_tensorflow,
    _is_torch,
    _is_torch_device,
43
44
45
    add_end_docstrings,
    cached_path,
    hf_bucket_url,
46
    is_flax_available,
47
    is_offline_mode,
48
49
    is_remote_url,
    is_tf_available,
50
    is_tokenizers_available,
51
    is_torch_available,
52
    to_py_obj,
53
54
    torch_required,
)
Lysandre Debut's avatar
Lysandre Debut committed
55
from .utils import logging
56
57


Sylvain Gugger's avatar
Sylvain Gugger committed
58
59
60
61
62
63
64
65
if TYPE_CHECKING:
    if is_torch_available():
        import torch
    if is_tf_available():
        import tensorflow as tf
    if is_flax_available():
        import jax.numpy as jnp  # noqa: F401

66
67
68
69
70
71
72
73

if is_tokenizers_available():
    from tokenizers import AddedToken
    from tokenizers import Encoding as EncodingFast
else:

    @dataclass(frozen=True, eq=True)
    class AddedToken:
Sylvain Gugger's avatar
Sylvain Gugger committed
74
75
76
        """
        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
        way it should behave.
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
        """

        content: str = field(default_factory=str)
        single_word: bool = False
        lstrip: bool = False
        rstrip: bool = False
        normalized: bool = True

        def __getstate__(self):
            return self.__dict__

    @dataclass
    class EncodingFast:
        """ This is dummy class because without the `tokenizers` library we don't have these objects anyway """

        pass

94

Lysandre Debut's avatar
Lysandre Debut committed
95
logger = logging.get_logger(__name__)
96
97
98
99
100
101
102
103
104
105
106
107
108

VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER

# Define type aliases and NamedTuples
TextInput = str
PreTokenizedInput = List[str]
EncodedInput = List[int]
TextInputPair = Tuple[str, str]
PreTokenizedInputPair = Tuple[List[str], List[str]]
EncodedInputPair = Tuple[List[int], List[int]]


109
# Slow tokenizers used to be saved in three separated files
110
111
112
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
113
114

# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
115
116
117
118
FULL_TOKENIZER_FILE = "tokenizer.json"


class TruncationStrategy(ExplicitEnum):
Sylvain Gugger's avatar
Sylvain Gugger committed
119
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
120
121
    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
    tab-completion in an IDE.
Sylvain Gugger's avatar
Sylvain Gugger committed
122
123
    """

124
125
126
127
128
129
130
    ONLY_FIRST = "only_first"
    ONLY_SECOND = "only_second"
    LONGEST_FIRST = "longest_first"
    DO_NOT_TRUNCATE = "do_not_truncate"


class CharSpan(NamedTuple):
Sylvain Gugger's avatar
Sylvain Gugger committed
131
132
    """
    Character span in the original string.
133

Sylvain Gugger's avatar
Sylvain Gugger committed
134
135
136
    Args:
        start (:obj:`int`): Index of the first character in the original string.
        end (:obj:`int`): Index of the character following the last character in the original string.
137
138
139
140
141
142
143
    """

    start: int
    end: int


class TokenSpan(NamedTuple):
Sylvain Gugger's avatar
Sylvain Gugger committed
144
145
    """
    Token span in an encoded string (list of tokens).
146

Sylvain Gugger's avatar
Sylvain Gugger committed
147
148
149
    Args:
        start (:obj:`int`): Index of the first token in the span.
        end (:obj:`int`): Index of the token following the last token in the span.
150
151
152
153
154
155
156
    """

    start: int
    end: int


class BatchEncoding(UserDict):
Sylvain Gugger's avatar
Sylvain Gugger committed
157
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
158
159
    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
    :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
Sylvain Gugger's avatar
Sylvain Gugger committed
160
161
162
163
164
165
166
167
168
169
    attention_masks, etc).

    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
    utility methods to map from word/character space to token space.

    Args:
        data (:obj:`dict`):
            Dictionary of lists/arrays/tensors returned by the encode/batch_encode methods ('input_ids',
            'attention_mask', etc.).
        encoding (:obj:`tokenizers.Encoding` or :obj:`Sequence[tokenizers.Encoding]`, `optional`):
170
171
172
            If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
            space to token space the :obj:`tokenizers.Encoding` instance or list of instance (for batches) hold this
            information.
Sylvain Gugger's avatar
Sylvain Gugger committed
173
174
175
176
177
        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
        prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
178
179
180
        n_sequences (:obj:`Optional[int]`, `optional`):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
181
182
183
184
185
186
187
188
    """

    def __init__(
        self,
        data: Optional[Dict[str, Any]] = None,
        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
        tensor_type: Union[None, str, TensorType] = None,
        prepend_batch_axis: bool = False,
189
        n_sequences: Optional[int] = None,
190
191
192
193
194
195
196
197
    ):
        super().__init__(data)

        if isinstance(encoding, EncodingFast):
            encoding = [encoding]

        self._encodings = encoding

198
199
200
201
202
        if n_sequences is None and encoding is not None and len(encoding):
            n_sequences = encoding[0].n_sequences

        self._n_sequences = n_sequences

203
204
        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

205
206
207
208
209
210
211
    @property
    def n_sequences(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
        :class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
        sentence) or :obj:`2` (a pair of sentences)
        """
Lysandre Debut's avatar
Lysandre Debut committed
212
        return self._n_sequences
213

214
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
215
    def is_fast(self) -> bool:
216
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
217
218
        :obj:`bool`: Indicate whether this :class:`~transformers.BatchEncoding` was generated from the result of a
        :class:`~transformers.PreTrainedTokenizerFast` or not.
219
220
221
        """
        return self._encodings is not None

Sylvain Gugger's avatar
Sylvain Gugger committed
222
223
    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
224
225
        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask',
        etc.).
Sylvain Gugger's avatar
Sylvain Gugger committed
226
227

        If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
        """
        if isinstance(item, str):
            return self.data[item]
        elif self._encodings is not None:
            return self._encodings[item]
        else:
            raise KeyError(
                "Indexing with integers (to access backend Encoding for a given batch index) "
                "is not available when using Python based tokenizers"
            )

    def __getattr__(self, item: str):
        try:
            return self.data[item]
        except KeyError:
            raise AttributeError

245
246
247
248
249
250
251
252
253
254
    def __getstate__(self):
        return {"data": self.data, "encodings": self._encodings}

    def __setstate__(self, state):
        if "data" in state:
            self.data = state["data"]

        if "encodings" in state:
            self._encodings = state["encodings"]

255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
    def keys(self):
        return self.data.keys()

    def values(self):
        return self.data.values()

    def items(self):
        return self.data.items()

    # After this point:
    # Extended properties and methods only available for fast (Rust-based) tokenizers
    # provided by HuggingFace tokenizers library.

    @property
    def encodings(self) -> Optional[List[EncodingFast]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
271
272
        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
        :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
273
274
275
        """
        return self._encodings

276
    def tokens(self, batch_index: int = 0) -> List[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
277
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
278
279
        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
        integer indices) at a given batch index (only works for the output of a fast tokenizer).
Sylvain Gugger's avatar
Sylvain Gugger committed
280
281
282
283
284
285
286

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.

        Returns:
            :obj:`List[str]`: The list of tokens at that index.
        """
287
        if not self._encodings:
Sylvain Gugger's avatar
Sylvain Gugger committed
288
            raise ValueError("tokens() is not available when using Python-based tokenizers")
289
290
        return self._encodings[batch_index].tokens

291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to the id of their original sentences:

            - :obj:`None` for special tokens added around or between sequences,
            - :obj:`0` for tokens corresponding to words in the first sequence,
            - :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
              encoded.

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.

        Returns:
            :obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
            added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
            corresponding sequence.
        """
        if not self._encodings:
            raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
        return self._encodings[batch_index].sequence_ids

312
    def words(self, batch_index: int = 0) -> List[Optional[int]]:
Sylvain Gugger's avatar
Sylvain Gugger committed
313
314
315
316
317
318
319
320
321
322
323
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.

        Returns:
            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
            word (several tokens will be mapped to the same word index if they are parts of that word).
        """
324
        if not self._encodings:
Sylvain Gugger's avatar
Sylvain Gugger committed
325
            raise ValueError("words() is not available when using Python-based tokenizers")
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
        warnings.warn(
            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
            "but more self-explanatory `BatchEncoding.word_ids()` property.",
            FutureWarning,
        )
        return self.word_ids(batch_index)

    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.

        Returns:
            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
            word (several tokens will be mapped to the same word index if they are parts of that word).
        """
        if not self._encodings:
            raise ValueError("word_ids() is not available when using Python-based tokenizers")
        return self._encodings[batch_index].word_ids

    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
        Get the index of the sequence represented by the given token. In the general use case, this method returns
        :obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair

        Can be called as:

        - ``self.token_to_sequence(token_index)`` if batch size is 1
        - ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.

        Args:
            batch_or_token_index (:obj:`int`):
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the token in the sequence.
            token_index (:obj:`int`, `optional`):
                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
                sequence.

        Returns:
            :obj:`int`: Index of the word in the input sequence.
        """

        if not self._encodings:
            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_sequence(token_index)
387
388
389

    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
390
        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
391
392
393
394
395
396

        Can be called as:

        - ``self.token_to_word(token_index)`` if batch size is 1
        - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1

Sylvain Gugger's avatar
Sylvain Gugger committed
397
398
399
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.
400
401
402

        Args:
            batch_or_token_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
403
404
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
405
            token_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
406
407
                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
                sequence.
408
409

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
410
            :obj:`int`: Index of the word in the input sequence.
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
        """

        if not self._encodings:
            raise ValueError("token_to_word() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_word(token_index)

426
427
428
    def word_to_tokens(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> Optional[TokenSpan]:
429
        """
430
        Get the encoded token span corresponding to a word in a sequence of the batch.
431

Sylvain Gugger's avatar
Sylvain Gugger committed
432
        Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
433

Sylvain Gugger's avatar
Sylvain Gugger committed
434
435
        - **start** -- Index of the first token.
        - **end** -- Index of the token following the last token.
436
437
438

        Can be called as:

439
440
441
        - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
        - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
          to 1
442

Sylvain Gugger's avatar
Sylvain Gugger committed
443
444
445
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
446
447
448

        Args:
            batch_or_word_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
449
450
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the word in the sequence.
451
            word_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
452
453
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
454
455
456
            sequence_index (:obj:`int`, `optional`, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.
457
458

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
459
460
            Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
            Returns :obj:`None` if no tokens correspond to the word.
461
462
463
464
465
466
467
468
469
470
471
472
473
        """

        if not self._encodings:
            raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if word_index < 0:
            word_index = self._seq_len + word_index
474
        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
475
        return TokenSpan(*span) if span is not None else None
476
477
478
479
480

    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
        """
        Get the character span corresponding to an encoded token in a sequence of the batch.

Sylvain Gugger's avatar
Sylvain Gugger committed
481
        Character spans are returned as a :class:`~transformers.tokenization_utils_base.CharSpan` with:
482

Sylvain Gugger's avatar
Sylvain Gugger committed
483
484
485
        - **start** -- Index of the first character in the original string associated to the token.
        - **end** -- Index of the character following the last character in the original string associated to the
          token.
486
487
488
489
490
491
492
493

        Can be called as:

        - ``self.token_to_chars(token_index)`` if batch size is 1
        - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1

        Args:
            batch_or_token_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
494
495
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
496
            token_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
497
498
                If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in
                the sequence.
499
500

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
501
            :class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string.
502
503
504
505
506
507
508
509
510
511
512
        """

        if not self._encodings:
            raise ValueError("token_to_chars() is not available when using Python based tokenizers")
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))

513
514
515
    def char_to_token(
        self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
    ) -> int:
516
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
517
518
        Get the index of the token in the encoded output comprising a character in the original string for a sequence
        of the batch.
519
520
521
522
523
524

        Can be called as:

        - ``self.char_to_token(char_index)`` if batch size is 1
        - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1

Sylvain Gugger's avatar
Sylvain Gugger committed
525
526
527
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
528
529
530

        Args:
            batch_or_char_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
531
532
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
533
            char_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
534
535
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
536
537
538
            sequence_index (:obj:`int`, `optional`, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.
539
540
541
542
543
544
545
546
547
548
549
550
551


        Returns:
            :obj:`int`: Index of the token.
        """

        if not self._encodings:
            raise ValueError("char_to_token() is not available when using Python based tokenizers")
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
552
        return self._encodings[batch_index].char_to_token(char_index, sequence_index)
553

554
555
556
    def word_to_chars(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> CharSpan:
557
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
558
        Get the character span in the original string corresponding to given word in a sequence of the batch.
559
560
561
562
563
564
565
566
567
568
569
570
571

        Character spans are returned as a CharSpan NamedTuple with:

        - start: index of the first character in the original string
        - end: index of the character following the last character in the original string

        Can be called as:

        - ``self.word_to_chars(word_index)`` if batch size is 1
        - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1

        Args:
            batch_or_word_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
572
573
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
574
            word_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
575
576
                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                sequence.
577
578
579
            sequence_index (:obj:`int`, `optional`, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.
580
581

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
582
583
            :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
            CharSpan are NamedTuple with:
584
585

                - start: index of the first character associated to the token in the original string
Sylvain Gugger's avatar
Sylvain Gugger committed
586
587
                - end: index of the character following the last character associated to the token in the original
                  string
588
589
590
591
592
593
594
595
596
        """

        if not self._encodings:
            raise ValueError("word_to_chars() is not available when using Python based tokenizers")
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
597
        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
598

599
    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
600
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
601
602
        Get the word in the original string corresponding to a character in the original string of a sequence of the
        batch.
603
604
605
606
607
608

        Can be called as:

        - ``self.char_to_word(char_index)`` if batch size is 1
        - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1

Sylvain Gugger's avatar
Sylvain Gugger committed
609
610
611
        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.
612
613
614

        Args:
            batch_or_char_index (:obj:`int`):
Sylvain Gugger's avatar
Sylvain Gugger committed
615
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
616
                the character in the original string.
617
            char_index (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
618
                If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
619
                original string.
620
621
622
            sequence_index (:obj:`int`, `optional`, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.
623
624
625


        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
626
            :obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s).
627
628
629
630
631
632
633
634
635
        """

        if not self._encodings:
            raise ValueError("char_to_word() is not available when using Python based tokenizers")
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
636
        return self._encodings[batch_index].char_to_word(char_index, sequence_index)
637

Sylvain Gugger's avatar
Sylvain Gugger committed
638
639
640
641
642
643
644
    def convert_to_tensors(
        self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
    ):
        """
        Convert the inner content to tensors.

        Args:
645
            tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
646
                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
647
                :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
Sylvain Gugger's avatar
Sylvain Gugger committed
648
649
650
            prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
                Whether or not to add the batch dimension during the conversion.
        """
651
652
653
654
655
656
657
658
        if tensor_type is None:
            return self

        # Convert to TensorType
        if not isinstance(tensor_type, TensorType):
            tensor_type = TensorType(tensor_type)

        # Get a function reference for the correct framework
659
660
661
662
663
        if tensor_type == TensorType.TENSORFLOW:
            if not is_tf_available():
                raise ImportError(
                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
                )
Sylvain Gugger's avatar
Sylvain Gugger committed
664
665
            import tensorflow as tf

666
            as_tensor = tf.constant
667
            is_tensor = tf.is_tensor
668
669
670
        elif tensor_type == TensorType.PYTORCH:
            if not is_torch_available():
                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
Sylvain Gugger's avatar
Sylvain Gugger committed
671
672
            import torch

673
            as_tensor = torch.tensor
674
            is_tensor = torch.is_tensor
675
676
677
        elif tensor_type == TensorType.JAX:
            if not is_flax_available():
                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
Sylvain Gugger's avatar
Sylvain Gugger committed
678
679
            import jax.numpy as jnp  # noqa: F811

680
            as_tensor = jnp.array
681
            is_tensor = _is_jax
682
        else:
683
            as_tensor = np.asarray
684
            is_tensor = _is_numpy
685
686
687
        # (mfuntowicz: This code is unreachable)
        # else:
        #     raise ImportError(
688
        #         f"Unable to convert output to tensors format {tensor_type}"
689
        #     )
690
691
692
693
694
695
696

        # Do the tensor conversion in batch
        for key, value in self.items():
            try:
                if prepend_batch_axis:
                    value = [value]

697
698
                if not is_tensor(value):
                    tensor = as_tensor(value)
699

700
701
702
703
704
705
                    # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
                    # # at-least2d
                    # if tensor.ndim > 2:
                    #     tensor = tensor.squeeze(0)
                    # elif tensor.ndim < 2:
                    #     tensor = tensor[None, :]
706

707
                    self[key] = tensor
708
            except:  # noqa E722
709
710
711
712
713
                if key == "overflowing_tokens":
                    raise ValueError(
                        "Unable to create tensor returning overflowing tokens of different lengths. "
                        "Please see if a fast version of this tokenizer is available to have this feature available."
                    )
714
715
716
717
718
719
720
721
                raise ValueError(
                    "Unable to create tensor, you should probably activate truncation and/or padding "
                    "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
                )

        return self

    @torch_required
722
    def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
Sylvain Gugger's avatar
Sylvain Gugger committed
723
724
725
726
727
728
729
        """
        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).

        Args:
            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.

        Returns:
730
            :class:`~transformers.BatchEncoding`: The same instance after modification.
Sylvain Gugger's avatar
Sylvain Gugger committed
731
        """
732
733
734
735

        # This check catches things like APEX blindly calling "to" on all inputs to a module
        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
        # into a HalfTensor
736
        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
737
738
            self.data = {k: v.to(device=device) for k, v in self.data.items()}
        else:
739
            logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
740
741
742
743
        return self


class SpecialTokensMixin:
Sylvain Gugger's avatar
Sylvain Gugger committed
744
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
745
746
    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
    handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
747
    used to directly access these special tokens in a model-independent manner and allow to set and update the special
Sylvain Gugger's avatar
Sylvain Gugger committed
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
    tokens.

    Args:
        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the beginning of a sentence.
        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the end of a sentence.
        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing an out-of-vocabulary token.
        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token separating two different sentences in the same input (used by BERT for instance).
        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the class of the input (used by BERT for instance).
        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT).
        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A tuple or a list of additional special tokens.
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
    """

    SPECIAL_TOKENS_ATTRIBUTES = [
        "bos_token",
        "eos_token",
        "unk_token",
        "sep_token",
        "pad_token",
        "cls_token",
        "mask_token",
        "additional_special_tokens",
    ]

    def __init__(self, verbose=True, **kwargs):
        self._bos_token = None
        self._eos_token = None
        self._unk_token = None
        self._sep_token = None
        self._pad_token = None
        self._cls_token = None
        self._mask_token = None
        self._pad_token_type_id = 0
        self._additional_special_tokens = []
        self.verbose = verbose

794
        # We directly set the hidden value to allow initialization with special tokens
795
796
        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
        # TODO clean this up at some point (probably by switching to fast tokenizers)
797
        for key, value in kwargs.items():
798
799
            if value is None:
                continue
800
801
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                if key == "additional_special_tokens":
Teven's avatar
Teven committed
802
803
                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
                    assert all(isinstance(t, str) for t in value), "One of the tokens is not a string"
804
                    setattr(self, key, value)
805
                elif isinstance(value, (str, AddedToken)):
806
807
                    setattr(self, key, value)
                else:
808
                    raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
809

810
    def sanitize_special_tokens(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
811
812
813
        """
        Make sure that all the special tokens attributes of the tokenizer (:obj:`tokenizer.mask_token`,
        :obj:`tokenizer.cls_token`, etc.) are in the vocabulary.
814

Sylvain Gugger's avatar
Sylvain Gugger committed
815
816
817
        Add the missing ones to the vocabulary if needed.

        Return:
818
            :obj:`int`: The number of tokens added in the vocabulary during the operation.
819
        """
820
        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
821

822
    def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
823
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
824
825
826
        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
        current vocabulary).
827

828
829
830
831
832
833
834
        .. Note::
            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
            the model so that its embedding matrix matches the tokenizer.

            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.

        Using :obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
835

Sylvain Gugger's avatar
Sylvain Gugger committed
836
837
838
        - Special tokens are carefully handled by the tokenizer (they are never split).
        - You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This
          makes it easy to develop model-agnostic training and fine-tuning scripts.
839

Sylvain Gugger's avatar
Sylvain Gugger committed
840
841
842
        When possible, special tokens are already registered for provided pretrained models (for instance
        :class:`~transformers.BertTokenizer` :obj:`cls_token` is already registered to be :obj`'[CLS]'` and XLM's one
        is also registered to be :obj:`'</s>'`).
843
844

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
845
846
847
            special_tokens_dict (dictionary `str` to `str` or :obj:`tokenizers.AddedToken`):
                Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``,
                ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
848
849
                ``additional_special_tokens``].

Sylvain Gugger's avatar
Sylvain Gugger committed
850
851
                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
                assign the index of the ``unk_token`` to them).
852
853

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
854
            :obj:`int`: Number of tokens added to the vocabulary.
855
856
857
858
859
860
861
862
863
864
865

        Examples::

            # Let's see how to add a new classification token to GPT-2
            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            model = GPT2Model.from_pretrained('gpt2')

            special_tokens_dict = {'cls_token': '<CLS>'}

            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
            print('We have added', num_added_toks, 'tokens')
Sylvain Gugger's avatar
Sylvain Gugger committed
866
867
            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
            model.resize_token_embeddings(len(tokenizer))
868
869
870
871
872
873
874
875

            assert tokenizer.cls_token == '<CLS>'
        """
        if not special_tokens_dict:
            return 0

        added_tokens = 0
        for key, value in special_tokens_dict.items():
Teven's avatar
Teven committed
876
            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
877

878
            if self.verbose:
879
                logger.info(f"Assigning {value} to the {key} key of the tokenizer")
880
881
            setattr(self, key, value)

882
            if key == "additional_special_tokens":
883
884
885
                assert isinstance(value, (list, tuple)) and all(
                    isinstance(t, (str, AddedToken)) for t in value
                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
886
                added_tokens += self.add_tokens(value, special_tokens=True)
887
            else:
888
889
890
                assert isinstance(
                    value, (str, AddedToken)
                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
891
                added_tokens += self.add_tokens([value], special_tokens=True)
892
893
894

        return added_tokens

Sylvain Gugger's avatar
Sylvain Gugger committed
895
896
897
    def add_tokens(
        self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
    ) -> int:
898
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
899
900
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary.
901

902
903
904
905
906
907
        .. Note::
            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
            the model so that its embedding matrix matches the tokenizer.

            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.

908
        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
909
910
911
912
913
            new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`):
                Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a
                string token to let you personalize its behavior: whether this token should only match against a single
                word, whether this token should strip all potential whitespaces on the left side, whether this token
                should strip all potential whitespaces on the right side, etc.
914
            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
915
916
                Can be used to specify if the token is a special token. This mostly change the normalization behavior
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
917

Sylvain Gugger's avatar
Sylvain Gugger committed
918
                See details for :obj:`tokenizers.AddedToken` in HuggingFace tokenizers library.
919
920

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
921
            :obj:`int`: Number of tokens added to the vocabulary.
922
923
924
925
926
927
928
929
930

        Examples::

            # Let's see how to increase the vocabulary of Bert model and tokenizer
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
            model = BertModel.from_pretrained('bert-base-uncased')

            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
            print('We have added', num_added_toks, 'tokens')
Sylvain Gugger's avatar
Sylvain Gugger committed
931
932
             # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
            model.resize_token_embeddings(len(tokenizer))
933
934
935
936
937
938
939
940
        """
        if not new_tokens:
            return 0

        if not isinstance(new_tokens, (list, tuple)):
            new_tokens = [new_tokens]

        return self._add_tokens(new_tokens, special_tokens=special_tokens)
941

942
943
944
    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
        raise NotImplementedError

945
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
946
947
948
949
    def bos_token(self) -> str:
        """
        :obj:`str`: Beginning of sentence token. Log an error if used while not having been set.
        """
950
951
        if self._bos_token is None and self.verbose:
            logger.error("Using bos_token, but it is not set yet.")
952
953
            return None
        return str(self._bos_token)
954
955

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
956
957
958
959
    def eos_token(self) -> str:
        """
        :obj:`str`: End of sentence token. Log an error if used while not having been set.
        """
960
961
        if self._eos_token is None and self.verbose:
            logger.error("Using eos_token, but it is not set yet.")
962
963
            return None
        return str(self._eos_token)
964
965

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
966
967
968
969
    def unk_token(self) -> str:
        """
        :obj:`str`: Unknown token. Log an error if used while not having been set.
        """
970
971
        if self._unk_token is None and self.verbose:
            logger.error("Using unk_token, but it is not set yet.")
972
973
            return None
        return str(self._unk_token)
974
975

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
976
977
    def sep_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
978
979
        :obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while
        not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
980
        """
981
982
        if self._sep_token is None and self.verbose:
            logger.error("Using sep_token, but it is not set yet.")
983
984
            return None
        return str(self._sep_token)
985
986

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
987
988
989
990
    def pad_token(self) -> str:
        """
        :obj:`str`: Padding token. Log an error if used while not having been set.
        """
991
992
        if self._pad_token is None and self.verbose:
            logger.error("Using pad_token, but it is not set yet.")
993
994
            return None
        return str(self._pad_token)
995
996

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
997
998
    def cls_token(self) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
999
1000
        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
        full depth of the model. Log an error if used while not having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1001
        """
1002
1003
        if self._cls_token is None and self.verbose:
            logger.error("Using cls_token, but it is not set yet.")
1004
1005
            return None
        return str(self._cls_token)
1006
1007

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1008
1009
1010
1011
1012
    def mask_token(self) -> str:
        """
        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
        not having been set.
        """
1013
1014
        if self._mask_token is None and self.verbose:
            logger.error("Using mask_token, but it is not set yet.")
1015
1016
            return None
        return str(self._mask_token)
1017
1018

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1019
1020
1021
1022
1023
    def additional_special_tokens(self) -> List[str]:
        """
        :obj:`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having
        been set.
        """
1024
1025
        if self._additional_special_tokens is None and self.verbose:
            logger.error("Using additional_special_tokens, but it is not set yet.")
1026
1027
            return None
        return [str(tok) for tok in self._additional_special_tokens]
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061

    @bos_token.setter
    def bos_token(self, value):
        self._bos_token = value

    @eos_token.setter
    def eos_token(self, value):
        self._eos_token = value

    @unk_token.setter
    def unk_token(self, value):
        self._unk_token = value

    @sep_token.setter
    def sep_token(self, value):
        self._sep_token = value

    @pad_token.setter
    def pad_token(self, value):
        self._pad_token = value

    @cls_token.setter
    def cls_token(self, value):
        self._cls_token = value

    @mask_token.setter
    def mask_token(self, value):
        self._mask_token = value

    @additional_special_tokens.setter
    def additional_special_tokens(self, value):
        self._additional_special_tokens = value

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1062
1063
1064
1065
1066
    def bos_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns :obj:`None` if the token
        has not been set.
        """
1067
1068
        if self._bos_token is None:
            return None
1069
1070
1071
        return self.convert_tokens_to_ids(self.bos_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1072
1073
1074
1075
1076
    def eos_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns :obj:`None` if the token has
        not been set.
        """
1077
1078
        if self._eos_token is None:
            return None
1079
1080
1081
        return self.convert_tokens_to_ids(self.eos_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1082
1083
1084
1085
1086
    def unk_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the unknown token in the vocabulary. Returns :obj:`None` if the token has not been
        set.
        """
1087
1088
        if self._unk_token is None:
            return None
1089
1090
1091
        return self.convert_tokens_to_ids(self.unk_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1092
1093
1094
1095
1096
    def sep_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
        sequence. Returns :obj:`None` if the token has not been set.
        """
1097
1098
        if self._sep_token is None:
            return None
1099
1100
1101
        return self.convert_tokens_to_ids(self.sep_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1102
1103
1104
1105
1106
    def pad_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
        set.
        """
1107
1108
        if self._pad_token is None:
            return None
1109
1110
1111
        return self.convert_tokens_to_ids(self.pad_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1112
1113
1114
1115
    def pad_token_type_id(self) -> int:
        """
        :obj:`int`: Id of the padding token type in the vocabulary.
        """
1116
1117
1118
        return self._pad_token_type_id

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1119
1120
1121
1122
1123
1124
1125
    def cls_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input
        sequence leveraging self-attention along the full depth of the model.

        Returns :obj:`None` if the token has not been set.
        """
1126
1127
        if self._cls_token is None:
            return None
1128
1129
1130
        return self.convert_tokens_to_ids(self.cls_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1131
1132
1133
1134
1135
    def mask_token_id(self) -> Optional[int]:
        """
        :obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
        modeling. Returns :obj:`None` if the token has not been set.
        """
1136
1137
        if self._mask_token is None:
            return None
1138
1139
1140
        return self.convert_tokens_to_ids(self.mask_token)

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1141
1142
    def additional_special_tokens_ids(self) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1143
1144
        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
        having been set.
Sylvain Gugger's avatar
Sylvain Gugger committed
1145
        """
1146
1147
        return self.convert_tokens_to_ids(self.additional_special_tokens)

1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
    @bos_token_id.setter
    def bos_token_id(self, value):
        self._bos_token = self.convert_tokens_to_ids(value)

    @eos_token_id.setter
    def eos_token_id(self, value):
        self._eos_token = self.convert_tokens_to_ids(value)

    @unk_token_id.setter
    def unk_token_id(self, value):
        self._unk_token = self.convert_tokens_to_ids(value)

    @sep_token_id.setter
    def sep_token_id(self, value):
        self._sep_token = self.convert_tokens_to_ids(value)

    @pad_token_id.setter
    def pad_token_id(self, value):
        self._pad_token = self.convert_tokens_to_ids(value)

    @cls_token_id.setter
    def cls_token_id(self, value):
        self._cls_token = self.convert_tokens_to_ids(value)

    @mask_token_id.setter
    def mask_token_id(self, value):
        self._mask_token = self.convert_tokens_to_ids(value)

    @additional_special_tokens_ids.setter
    def additional_special_tokens_ids(self, values):
        self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values]

1180
    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1181
1182
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1183
1184
        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`,
        :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
Sylvain Gugger's avatar
Sylvain Gugger committed
1185
1186

        Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
1187
1188
1189
1190
1191
1192
1193
1194
1195
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = str(attr_value)
        return set_attr

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1196
1197
1198
1199
1200
    def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
        """
        :obj:`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary
        mapping special token class attributes (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values
        (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
1201

Sylvain Gugger's avatar
Sylvain Gugger committed
1202
1203
        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
        how special tokens are tokenized.
1204
1205
1206
1207
1208
1209
1210
1211
1212
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = attr_value
        return set_attr

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1213
1214
1215
1216
1217
    def all_special_tokens(self) -> List[str]:
        """
        :obj:`List[str]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.

        Convert tokens of :obj:`tokenizers.AddedToken` type to string.
1218
        """
1219
1220
1221
1222
        all_toks = [str(s) for s in self.all_special_tokens_extended]
        return all_toks

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1223
1224
1225
1226
    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
        """
        :obj:`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.)
        mapped to class attributes.
1227

Sylvain Gugger's avatar
Sylvain Gugger committed
1228
1229
        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
        how special tokens are tokenized.
1230
        """
1231
        all_toks = []
1232
        set_attr = self.special_tokens_map_extended
1233
1234
        for attr_value in set_attr.values():
            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
1235
        all_toks = list(OrderedDict.fromkeys(all_toks))
1236
1237
1238
        return all_toks

    @property
Sylvain Gugger's avatar
Sylvain Gugger committed
1239
1240
1241
1242
    def all_special_ids(self) -> List[int]:
        """
        :obj:`List[int]`: List the ids of the special tokens(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class
        attributes.
1243
1244
1245
1246
1247
1248
1249
1250
        """
        all_toks = self.all_special_tokens
        all_ids = self.convert_tokens_to_ids(all_toks)
        return all_ids


ENCODE_KWARGS_DOCSTRING = r"""
            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1251
                Whether or not to encode the sequences with the special tokens relative to their model.
1252
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
                Activates and controls padding. Accepts the following values:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
                Activates and controls truncation. Accepts the following values:

                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
                  if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
                  the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
                  sequence lengths greater than the model maximum admissible input size).
            max_length (:obj:`int`, `optional`):
                Controls the maximum length to use by one of the truncation/padding parameters.

                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
            stride (:obj:`int`, `optional`, defaults to 0):
                If set to a number along with :obj:`max_length`, the overflowing tokens returned when
                :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
                returned to provide some overlap between truncated and overflowing sequences. The value of this
                argument defines the number of overlapping tokens.
1287
            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1288
1289
                Whether or not the input is already pre-tokenized (e.g., split into words), in which case the tokenizer
                will skip the pre-tokenization step. This is useful for NER or token classification.
Sylvain Gugger's avatar
Sylvain Gugger committed
1290
1291
1292
            pad_to_multiple_of (:obj:`int`, `optional`):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
1293
            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1294
1295
1296
1297
1298
                If set, will return tensors instead of list of python integers. Acceptable values are:

                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
1299
1300
1301
"""

ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
1302
            return_token_type_ids (:obj:`bool`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1303
1304
                Whether to return token type IDs. If left to the default, will return the token type IDs according to
                the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
1305

Sylvain Gugger's avatar
Sylvain Gugger committed
1306
1307
                `What are token type IDs? <../glossary.html#token-type-ids>`__
            return_attention_mask (:obj:`bool`, `optional`):
1308
1309
1310
1311
1312
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.

                `What are attention masks? <../glossary.html#attention-mask>`__
            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1313
                Whether or not to return overflowing token sequences.
1314
            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Tiger's avatar
Tiger committed
1315
                Whether or not to return special tokens mask information.
1316
            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1317
                Whether or not to return :obj:`(char_start, char_end)` for each token.
1318

Sylvain Gugger's avatar
Sylvain Gugger committed
1319
1320
1321
1322
1323
1324
                This is only available on fast tokenizers inheriting from
                :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
                :obj:`NotImplementedError`.
            return_length  (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to return the lengths of the encoded inputs.
            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
1325
                Whether or not to print more information and warnings.
Sylvain Gugger's avatar
Sylvain Gugger committed
1326
            **kwargs: passed to the :obj:`self.tokenize()` method
1327

Sylvain Gugger's avatar
Sylvain Gugger committed
1328
1329
1330
1331
1332
1333
        Return:
            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.

              `What are input IDs? <../glossary.html#input-ids>`__
Sylvain Gugger's avatar
Sylvain Gugger committed
1334

Sylvain Gugger's avatar
Sylvain Gugger committed
1335
1336
1337
1338
            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
              or if `"token_type_ids"` is in :obj:`self.model_input_names`).

              `What are token type IDs? <../glossary.html#token-type-ids>`__
Sylvain Gugger's avatar
Sylvain Gugger committed
1339

Sylvain Gugger's avatar
Sylvain Gugger committed
1340
1341
1342
1343
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).

              `What are attention masks? <../glossary.html#attention-mask>`__
Sylvain Gugger's avatar
Sylvain Gugger committed
1344

Sylvain Gugger's avatar
Sylvain Gugger committed
1345
1346
1347
1348
            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
              :obj:`return_overflowing_tokens=True`).
            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
              :obj:`return_overflowing_tokens=True`).
1349
            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
Tiger's avatar
Tiger committed
1350
              regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
Sylvain Gugger's avatar
Sylvain Gugger committed
1351
1352
            - **length** -- The length of the inputs (when :obj:`return_length=True`)
"""
1353

Sylvain Gugger's avatar
Sylvain Gugger committed
1354
1355
INIT_TOKENIZER_DOCSTRING = r"""
    Class attributes (overridden by derived classes)
Sylvain Gugger's avatar
Sylvain Gugger committed
1356

Tiger's avatar
Tiger committed
1357
        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
Sylvain Gugger's avatar
Sylvain Gugger committed
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
          each vocabulary file required by the model, and as associated values, the filename for saving the associated
          file (string).
        - **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
          high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the
          low-level being the :obj:`short-cut-names` of the pretrained models with, as associated values, the
          :obj:`url` to the associated pretrained vocabulary file.
        - **max_model_input_sizes** (:obj:`Dict[str, Optinal[int]]`) -- A dictionary with, as keys, the
          :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
          inputs of this model, or :obj:`None` if the model has no maximum input size.
        - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
Sylvain Gugger's avatar
Sylvain Gugger committed
1368
1369
          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
          to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
Sylvain Gugger's avatar
Sylvain Gugger committed
1370
1371
1372
1373
1374
1375
1376
1377
          tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
          method.
        - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
        - **padding_side** (:obj:`str`) -- The default value for the side on which the model should have padding
          applied. Should be :obj:`'right'` or :obj:`'left'`.

    Args:
        model_max_length (:obj:`int`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1378
1379
1380
1381
            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
            loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this
            will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no
            value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
Sylvain Gugger's avatar
Sylvain Gugger committed
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
        padding_side: (:obj:`str`, `optional`):
            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
            Default value is picked from the class attribute of the same name.
        model_input_names (:obj:`List[string]`, `optional`):
            The list of inputs accepted by the forward pass of the model (like :obj:`"token_type_ids"` or
            :obj:`"attention_mask"`). Default value is picked from the class attribute of the same name.
        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the beginning of a sentence. Will be associated to ``self.bos_token`` and
            ``self.bos_token_id``.
        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the end of a sentence. Will be associated to ``self.eos_token`` and
            ``self.eos_token_id``.
        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing an out-of-vocabulary token. Will be associated to ``self.unk_token`` and
            ``self.unk_token_id``.
        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token separating two different sentences in the same input (used by BERT for instance). Will be
            associated to ``self.sep_token`` and ``self.sep_token_id``.
        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation. Will be associated to ``self.pad_token`` and
            ``self.pad_token_id``.
        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing the class of the input (used by BERT for instance). Will be associated to
            ``self.cls_token`` and ``self.cls_token_id``.
        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT). Will be associated to ``self.mask_token`` and ``self.mask_token_id``.
        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
            A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
            tokenization process. Will be associated to ``self.additional_special_tokens`` and
            ``self.additional_special_tokens_ids``.
1414
1415
1416
"""


Sylvain Gugger's avatar
Sylvain Gugger committed
1417
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
1418
class PreTrainedTokenizerBase(SpecialTokensMixin):
Sylvain Gugger's avatar
Sylvain Gugger committed
1419
1420
    """
    Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`.
1421

Sylvain Gugger's avatar
Sylvain Gugger committed
1422
    Handles shared (mostly boiler plate) methods for those two classes.
1423
1424
1425
1426
1427
    """

    vocab_files_names: Dict[str, str] = {}
    pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
Sylvain Gugger's avatar
Sylvain Gugger committed
1428
    max_model_input_sizes: Dict[str, Optional[int]] = {}
1429
1430
1431
1432

    # first name has to correspond to main model input name
    # to make sure `tokenizer.pad(...)` works correctly
    model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
1433
    padding_side: str = "right"
1434
    slow_tokenizer_class = None
1435

1436
1437
1438
    def __init__(self, **kwargs):
        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
        self.init_inputs = ()
1439
        self.init_kwargs = copy.deepcopy(kwargs)
1440
        self.name_or_path = kwargs.pop("name_or_path", "")
1441
1442

        # For backward compatibility we fallback to set model_max_length from max_len if provided
1443
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER

        # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
        self.padding_side = kwargs.pop("padding_side", self.padding_side)
        assert self.padding_side in [
            "right",
            "left",
        ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

1454
1455
1456
1457
        self.deprecation_warnings = (
            {}
        )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).

1458
        super().__init__(**kwargs)
1459
1460
1461

    @property
    def max_len_single_sentence(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1462
1463
1464
        """
        :obj:`int`: The maximum length of a sentence that can be fed to the model.
        """
1465
1466
1467
1468
        return self.model_max_length - self.num_special_tokens_to_add(pair=False)

    @property
    def max_len_sentences_pair(self) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1469
1470
1471
        """
        :obj:`int`: The maximum combined length of a pair of sentences that can be fed to the model.
        """
1472
1473
1474
1475
        return self.model_max_length - self.num_special_tokens_to_add(pair=True)

    @max_len_single_sentence.setter
    def max_len_single_sentence(self, value) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1476
        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
1477
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
1478
1479
1480
1481
1482
            if not self.deprecation_warnings.get("max_len_single_sentence", False):
                logger.warning(
                    "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
                )
            self.deprecation_warnings["max_len_single_sentence"] = True
1483
1484
1485
1486
1487
1488
1489
        else:
            raise ValueError(
                "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
            )

    @max_len_sentences_pair.setter
    def max_len_sentences_pair(self, value) -> int:
Sylvain Gugger's avatar
Sylvain Gugger committed
1490
        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
1491
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
1492
1493
1494
1495
1496
            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
                logger.warning(
                    "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
                )
            self.deprecation_warnings["max_len_sentences_pair"] = True
1497
1498
1499
1500
1501
        else:
            raise ValueError(
                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
            )

1502
1503
1504
1505
1506
1507
1508
    def __repr__(self) -> str:
        return (
            f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
            f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
            f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
        )

1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
    def get_vocab(self) -> Dict[str, int]:
        """
        Returns the vocabulary as a dictionary of token to index.

        :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
        :obj:`token` is in the vocab.

        Returns:
            :obj:`Dict[str, int]`: The vocabulary.
        """
        raise NotImplementedError()

1521
    @classmethod
1522
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
1523
        r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
1524
1525
        Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
        a predefined tokenizer.
1526
1527

        Args:
1528
            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1529
1530
                Can be either:

1531
1532
1533
                - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co.
                  Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a
                  user or organization name, like ``dbmdz/bert-base-german-cased``.
Sylvain Gugger's avatar
Sylvain Gugger committed
1534
1535
1536
1537
1538
1539
                - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
                  using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`
                  method, e.g., ``./my_model_directory/``.
                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
                  ``./my_model_directory/vocab.txt``.
1540
            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1541
1542
1543
1544
1545
1546
1547
1548
1549
                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
                standard cache should not be used.
            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
                exist.
            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to delete incompletely received files. Attempt to resume the download if such a file
                exists.
            proxies (:obj:`Dict[str, str], `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1550
1551
                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
1552
1553
1554
            use_auth_token (:obj:`str` or `bool`, `optional`):
                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
Julien Chaumond's avatar
Julien Chaumond committed
1555
1556
1557
1558
            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
                identifier allowed by git.
1559
1560
1561
            subfolder (:obj:`str`, `optional`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
Sylvain Gugger's avatar
Sylvain Gugger committed
1562
1563
1564
1565
1566
1567
            inputs (additional positional arguments, `optional`):
                Will be passed along to the Tokenizer ``__init__`` method.
            kwargs (additional keyword arguments, `optional`):
                Will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like
                ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
                ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__`` for more details.
1568

1569
1570
1571
1572
        .. note::

            Passing :obj:`use_auth_token=True` is required when you want to use a private model.

1573
1574
        Examples::

Sylvain Gugger's avatar
Sylvain Gugger committed
1575
            # We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer
1576
            # Download vocabulary from huggingface.co and cache.
1577
1578
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

1579
            # Download vocabulary from huggingface.co (user-uploaded) and cache.
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')

            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')

            # If the tokenizer uses a single vocabulary file, you can point directly to this file
            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')

            # You can link tokens to special vocabulary when instantiating
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
            # You should be sure '<unk>' is in the vocabulary when doing that.
            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
            assert tokenizer.unk_token == '<unk>'

        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
1600
        use_auth_token = kwargs.pop("use_auth_token", None)
Julien Chaumond's avatar
Julien Chaumond committed
1601
        revision = kwargs.pop("revision", None)
1602
        subfolder = kwargs.pop("subfolder", None)
1603
1604
1605
1606
1607
1608
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)

        user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline
1609

1610
1611
1612
1613
        if is_offline_mode() and not local_files_only:
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True

1614
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
1615
1616
1617
        vocab_files = {}
        init_configuration = {}

1618
1619
1620
1621
1622
        if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
1623
                )
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
            warnings.warn(
                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
                FutureWarning,
            )
            file_id = list(cls.vocab_files_names.keys())[0]
            vocab_files[file_id] = pretrained_model_name_or_path
        else:
            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
            additional_files_names = {
                "added_tokens_file": ADDED_TOKENS_FILE,
                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
                "tokenizer_file": FULL_TOKENIZER_FILE,
            }
            # Look for the tokenizer files
            for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
                if os.path.isdir(pretrained_model_name_or_path):
                    if subfolder is not None:
                        full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
1644
                    else:
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                    if not os.path.exists(full_file_name):
                        logger.info(f"Didn't find file {full_file_name}. We won't load it.")
                        full_file_name = None
                else:
                    full_file_name = hf_bucket_url(
                        pretrained_model_name_or_path,
                        filename=file_name,
                        subfolder=subfolder,
                        revision=revision,
                        mirror=None,
                    )
1657

1658
                vocab_files[file_id] = full_file_name
1659
1660

        # Get files from url, cache, or disk depending on the case
Julien Chaumond's avatar
Julien Chaumond committed
1661
        resolved_vocab_files = {}
1662
        unresolved_files = []
Julien Chaumond's avatar
Julien Chaumond committed
1663
1664
1665
1666
1667
        for file_id, file_path in vocab_files.items():
            if file_path is None:
                resolved_vocab_files[file_id] = None
            else:
                try:
1668
1669
1670
1671
1672
1673
1674
1675
                    resolved_vocab_files[file_id] = cached_path(
                        file_path,
                        cache_dir=cache_dir,
                        force_download=force_download,
                        proxies=proxies,
                        resume_download=resume_download,
                        local_files_only=local_files_only,
                        use_auth_token=use_auth_token,
1676
                        user_agent=user_agent,
1677
1678
1679
1680
1681
1682
1683
                    )

                except FileNotFoundError as error:
                    if local_files_only:
                        unresolved_files.append(file_id)
                    else:
                        raise error
1684

Julien Chaumond's avatar
Julien Chaumond committed
1685
1686
1687
1688
1689
1690
                except requests.exceptions.HTTPError as err:
                    if "404 Client Error" in str(err):
                        logger.debug(err)
                        resolved_vocab_files[file_id] = None
                    else:
                        raise err
1691

1692
1693
1694
1695
1696
1697
        if len(unresolved_files) > 0:
            logger.info(
                f"Can't load following files from cache: {unresolved_files} and cannot check if these "
                "files are necessary for the tokenizer to operate."
            )

1698
        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
Julien Chaumond's avatar
Julien Chaumond committed
1699
1700
1701
1702
            msg = (
                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
1703
            )
Julien Chaumond's avatar
Julien Chaumond committed
1704
            raise EnvironmentError(msg)
1705
1706

        for file_id, file_path in vocab_files.items():
1707
1708
1709
            if file_id not in resolved_vocab_files:
                continue

1710
            if file_path == resolved_vocab_files[file_id]:
1711
                logger.info(f"loading file {file_path}")
1712
            else:
1713
                logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
1714

1715
1716
1717
1718
1719
1720
1721
1722
        return cls._from_pretrained(
            resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
        )

    @classmethod
    def _from_pretrained(
        cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
    ):
1723
1724
1725
1726
1727
        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
        # file or if `from_slow` is set to True.
        from_slow = kwargs.get("from_slow", False)
        has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
        if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
1728
            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
1729
1730
1731
1732
1733
1734
1735
1736
1737
                copy.deepcopy(resolved_vocab_files),
                pretrained_model_name_or_path,
                copy.deepcopy(init_configuration),
                *init_inputs,
                **(copy.deepcopy(kwargs)),
            )
        else:
            slow_tokenizer = None

1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
        # Prepare tokenizer initialization kwargs
        # Did we saved some inputs and kwargs to reload ?
        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
        if tokenizer_config_file is not None:
            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                init_kwargs = json.load(tokenizer_config_handle)
            saved_init_inputs = init_kwargs.pop("init_inputs", ())
            if not init_inputs:
                init_inputs = saved_init_inputs
        else:
            init_kwargs = init_configuration

        # Update with newly provided kwargs
        init_kwargs.update(kwargs)

1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
        # Convert AddedTokens serialized as dict to class instances
        def convert_added_tokens(obj: Union[AddedToken, Any]):
            if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
                obj.pop("__type")
                return AddedToken(**obj)
            elif isinstance(obj, (list, tuple)):
                return list(convert_added_tokens(o) for o in obj)
            elif isinstance(obj, dict):
                return {k: convert_added_tokens(v) for k, v in obj.items()}
            return obj

        init_kwargs = convert_added_tokens(init_kwargs)

1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
        # Set max length if needed
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
            # if we're using a pretrained model, ensure the tokenizer
            # wont index sequences longer than the number of positional embeddings
            model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
            if model_max_length is not None and isinstance(model_max_length, (int, float)):
                init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)

        # Merge resolved_vocab_files arguments in init_kwargs.
        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
        for args_name, file_path in resolved_vocab_files.items():
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path

1780
1781
1782
        if slow_tokenizer is not None:
            init_kwargs["__slow_tokenizer"] = slow_tokenizer

1783
1784
        init_kwargs["name_or_path"] = pretrained_model_name_or_path

1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
        # Instantiate tokenizer.
        try:
            tokenizer = cls(*init_inputs, **init_kwargs)
        except OSError:
            raise OSError(
                "Unable to load vocabulary from file. "
                "Please check that the provided vocabulary is accessible and not corrupted."
            )

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
1795
1796
1797
        # Removed: Now done at the base class level
        # tokenizer.init_inputs = init_inputs
        # tokenizer.init_kwargs = init_kwargs
1798

1799
1800
1801
1802
1803
1804
        # If there is a complementary special token map, load it
        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
        if special_tokens_map_file is not None:
            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
                special_tokens_map = json.load(special_tokens_map_handle)
            for key, value in special_tokens_map.items():
1805
1806
1807
1808
                if isinstance(value, dict):
                    value = AddedToken(**value)
                elif isinstance(value, list):
                    value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
1809
                setattr(tokenizer, key, value)
1810
1811

        # Add supplementary tokens.
1812
        special_tokens = tokenizer.all_special_tokens
1813
1814
1815
        if added_tokens_file is not None:
            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                added_tok_encoder = json.load(added_tokens_handle)
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826

            # Sort added tokens by index
            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))

            for token, index in added_tok_encoder_sorted:
                assert index == len(tokenizer), (
                    f"Non-consecutive added token '{token}' found. "
                    f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
                )
                tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))

Stas Bekman's avatar
Stas Bekman committed
1827
        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
1828
1829
1830
        added_tokens = tokenizer.sanitize_special_tokens()
        if added_tokens:
            logger.warning(
Stas Bekman's avatar
Stas Bekman committed
1831
                "Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained."
1832
            )
1833
1834
1835

        return tokenizer

1836
    def save_pretrained(
1837
1838
1839
1840
        self,
        save_directory: Union[str, os.PathLike],
        legacy_format: bool = True,
        filename_prefix: Optional[str] = None,
1841
    ) -> Tuple[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
1842
        """
1843
        Save the full tokenizer state.
Sylvain Gugger's avatar
Sylvain Gugger committed
1844

1845

Sylvain Gugger's avatar
Sylvain Gugger committed
1846
        This method make sure the full tokenizer can then be re-loaded using the
1847
1848
1849
        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.

        .. Note::
Sylvain Gugger's avatar
Sylvain Gugger committed
1850
1851
1852
1853
            A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will
            not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer`
            instance. It can only be loaded in a "fast" tokenizer, i.e. in a
            :class:`transformers.PreTrainedTokenizerFast` instance.
Sylvain Gugger's avatar
Sylvain Gugger committed
1854
1855
1856
1857
1858
1859

        .. Warning::
           This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
           modifying :obj:`tokenizer.do_lower_case` after creation).

        Args:
1860
            save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
1861
            legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1862
1863
1864
1865
                Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
                separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
                possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with
                "slow" tokenizers (not powered by the `tokenizers` library).
1866
1867
            filename_prefix: (:obj:`str`, `optional`):
                A prefix to add to the names of the files saved by the tokenizer.
1868

Sylvain Gugger's avatar
Sylvain Gugger committed
1869
1870
        Returns:
            A tuple of :obj:`str`: The files saved.
1871
        """
1872
        if os.path.isfile(save_directory):
1873
            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
1874
            return
1875
        os.makedirs(save_directory, exist_ok=True)
1876

1877
1878
1879
1880
1881
1882
        special_tokens_map_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
        )
        tokenizer_config_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
        )
1883
1884
1885
1886
1887
1888
1889

        tokenizer_config = copy.deepcopy(self.init_kwargs)
        if len(self.init_inputs) > 0:
            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
        for file_id in self.vocab_files_names.keys():
            tokenizer_config.pop(file_id, None)

1890
        # Sanitize AddedTokens
1891
        def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
1892
1893
            if isinstance(obj, AddedToken):
                out = obj.__getstate__()
1894
1895
                if add_type_field:
                    out["__type"] = "AddedToken"
1896
1897
                return out
            elif isinstance(obj, (list, tuple)):
1898
                return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
1899
            elif isinstance(obj, dict):
1900
                return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
1901
1902
            return obj

1903
1904
        # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
        tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
1905
1906
        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
1907
        logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
1908

1909
        # Sanitize AddedTokens in special_tokens_map
1910
        write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
1911
        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
1912
            f.write(json.dumps(write_dict, ensure_ascii=False))
1913
        logger.info(f"Special tokens file saved in {special_tokens_map_file}")
1914

1915
1916
        file_names = (tokenizer_config_file, special_tokens_map_file)

1917
1918
1919
1920
1921
1922
        return self._save_pretrained(
            save_directory=save_directory,
            file_names=file_names,
            legacy_format=legacy_format,
            filename_prefix=filename_prefix,
        )
1923

1924
1925
    def _save_pretrained(
        self,
1926
        save_directory: Union[str, os.PathLike],
1927
1928
1929
1930
        file_names: Tuple[str],
        legacy_format: bool = True,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
Sylvain Gugger's avatar
Sylvain Gugger committed
1931
1932
        """
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
1933

Sylvain Gugger's avatar
Sylvain Gugger committed
1934
1935
        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
        specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
1936
1937
1938
        """
        if not legacy_format:
            raise ValueError(
1939
                "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
1940
1941
            )

1942
1943
        save_directory = str(save_directory)

1944
1945
1946
        added_tokens_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
        )
1947
1948
        added_vocab = self.get_added_vocab()
        if added_vocab:
1949
            with open(added_tokens_file, "w", encoding="utf-8") as f:
1950
                out_str = json.dumps(added_vocab, ensure_ascii=False)
1951
                f.write(out_str)
1952
                logger.info(f"added tokens file saved in {added_tokens_file}")
1953

1954
        vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
1955

1956
1957
1958
1959
1960
1961
        return file_names + vocab_files + (added_tokens_file,)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

Sylvain Gugger's avatar
Sylvain Gugger committed
1962
1963
        This method won't save the configuration and special token mappings of the tokenizer. Use
        :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer.
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974

        Args:
            save_directory (:obj:`str`):
                The directory in which to save the vocabulary.
            filename_prefix (:obj:`str`, `optional`):
                An optional prefix to add to the named of the saved files.

        Returns:
            :obj:`Tuple(str)`: Paths to the files saved.
        """
        raise NotImplementedError
1975

1976
1977
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        """
1978
        Converts a string in a sequence of tokens, replacing unknown tokens with the :obj:`unk_token`.
1979
1980
1981
1982
1983
1984
1985
1986
1987

        Args:
            text (:obj:`str`):
                The sequence to be encoded.
            pair (:obj:`str`, `optional`):
                A second sequence to be encoded with the first.
            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to add the special tokens associated with the corresponding model.
            kwargs (additional keyword arguments, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1988
                Will be passed to the underlying model specific encode method. See details in
1989
                :meth:`~transformers.PreTrainedTokenizerBase.__call__`
1990
1991
1992
1993
1994
1995

        Returns:
            :obj:`List[str]`: The list of tokens.
        """
        raise NotImplementedError

1996
1997
1998
    @add_end_docstrings(
        ENCODE_KWARGS_DOCSTRING,
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
1999
2000
2001
2002
            **kwargs: Passed along to the `.tokenize()` method.
        """,
        """
        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
2003
2004
            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the
            text.
Sylvain Gugger's avatar
Sylvain Gugger committed
2005
        """,
2006
2007
2008
2009
2010
2011
    )
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2012
2013
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2014
2015
2016
2017
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
Sylvain Gugger's avatar
Sylvain Gugger committed
2018
    ) -> List[int]:
2019
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2020
        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
2021
2022
2023
2024
2025

        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.

        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2026
2027
2028
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
                method).
Sylvain Gugger's avatar
Sylvain Gugger committed
2029
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2030
2031
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
Sylvain Gugger's avatar
Sylvain Gugger committed
2032
                ``convert_tokens_to_ids`` method).
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
        """
        encoded_inputs = self.encode_plus(
            text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            return_tensors=return_tensors,
            **kwargs,
        )

        return encoded_inputs["input_ids"]

    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        raise NotImplementedError

    def _get_padding_truncation_strategies(
2052
        self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
2053
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
2054
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2055
2056
        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
        and pad_to_max_length) and behaviors.
2057
2058
2059
2060
2061
2062
2063
2064
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

        # Backward compatibility for previous behavior, maybe we should deprecate it:
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
2065
                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
2066
                    logger.warning(
2067
2068
                        "Truncation was not explicitly activated but `max_length` is provided a specific value, "
                        "please use `truncation=True` to explicitly truncate examples to max length. "
2069
2070
2071
2072
                        "Defaulting to 'longest_first' truncation strategy. "
                        "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
                        "more precisely by providing a specific strategy to `truncation`."
                    )
2073
                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
2074
            truncation = "longest_first"
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084

        # Get padding strategy
        if padding is False and old_pad_to_max_length:
            if verbose:
                warnings.warn(
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
                    "maximal input size of the model (e.g. 512 for Bert).",
2085
                    FutureWarning,
2086
2087
2088
2089
2090
2091
2092
2093
                )
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
2094
            elif not isinstance(padding, PaddingStrategy):
2095
                padding_strategy = PaddingStrategy(padding)
2096
2097
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
        if truncation is False and old_truncation_strategy != "do_not_truncate":
            if verbose:
                warnings.warn(
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
                    "use `truncation=True` to truncate examples to a max length. You can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
                    "maximal input size of the model (e.g. 512 for Bert). "
                    " If you have pairs of inputs, you can give a specific truncation strategy selected among "
                    "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
                    "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
                    "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
2113
                    FutureWarning,
2114
2115
2116
2117
2118
                )
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
        elif truncation is not False:
            if truncation is True:
                truncation_strategy = (
2119
2120
2121
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
2122
                truncation_strategy = TruncationStrategy(truncation)
2123
2124
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
2125
2126
2127
2128
2129
2130
2131
2132
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
2133
2134
2135
2136
2137
2138
                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
                            logger.warning(
                                "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
                                "Default to no padding."
                            )
                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
2139
2140
2141
2142
2143
2144
2145
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length

            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
2146
2147
2148
2149
2150
2151
                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
                            logger.warning(
                                "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
                                "Default to no truncation."
                            )
                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (
            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
            and padding_strategy != PaddingStrategy.DO_NOT_PAD
            and pad_to_multiple_of is not None
            and max_length is not None
            and (max_length % pad_to_multiple_of != 0)
        ):
            raise ValueError(
                f"Truncation and padding are both activated but "
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

2177
2178
2179
2180
2181
2182
2183
2184
        return padding_strategy, truncation_strategy, max_length, kwargs

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2185
2186
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2187
2188
        max_length: Optional[int] = None,
        stride: int = 0,
2189
        is_split_into_words: bool = False,
2190
        pad_to_multiple_of: Optional[int] = None,
2191
2192
2193
2194
2195
2196
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2197
        return_length: bool = False,
2198
2199
2200
2201
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2202
2203
        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
        sequences.
2204
2205

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2206
            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2207
2208
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
2209
                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
Sylvain Gugger's avatar
Sylvain Gugger committed
2210
            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2211
2212
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
2213
                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
2214
        """
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
        # Input type checking for clearer error
        assert isinstance(text, str) or (
            isinstance(text, (list, tuple))
            and (
                len(text) == 0
                or (
                    isinstance(text[0], str)
                    or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
                )
            )
        ), (
            "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
            "or `List[List[str]]` (batch of pretokenized examples)."
        )

        assert (
            text_pair is None
            or isinstance(text_pair, str)
            or (
                isinstance(text_pair, (list, tuple))
                and (
                    len(text_pair) == 0
                    or (
                        isinstance(text_pair[0], str)
                        or (
                            isinstance(text_pair[0], (list, tuple))
                            and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
                        )
                    )
                )
            )
        ), (
            "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
            "or `List[List[str]]` (batch of pretokenized examples)."
        )

2251
        is_batched = bool(
2252
2253
2254
2255
            (not is_split_into_words and isinstance(text, (list, tuple)))
            or (
                is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
            )
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
        )

        if is_batched:
            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
            return self.batch_encode_plus(
                batch_text_or_text_pairs=batch_text_or_text_pairs,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
2267
                is_split_into_words=is_split_into_words,
2268
                pad_to_multiple_of=pad_to_multiple_of,
2269
2270
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
2271
                return_attention_mask=return_attention_mask,
2272
                return_overflowing_tokens=return_overflowing_tokens,
2273
                return_special_tokens_mask=return_special_tokens_mask,
2274
                return_offsets_mapping=return_offsets_mapping,
2275
                return_length=return_length,
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
                verbose=verbose,
                **kwargs,
            )
        else:
            return self.encode_plus(
                text=text,
                text_pair=text_pair,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
2288
                is_split_into_words=is_split_into_words,
2289
                pad_to_multiple_of=pad_to_multiple_of,
2290
2291
2292
2293
2294
2295
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
2296
                return_length=return_length,
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
                verbose=verbose,
                **kwargs,
            )

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2307
2308
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2309
2310
        max_length: Optional[int] = None,
        stride: int = 0,
2311
        is_split_into_words: bool = False,
2312
        pad_to_multiple_of: Optional[int] = None,
2313
2314
2315
2316
2317
2318
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2319
        return_length: bool = False,
2320
2321
2322
2323
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2324
2325
2326
2327
        Tokenize and prepare for the model a sequence or a pair of sequences.

        .. warning::
            This method is deprecated, ``__call__`` should be used instead.
2328
2329

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2330
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
Sylvain Gugger's avatar
Sylvain Gugger committed
2331
2332
2333
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
                method).
Sylvain Gugger's avatar
Sylvain Gugger committed
2334
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2335
2336
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
Sylvain Gugger's avatar
Sylvain Gugger committed
2337
                ``convert_tokens_to_ids`` method).
2338
2339
2340
2341
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2342
2343
2344
2345
2346
2347
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
        )

        return self._encode_plus(
            text=text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
2358
            is_split_into_words=is_split_into_words,
2359
            pad_to_multiple_of=pad_to_multiple_of,
2360
2361
2362
2363
2364
2365
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
2366
            return_length=return_length,
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
            verbose=verbose,
            **kwargs,
        )

    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
2380
        is_split_into_words: bool = False,
2381
        pad_to_multiple_of: Optional[int] = None,
2382
2383
2384
2385
2386
2387
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
2388
        return_length: bool = False,
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        raise NotImplementedError

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2406
2407
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2408
2409
        max_length: Optional[int] = None,
        stride: int = 0,
2410
        is_split_into_words: bool = False,
2411
        pad_to_multiple_of: Optional[int] = None,
2412
2413
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
2414
        return_attention_mask: Optional[bool] = None,
2415
        return_overflowing_tokens: bool = False,
2416
        return_special_tokens_mask: bool = False,
2417
        return_offsets_mapping: bool = False,
2418
        return_length: bool = False,
2419
2420
2421
2422
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2423
2424
2425
2426
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

        .. warning::
            This method is deprecated, ``__call__`` should be used instead.
2427
2428

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2429
            batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2430
2431
2432
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
                details in ``encode_plus``).
2433
2434
2435
2436
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2437
2438
2439
2440
2441
2442
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
2443
2444
2445
2446
2447
2448
2449
2450
2451
        )

        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
2452
            is_split_into_words=is_split_into_words,
2453
            pad_to_multiple_of=pad_to_multiple_of,
2454
2455
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
2456
            return_attention_mask=return_attention_mask,
2457
            return_overflowing_tokens=return_overflowing_tokens,
2458
            return_special_tokens_mask=return_special_tokens_mask,
2459
            return_offsets_mapping=return_offsets_mapping,
2460
            return_length=return_length,
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
            verbose=verbose,
            **kwargs,
        )

    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
2480
        is_split_into_words: bool = False,
2481
        pad_to_multiple_of: Optional[int] = None,
2482
2483
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
2484
        return_attention_mask: Optional[bool] = None,
2485
        return_overflowing_tokens: bool = False,
2486
        return_special_tokens_mask: bool = False,
2487
        return_offsets_mapping: bool = False,
2488
        return_length: bool = False,
2489
2490
2491
2492
2493
2494
2495
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        raise NotImplementedError

    def pad(
        self,
2496
2497
2498
2499
2500
2501
2502
        encoded_inputs: Union[
            BatchEncoding,
            List[BatchEncoding],
            Dict[str, EncodedInput],
            Dict[str, List[EncodedInput]],
            List[Dict[str, EncodedInput]],
        ],
Sylvain Gugger's avatar
Sylvain Gugger committed
2503
        padding: Union[bool, str, PaddingStrategy] = True,
2504
        max_length: Optional[int] = None,
2505
        pad_to_multiple_of: Optional[int] = None,
2506
        return_attention_mask: Optional[bool] = None,
2507
        return_tensors: Optional[Union[str, TensorType]] = None,
2508
        verbose: bool = True,
2509
    ) -> BatchEncoding:
Sylvain Gugger's avatar
Sylvain Gugger committed
2510
2511
2512
        """
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.
2513

Sylvain Gugger's avatar
Sylvain Gugger committed
2514
2515
        Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
        ``self.pad_token_id`` and ``self.pad_token_type_id``)
2516

2517
2518
2519
2520
2521
2522
        .. note::

            If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
            result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
            case of PyTorch tensors, you will lose the specific device of your tensors however.

2523
        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2524
            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2525
2526
2527
2528
                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
                well as in a PyTorch Dataloader collate function.
2529
2530
2531

                Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
                see the note above for the return type.
2532
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
            max_length (:obj:`int`, `optional`):
                Maximum length of the returned list and optionally padding length (see above).
            pad_to_multiple_of (:obj:`int`, `optional`):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2548
                >= 7.5 (Volta).
Sylvain Gugger's avatar
Sylvain Gugger committed
2549
2550
2551
2552
2553
            return_attention_mask (:obj:`bool`, `optional`):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.

                `What are attention masks? <../glossary.html#attention-mask>`__
2554
            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2555
2556
2557
2558
2559
                If set, will return tensors instead of list of python integers. Acceptable values are:

                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
2560
            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
2561
                Whether or not to print more information and warnings.
2562
        """
2563
        # If we have a list of dicts, let's convert it in a dict of lists
2564
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
2565
2566
2567
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

2568
2569
2570
2571
2572
2573
        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
                "You should supply an encoding or a list of encodings to this method"
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )
2574

2575
2576
2577
        required_input = encoded_inputs[self.model_input_names[0]]

        if not required_input:
2578
            if return_attention_mask:
2579
2580
                encoded_inputs["attention_mask"] = []
            return encoded_inputs
2581

2582
2583
2584
        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch
2585

2586
        first_element = required_input[0]
2587
2588
2589
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
            index = 0
2590
            while len(required_input[index]) == 0:
2591
                index += 1
2592
2593
            if index < len(required_input):
                first_element = required_input[index][0]
2594
2595
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
Sylvain Gugger's avatar
Sylvain Gugger committed
2596
            if is_tf_available() and _is_tensorflow(first_element):
2597
                return_tensors = "tf" if return_tensors is None else return_tensors
Sylvain Gugger's avatar
Sylvain Gugger committed
2598
            elif is_torch_available() and _is_torch(first_element):
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
                    f"Should be one of a python, numpy, pytorch or tensorflow object."
                )

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)

2611
        # Convert padding_strategy in PaddingStrategy
2612
2613
2614
2615
        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
            padding=padding, max_length=max_length, verbose=verbose
        )

2616
2617
        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
2618
2619
            encoded_inputs = self._pad(
                encoded_inputs,
2620
2621
                max_length=max_length,
                padding_strategy=padding_strategy,
2622
                pad_to_multiple_of=pad_to_multiple_of,
2623
2624
                return_attention_mask=return_attention_mask,
            )
2625
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
2626

2627
        batch_size = len(required_input)
2628
        assert all(
2629
            len(v) == batch_size for v in encoded_inputs.values()
Tiger's avatar
Tiger committed
2630
        ), "Some items in the output dictionary have a different batch size than others."
2631
2632

        if padding_strategy == PaddingStrategy.LONGEST:
2633
            max_length = max(len(inputs) for inputs in required_input)
2634
2635
2636
2637
            padding_strategy = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
2638
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
2639
2640
2641
2642
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding_strategy,
2643
                pad_to_multiple_of=pad_to_multiple_of,
2644
2645
2646
2647
2648
2649
2650
2651
                return_attention_mask=return_attention_mask,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

2652
        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
2653

Sylvain Gugger's avatar
Sylvain Gugger committed
2654
2655
2656
2657
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2658
2659
        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
        <../glossary.html#token-type-ids>`__
Sylvain Gugger's avatar
Sylvain Gugger committed
2660

2661
        Should be overridden in a subclass if the model has a special way of building those.
Sylvain Gugger's avatar
Sylvain Gugger committed
2662
2663
2664
2665
2666
2667
2668
2669

        Args:
            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.

        Returns:
            :obj:`List[int]`: The token type ids.
        """
2670
2671
2672
2673
        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)

Sylvain Gugger's avatar
Sylvain Gugger committed
2674
2675
2676
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
2677
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2678
2679
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
2680

2681
        This implementation does not add special tokens and this method should be overridden in a subclass.
Sylvain Gugger's avatar
Sylvain Gugger committed
2682
2683
2684
2685
2686
2687
2688

        Args:
            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.

        Returns:
            :obj:`List[int]`: The model input with special tokens.
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
        """
        if token_ids_1 is None:
            return token_ids_0
        return token_ids_0 + token_ids_1

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
Sylvain Gugger's avatar
Sylvain Gugger committed
2700
2701
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
        **kwargs
    ) -> BatchEncoding:
Sylvain Gugger's avatar
Sylvain Gugger committed
2716
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2717
2718
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
2719
2720
2721
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2722
            ids (:obj:`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2723
2724
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
                and ``convert_tokens_to_ids`` methods.
Sylvain Gugger's avatar
Sylvain Gugger committed
2725
            pair_ids (:obj:`List[int]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2726
2727
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
                and ``convert_tokens_to_ids`` methods.
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

2744
        if return_token_type_ids and not add_special_tokens:
2745
2746
2747
2748
2749
2750
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None."
            )

2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
2763
        overflowing_tokens = []
2764
2765
2766
2767
2768
2769
2770
2771
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation_strategy,
                stride=stride,
            )
2772
2773
2774
2775

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length
2776
2777
2778
2779
2780
2781
2782

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
2783
            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
2784

Tiger's avatar
Tiger committed
2785
        # Build output dictionary
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Check lengths
2796
        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824

        # Padding
        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding_strategy.value,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(
            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
        )

        return batch_outputs

    def truncate_sequences(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        num_tokens_to_remove: int = 0,
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
        stride: int = 0,
    ) -> Tuple[List[int], List[int], List[int]]:
Sylvain Gugger's avatar
Sylvain Gugger committed
2825
2826
        """
        Truncates a sequence pair in-place following the strategy.
2827
2828

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
2829
            ids (:obj:`List[int]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2830
2831
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
                and ``convert_tokens_to_ids`` methods.
Sylvain Gugger's avatar
Sylvain Gugger committed
2832
            pair_ids (:obj:`List[int]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2833
2834
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
                and ``convert_tokens_to_ids`` methods.
Sylvain Gugger's avatar
Sylvain Gugger committed
2835
2836
            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
                Number of tokens to remove using the truncation strategy.
2837
            truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
Sylvain Gugger's avatar
Sylvain Gugger committed
2838
2839
                The strategy to follow for truncation. Can be:

Sylvain Gugger's avatar
Sylvain Gugger committed
2840
2841
2842
2843
                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
                  sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
2844
2845
2846
2847
2848
2849
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
                  the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
Sylvain Gugger's avatar
Sylvain Gugger committed
2850
2851
                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).
Sylvain Gugger's avatar
Sylvain Gugger committed
2852
            stride (:obj:`int`, `optional`, defaults to 0):
Sylvain Gugger's avatar
Sylvain Gugger committed
2853
2854
                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                sequence returned. The value of this argument defines the number of additional tokens.
Sylvain Gugger's avatar
Sylvain Gugger committed
2855
2856

        Returns:
Sylvain Gugger's avatar
Sylvain Gugger committed
2857
2858
            :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
            list of overflowing tokens.
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
        """
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []

        if not isinstance(truncation_strategy, TruncationStrategy):
            truncation_strategy = TruncationStrategy(truncation_strategy)

        overflowing_tokens = []
        if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
            for _ in range(num_tokens_to_remove):
                if pair_ids is None or len(ids) > len(pair_ids):
                    if not overflowing_tokens:
                        window_len = min(len(ids), stride + 1)
                    else:
                        window_len = 1
                    overflowing_tokens.extend(ids[-window_len:])
                    ids = ids[:-1]
                else:
                    if not overflowing_tokens:
                        window_len = min(len(pair_ids), stride + 1)
                    else:
                        window_len = 1
                    overflowing_tokens.extend(pair_ids[-window_len:])
                    pair_ids = pair_ids[:-1]
        elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
            if len(ids) > num_tokens_to_remove:
                window_len = min(len(ids), stride + num_tokens_to_remove)
                overflowing_tokens = ids[-window_len:]
                ids = ids[:-num_tokens_to_remove]
            else:
                logger.error(
                    f"We need to remove {num_tokens_to_remove} to truncate the input"
                    f"but the first sequence has a length {len(ids)}. "
                    f"Please select another truncation strategy than {truncation_strategy}, "
                    f"for instance 'longest_first' or 'only_second'."
                )
        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
            if len(pair_ids) > num_tokens_to_remove:
                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
                overflowing_tokens = pair_ids[-window_len:]
                pair_ids = pair_ids[:-num_tokens_to_remove]
            else:
                logger.error(
                    f"We need to remove {num_tokens_to_remove} to truncate the input"
                    f"but the second sequence has a length {len(pair_ids)}. "
                    f"Please select another truncation strategy than {truncation_strategy}, "
                    f"for instance 'longest_first' or 'only_first'."
                )

        return (ids, pair_ids, overflowing_tokens)

2910
2911
    def _pad(
        self,
2912
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
2913
2914
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
2915
        pad_to_multiple_of: Optional[int] = None,
2916
2917
        return_attention_mask: Optional[bool] = None,
    ) -> dict:
Sylvain Gugger's avatar
Sylvain Gugger committed
2918
        """
2919
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
2920
2921
2922
2923
2924
2925

        Args:
            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.
Sylvain Gugger's avatar
Sylvain Gugger committed
2926

2927
2928
2929
2930
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:
Sylvain Gugger's avatar
Sylvain Gugger committed
2931

2932
2933
                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
2934
2935
2936
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
2937
2938
2939
2940
2941
2942
            return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        """
        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

2943
2944
        required_input = encoded_inputs[self.model_input_names[0]]

2945
        if padding_strategy == PaddingStrategy.LONGEST:
2946
            max_length = len(required_input)
2947

2948
2949
2950
        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

2951
        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
2952
2953

        if needs_to_be_padded:
2954
            difference = max_length - len(required_input)
2955
2956
            if self.padding_side == "right":
                if return_attention_mask:
2957
                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
2958
2959
2960
2961
2962
2963
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
                    )
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
2964
                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
2965
2966
            elif self.padding_side == "left":
                if return_attention_mask:
2967
                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
2968
2969
2970
2971
2972
2973
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
                        "token_type_ids"
                    ]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
2974
                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
2975
2976
            else:
                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
2977
        elif return_attention_mask and "attention_mask" not in encoded_inputs:
2978
            encoded_inputs["attention_mask"] = [1] * len(required_input)
2979
2980
2981

        return encoded_inputs

2982
2983
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
2984
2985
        Converts a sequence of tokens in a single string. The most simple way to do it is ``" ".join(tokens)`` but we
        often want to remove sub-word tokenization artifacts at the same time.
Sylvain Gugger's avatar
Sylvain Gugger committed
2986

2987
2988
        Args:
            tokens (:obj:`List[str]`): The token to join in a string.
Sylvain Gugger's avatar
Sylvain Gugger committed
2989

cronoik's avatar
cronoik committed
2990
2991
        Returns:
            :obj:`str`: The joined tokens.
2992
2993
2994
        """
        raise NotImplementedError

2995
    def batch_decode(
2996
2997
2998
2999
3000
        self,
        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
3001
3002
3003
3004
3005
    ) -> List[str]:
        """
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
3006
            sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3007
3008
3009
3010
3011
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to clean up the tokenization spaces.
3012
3013
            kwargs (additional keyword arguments, `optional`):
                Will be passed to the underlying model specific decode method.
Sylvain Gugger's avatar
Sylvain Gugger committed
3014
3015
3016

        Returns:
            :obj:`List[str]`: The list of decoded sentences.
3017
3018
3019
        """
        return [
            self.decode(
3020
3021
3022
3023
                seq,
                skip_special_tokens=skip_special_tokens,
                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                **kwargs,
3024
3025
3026
            )
            for seq in sequences
        ]
3027

3028
    def decode(
3029
        self,
3030
        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
3031
3032
3033
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
3034
3035
    ) -> str:
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
3036
3037
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.
Sylvain Gugger's avatar
Sylvain Gugger committed
3038

3039
3040
3041
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.

        Args:
3042
            token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
Sylvain Gugger's avatar
Sylvain Gugger committed
3043
3044
3045
3046
3047
                List of tokenized input ids. Can be obtained using the ``__call__`` method.
            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to clean up the tokenization spaces.
3048
3049
            kwargs (additional keyword arguments, `optional`):
                Will be passed to the underlying model specific decode method.
Sylvain Gugger's avatar
Sylvain Gugger committed
3050
3051
3052

        Returns:
            :obj:`str`: The decoded sentence.
3053
        """
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
        # Convert inputs to python lists
        token_ids = to_py_obj(token_ids)

        return self._decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )

    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = True,
        **kwargs
    ) -> str:
3071
3072
3073
        raise NotImplementedError

    def get_special_tokens_mask(
Sylvain Gugger's avatar
Sylvain Gugger committed
3074
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
3075
3076
3077
3078
3079
3080
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.

        Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
3081
3082
3083
3084
3085
            token_ids_0 (:obj:`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (:obj:`List[int]`, `optional`):
                List of ids of the second sequence.
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
3086
                Whether or not the token list is already formatted with special tokens for the model.
3087
3088
3089
3090
3091
3092
3093

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        assert already_has_special_tokens and token_ids_1 is None, (
            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
            "Please use a slow (full python) tokenizer to activate this argument."
3094
            "Or set `return_special_tokens_mask=True` when calling the encoding method "
3095
3096
3097
3098
3099
3100
3101
3102
3103
            "to get the special tokens mask in any tokenizer. "
        )

        all_special_ids = self.all_special_ids  # cache the property

        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]

        return special_tokens_mask

3104
3105
    @staticmethod
    def clean_up_tokenization(out_string: str) -> str:
Sylvain Gugger's avatar
Sylvain Gugger committed
3106
        """
3107
        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
Sylvain Gugger's avatar
Sylvain Gugger committed
3108
3109
3110
3111
3112
3113

        Args:
            out_string (:obj:`str`): The text to clean up.

        Returns:
            :obj:`str`: The cleaned-up string.
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
        """
        out_string = (
            out_string.replace(" .", ".")
            .replace(" ?", "?")
            .replace(" !", "!")
            .replace(" ,", ",")
            .replace(" ' ", "'")
            .replace(" n't", "n't")
            .replace(" 'm", "'m")
            .replace(" 's", "'s")
            .replace(" 've", "'ve")
            .replace(" 're", "'re")
        )
        return out_string
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143

    def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
        """
        Depending on the input and internal state we might trigger a warning about a sequence that is too long for it's
        corresponding model

        Args:
            ids (:obj:`List[str]`): The ids produced by the tokenization
            max_length (:obj:`int`, `optional`): The max_length desired (does not trigger a warning if it is set)
            verbose (:obj:`bool`): Whether or not to print more information and warnings.

        """
        if max_length is None and len(ids) > self.model_max_length and verbose:
            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
                logger.warning(
                    "Token indices sequence length is longer than the specified maximum sequence length "
3144
3145
                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
                    "will result in indexing errors"
3146
3147
                )
            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183

    @contextmanager
    def as_target_tokenizer(self):
        """
        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
        sequence-to-sequence models that need a slightly different processing for the labels.
        """
        yield

    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        tgt_texts: Optional[List[str]] = None,
        max_length: Optional[int] = None,
        max_target_length: Optional[int] = None,
        padding: str = "longest",
        return_tensors: str = None,
        truncation: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Prepare model inputs for translation. For best performance, translate one sentence at a time.

        Arguments:
            src_texts (:obj:`List[str]`):
                List of documents to summarize or source language texts.
            tgt_texts (:obj:`list`, `optional`):
                List of summaries or target language texts.
            max_length (:obj:`int`, `optional`):
                Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
                is required by one of the truncation/padding parameters. If the model has no specific maximum input
                length (like XLNet) truncation/padding to a maximum length will be deactivated.
            max_target_length (:obj:`int`, `optional`):
                Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
                to :obj:`None`, this will use the max_length value.
3184
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
3185
3186
3187
3188
3189
3190
3191
3192
                Activates and controls padding. Accepts the following values:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
3193
            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
                If set, will return tensors instead of list of python integers. Acceptable values are:

                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`):
                Activates and controls truncation. Accepts the following values:

                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
                  if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
                  the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
                  sequence lengths greater than the model maximum admissible input size).
            **kwargs:
                Additional keyword arguments passed along to :obj:`self.__call__`.

        Return:
            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:

            - **input_ids** -- List of token ids to be fed to the encoder.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **labels** -- List of token ids for tgt_texts.

            The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
            Otherwise, input_ids, attention_mask will be the only keys.
        """
3227
3228
3229
3230
3231
3232
3233
        warnings.warn(
            "`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of 🤗 Transformers. Use the "
            "regular `__call__` method to prepare your inputs and the tokenizer under the `with_target_tokenizer` "
            "context manager to prepare your targets. See the documentation of your specific tokenizer for more "
            "details",
            FutureWarning,
        )
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
        # mBART-specific kwargs that should be ignored by other models.
        kwargs.pop("src_lang", None)
        kwargs.pop("tgt_lang", None)
        if max_length is None:
            max_length = self.model_max_length
        model_inputs = self(
            src_texts,
            add_special_tokens=True,
            return_tensors=return_tensors,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            **kwargs,
        )
        if tgt_texts is None:
            return model_inputs
        # Process tgt_texts
        if max_target_length is None:
            max_target_length = max_length
        with self.as_target_tokenizer():
            labels = self(
                tgt_texts,
                add_special_tokens=True,
                return_tensors=return_tensors,
                padding=padding,
                max_length=max_target_length,
                truncation=truncation,
                **kwargs,
            )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs