"...git@developer.sourcefind.cn:ci-demos/wan2.1.git" did not exist on "75f4505072f76a65f459bec4bf74466f2e5bf114"
tokenization_auto.py 30.2 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
""" Auto Tokenizer class."""
thomwolf's avatar
thomwolf committed
16

17
import importlib
18
19
import json
import os
20
from collections import OrderedDict
21
from pathlib import Path
22
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
thomwolf's avatar
thomwolf committed
23

Sylvain Gugger's avatar
Sylvain Gugger committed
24
from ...configuration_utils import PretrainedConfig
25
26
from ...file_utils import (
    cached_path,
27
    get_list_of_files,
28
29
30
31
32
    hf_bucket_url,
    is_offline_mode,
    is_sentencepiece_available,
    is_tokenizers_available,
)
33
from ...tokenization_utils import PreTrainedTokenizer
34
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
35
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
36
from ...utils import logging
37
38
from ..encoder_decoder import EncoderDecoderConfig
from .auto_factory import _LazyAutoMapping
39
from .configuration_auto import (
40
    CONFIG_MAPPING_NAMES,
41
    AutoConfig,
42
    config_class_to_model_type,
43
    model_type_to_module_name,
44
    replace_list_option_in_docstrings,
45
)
46
from .dynamic import get_class_from_dynamic_module
Aymeric Augustin's avatar
Aymeric Augustin committed
47

thomwolf's avatar
thomwolf committed
48

Lysandre Debut's avatar
Lysandre Debut committed
49
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
50

51
52
53
54
55
56
57
if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
    TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
    TOKENIZER_MAPPING_NAMES = OrderedDict(
        [
Gunjan Chhablani's avatar
Gunjan Chhablani committed
58
            ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
59
60
            ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
            ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
61
            (
62
63
64
65
66
                "t5",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
67
68
            ),
            (
69
70
71
72
73
                "mt5",
                (
                    "MT5Tokenizer" if is_sentencepiece_available() else None,
                    "MT5TokenizerFast" if is_tokenizers_available() else None,
                ),
74
            ),
75
76
            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
            ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
77
            (
78
79
80
81
82
                "albert",
                (
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
                ),
83
84
            ),
            (
85
86
87
88
89
                "camembert",
                (
                    "CamembertTokenizer" if is_sentencepiece_available() else None,
                    "CamembertTokenizerFast" if is_tokenizers_available() else None,
                ),
90
91
            ),
            (
92
93
94
95
96
                "pegasus",
                (
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
                ),
97
98
            ),
            (
99
100
101
102
103
                "mbart",
                (
                    "MBartTokenizer" if is_sentencepiece_available() else None,
                    "MBartTokenizerFast" if is_tokenizers_available() else None,
                ),
104
105
            ),
            (
106
107
108
109
110
                "xlm-roberta",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
111
            ),
112
113
            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
114
            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
115
116
117
            ("bart", ("BartTokenizer", "BartTokenizerFast")),
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
118
            (
119
120
121
122
123
                "reformer",
                (
                    "ReformerTokenizer" if is_sentencepiece_available() else None,
                    "ReformerTokenizerFast" if is_tokenizers_available() else None,
                ),
124
            ),
125
126
127
128
            ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
            ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
            ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
            ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
129
            ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
130
            ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
131
            (
132
133
134
135
136
                "dpr",
                (
                    "DPRQuestionEncoderTokenizer",
                    "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
                ),
137
138
            ),
            (
139
140
                "squeezebert",
                ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
141
            ),
142
143
144
145
            ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("transfo-xl", ("TransfoXLTokenizer", None)),
146
            (
147
148
149
150
151
                "xlnet",
                (
                    "XLNetTokenizer" if is_sentencepiece_available() else None,
                    "XLNetTokenizerFast" if is_tokenizers_available() else None,
                ),
152
            ),
153
154
155
156
157
158
159
160
161
162
            ("flaubert", ("FlaubertTokenizer", None)),
            ("xlm", ("XLMTokenizer", None)),
            ("ctrl", ("CTRLTokenizer", None)),
            ("fsmt", ("FSMTTokenizer", None)),
            ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
            ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
            ("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)),
            ("rag", ("RagTokenizer", None)),
            ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
            ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
163
            ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
164
165
166
167
168
169
            ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
            ("prophetnet", ("ProphetNetTokenizer", None)),
            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
            ("tapas", ("TapasTokenizer", None)),
            ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
            ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
170
            (
171
172
173
174
175
                "big_bird",
                (
                    "BigBirdTokenizer" if is_sentencepiece_available() else None,
                    "BigBirdTokenizerFast" if is_tokenizers_available() else None,
                ),
176
            ),
177
            ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
178
            ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
179
180
181
182
            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
            ("hubert", ("Wav2Vec2CTCTokenizer", None)),
            ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("luke", ("LukeTokenizer", None)),
Ryokan RI's avatar
Ryokan RI committed
183
            ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
184
185
186
187
            ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
            ("canine", ("CanineTokenizer", None)),
            ("bertweet", ("BertweetTokenizer", None)),
            ("bert-japanese", ("BertJapaneseTokenizer", None)),
Ori Ram's avatar
Ori Ram committed
188
            ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
189
            ("byt5", ("ByT5Tokenizer", None)),
190
            (
191
192
193
194
195
                "cpm",
                (
                    "CpmTokenizer" if is_sentencepiece_available() else None,
                    "CpmTokenizerFast" if is_tokenizers_available() else None,
                ),
196
            ),
197
198
            ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
            ("phobert", ("PhobertTokenizer", None)),
199
            ("bartpho", ("BartphoTokenizer", None)),
200
201
202
203
204
205
206
207
208
209
210
211
212
213
            (
                "barthez",
                (
                    "BarthezTokenizer" if is_sentencepiece_available() else None,
                    "BarthezTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
            (
                "mbart50",
                (
                    "MBart50Tokenizer" if is_sentencepiece_available() else None,
                    "MBart50TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
214
215
216
217
218
219
220
            (
                "rembert",
                (
                    "RemBertTokenizer" if is_sentencepiece_available() else None,
                    "RemBertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
221
222
223
224
225
226
227
            (
                "clip",
                (
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
228
            ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
229
230
231
232
233
234
235
            (
                "perceiver",
                (
                    "PerceiverTokenizer",
                    None,
                ),
            ),
236
237
        ]
    )
238

239
240
241
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
242

243

244
def tokenizer_class_from_name(class_name: str):
245
246
247
248
249
    if class_name == "PreTrainedTokenizerFast":
        return PreTrainedTokenizerFast

    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
        if class_name in tokenizers:
250
            module_name = model_type_to_module_name(module_name)
251

252
253
            module = importlib.import_module(f".{module_name}", "transformers.models")
            return getattr(module, class_name)
254

255
256
257
258
259
    for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
        for tokenizer in tokenizers:
            if getattr(tokenizer, "__name__", None) == class_name:
                return tokenizer

260
    return None
261
262


263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def get_tokenizer_config(
    pretrained_model_name_or_path: Union[str, os.PathLike],
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
    proxies: Optional[Dict[str, str]] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
    **kwargs,
):
    """
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
278
        pretrained_model_name_or_path (`str` or `os.PathLike`):
279
280
            This can be either:

281
            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
Sylvain Gugger's avatar
Sylvain Gugger committed
282
283
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
              under a user or organization name, like `dbmdz/bert-base-german-cased`.
284
285
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
286

287
        cache_dir (`str` or `os.PathLike`, *optional*):
288
289
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
290
        force_download (`bool`, *optional*, defaults to `False`):
291
292
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
293
        resume_download (`bool`, *optional*, defaults to `False`):
294
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
295
        proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
296
297
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
298
        use_auth_token (`str` or *bool*, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
299
300
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
            when running `transformers-cli login` (stored in `~/.huggingface`).
301
        revision(`str`, *optional*, defaults to `"main"`):
302
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
303
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
304
            identifier allowed by git.
305
306
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
307

308
    <Tip>
309

310
    Passing `use_auth_token=True` is required when you want to use a private model.
311

312
    </Tip>
313
314

    Returns:
315
        `Dict`: The configuration of the tokenizer.
316

317
    Examples:
318

319
320
321
322
323
    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
324

325
326
    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer
327

328
329
330
331
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```"""
332
333
334
335
    if is_offline_mode() and not local_files_only:
        logger.info("Offline mode: forcing local_files_only=True")
        local_files_only = True

336
337
338
339
340
341
342
343
344
345
    # Will raise a ValueError if `pretrained_model_name_or_path` is not a valid path or model identifier
    repo_files = get_list_of_files(
        pretrained_model_name_or_path,
        revision=revision,
        use_auth_token=use_auth_token,
        local_files_only=local_files_only,
    )
    if TOKENIZER_CONFIG_FILE not in [Path(f).name for f in repo_files]:
        return {}

346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
    if os.path.isdir(pretrained_model_name_or_path):
        config_file = os.path.join(pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE)
    else:
        config_file = hf_bucket_url(
            pretrained_model_name_or_path, filename=TOKENIZER_CONFIG_FILE, revision=revision, mirror=None
        )

    try:
        # Load from URL or cache if already cached
        resolved_config_file = cached_path(
            config_file,
            cache_dir=cache_dir,
            force_download=force_download,
            proxies=proxies,
            resume_download=resume_download,
            local_files_only=local_files_only,
            use_auth_token=use_auth_token,
        )

366
    except EnvironmentError:
367
368
369
370
371
372
373
        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
        return {}

    with open(resolved_config_file, encoding="utf-8") as reader:
        return json.load(reader)


Julien Chaumond's avatar
Julien Chaumond committed
374
class AutoTokenizer:
375
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
376
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
377
    created with the [`AutoTokenizer.from_pretrained`] class method.
thomwolf's avatar
thomwolf committed
378

379
    This class cannot be instantiated directly using `__init__()` (throws an error).
thomwolf's avatar
thomwolf committed
380
    """
381

thomwolf's avatar
thomwolf committed
382
    def __init__(self):
383
384
385
386
        raise EnvironmentError(
            "AutoTokenizer is designed to be instantiated "
            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
        )
thomwolf's avatar
thomwolf committed
387
388

    @classmethod
389
    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
thomwolf's avatar
thomwolf committed
390
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
391
392
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
thomwolf's avatar
thomwolf committed
393

Sylvain Gugger's avatar
Sylvain Gugger committed
394
395
396
        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:
397

398
        List options
thomwolf's avatar
thomwolf committed
399
400

        Params:
401
            pretrained_model_name_or_path (`str` or `os.PathLike`):
402
403
                Can be either:

404
                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Sylvain Gugger's avatar
Sylvain Gugger committed
405
406
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
407
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
Sylvain Gugger's avatar
Sylvain Gugger committed
408
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
409
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
410
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
Sylvain Gugger's avatar
Sylvain Gugger committed
411
                      applicable to all derived classes)
412
413
414
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
415
                The configuration object used to dertermine the tokenizer class to instantiate.
416
            cache_dir (`str` or `os.PathLike`, *optional*):
417
418
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
419
            force_download (`bool`, *optional*, defaults to `False`):
420
421
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
422
            resume_download (`bool`, *optional*, defaults to `False`):
423
424
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
425
            proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
426
427
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
428
            revision(`str`, *optional*, defaults to `"main"`):
Julien Chaumond's avatar
Julien Chaumond committed
429
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
430
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
Julien Chaumond's avatar
Julien Chaumond committed
431
                identifier allowed by git.
432
            subfolder (`str`, *optional*):
433
434
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
435
            use_fast (`bool`, *optional*, defaults to `True`):
436
                Whether or not to try to load the fast version of the tokenizer.
437
            tokenizer_type (`str`, *optional*):
438
                Tokenizer type to be loaded.
439
            trust_remote_code (`bool`, *optional*, defaults to `False`):
440
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
Sylvain Gugger's avatar
Sylvain Gugger committed
441
442
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
443
444
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
Sylvain Gugger's avatar
Sylvain Gugger committed
445
446
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.
thomwolf's avatar
thomwolf committed
447

448
        Examples:
449

450
451
        ```python
        >>> from transformers import AutoTokenizer
452

453
454
        >>> # Download vocabulary from huggingface.co and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
455

456
457
        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
thomwolf's avatar
thomwolf committed
458

459
460
461
        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
        ```"""
462
        config = kwargs.pop("config", None)
463
        kwargs["_from_auto"] = True
464

465
        use_fast = kwargs.pop("use_fast", True)
466
        tokenizer_type = kwargs.pop("tokenizer_type", None)
467
        trust_remote_code = kwargs.pop("trust_remote_code", False)
468

469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
        # First, let's see whether the tokenizer_type is passed so that we can leverage it
        if tokenizer_type is not None:
            tokenizer_class = None
            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)

            if tokenizer_class_tuple is None:
                raise ValueError(
                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
                )

            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple

            if use_fast and tokenizer_fast_class_name is not None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)

            if tokenizer_class is None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)

            if tokenizer_class is None:
                raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")

            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
494
495
        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
496
        tokenizer_auto_map = tokenizer_config.get("auto_map")
497
498
499
500

        # If that did not work, let's try to use the config.
        if config_tokenizer_class is None:
            if not isinstance(config, PretrainedConfig):
501
502
503
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
                )
504
            config_tokenizer_class = config.tokenizer_class
505
506
            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
507
508
509

        # If we have the tokenizer class from the tokenizer config or the model config we're good!
        if config_tokenizer_class is not None:
510
            tokenizer_class = None
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
            if tokenizer_auto_map is not None:
                if not trust_remote_code:
                    raise ValueError(
                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that repo "
                        "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
                        "the option `trust_remote_code=True` to remove this error."
                    )
                if kwargs.get("revision", None) is None:
                    logger.warn(
                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
                        "no malicious code has been contributed in a newer revision."
                    )

                if use_fast and tokenizer_auto_map[1] is not None:
                    class_ref = tokenizer_auto_map[1]
                else:
                    class_ref = tokenizer_auto_map[0]

                module_file, class_name = class_ref.split(".")
                tokenizer_class = get_class_from_dynamic_module(
                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
                )

            elif use_fast and not config_tokenizer_class.endswith("Fast"):
535
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
536
537
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
538
                tokenizer_class_candidate = config_tokenizer_class
539
540
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

541
            if tokenizer_class is None:
542
                raise ValueError(
543
                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
544
                )
545
546
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

547
        # Otherwise we have to be creative.
548
549
550
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
551
                logger.warning(
552
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
553
                    f"config class: {config.decoder.__class__}. It is not recommended to use the "
554
555
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
                    "specific tokenizer classes."
556
557
558
                )
            config = config.encoder

559
560
        model_type = config_class_to_model_type(type(config).__name__)
        if model_type is not None:
561
            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
562
            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
563
564
                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
565
566
567
568
569
570
571
                if tokenizer_class_py is not None:
                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
                        "in order to use this tokenizer."
                    )
572

573
        raise ValueError(
574
575
            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
576
        )
577
578
579
580
581
582
583

    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None):
        """
        Register a new tokenizer in this mapping.


        Args:
584
            config_class ([`PretrainedConfig`]):
585
                The configuration corresponding to the model to register.
586
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
587
                The slow tokenizer to register.
588
            slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
            raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
            raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
            raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")

        if (
            slow_tokenizer_class is not None
            and fast_tokenizer_class is not None
            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
        ):
            raise ValueError(
                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
                "consistent with the slow tokenizer class you passed (fast tokenizer has "
                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
                "so they match!"
            )

        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
        if config_class in TOKENIZER_MAPPING._extra_content:
            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
            if slow_tokenizer_class is None:
                slow_tokenizer_class = existing_slow
            if fast_tokenizer_class is None:
                fast_tokenizer_class = existing_fast

        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class))