tokenization_auto.py 29.8 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Julien Chaumond's avatar
Julien Chaumond committed
15
""" Auto Tokenizer class. """
thomwolf's avatar
thomwolf committed
16

17
import importlib
18
19
import json
import os
20
from collections import OrderedDict
21
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
thomwolf's avatar
thomwolf committed
22

Sylvain Gugger's avatar
Sylvain Gugger committed
23
from ...configuration_utils import PretrainedConfig
24
25
26
27
28
29
30
from ...file_utils import (
    cached_path,
    hf_bucket_url,
    is_offline_mode,
    is_sentencepiece_available,
    is_tokenizers_available,
)
31
from ...tokenization_utils import PreTrainedTokenizer
32
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
33
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
34
from ...utils import logging
35
36
from ..encoder_decoder import EncoderDecoderConfig
from .auto_factory import _LazyAutoMapping
37
from .configuration_auto import (
38
    CONFIG_MAPPING_NAMES,
39
    AutoConfig,
40
    config_class_to_model_type,
41
    model_type_to_module_name,
42
    replace_list_option_in_docstrings,
43
)
44
from .dynamic import get_class_from_dynamic_module
Aymeric Augustin's avatar
Aymeric Augustin committed
45

thomwolf's avatar
thomwolf committed
46

Lysandre Debut's avatar
Lysandre Debut committed
47
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
48

49
50
51
52
53
54
55
if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
    TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
    TOKENIZER_MAPPING_NAMES = OrderedDict(
        [
Gunjan Chhablani's avatar
Gunjan Chhablani committed
56
            ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
57
58
            ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
            ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
59
            (
60
61
62
63
64
                "t5",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
65
66
            ),
            (
67
68
69
70
71
                "mt5",
                (
                    "MT5Tokenizer" if is_sentencepiece_available() else None,
                    "MT5TokenizerFast" if is_tokenizers_available() else None,
                ),
72
            ),
73
74
            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
            ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
75
            (
76
77
78
79
80
                "albert",
                (
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
                ),
81
82
            ),
            (
83
84
85
86
87
                "camembert",
                (
                    "CamembertTokenizer" if is_sentencepiece_available() else None,
                    "CamembertTokenizerFast" if is_tokenizers_available() else None,
                ),
88
89
            ),
            (
90
91
92
93
94
                "pegasus",
                (
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
                ),
95
96
            ),
            (
97
98
99
100
101
                "mbart",
                (
                    "MBartTokenizer" if is_sentencepiece_available() else None,
                    "MBartTokenizerFast" if is_tokenizers_available() else None,
                ),
102
103
            ),
            (
104
105
106
107
108
                "xlm-roberta",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
109
            ),
110
111
            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
112
            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
113
114
115
            ("bart", ("BartTokenizer", "BartTokenizerFast")),
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
116
            (
117
118
119
120
121
                "reformer",
                (
                    "ReformerTokenizer" if is_sentencepiece_available() else None,
                    "ReformerTokenizerFast" if is_tokenizers_available() else None,
                ),
122
            ),
123
124
125
126
            ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
            ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
            ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
            ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
127
            ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
128
            ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
129
            (
130
131
132
133
134
                "dpr",
                (
                    "DPRQuestionEncoderTokenizer",
                    "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
                ),
135
136
            ),
            (
137
138
                "squeezebert",
                ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
139
            ),
140
141
142
143
            ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("transfo-xl", ("TransfoXLTokenizer", None)),
144
            (
145
146
147
148
149
                "xlnet",
                (
                    "XLNetTokenizer" if is_sentencepiece_available() else None,
                    "XLNetTokenizerFast" if is_tokenizers_available() else None,
                ),
150
            ),
151
152
153
154
155
156
157
158
159
160
            ("flaubert", ("FlaubertTokenizer", None)),
            ("xlm", ("XLMTokenizer", None)),
            ("ctrl", ("CTRLTokenizer", None)),
            ("fsmt", ("FSMTTokenizer", None)),
            ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
            ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
            ("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)),
            ("rag", ("RagTokenizer", None)),
            ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
            ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
161
            ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
162
163
164
165
166
167
            ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
            ("prophetnet", ("ProphetNetTokenizer", None)),
            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
            ("tapas", ("TapasTokenizer", None)),
            ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
            ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
168
            (
169
170
171
172
173
                "big_bird",
                (
                    "BigBirdTokenizer" if is_sentencepiece_available() else None,
                    "BigBirdTokenizerFast" if is_tokenizers_available() else None,
                ),
174
            ),
175
            ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
176
            ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
177
178
179
180
            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
            ("hubert", ("Wav2Vec2CTCTokenizer", None)),
            ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("luke", ("LukeTokenizer", None)),
Ryokan RI's avatar
Ryokan RI committed
181
            ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
182
183
184
185
            ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
            ("canine", ("CanineTokenizer", None)),
            ("bertweet", ("BertweetTokenizer", None)),
            ("bert-japanese", ("BertJapaneseTokenizer", None)),
Ori Ram's avatar
Ori Ram committed
186
            ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
187
            ("byt5", ("ByT5Tokenizer", None)),
188
            (
189
190
191
192
193
                "cpm",
                (
                    "CpmTokenizer" if is_sentencepiece_available() else None,
                    "CpmTokenizerFast" if is_tokenizers_available() else None,
                ),
194
            ),
195
196
            ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
            ("phobert", ("PhobertTokenizer", None)),
197
            ("bartpho", ("BartphoTokenizer", None)),
198
199
200
201
202
203
204
205
206
207
208
209
210
211
            (
                "barthez",
                (
                    "BarthezTokenizer" if is_sentencepiece_available() else None,
                    "BarthezTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
            (
                "mbart50",
                (
                    "MBart50Tokenizer" if is_sentencepiece_available() else None,
                    "MBart50TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
212
213
214
215
216
217
218
            (
                "rembert",
                (
                    "RemBertTokenizer" if is_sentencepiece_available() else None,
                    "RemBertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
219
220
221
222
223
224
225
            (
                "clip",
                (
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
226
            ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
227
228
229
230
231
232
233
            (
                "perceiver",
                (
                    "PerceiverTokenizer",
                    None,
                ),
            ),
234
235
        ]
    )
236

237
238
239
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
240

241

242
def tokenizer_class_from_name(class_name: str):
243
244
245
246
247
    if class_name == "PreTrainedTokenizerFast":
        return PreTrainedTokenizerFast

    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
        if class_name in tokenizers:
248
            module_name = model_type_to_module_name(module_name)
249

250
251
            module = importlib.import_module(f".{module_name}", "transformers.models")
            return getattr(module, class_name)
252

253
254
255
256
257
    for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
        for tokenizer in tokenizers:
            if getattr(tokenizer, "__name__", None) == class_name:
                return tokenizer

258
    return None
259
260


261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def get_tokenizer_config(
    pretrained_model_name_or_path: Union[str, os.PathLike],
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
    proxies: Optional[Dict[str, str]] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
    **kwargs,
):
    """
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
276
        pretrained_model_name_or_path (`str` or `os.PathLike`):
277
278
            This can be either:

279
280
281
282
283
            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
              namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
284

285
        cache_dir (`str` or `os.PathLike`, *optional*):
286
287
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
288
        force_download (`bool`, *optional*, defaults to `False`):
289
290
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
291
        resume_download (`bool`, *optional*, defaults to `False`):
292
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
293
294
295
296
297
298
        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        use_auth_token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
            generated when running `transformers-cli login` (stored in `~/.huggingface`).
        revision(`str`, *optional*, defaults to `"main"`):
299
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
300
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
301
            identifier allowed by git.
302
303
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
304

305
    <Tip>
306

307
    Passing `use_auth_token=True` is required when you want to use a private model.
308

309
    </Tip>
310
311

    Returns:
312
        `Dict`: The configuration of the tokenizer.
313

314
    Examples:
315

316
317
318
319
320
    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
321

322
323
    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer
324

325
326
327
328
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```"""
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
    if is_offline_mode() and not local_files_only:
        logger.info("Offline mode: forcing local_files_only=True")
        local_files_only = True

    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
    if os.path.isdir(pretrained_model_name_or_path):
        config_file = os.path.join(pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE)
    else:
        config_file = hf_bucket_url(
            pretrained_model_name_or_path, filename=TOKENIZER_CONFIG_FILE, revision=revision, mirror=None
        )

    try:
        # Load from URL or cache if already cached
        resolved_config_file = cached_path(
            config_file,
            cache_dir=cache_dir,
            force_download=force_download,
            proxies=proxies,
            resume_download=resume_download,
            local_files_only=local_files_only,
            use_auth_token=use_auth_token,
        )

353
    except (EnvironmentError, ValueError):
354
355
356
357
358
359
360
        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
        return {}

    with open(resolved_config_file, encoding="utf-8") as reader:
        return json.load(reader)


Julien Chaumond's avatar
Julien Chaumond committed
361
class AutoTokenizer:
362
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
363
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
364
    created with the [`AutoTokenizer.from_pretrained`] class method.
thomwolf's avatar
thomwolf committed
365

366
    This class cannot be instantiated directly using `__init__()` (throws an error).
thomwolf's avatar
thomwolf committed
367
    """
368

thomwolf's avatar
thomwolf committed
369
    def __init__(self):
370
371
372
373
        raise EnvironmentError(
            "AutoTokenizer is designed to be instantiated "
            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
        )
thomwolf's avatar
thomwolf committed
374
375

    @classmethod
376
    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
thomwolf's avatar
thomwolf committed
377
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
378
379
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
thomwolf's avatar
thomwolf committed
380

381
382
383
        The tokenizer class to instantiate is selected based on the `model_type` property of the config object
        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
        missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
384

385
        List options
thomwolf's avatar
thomwolf committed
386
387

        Params:
388
            pretrained_model_name_or_path (`str` or `os.PathLike`):
389
390
                Can be either:

391
392
393
394
395
396
                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
                      a user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g.,
                      `./my_model_directory/`.
397
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
398
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
Sylvain Gugger's avatar
Sylvain Gugger committed
399
                      applicable to all derived classes)
400
401
402
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
403
                The configuration object used to dertermine the tokenizer class to instantiate.
404
            cache_dir (`str` or `os.PathLike`, *optional*):
405
406
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
407
            force_download (`bool`, *optional*, defaults to `False`):
408
409
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
410
            resume_download (`bool`, *optional*, defaults to `False`):
411
412
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
413
414
415
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision(`str`, *optional*, defaults to `"main"`):
Julien Chaumond's avatar
Julien Chaumond committed
416
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
417
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
Julien Chaumond's avatar
Julien Chaumond committed
418
                identifier allowed by git.
419
            subfolder (`str`, *optional*):
420
421
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
422
            use_fast (`bool`, *optional*, defaults to `True`):
423
                Whether or not to try to load the fast version of the tokenizer.
424
            tokenizer_type (`str`, *optional*):
425
                Tokenizer type to be loaded.
426
            trust_remote_code (`bool`, *optional*, defaults to `False`):
427
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
428
                should only be set to `True` for repositories you trust and in which you have read the code, as it
429
                will execute code present on the Hub on your local machine.
430
431
432
433
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`,
                `mask_token`, `additional_special_tokens`. See parameters in the `__init__()` for more details.
thomwolf's avatar
thomwolf committed
434

435
        Examples:
436

437
438
        ```python
        >>> from transformers import AutoTokenizer
439

440
441
        >>> # Download vocabulary from huggingface.co and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
442

443
444
        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
thomwolf's avatar
thomwolf committed
445

446
447
448
        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
        ```"""
449
        config = kwargs.pop("config", None)
450
        kwargs["_from_auto"] = True
451

452
        use_fast = kwargs.pop("use_fast", True)
453
        tokenizer_type = kwargs.pop("tokenizer_type", None)
454
        trust_remote_code = kwargs.pop("trust_remote_code", False)
455

456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
        # First, let's see whether the tokenizer_type is passed so that we can leverage it
        if tokenizer_type is not None:
            tokenizer_class = None
            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)

            if tokenizer_class_tuple is None:
                raise ValueError(
                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
                )

            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple

            if use_fast and tokenizer_fast_class_name is not None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)

            if tokenizer_class is None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)

            if tokenizer_class is None:
                raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")

            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
481
482
        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
483
        tokenizer_auto_map = tokenizer_config.get("auto_map")
484
485
486
487

        # If that did not work, let's try to use the config.
        if config_tokenizer_class is None:
            if not isinstance(config, PretrainedConfig):
488
489
490
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
                )
491
            config_tokenizer_class = config.tokenizer_class
492
493
            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
494
495
496

        # If we have the tokenizer class from the tokenizer config or the model config we're good!
        if config_tokenizer_class is not None:
497
            tokenizer_class = None
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
            if tokenizer_auto_map is not None:
                if not trust_remote_code:
                    raise ValueError(
                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that repo "
                        "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
                        "the option `trust_remote_code=True` to remove this error."
                    )
                if kwargs.get("revision", None) is None:
                    logger.warn(
                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
                        "no malicious code has been contributed in a newer revision."
                    )

                if use_fast and tokenizer_auto_map[1] is not None:
                    class_ref = tokenizer_auto_map[1]
                else:
                    class_ref = tokenizer_auto_map[0]

                module_file, class_name = class_ref.split(".")
                tokenizer_class = get_class_from_dynamic_module(
                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
                )

            elif use_fast and not config_tokenizer_class.endswith("Fast"):
522
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
523
524
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
525
                tokenizer_class_candidate = config_tokenizer_class
526
527
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

528
            if tokenizer_class is None:
529
                raise ValueError(
530
                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
531
                )
532
533
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

534
        # Otherwise we have to be creative.
535
536
537
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
538
                logger.warning(
539
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
540
                    f"config class: {config.decoder.__class__}. It is not recommended to use the "
541
542
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
                    "specific tokenizer classes."
543
544
545
                )
            config = config.encoder

546
547
        model_type = config_class_to_model_type(type(config).__name__)
        if model_type is not None:
548
            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549
            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
550
551
                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
552
553
554
555
556
557
558
                if tokenizer_class_py is not None:
                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
                        "in order to use this tokenizer."
                    )
559

560
        raise ValueError(
561
562
            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
563
        )
564
565
566
567
568
569
570

    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None):
        """
        Register a new tokenizer in this mapping.


        Args:
571
            config_class ([`PretrainedConfig`]):
572
                The configuration corresponding to the model to register.
573
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
574
                The slow tokenizer to register.
575
            slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
            raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
            raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
            raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")

        if (
            slow_tokenizer_class is not None
            and fast_tokenizer_class is not None
            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
        ):
            raise ValueError(
                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
                "consistent with the slow tokenizer class you passed (fast tokenizer has "
                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
                "so they match!"
            )

        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
        if config_class in TOKENIZER_MAPPING._extra_content:
            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
            if slow_tokenizer_class is None:
                slow_tokenizer_class = existing_slow
            if fast_tokenizer_class is None:
                fast_tokenizer_class = existing_fast

        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class))