tokenization_auto.py 30 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Julien Chaumond's avatar
Julien Chaumond committed
15
""" Auto Tokenizer class. """
thomwolf's avatar
thomwolf committed
16

17
import importlib
18
19
import json
import os
20
from collections import OrderedDict
21
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
thomwolf's avatar
thomwolf committed
22

Sylvain Gugger's avatar
Sylvain Gugger committed
23
from ...configuration_utils import PretrainedConfig
24
25
26
27
28
29
30
from ...file_utils import (
    cached_path,
    hf_bucket_url,
    is_offline_mode,
    is_sentencepiece_available,
    is_tokenizers_available,
)
31
from ...tokenization_utils import PreTrainedTokenizer
32
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
33
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
34
from ...utils import logging
35
36
from ..encoder_decoder import EncoderDecoderConfig
from .auto_factory import _LazyAutoMapping
37
from .configuration_auto import (
38
    CONFIG_MAPPING_NAMES,
39
    AutoConfig,
40
    config_class_to_model_type,
41
    model_type_to_module_name,
42
    replace_list_option_in_docstrings,
43
)
44
from .dynamic import get_class_from_dynamic_module
Aymeric Augustin's avatar
Aymeric Augustin committed
45

thomwolf's avatar
thomwolf committed
46

Lysandre Debut's avatar
Lysandre Debut committed
47
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
48

49
50
51
52
53
54
55
if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
    TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
    TOKENIZER_MAPPING_NAMES = OrderedDict(
        [
Gunjan Chhablani's avatar
Gunjan Chhablani committed
56
            ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
57
58
            ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
            ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
59
            (
60
61
62
63
64
                "t5",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
65
66
            ),
            (
67
68
69
70
71
                "mt5",
                (
                    "MT5Tokenizer" if is_sentencepiece_available() else None,
                    "MT5TokenizerFast" if is_tokenizers_available() else None,
                ),
72
            ),
73
74
            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
            ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
75
            (
76
77
78
79
80
                "albert",
                (
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
                ),
81
82
            ),
            (
83
84
85
86
87
                "camembert",
                (
                    "CamembertTokenizer" if is_sentencepiece_available() else None,
                    "CamembertTokenizerFast" if is_tokenizers_available() else None,
                ),
88
89
            ),
            (
90
91
92
93
94
                "pegasus",
                (
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
                ),
95
96
            ),
            (
97
98
99
100
101
                "mbart",
                (
                    "MBartTokenizer" if is_sentencepiece_available() else None,
                    "MBartTokenizerFast" if is_tokenizers_available() else None,
                ),
102
103
            ),
            (
104
105
106
107
108
                "xlm-roberta",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
109
            ),
110
111
            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
112
            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
113
114
115
            ("bart", ("BartTokenizer", "BartTokenizerFast")),
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
116
            (
117
118
119
120
121
                "reformer",
                (
                    "ReformerTokenizer" if is_sentencepiece_available() else None,
                    "ReformerTokenizerFast" if is_tokenizers_available() else None,
                ),
122
            ),
123
124
125
126
            ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
            ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
            ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
            ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
127
            ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
128
            ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
129
            (
130
131
132
133
134
                "dpr",
                (
                    "DPRQuestionEncoderTokenizer",
                    "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
                ),
135
136
            ),
            (
137
138
                "squeezebert",
                ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
139
            ),
140
141
142
143
            ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("transfo-xl", ("TransfoXLTokenizer", None)),
144
            (
145
146
147
148
149
                "xlnet",
                (
                    "XLNetTokenizer" if is_sentencepiece_available() else None,
                    "XLNetTokenizerFast" if is_tokenizers_available() else None,
                ),
150
            ),
151
152
153
154
155
156
157
158
159
160
            ("flaubert", ("FlaubertTokenizer", None)),
            ("xlm", ("XLMTokenizer", None)),
            ("ctrl", ("CTRLTokenizer", None)),
            ("fsmt", ("FSMTTokenizer", None)),
            ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
            ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
            ("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)),
            ("rag", ("RagTokenizer", None)),
            ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
            ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
161
            ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
162
163
164
165
166
167
            ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
            ("prophetnet", ("ProphetNetTokenizer", None)),
            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
            ("tapas", ("TapasTokenizer", None)),
            ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
            ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
168
            (
169
170
171
172
173
                "big_bird",
                (
                    "BigBirdTokenizer" if is_sentencepiece_available() else None,
                    "BigBirdTokenizerFast" if is_tokenizers_available() else None,
                ),
174
            ),
175
            ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
176
            ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
177
178
179
180
            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
            ("hubert", ("Wav2Vec2CTCTokenizer", None)),
            ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("luke", ("LukeTokenizer", None)),
Ryokan RI's avatar
Ryokan RI committed
181
            ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
182
183
184
185
            ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
            ("canine", ("CanineTokenizer", None)),
            ("bertweet", ("BertweetTokenizer", None)),
            ("bert-japanese", ("BertJapaneseTokenizer", None)),
Ori Ram's avatar
Ori Ram committed
186
            ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
187
            ("byt5", ("ByT5Tokenizer", None)),
188
            (
189
190
191
192
193
                "cpm",
                (
                    "CpmTokenizer" if is_sentencepiece_available() else None,
                    "CpmTokenizerFast" if is_tokenizers_available() else None,
                ),
194
            ),
195
196
            ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
            ("phobert", ("PhobertTokenizer", None)),
197
            ("bartpho", ("BartphoTokenizer", None)),
198
199
200
201
202
203
204
205
206
207
208
209
210
211
            (
                "barthez",
                (
                    "BarthezTokenizer" if is_sentencepiece_available() else None,
                    "BarthezTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
            (
                "mbart50",
                (
                    "MBart50Tokenizer" if is_sentencepiece_available() else None,
                    "MBart50TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
212
213
214
215
216
217
218
            (
                "rembert",
                (
                    "RemBertTokenizer" if is_sentencepiece_available() else None,
                    "RemBertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
219
220
221
222
223
224
225
            (
                "clip",
                (
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
226
227
        ]
    )
228

229
230
231
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
232

233

234
def tokenizer_class_from_name(class_name: str):
235
236
237
238
239
    if class_name == "PreTrainedTokenizerFast":
        return PreTrainedTokenizerFast

    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
        if class_name in tokenizers:
240
            module_name = model_type_to_module_name(module_name)
241

242
243
            module = importlib.import_module(f".{module_name}", "transformers.models")
            return getattr(module, class_name)
244

245
246
247
248
249
    for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
        for tokenizer in tokenizers:
            if getattr(tokenizer, "__name__", None) == class_name:
                return tokenizer

250
    return None
251
252


253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
def get_tokenizer_config(
    pretrained_model_name_or_path: Union[str, os.PathLike],
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
    proxies: Optional[Dict[str, str]] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
    **kwargs,
):
    """
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
        pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
            This can be either:

            - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
              huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
              namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
            - a path to a `directory` containing a configuration file saved using the
              :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g., ``./my_model_directory/``.

        cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
        force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
        resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
        proxies (:obj:`Dict[str, str]`, `optional`):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        use_auth_token (:obj:`str` or `bool`, `optional`):
            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
        revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
            identifier allowed by git.
        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
            If :obj:`True`, will only try to load the tokenizer configuration from local files.

    .. note::

        Passing :obj:`use_auth_token=True` is required when you want to use a private model.


    Returns:
        :obj:`Dict`: The configuration of the tokenizer.

    Examples::

        # Download configuration from huggingface.co and cache.
        tokenizer_config = get_tokenizer_config("bert-base-uncased")
        # This model does not have a tokenizer config so the result will be an empty dict.
        tokenizer_config = get_tokenizer_config("xlm-roberta-base")

        # Save a pretrained tokenizer locally and you can reload its config
        from transformers import AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        tokenizer.save_pretrained("tokenizer-test")
        tokenizer_config = get_tokenizer_config("tokenizer-test")
    """
    if is_offline_mode() and not local_files_only:
        logger.info("Offline mode: forcing local_files_only=True")
        local_files_only = True

    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
    if os.path.isdir(pretrained_model_name_or_path):
        config_file = os.path.join(pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE)
    else:
        config_file = hf_bucket_url(
            pretrained_model_name_or_path, filename=TOKENIZER_CONFIG_FILE, revision=revision, mirror=None
        )

    try:
        # Load from URL or cache if already cached
        resolved_config_file = cached_path(
            config_file,
            cache_dir=cache_dir,
            force_download=force_download,
            proxies=proxies,
            resume_download=resume_download,
            local_files_only=local_files_only,
            use_auth_token=use_auth_token,
        )

    except EnvironmentError:
        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
        return {}

    with open(resolved_config_file, encoding="utf-8") as reader:
        return json.load(reader)


Julien Chaumond's avatar
Julien Chaumond committed
352
class AutoTokenizer:
353
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
354
355
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
    created with the :meth:`AutoTokenizer.from_pretrained` class method.
thomwolf's avatar
thomwolf committed
356

357
    This class cannot be instantiated directly using ``__init__()`` (throws an error).
thomwolf's avatar
thomwolf committed
358
    """
359

thomwolf's avatar
thomwolf committed
360
    def __init__(self):
361
362
363
364
        raise EnvironmentError(
            "AutoTokenizer is designed to be instantiated "
            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
        )
thomwolf's avatar
thomwolf committed
365
366

    @classmethod
367
    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
thomwolf's avatar
thomwolf committed
368
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
369
370
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
thomwolf's avatar
thomwolf committed
371

372
373
374
        The tokenizer class to instantiate is selected based on the :obj:`model_type` property of the config object
        (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's
        missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
375

376
        List options
thomwolf's avatar
thomwolf committed
377
378

        Params:
379
            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
380
381
                Can be either:

382
383
384
                    - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
385
386
387
388
                    - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
                      using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.,
                      ``./my_model_directory/``.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
Sylvain Gugger's avatar
Sylvain Gugger committed
389
390
                      single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``. (Not
                      applicable to all derived classes)
391
392
            inputs (additional positional arguments, `optional`):
                Will be passed along to the Tokenizer ``__init__()`` method.
393
            config (:class:`~transformers.PretrainedConfig`, `optional`)
394
                The configuration object used to dertermine the tokenizer class to instantiate.
395
            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
396
397
398
399
400
401
402
403
404
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (:obj:`Dict[str, str]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
405
406
                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
Julien Chaumond's avatar
Julien Chaumond committed
407
408
409
410
            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
                identifier allowed by git.
411
412
413
            subfolder (:obj:`str`, `optional`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
414
            use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
415
                Whether or not to try to load the fast version of the tokenizer.
416
417
            tokenizer_type (:obj:`str`, `optional`):
                Tokenizer type to be loaded.
418
419
420
421
            trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
                will execute code present on the Hub on your local machine.
422
423
424
425
            kwargs (additional keyword arguments, `optional`):
                Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like
                ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
                ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__()`` for more details.
thomwolf's avatar
thomwolf committed
426
427
428

        Examples::

429
            >>> from transformers import AutoTokenizer
430

431
            >>> # Download vocabulary from huggingface.co and cache.
432
            >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
433

434
            >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
435
            >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
436

437
438
            >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
            >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
thomwolf's avatar
thomwolf committed
439
440

        """
441
        config = kwargs.pop("config", None)
442
        kwargs["_from_auto"] = True
443

444
        use_fast = kwargs.pop("use_fast", True)
445
        tokenizer_type = kwargs.pop("tokenizer_type", None)
446
        trust_remote_code = kwargs.pop("trust_remote_code", False)
447

448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
        # First, let's see whether the tokenizer_type is passed so that we can leverage it
        if tokenizer_type is not None:
            tokenizer_class = None
            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)

            if tokenizer_class_tuple is None:
                raise ValueError(
                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
                )

            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple

            if use_fast and tokenizer_fast_class_name is not None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)

            if tokenizer_class is None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)

            if tokenizer_class is None:
                raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")

            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
473
474
        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
475
        tokenizer_auto_map = tokenizer_config.get("auto_map")
476
477
478
479

        # If that did not work, let's try to use the config.
        if config_tokenizer_class is None:
            if not isinstance(config, PretrainedConfig):
480
481
482
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
                )
483
            config_tokenizer_class = config.tokenizer_class
484
485
            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
486
487
488

        # If we have the tokenizer class from the tokenizer config or the model config we're good!
        if config_tokenizer_class is not None:
489
            tokenizer_class = None
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
            if tokenizer_auto_map is not None:
                if not trust_remote_code:
                    raise ValueError(
                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that repo "
                        "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
                        "the option `trust_remote_code=True` to remove this error."
                    )
                if kwargs.get("revision", None) is None:
                    logger.warn(
                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
                        "no malicious code has been contributed in a newer revision."
                    )

                if use_fast and tokenizer_auto_map[1] is not None:
                    class_ref = tokenizer_auto_map[1]
                else:
                    class_ref = tokenizer_auto_map[0]

                module_file, class_name = class_ref.split(".")
                tokenizer_class = get_class_from_dynamic_module(
                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
                )

            elif use_fast and not config_tokenizer_class.endswith("Fast"):
514
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
515
516
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
517
                tokenizer_class_candidate = config_tokenizer_class
518
519
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

520
            if tokenizer_class is None:
521
                raise ValueError(
522
                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
523
                )
524
525
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

526
        # Otherwise we have to be creative.
527
528
529
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
530
                logger.warning(
531
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
532
                    f"config class: {config.decoder.__class__}. It is not recommended to use the "
533
534
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
                    "specific tokenizer classes."
535
536
537
                )
            config = config.encoder

538
539
        model_type = config_class_to_model_type(type(config).__name__)
        if model_type is not None:
540
            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
541
            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
542
543
                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
544
545
546
547
548
549
550
                if tokenizer_class_py is not None:
                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
                        "in order to use this tokenizer."
                    )
551

552
        raise ValueError(
553
554
            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
555
        )
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598

    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None):
        """
        Register a new tokenizer in this mapping.


        Args:
            config_class (:class:`~transformers.PretrainedConfig`):
                The configuration corresponding to the model to register.
            slow_tokenizer_class (:class:`~transformers.PretrainedTokenizer`, `optional`):
                The slow tokenizer to register.
            slow_tokenizer_class (:class:`~transformers.PretrainedTokenizerFast`, `optional`):
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
            raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
            raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
            raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")

        if (
            slow_tokenizer_class is not None
            and fast_tokenizer_class is not None
            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
        ):
            raise ValueError(
                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
                "consistent with the slow tokenizer class you passed (fast tokenizer has "
                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
                "so they match!"
            )

        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
        if config_class in TOKENIZER_MAPPING._extra_content:
            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
            if slow_tokenizer_class is None:
                slow_tokenizer_class = existing_slow
            if fast_tokenizer_class is None:
                fast_tokenizer_class = existing_fast

        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class))