tokenization_auto.py 37.4 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
""" Auto Tokenizer class."""
thomwolf's avatar
thomwolf committed
16

17
import importlib
18
19
import json
import os
20
from collections import OrderedDict
21
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
thomwolf's avatar
thomwolf committed
22

Sylvain Gugger's avatar
Sylvain Gugger committed
23
from ...configuration_utils import PretrainedConfig
24
from ...dynamic_module_utils import get_class_from_dynamic_module
25
from ...tokenization_utils import PreTrainedTokenizer
26
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
27
from ...tokenization_utils_fast import PreTrainedTokenizerFast
28
from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging
29
30
from ..encoder_decoder import EncoderDecoderConfig
from .auto_factory import _LazyAutoMapping
31
from .configuration_auto import (
32
    CONFIG_MAPPING_NAMES,
33
    AutoConfig,
34
    config_class_to_model_type,
35
    model_type_to_module_name,
36
    replace_list_option_in_docstrings,
37
)
Aymeric Augustin's avatar
Aymeric Augustin committed
38

thomwolf's avatar
thomwolf committed
39

Lysandre Debut's avatar
Lysandre Debut committed
40
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
41

42
43
44
45
46
47
48
if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
    TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
    TOKENIZER_MAPPING_NAMES = OrderedDict(
        [
49
            (
50
                "albert",
51
                (
52
53
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
54
                ),
55
            ),
56
            ("bart", ("BartTokenizer", "BartTokenizerFast")),
57
            (
58
                "barthez",
59
                (
60
61
                    "BarthezTokenizer" if is_sentencepiece_available() else None,
                    "BarthezTokenizerFast" if is_tokenizers_available() else None,
62
                ),
63
            ),
64
65
66
67
68
            ("bartpho", ("BartphoTokenizer", None)),
            ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
            ("bert-japanese", ("BertJapaneseTokenizer", None)),
            ("bertweet", ("BertweetTokenizer", None)),
69
            (
70
                "big_bird",
71
                (
72
73
                    "BigBirdTokenizer" if is_sentencepiece_available() else None,
                    "BigBirdTokenizerFast" if is_tokenizers_available() else None,
74
                ),
75
            ),
76
            ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
Kamal Raj Kanakarajan's avatar
Kamal Raj Kanakarajan committed
77
            ("biogpt", ("BioGptTokenizer", None)),
78
79
            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
Younes Belkada's avatar
Younes Belkada committed
80
            ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
81
            ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
Younes Belkada's avatar
Younes Belkada committed
82
            ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)),
83
            ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
84
            ("byt5", ("ByT5Tokenizer", None)),
85
            (
86
87
88
89
90
                "camembert",
                (
                    "CamembertTokenizer" if is_sentencepiece_available() else None,
                    "CamembertTokenizerFast" if is_tokenizers_available() else None,
                ),
91
            ),
92
            ("canine", ("CanineTokenizer", None)),
93
            ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
94
            (
95
                "clip",
96
                (
97
98
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
99
                ),
100
            ),
NielsRogge's avatar
NielsRogge committed
101
102
103
104
105
106
107
            (
                "clipseg",
                (
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
rooa's avatar
rooa committed
108
            ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
109
            ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
110
            (
111
                "cpm",
112
                (
113
114
                    "CpmTokenizer" if is_sentencepiece_available() else None,
                    "CpmTokenizerFast" if is_tokenizers_available() else None,
115
                ),
116
            ),
117
118
119
            ("ctrl", ("CTRLTokenizer", None)),
            ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
            ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
120
            (
121
                "deberta-v2",
122
                (
123
124
                    "DebertaV2Tokenizer" if is_sentencepiece_available() else None,
                    "DebertaV2TokenizerFast" if is_tokenizers_available() else None,
125
                ),
126
            ),
127
            ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
128
            (
129
                "dpr",
130
                (
131
132
                    "DPRQuestionEncoderTokenizer",
                    "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
133
                ),
134
            ),
135
            ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
136
            ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
137
            ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
Matt's avatar
Matt committed
138
            ("esm", ("EsmTokenizer", None)),
139
140
141
            ("flaubert", ("FlaubertTokenizer", None)),
            ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
            ("fsmt", ("FSMTTokenizer", None)),
142
            ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
143
            ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
144
            ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
145
146
            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
147
            ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
148
            ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
149
            ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
150
            ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
151
152
153
            ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
            ("hubert", ("Wav2Vec2CTCTokenizer", None)),
            ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
154
            ("jukebox", ("JukeboxTokenizer", None)),
155
            ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
156
            ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
157
            ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
158
            ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
159
            ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
160
            ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
161
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
Daniel Stancl's avatar
Daniel Stancl committed
162
163
164
165
166
167
168
            (
                "longt5",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
169
170
171
172
            ("luke", ("LukeTokenizer", None)),
            ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
            ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
173
            (
174
                "mbart",
175
                (
176
177
                    "MBartTokenizer" if is_sentencepiece_available() else None,
                    "MBartTokenizerFast" if is_tokenizers_available() else None,
178
                ),
179
180
            ),
            (
181
                "mbart50",
182
                (
183
184
                    "MBart50Tokenizer" if is_sentencepiece_available() else None,
                    "MBart50TokenizerFast" if is_tokenizers_available() else None,
185
                ),
186
            ),
187
188
189
190
            ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
191
            (
192
                "mt5",
193
                (
194
195
                    "MT5Tokenizer" if is_sentencepiece_available() else None,
                    "MT5TokenizerFast" if is_tokenizers_available() else None,
196
197
                ),
            ),
StevenTang1998's avatar
StevenTang1998 committed
198
            ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
199
            ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
200
            (
Lysandre Debut's avatar
Lysandre Debut committed
201
202
203
204
205
206
207
                "nllb",
                (
                    "NllbTokenizer" if is_sentencepiece_available() else None,
                    "NllbTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
            (
208
                "nystromformer",
209
                (
210
211
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
212
                ),
213
            ),
Jitesh Jain's avatar
Jitesh Jain committed
214
            ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
215
            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
216
            ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
217
            ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
218
            (
219
                "pegasus",
220
                (
221
222
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
223
                ),
224
            ),
225
226
227
228
229
230
231
            (
                "pegasus_x",
                (
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
232
            (
233
                "perceiver",
234
                (
235
236
                    "PerceiverTokenizer",
                    None,
237
238
                ),
            ),
239
240
241
242
243
244
            ("phobert", ("PhobertTokenizer", None)),
            ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
            ("prophetnet", ("ProphetNetTokenizer", None)),
            ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("rag", ("RagTokenizer", None)),
            ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
245
            (
246
                "reformer",
247
                (
248
249
                    "ReformerTokenizer" if is_sentencepiece_available() else None,
                    "ReformerTokenizerFast" if is_tokenizers_available() else None,
250
251
                ),
            ),
252
253
254
255
256
257
258
            (
                "rembert",
                (
                    "RemBertTokenizer" if is_sentencepiece_available() else None,
                    "RemBertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
259
260
            ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
261
262
263
264
            (
                "roberta-prelayernorm",
                ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None),
            ),
265
            ("roc_bert", ("RoCBertTokenizer", None)),
266
267
268
            ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
            ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
            ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
269
            ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
270
            ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
271
            (
272
273
                "squeezebert",
                ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
274
            ),
275
276
277
278
279
280
281
            (
                "switch_transformers",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
282
            (
283
                "t5",
284
                (
285
286
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
287
288
                ),
            ),
289
290
291
            ("tapas", ("TapasTokenizer", None)),
            ("tapex", ("TapexTokenizer", None)),
            ("transfo-xl", ("TransfoXLTokenizer", None)),
292
            ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
293
294
            ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
295
            ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
296
            ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
297
            ("whisper", ("WhisperTokenizer" if is_sentencepiece_available() else None, None)),
NielsRogge's avatar
NielsRogge committed
298
            ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
299
300
301
302
303
304
305
            (
                "xglm",
                (
                    "XGLMTokenizer" if is_sentencepiece_available() else None,
                    "XGLMTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
306
307
            ("xlm", ("XLMTokenizer", None)),
            ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
308
            (
309
                "xlm-roberta",
310
                (
311
312
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
313
314
                ),
            ),
315
316
317
318
319
320
321
            (
                "xlm-roberta-xl",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
322
323
324
325
326
327
328
            (
                "xlnet",
                (
                    "XLNetTokenizer" if is_sentencepiece_available() else None,
                    "XLNetTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
Jannis Vamvas's avatar
Jannis Vamvas committed
329
330
331
332
333
334
335
            (
                "xmod",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
336
337
338
339
340
341
342
            (
                "yoso",
                (
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
343
344
        ]
    )
345

346
347
348
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
349

350

351
def tokenizer_class_from_name(class_name: str):
352
353
354
355
356
    if class_name == "PreTrainedTokenizerFast":
        return PreTrainedTokenizerFast

    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
        if class_name in tokenizers:
357
            module_name = model_type_to_module_name(module_name)
358

359
            module = importlib.import_module(f".{module_name}", "transformers.models")
360
361
362
363
            try:
                return getattr(module, class_name)
            except AttributeError:
                continue
364

365
366
367
368
369
    for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
        for tokenizer in tokenizers:
            if getattr(tokenizer, "__name__", None) == class_name:
                return tokenizer

370
371
372
373
374
375
    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
    # init and we return the proper dummy to get an appropriate error message.
    main_module = importlib.import_module("transformers")
    if hasattr(main_module, class_name):
        return getattr(main_module, class_name)

376
    return None
377
378


379
380
381
382
383
384
385
386
387
def get_tokenizer_config(
    pretrained_model_name_or_path: Union[str, os.PathLike],
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
    proxies: Optional[Dict[str, str]] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
388
    subfolder: str = "",
389
390
391
392
393
394
    **kwargs,
):
    """
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
395
        pretrained_model_name_or_path (`str` or `os.PathLike`):
396
397
            This can be either:

398
            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
Sylvain Gugger's avatar
Sylvain Gugger committed
399
400
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
              under a user or organization name, like `dbmdz/bert-base-german-cased`.
401
402
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
403

404
        cache_dir (`str` or `os.PathLike`, *optional*):
405
406
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
407
        force_download (`bool`, *optional*, defaults to `False`):
408
409
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
410
        resume_download (`bool`, *optional*, defaults to `False`):
411
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
412
        proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
413
414
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
415
        use_auth_token (`str` or *bool*, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
416
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
417
            when running `huggingface-cli login` (stored in `~/.huggingface`).
418
        revision (`str`, *optional*, defaults to `"main"`):
419
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
420
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
421
            identifier allowed by git.
422
423
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
424
425
426
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.
427

428
    <Tip>
429

430
    Passing `use_auth_token=True` is required when you want to use a private model.
431

432
    </Tip>
433
434

    Returns:
435
        `Dict`: The configuration of the tokenizer.
436

437
    Examples:
438

439
440
441
442
443
    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
444

445
446
    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer
447

448
449
450
451
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```"""
452
453
    commit_hash = kwargs.get("_commit_hash", None)
    resolved_config_file = cached_file(
454
455
456
457
458
459
460
461
462
        pretrained_model_name_or_path,
        TOKENIZER_CONFIG_FILE,
        cache_dir=cache_dir,
        force_download=force_download,
        resume_download=resume_download,
        proxies=proxies,
        use_auth_token=use_auth_token,
        revision=revision,
        local_files_only=local_files_only,
463
        subfolder=subfolder,
464
465
466
        _raise_exceptions_for_missing_entries=False,
        _raise_exceptions_for_connection_errors=False,
        _commit_hash=commit_hash,
467
468
    )
    if resolved_config_file is None:
469
470
        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
        return {}
471
    commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
472
473

    with open(resolved_config_file, encoding="utf-8") as reader:
474
475
476
        result = json.load(reader)
    result["_commit_hash"] = commit_hash
    return result
477
478


Julien Chaumond's avatar
Julien Chaumond committed
479
class AutoTokenizer:
480
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
481
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
482
    created with the [`AutoTokenizer.from_pretrained`] class method.
thomwolf's avatar
thomwolf committed
483

484
    This class cannot be instantiated directly using `__init__()` (throws an error).
thomwolf's avatar
thomwolf committed
485
    """
486

thomwolf's avatar
thomwolf committed
487
    def __init__(self):
488
489
490
491
        raise EnvironmentError(
            "AutoTokenizer is designed to be instantiated "
            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
        )
thomwolf's avatar
thomwolf committed
492
493

    @classmethod
494
    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
thomwolf's avatar
thomwolf committed
495
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
496
497
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
thomwolf's avatar
thomwolf committed
498

Sylvain Gugger's avatar
Sylvain Gugger committed
499
500
501
        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:
502

503
        List options
thomwolf's avatar
thomwolf committed
504
505

        Params:
506
            pretrained_model_name_or_path (`str` or `os.PathLike`):
507
508
                Can be either:

509
                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Sylvain Gugger's avatar
Sylvain Gugger committed
510
511
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
512
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
Sylvain Gugger's avatar
Sylvain Gugger committed
513
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
514
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
515
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
Sylvain Gugger's avatar
Sylvain Gugger committed
516
                      applicable to all derived classes)
517
518
519
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
520
                The configuration object used to dertermine the tokenizer class to instantiate.
521
            cache_dir (`str` or `os.PathLike`, *optional*):
522
523
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
524
            force_download (`bool`, *optional*, defaults to `False`):
525
526
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
527
            resume_download (`bool`, *optional*, defaults to `False`):
528
529
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
530
            proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
531
532
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
533
            revision (`str`, *optional*, defaults to `"main"`):
Julien Chaumond's avatar
Julien Chaumond committed
534
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
535
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
Julien Chaumond's avatar
Julien Chaumond committed
536
                identifier allowed by git.
537
            subfolder (`str`, *optional*):
538
539
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
540
            use_fast (`bool`, *optional*, defaults to `True`):
541
542
543
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
544
            tokenizer_type (`str`, *optional*):
545
                Tokenizer type to be loaded.
546
            trust_remote_code (`bool`, *optional*, defaults to `False`):
547
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
Sylvain Gugger's avatar
Sylvain Gugger committed
548
549
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
550
551
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
Sylvain Gugger's avatar
Sylvain Gugger committed
552
553
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.
thomwolf's avatar
thomwolf committed
554

555
        Examples:
556

557
558
        ```python
        >>> from transformers import AutoTokenizer
559

560
        >>> # Download vocabulary from huggingface.co and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
561
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
562

563
        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
564
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
thomwolf's avatar
thomwolf committed
565

566
        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
Sylvain Gugger's avatar
Sylvain Gugger committed
567
        >>> tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
568
569
570

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
571
        ```"""
572
        config = kwargs.pop("config", None)
573
        kwargs["_from_auto"] = True
574

575
        use_fast = kwargs.pop("use_fast", True)
576
        tokenizer_type = kwargs.pop("tokenizer_type", None)
577
        trust_remote_code = kwargs.pop("trust_remote_code", False)
578

579
580
581
582
583
584
585
586
587
588
589
590
591
        # First, let's see whether the tokenizer_type is passed so that we can leverage it
        if tokenizer_type is not None:
            tokenizer_class = None
            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)

            if tokenizer_class_tuple is None:
                raise ValueError(
                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
                )

            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple

Arthur's avatar
Arthur committed
592
593
594
595
596
597
598
599
            if use_fast:
                if tokenizer_fast_class_name is not None:
                    tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)
                else:
                    logger.warning(
                        "`use_fast` is set to `True` but the tokenizer class does not have a fast version. "
                        " Falling back to the slow version."
                    )
600
601
602
603
604
605
606
607
608
            if tokenizer_class is None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)

            if tokenizer_class is None:
                raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")

            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
609
        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
610
611
        if "_commit_hash" in tokenizer_config:
            kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
612
        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
613
614
615
616
617
618
619
        tokenizer_auto_map = None
        if "auto_map" in tokenizer_config:
            if isinstance(tokenizer_config["auto_map"], (tuple, list)):
                # Legacy format for dynamic tokenizers
                tokenizer_auto_map = tokenizer_config["auto_map"]
            else:
                tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
620
621
622
623

        # If that did not work, let's try to use the config.
        if config_tokenizer_class is None:
            if not isinstance(config, PretrainedConfig):
624
625
626
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
                )
627
            config_tokenizer_class = config.tokenizer_class
628
629
            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
630
631
632

        # If we have the tokenizer class from the tokenizer config or the model config we're good!
        if config_tokenizer_class is not None:
633
            tokenizer_class = None
634
635
636
            if tokenizer_auto_map is not None:
                if not trust_remote_code:
                    raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
637
638
639
                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that"
                        " repo on your local machine. Make sure you have read the code there to avoid malicious use,"
                        " then set the option `trust_remote_code=True` to remove this error."
640
641
                    )
                if kwargs.get("revision", None) is None:
642
                    logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
643
644
                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure"
                        " no malicious code has been contributed in a newer revision."
645
646
647
648
649
650
651
652
653
654
655
                    )

                if use_fast and tokenizer_auto_map[1] is not None:
                    class_ref = tokenizer_auto_map[1]
                else:
                    class_ref = tokenizer_auto_map[0]

                module_file, class_name = class_ref.split(".")
                tokenizer_class = get_class_from_dynamic_module(
                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
                )
656
                tokenizer_class.register_for_auto_class()
657
658

            elif use_fast and not config_tokenizer_class.endswith("Fast"):
659
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
660
661
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
662
                tokenizer_class_candidate = config_tokenizer_class
663
664
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

665
            if tokenizer_class is None:
666
                raise ValueError(
667
                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
668
                )
669
670
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

671
        # Otherwise we have to be creative.
672
673
674
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
675
                logger.warning(
676
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
677
                    f"config class: {config.decoder.__class__}. It is not recommended to use the "
678
679
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
                    "specific tokenizer classes."
680
681
682
                )
            config = config.encoder

683
684
        model_type = config_class_to_model_type(type(config).__name__)
        if model_type is not None:
685
            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
686
            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
687
688
                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
689
690
691
692
693
694
695
                if tokenizer_class_py is not None:
                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
                        "in order to use this tokenizer."
                    )
696

697
        raise ValueError(
698
699
            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
700
        )
701
702
703
704
705
706
707

    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None):
        """
        Register a new tokenizer in this mapping.


        Args:
708
            config_class ([`PretrainedConfig`]):
709
                The configuration corresponding to the model to register.
710
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
711
                The slow tokenizer to register.
712
            slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
            raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
            raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
            raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")

        if (
            slow_tokenizer_class is not None
            and fast_tokenizer_class is not None
            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
        ):
            raise ValueError(
                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
                "consistent with the slow tokenizer class you passed (fast tokenizer has "
                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
                "so they match!"
            )

        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
        if config_class in TOKENIZER_MAPPING._extra_content:
            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
            if slow_tokenizer_class is None:
                slow_tokenizer_class = existing_slow
            if fast_tokenizer_class is None:
                fast_tokenizer_class = existing_fast

        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class))