"...git@developer.sourcefind.cn:youbo/yolo11_pytorch.git" did not exist on "a74dc9a0390d8903281065c5a1a578c44ca0cb68"
tokenization_auto.py 38.5 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
""" Auto Tokenizer class."""
thomwolf's avatar
thomwolf committed
16

17
import importlib
18
19
import json
import os
20
from collections import OrderedDict
21
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
thomwolf's avatar
thomwolf committed
22

Sylvain Gugger's avatar
Sylvain Gugger committed
23
from ...configuration_utils import PretrainedConfig
24
from ...dynamic_module_utils import get_class_from_dynamic_module
25
from ...tokenization_utils import PreTrainedTokenizer
26
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
27
from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging
28
29
from ..encoder_decoder import EncoderDecoderConfig
from .auto_factory import _LazyAutoMapping
30
from .configuration_auto import (
31
    CONFIG_MAPPING_NAMES,
32
    AutoConfig,
33
    config_class_to_model_type,
34
    model_type_to_module_name,
35
    replace_list_option_in_docstrings,
36
)
Aymeric Augustin's avatar
Aymeric Augustin committed
37

thomwolf's avatar
thomwolf committed
38

39
40
41
42
43
44
if is_tokenizers_available():
    from ...tokenization_utils_fast import PreTrainedTokenizerFast
else:
    PreTrainedTokenizerFast = None


Lysandre Debut's avatar
Lysandre Debut committed
45
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
46

47
48
49
50
51
52
53
if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
    TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
    TOKENIZER_MAPPING_NAMES = OrderedDict(
        [
54
            (
55
                "albert",
56
                (
57
58
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
59
                ),
60
            ),
61
            ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
62
            ("bart", ("BartTokenizer", "BartTokenizerFast")),
63
            (
64
                "barthez",
65
                (
66
67
                    "BarthezTokenizer" if is_sentencepiece_available() else None,
                    "BarthezTokenizerFast" if is_tokenizers_available() else None,
68
                ),
69
            ),
70
71
72
73
74
            ("bartpho", ("BartphoTokenizer", None)),
            ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
            ("bert-japanese", ("BertJapaneseTokenizer", None)),
            ("bertweet", ("BertweetTokenizer", None)),
75
            (
76
                "big_bird",
77
                (
78
79
                    "BigBirdTokenizer" if is_sentencepiece_available() else None,
                    "BigBirdTokenizerFast" if is_tokenizers_available() else None,
80
                ),
81
            ),
82
            ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
Kamal Raj Kanakarajan's avatar
Kamal Raj Kanakarajan committed
83
            ("biogpt", ("BioGptTokenizer", None)),
84
85
            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
Younes Belkada's avatar
Younes Belkada committed
86
            ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
87
            ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
Younes Belkada's avatar
Younes Belkada committed
88
            ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)),
89
            ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
90
            ("byt5", ("ByT5Tokenizer", None)),
91
            (
92
93
94
95
96
                "camembert",
                (
                    "CamembertTokenizer" if is_sentencepiece_available() else None,
                    "CamembertTokenizerFast" if is_tokenizers_available() else None,
                ),
97
            ),
98
            ("canine", ("CanineTokenizer", None)),
99
            ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
100
101
102
103
104
105
106
            (
                "clap",
                (
                    "RobertaTokenizer",
                    "RobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
107
            (
108
                "clip",
109
                (
110
111
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
112
                ),
113
            ),
NielsRogge's avatar
NielsRogge committed
114
115
116
117
118
119
120
            (
                "clipseg",
                (
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
rooa's avatar
rooa committed
121
            ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
122
            ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
123
            (
124
                "cpm",
125
                (
126
127
                    "CpmTokenizer" if is_sentencepiece_available() else None,
                    "CpmTokenizerFast" if is_tokenizers_available() else None,
128
                ),
129
            ),
130
131
132
            ("ctrl", ("CTRLTokenizer", None)),
            ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
            ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
133
            (
134
                "deberta-v2",
135
                (
136
137
                    "DebertaV2Tokenizer" if is_sentencepiece_available() else None,
                    "DebertaV2TokenizerFast" if is_tokenizers_available() else None,
138
                ),
139
            ),
140
            ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
141
            (
142
                "dpr",
143
                (
144
145
                    "DPRQuestionEncoderTokenizer",
                    "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
146
                ),
147
            ),
148
            ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
149
            ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
150
            ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
Matt's avatar
Matt committed
151
            ("esm", ("EsmTokenizer", None)),
152
153
154
            ("flaubert", ("FlaubertTokenizer", None)),
            ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
            ("fsmt", ("FSMTTokenizer", None)),
155
            ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
156
            ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
157
            ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
158
159
            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
160
            ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
161
            ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
162
            ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
163
            ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
164
            ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
165
166
167
            ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
            ("hubert", ("Wav2Vec2CTCTokenizer", None)),
            ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
168
            ("jukebox", ("JukeboxTokenizer", None)),
169
            ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
170
            ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
171
            ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
172
            ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
173
            ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
174
            ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
Jason Phang's avatar
Jason Phang committed
175
            ("llama", ("LlamaTokenizer" if is_sentencepiece_available() else None, None)),
176
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
Daniel Stancl's avatar
Daniel Stancl committed
177
178
179
180
181
182
183
            (
                "longt5",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
184
185
186
187
            ("luke", ("LukeTokenizer", None)),
            ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
            ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
188
            (
189
                "mbart",
190
                (
191
192
                    "MBartTokenizer" if is_sentencepiece_available() else None,
                    "MBartTokenizerFast" if is_tokenizers_available() else None,
193
                ),
194
195
            ),
            (
196
                "mbart50",
197
                (
198
199
                    "MBart50Tokenizer" if is_sentencepiece_available() else None,
                    "MBart50TokenizerFast" if is_tokenizers_available() else None,
200
                ),
201
            ),
202
            ("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
203
            ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
wangpeng's avatar
wangpeng committed
204
            ("mgp-str", ("MgpstrTokenizer", None)),
205
206
207
            ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
208
            (
209
                "mt5",
210
                (
211
212
                    "MT5Tokenizer" if is_sentencepiece_available() else None,
                    "MT5TokenizerFast" if is_tokenizers_available() else None,
213
214
                ),
            ),
StevenTang1998's avatar
StevenTang1998 committed
215
            ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
216
            ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
217
            (
Lysandre Debut's avatar
Lysandre Debut committed
218
219
220
221
222
223
                "nllb",
                (
                    "NllbTokenizer" if is_sentencepiece_available() else None,
                    "NllbTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
224
225
226
227
228
229
230
            (
                "nllb-moe",
                (
                    "NllbTokenizer" if is_sentencepiece_available() else None,
                    "NllbTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
Lysandre Debut's avatar
Lysandre Debut committed
231
            (
232
                "nystromformer",
233
                (
234
235
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
236
                ),
237
            ),
Jitesh Jain's avatar
Jitesh Jain committed
238
            ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
239
            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
240
            ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
241
            ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
242
            (
243
                "pegasus",
244
                (
245
246
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
247
                ),
248
            ),
249
250
251
252
253
254
255
            (
                "pegasus_x",
                (
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
256
            (
257
                "perceiver",
258
                (
259
260
                    "PerceiverTokenizer",
                    None,
261
262
                ),
            ),
263
            ("phobert", ("PhobertTokenizer", None)),
Younes Belkada's avatar
Younes Belkada committed
264
            ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
265
266
267
268
269
            ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
            ("prophetnet", ("ProphetNetTokenizer", None)),
            ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("rag", ("RagTokenizer", None)),
            ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
270
            (
271
                "reformer",
272
                (
273
274
                    "ReformerTokenizer" if is_sentencepiece_available() else None,
                    "ReformerTokenizerFast" if is_tokenizers_available() else None,
275
276
                ),
            ),
277
278
279
280
281
282
283
            (
                "rembert",
                (
                    "RemBertTokenizer" if is_sentencepiece_available() else None,
                    "RemBertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
284
285
            ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
286
287
288
289
            (
                "roberta-prelayernorm",
                ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None),
            ),
290
            ("roc_bert", ("RoCBertTokenizer", None)),
291
292
293
            ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
            ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
            ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
294
            ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
295
            ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
296
            (
297
298
                "squeezebert",
                ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
299
            ),
300
301
302
303
304
305
306
            (
                "switch_transformers",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
307
            (
308
                "t5",
309
                (
310
311
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
312
313
                ),
            ),
314
315
316
            ("tapas", ("TapasTokenizer", None)),
            ("tapex", ("TapexTokenizer", None)),
            ("transfo-xl", ("TransfoXLTokenizer", None)),
317
            ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
318
319
            ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
320
            ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
321
            ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
322
            ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
323
            ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
324
325
326
327
328
329
330
            (
                "xglm",
                (
                    "XGLMTokenizer" if is_sentencepiece_available() else None,
                    "XGLMTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
331
332
            ("xlm", ("XLMTokenizer", None)),
            ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
333
            (
334
                "xlm-roberta",
335
                (
336
337
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
338
339
                ),
            ),
340
341
342
343
344
345
346
            (
                "xlm-roberta-xl",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
347
348
349
350
351
352
353
            (
                "xlnet",
                (
                    "XLNetTokenizer" if is_sentencepiece_available() else None,
                    "XLNetTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
Jannis Vamvas's avatar
Jannis Vamvas committed
354
355
356
357
358
359
360
            (
                "xmod",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
361
362
363
364
365
366
367
            (
                "yoso",
                (
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
368
369
        ]
    )
370

371
372
373
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
374

375

376
def tokenizer_class_from_name(class_name: str):
377
378
379
380
381
    if class_name == "PreTrainedTokenizerFast":
        return PreTrainedTokenizerFast

    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
        if class_name in tokenizers:
382
            module_name = model_type_to_module_name(module_name)
383

384
            module = importlib.import_module(f".{module_name}", "transformers.models")
385
386
387
388
            try:
                return getattr(module, class_name)
            except AttributeError:
                continue
389

390
391
392
393
394
    for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
        for tokenizer in tokenizers:
            if getattr(tokenizer, "__name__", None) == class_name:
                return tokenizer

395
396
397
398
399
400
    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
    # init and we return the proper dummy to get an appropriate error message.
    main_module = importlib.import_module("transformers")
    if hasattr(main_module, class_name):
        return getattr(main_module, class_name)

401
    return None
402
403


404
405
406
407
408
409
410
411
412
def get_tokenizer_config(
    pretrained_model_name_or_path: Union[str, os.PathLike],
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
    proxies: Optional[Dict[str, str]] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
413
    subfolder: str = "",
414
415
416
417
418
419
    **kwargs,
):
    """
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
420
        pretrained_model_name_or_path (`str` or `os.PathLike`):
421
422
            This can be either:

423
            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
Sylvain Gugger's avatar
Sylvain Gugger committed
424
425
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
              under a user or organization name, like `dbmdz/bert-base-german-cased`.
426
427
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
428

429
        cache_dir (`str` or `os.PathLike`, *optional*):
430
431
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
432
        force_download (`bool`, *optional*, defaults to `False`):
433
434
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
435
        resume_download (`bool`, *optional*, defaults to `False`):
436
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
437
        proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
438
439
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
440
        use_auth_token (`str` or *bool*, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
441
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
442
            when running `huggingface-cli login` (stored in `~/.huggingface`).
443
        revision (`str`, *optional*, defaults to `"main"`):
444
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
445
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
446
            identifier allowed by git.
447
448
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
449
450
451
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.
452

453
    <Tip>
454

455
    Passing `use_auth_token=True` is required when you want to use a private model.
456

457
    </Tip>
458
459

    Returns:
460
        `Dict`: The configuration of the tokenizer.
461

462
    Examples:
463

464
465
466
467
468
    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
469

470
471
    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer
472

473
474
475
476
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```"""
477
478
    commit_hash = kwargs.get("_commit_hash", None)
    resolved_config_file = cached_file(
479
480
481
482
483
484
485
486
487
        pretrained_model_name_or_path,
        TOKENIZER_CONFIG_FILE,
        cache_dir=cache_dir,
        force_download=force_download,
        resume_download=resume_download,
        proxies=proxies,
        use_auth_token=use_auth_token,
        revision=revision,
        local_files_only=local_files_only,
488
        subfolder=subfolder,
489
490
491
        _raise_exceptions_for_missing_entries=False,
        _raise_exceptions_for_connection_errors=False,
        _commit_hash=commit_hash,
492
493
    )
    if resolved_config_file is None:
494
495
        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
        return {}
496
    commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
497
498

    with open(resolved_config_file, encoding="utf-8") as reader:
499
500
501
        result = json.load(reader)
    result["_commit_hash"] = commit_hash
    return result
502
503


Julien Chaumond's avatar
Julien Chaumond committed
504
class AutoTokenizer:
505
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
506
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
507
    created with the [`AutoTokenizer.from_pretrained`] class method.
thomwolf's avatar
thomwolf committed
508

509
    This class cannot be instantiated directly using `__init__()` (throws an error).
thomwolf's avatar
thomwolf committed
510
    """
511

thomwolf's avatar
thomwolf committed
512
    def __init__(self):
513
514
515
516
        raise EnvironmentError(
            "AutoTokenizer is designed to be instantiated "
            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
        )
thomwolf's avatar
thomwolf committed
517
518

    @classmethod
519
    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
thomwolf's avatar
thomwolf committed
520
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
521
522
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
thomwolf's avatar
thomwolf committed
523

Sylvain Gugger's avatar
Sylvain Gugger committed
524
525
526
        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:
527

528
        List options
thomwolf's avatar
thomwolf committed
529
530

        Params:
531
            pretrained_model_name_or_path (`str` or `os.PathLike`):
532
533
                Can be either:

534
                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Sylvain Gugger's avatar
Sylvain Gugger committed
535
536
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
537
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
Sylvain Gugger's avatar
Sylvain Gugger committed
538
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
539
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
540
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
Sylvain Gugger's avatar
Sylvain Gugger committed
541
                      applicable to all derived classes)
542
543
544
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
545
                The configuration object used to dertermine the tokenizer class to instantiate.
546
            cache_dir (`str` or `os.PathLike`, *optional*):
547
548
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
549
            force_download (`bool`, *optional*, defaults to `False`):
550
551
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
552
            resume_download (`bool`, *optional*, defaults to `False`):
553
554
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
555
            proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
556
557
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
558
            revision (`str`, *optional*, defaults to `"main"`):
Julien Chaumond's avatar
Julien Chaumond committed
559
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
560
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
Julien Chaumond's avatar
Julien Chaumond committed
561
                identifier allowed by git.
562
            subfolder (`str`, *optional*):
563
564
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
565
            use_fast (`bool`, *optional*, defaults to `True`):
566
567
568
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
569
            tokenizer_type (`str`, *optional*):
570
                Tokenizer type to be loaded.
571
            trust_remote_code (`bool`, *optional*, defaults to `False`):
572
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
Sylvain Gugger's avatar
Sylvain Gugger committed
573
574
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
575
576
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
Sylvain Gugger's avatar
Sylvain Gugger committed
577
578
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.
thomwolf's avatar
thomwolf committed
579

580
        Examples:
581

582
583
        ```python
        >>> from transformers import AutoTokenizer
584

585
        >>> # Download vocabulary from huggingface.co and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
586
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
587

588
        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
589
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
thomwolf's avatar
thomwolf committed
590

591
        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
Yih-Dar's avatar
Yih-Dar committed
592
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
593
594
595

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
596
        ```"""
597
        config = kwargs.pop("config", None)
598
        kwargs["_from_auto"] = True
599

600
        use_fast = kwargs.pop("use_fast", True)
601
        tokenizer_type = kwargs.pop("tokenizer_type", None)
602
        trust_remote_code = kwargs.pop("trust_remote_code", False)
603

604
605
606
607
608
609
610
611
612
613
614
615
616
        # First, let's see whether the tokenizer_type is passed so that we can leverage it
        if tokenizer_type is not None:
            tokenizer_class = None
            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)

            if tokenizer_class_tuple is None:
                raise ValueError(
                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
                )

            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple

Arthur's avatar
Arthur committed
617
618
619
620
621
622
623
624
            if use_fast:
                if tokenizer_fast_class_name is not None:
                    tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)
                else:
                    logger.warning(
                        "`use_fast` is set to `True` but the tokenizer class does not have a fast version. "
                        " Falling back to the slow version."
                    )
625
626
627
628
629
630
631
632
633
            if tokenizer_class is None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)

            if tokenizer_class is None:
                raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")

            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
634
        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
635
636
        if "_commit_hash" in tokenizer_config:
            kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
637
        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
638
639
640
641
642
643
644
        tokenizer_auto_map = None
        if "auto_map" in tokenizer_config:
            if isinstance(tokenizer_config["auto_map"], (tuple, list)):
                # Legacy format for dynamic tokenizers
                tokenizer_auto_map = tokenizer_config["auto_map"]
            else:
                tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
645
646
647
648

        # If that did not work, let's try to use the config.
        if config_tokenizer_class is None:
            if not isinstance(config, PretrainedConfig):
649
650
651
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
                )
652
            config_tokenizer_class = config.tokenizer_class
653
654
            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
655
656
657

        # If we have the tokenizer class from the tokenizer config or the model config we're good!
        if config_tokenizer_class is not None:
658
            tokenizer_class = None
659
660
661
            if tokenizer_auto_map is not None:
                if not trust_remote_code:
                    raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
662
663
664
                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that"
                        " repo on your local machine. Make sure you have read the code there to avoid malicious use,"
                        " then set the option `trust_remote_code=True` to remove this error."
665
666
                    )
                if kwargs.get("revision", None) is None:
667
                    logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
668
669
                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure"
                        " no malicious code has been contributed in a newer revision."
670
671
672
673
674
675
676
677
678
679
680
                    )

                if use_fast and tokenizer_auto_map[1] is not None:
                    class_ref = tokenizer_auto_map[1]
                else:
                    class_ref = tokenizer_auto_map[0]

                module_file, class_name = class_ref.split(".")
                tokenizer_class = get_class_from_dynamic_module(
                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
                )
681
                tokenizer_class.register_for_auto_class()
682
683

            elif use_fast and not config_tokenizer_class.endswith("Fast"):
684
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
685
686
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
687
                tokenizer_class_candidate = config_tokenizer_class
688
689
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

690
            if tokenizer_class is None:
691
                raise ValueError(
692
                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
693
                )
694
695
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

696
        # Otherwise we have to be creative.
697
698
699
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
700
                logger.warning(
701
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
702
                    f"config class: {config.decoder.__class__}. It is not recommended to use the "
703
704
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
                    "specific tokenizer classes."
705
706
707
                )
            config = config.encoder

708
709
        model_type = config_class_to_model_type(type(config).__name__)
        if model_type is not None:
710
            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
711
            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
712
713
                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
714
715
716
717
718
719
720
                if tokenizer_class_py is not None:
                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
                        "in order to use this tokenizer."
                    )
721

722
        raise ValueError(
723
724
            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
725
        )
726
727
728
729
730
731
732

    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None):
        """
        Register a new tokenizer in this mapping.


        Args:
733
            config_class ([`PretrainedConfig`]):
734
                The configuration corresponding to the model to register.
735
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
736
                The slow tokenizer to register.
737
            slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
            raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
            raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
            raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")

        if (
            slow_tokenizer_class is not None
            and fast_tokenizer_class is not None
            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
        ):
            raise ValueError(
                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
                "consistent with the slow tokenizer class you passed (fast tokenizer has "
                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
                "so they match!"
            )

        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
        if config_class in TOKENIZER_MAPPING._extra_content:
            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
            if slow_tokenizer_class is None:
                slow_tokenizer_class = existing_slow
            if fast_tokenizer_class is None:
                fast_tokenizer_class = existing_fast

        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class))