tokenization_auto.py 38.6 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
""" Auto Tokenizer class."""
thomwolf's avatar
thomwolf committed
16

17
import importlib
18
19
import json
import os
20
from collections import OrderedDict
21
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
thomwolf's avatar
thomwolf committed
22

Sylvain Gugger's avatar
Sylvain Gugger committed
23
from ...configuration_utils import PretrainedConfig
24
from ...dynamic_module_utils import get_class_from_dynamic_module
25
from ...tokenization_utils import PreTrainedTokenizer
26
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
27
from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging
28
29
from ..encoder_decoder import EncoderDecoderConfig
from .auto_factory import _LazyAutoMapping
30
from .configuration_auto import (
31
    CONFIG_MAPPING_NAMES,
32
    AutoConfig,
33
    config_class_to_model_type,
34
    model_type_to_module_name,
35
    replace_list_option_in_docstrings,
36
)
Aymeric Augustin's avatar
Aymeric Augustin committed
37

thomwolf's avatar
thomwolf committed
38

39
40
41
42
43
44
if is_tokenizers_available():
    from ...tokenization_utils_fast import PreTrainedTokenizerFast
else:
    PreTrainedTokenizerFast = None


Lysandre Debut's avatar
Lysandre Debut committed
45
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
46

47
48
49
50
51
52
53
if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
    TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
    TOKENIZER_MAPPING_NAMES = OrderedDict(
        [
54
            (
55
                "albert",
56
                (
57
58
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
59
                ),
60
            ),
61
            ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
62
            ("bart", ("BartTokenizer", "BartTokenizerFast")),
63
            (
64
                "barthez",
65
                (
66
67
                    "BarthezTokenizer" if is_sentencepiece_available() else None,
                    "BarthezTokenizerFast" if is_tokenizers_available() else None,
68
                ),
69
            ),
70
71
72
73
74
            ("bartpho", ("BartphoTokenizer", None)),
            ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
            ("bert-japanese", ("BertJapaneseTokenizer", None)),
            ("bertweet", ("BertweetTokenizer", None)),
75
            (
76
                "big_bird",
77
                (
78
79
                    "BigBirdTokenizer" if is_sentencepiece_available() else None,
                    "BigBirdTokenizerFast" if is_tokenizers_available() else None,
80
                ),
81
            ),
82
            ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
Kamal Raj Kanakarajan's avatar
Kamal Raj Kanakarajan committed
83
            ("biogpt", ("BioGptTokenizer", None)),
84
85
            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
Younes Belkada's avatar
Younes Belkada committed
86
            ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
87
            ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
Younes Belkada's avatar
Younes Belkada committed
88
            ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)),
89
            ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
90
            ("byt5", ("ByT5Tokenizer", None)),
91
            (
92
93
94
95
96
                "camembert",
                (
                    "CamembertTokenizer" if is_sentencepiece_available() else None,
                    "CamembertTokenizerFast" if is_tokenizers_available() else None,
                ),
97
            ),
98
            ("canine", ("CanineTokenizer", None)),
99
            ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
100
101
102
103
104
105
106
            (
                "clap",
                (
                    "RobertaTokenizer",
                    "RobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
107
            (
108
                "clip",
109
                (
110
111
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
112
                ),
113
            ),
NielsRogge's avatar
NielsRogge committed
114
115
116
117
118
119
120
            (
                "clipseg",
                (
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
rooa's avatar
rooa committed
121
            ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
122
            ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
123
            (
124
                "cpm",
125
                (
126
127
                    "CpmTokenizer" if is_sentencepiece_available() else None,
                    "CpmTokenizerFast" if is_tokenizers_available() else None,
128
                ),
129
            ),
130
131
132
            ("ctrl", ("CTRLTokenizer", None)),
            ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
            ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
133
            (
134
                "deberta-v2",
135
                (
136
137
                    "DebertaV2Tokenizer" if is_sentencepiece_available() else None,
                    "DebertaV2TokenizerFast" if is_tokenizers_available() else None,
138
                ),
139
            ),
140
            ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
141
            (
142
                "dpr",
143
                (
144
145
                    "DPRQuestionEncoderTokenizer",
                    "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
146
                ),
147
            ),
148
            ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
149
            ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
150
            ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
Matt's avatar
Matt committed
151
            ("esm", ("EsmTokenizer", None)),
152
153
154
            ("flaubert", ("FlaubertTokenizer", None)),
            ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
            ("fsmt", ("FSMTTokenizer", None)),
155
            ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
156
            ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
157
            ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
158
159
            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
160
            ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
161
            ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
162
            ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
163
            ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
164
            ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
165
166
167
            ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
            ("hubert", ("Wav2Vec2CTCTokenizer", None)),
            ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
168
            ("jukebox", ("JukeboxTokenizer", None)),
169
            ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
170
            ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
171
            ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
172
            ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
173
            ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
174
            ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
175
176
177
178
179
180
181
            (
                "llama",
                (
                    "LlamaTokenizer" if is_sentencepiece_available() else None,
                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
182
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
Daniel Stancl's avatar
Daniel Stancl committed
183
184
185
186
187
188
189
            (
                "longt5",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
190
191
192
193
            ("luke", ("LukeTokenizer", None)),
            ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
            ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
194
            (
195
                "mbart",
196
                (
197
198
                    "MBartTokenizer" if is_sentencepiece_available() else None,
                    "MBartTokenizerFast" if is_tokenizers_available() else None,
199
                ),
200
201
            ),
            (
202
                "mbart50",
203
                (
204
205
                    "MBart50Tokenizer" if is_sentencepiece_available() else None,
                    "MBart50TokenizerFast" if is_tokenizers_available() else None,
206
                ),
207
            ),
208
            ("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
209
            ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
wangpeng's avatar
wangpeng committed
210
            ("mgp-str", ("MgpstrTokenizer", None)),
211
212
213
            ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
214
            (
215
                "mt5",
216
                (
217
218
                    "MT5Tokenizer" if is_sentencepiece_available() else None,
                    "MT5TokenizerFast" if is_tokenizers_available() else None,
219
220
                ),
            ),
StevenTang1998's avatar
StevenTang1998 committed
221
            ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
222
            ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
223
            (
Lysandre Debut's avatar
Lysandre Debut committed
224
225
226
227
228
229
                "nllb",
                (
                    "NllbTokenizer" if is_sentencepiece_available() else None,
                    "NllbTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
230
231
232
233
234
235
236
            (
                "nllb-moe",
                (
                    "NllbTokenizer" if is_sentencepiece_available() else None,
                    "NllbTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
Lysandre Debut's avatar
Lysandre Debut committed
237
            (
238
                "nystromformer",
239
                (
240
241
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
242
                ),
243
            ),
Jitesh Jain's avatar
Jitesh Jain committed
244
            ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
245
            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
246
            ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
247
            ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
248
            (
249
                "pegasus",
250
                (
251
252
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
253
                ),
254
            ),
255
256
257
258
259
260
261
            (
                "pegasus_x",
                (
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
262
            (
263
                "perceiver",
264
                (
265
266
                    "PerceiverTokenizer",
                    None,
267
268
                ),
            ),
269
            ("phobert", ("PhobertTokenizer", None)),
Younes Belkada's avatar
Younes Belkada committed
270
            ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
271
272
273
274
275
            ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
            ("prophetnet", ("ProphetNetTokenizer", None)),
            ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("rag", ("RagTokenizer", None)),
            ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
276
            (
277
                "reformer",
278
                (
279
280
                    "ReformerTokenizer" if is_sentencepiece_available() else None,
                    "ReformerTokenizerFast" if is_tokenizers_available() else None,
281
282
                ),
            ),
283
284
285
286
287
288
289
            (
                "rembert",
                (
                    "RemBertTokenizer" if is_sentencepiece_available() else None,
                    "RemBertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
290
291
            ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
292
293
294
295
            (
                "roberta-prelayernorm",
                ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None),
            ),
296
            ("roc_bert", ("RoCBertTokenizer", None)),
297
298
299
            ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
            ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
            ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
300
            ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
301
            ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
302
            (
303
304
                "squeezebert",
                ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
305
            ),
306
307
308
309
310
311
312
            (
                "switch_transformers",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
313
            (
314
                "t5",
315
                (
316
317
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
318
319
                ),
            ),
320
321
322
            ("tapas", ("TapasTokenizer", None)),
            ("tapex", ("TapexTokenizer", None)),
            ("transfo-xl", ("TransfoXLTokenizer", None)),
323
            ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
324
325
            ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
326
            ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
327
            ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
328
            ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
329
            ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
330
331
332
333
334
335
336
            (
                "xglm",
                (
                    "XGLMTokenizer" if is_sentencepiece_available() else None,
                    "XGLMTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
337
338
            ("xlm", ("XLMTokenizer", None)),
            ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
339
            (
340
                "xlm-roberta",
341
                (
342
343
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
344
345
                ),
            ),
346
347
348
349
350
351
352
            (
                "xlm-roberta-xl",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
353
354
355
356
357
358
359
            (
                "xlnet",
                (
                    "XLNetTokenizer" if is_sentencepiece_available() else None,
                    "XLNetTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
Jannis Vamvas's avatar
Jannis Vamvas committed
360
361
362
363
364
365
366
            (
                "xmod",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
367
368
369
370
371
372
373
            (
                "yoso",
                (
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
374
375
        ]
    )
376

377
378
379
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
380

381

382
def tokenizer_class_from_name(class_name: str):
383
384
385
386
387
    if class_name == "PreTrainedTokenizerFast":
        return PreTrainedTokenizerFast

    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
        if class_name in tokenizers:
388
            module_name = model_type_to_module_name(module_name)
389

390
            module = importlib.import_module(f".{module_name}", "transformers.models")
391
392
393
394
            try:
                return getattr(module, class_name)
            except AttributeError:
                continue
395

396
397
398
399
400
    for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
        for tokenizer in tokenizers:
            if getattr(tokenizer, "__name__", None) == class_name:
                return tokenizer

401
402
403
404
405
406
    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
    # init and we return the proper dummy to get an appropriate error message.
    main_module = importlib.import_module("transformers")
    if hasattr(main_module, class_name):
        return getattr(main_module, class_name)

407
    return None
408
409


410
411
412
413
414
415
416
417
418
def get_tokenizer_config(
    pretrained_model_name_or_path: Union[str, os.PathLike],
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
    proxies: Optional[Dict[str, str]] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
419
    subfolder: str = "",
420
421
422
423
424
425
    **kwargs,
):
    """
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
426
        pretrained_model_name_or_path (`str` or `os.PathLike`):
427
428
            This can be either:

429
            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
Sylvain Gugger's avatar
Sylvain Gugger committed
430
431
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
              under a user or organization name, like `dbmdz/bert-base-german-cased`.
432
433
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
434

435
        cache_dir (`str` or `os.PathLike`, *optional*):
436
437
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
438
        force_download (`bool`, *optional*, defaults to `False`):
439
440
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
441
        resume_download (`bool`, *optional*, defaults to `False`):
442
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
443
        proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
444
445
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
446
        use_auth_token (`str` or *bool*, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
447
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
448
            when running `huggingface-cli login` (stored in `~/.huggingface`).
449
        revision (`str`, *optional*, defaults to `"main"`):
450
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
451
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
452
            identifier allowed by git.
453
454
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
455
456
457
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.
458

459
    <Tip>
460

461
    Passing `use_auth_token=True` is required when you want to use a private model.
462

463
    </Tip>
464
465

    Returns:
466
        `Dict`: The configuration of the tokenizer.
467

468
    Examples:
469

470
471
472
473
474
    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
475

476
477
    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer
478

479
480
481
482
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```"""
483
484
    commit_hash = kwargs.get("_commit_hash", None)
    resolved_config_file = cached_file(
485
486
487
488
489
490
491
492
493
        pretrained_model_name_or_path,
        TOKENIZER_CONFIG_FILE,
        cache_dir=cache_dir,
        force_download=force_download,
        resume_download=resume_download,
        proxies=proxies,
        use_auth_token=use_auth_token,
        revision=revision,
        local_files_only=local_files_only,
494
        subfolder=subfolder,
495
496
497
        _raise_exceptions_for_missing_entries=False,
        _raise_exceptions_for_connection_errors=False,
        _commit_hash=commit_hash,
498
499
    )
    if resolved_config_file is None:
500
501
        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
        return {}
502
    commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
503
504

    with open(resolved_config_file, encoding="utf-8") as reader:
505
506
507
        result = json.load(reader)
    result["_commit_hash"] = commit_hash
    return result
508
509


Julien Chaumond's avatar
Julien Chaumond committed
510
class AutoTokenizer:
511
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
512
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
513
    created with the [`AutoTokenizer.from_pretrained`] class method.
thomwolf's avatar
thomwolf committed
514

515
    This class cannot be instantiated directly using `__init__()` (throws an error).
thomwolf's avatar
thomwolf committed
516
    """
517

thomwolf's avatar
thomwolf committed
518
    def __init__(self):
519
520
521
522
        raise EnvironmentError(
            "AutoTokenizer is designed to be instantiated "
            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
        )
thomwolf's avatar
thomwolf committed
523
524

    @classmethod
525
    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
thomwolf's avatar
thomwolf committed
526
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
527
528
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
thomwolf's avatar
thomwolf committed
529

Sylvain Gugger's avatar
Sylvain Gugger committed
530
531
532
        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:
533

534
        List options
thomwolf's avatar
thomwolf committed
535
536

        Params:
537
            pretrained_model_name_or_path (`str` or `os.PathLike`):
538
539
                Can be either:

540
                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Sylvain Gugger's avatar
Sylvain Gugger committed
541
542
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
543
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
Sylvain Gugger's avatar
Sylvain Gugger committed
544
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
545
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
546
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
Sylvain Gugger's avatar
Sylvain Gugger committed
547
                      applicable to all derived classes)
548
549
550
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
551
                The configuration object used to dertermine the tokenizer class to instantiate.
552
            cache_dir (`str` or `os.PathLike`, *optional*):
553
554
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
555
            force_download (`bool`, *optional*, defaults to `False`):
556
557
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
558
            resume_download (`bool`, *optional*, defaults to `False`):
559
560
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
561
            proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
562
563
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
564
            revision (`str`, *optional*, defaults to `"main"`):
Julien Chaumond's avatar
Julien Chaumond committed
565
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
566
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
Julien Chaumond's avatar
Julien Chaumond committed
567
                identifier allowed by git.
568
            subfolder (`str`, *optional*):
569
570
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
571
            use_fast (`bool`, *optional*, defaults to `True`):
572
573
574
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
575
            tokenizer_type (`str`, *optional*):
576
                Tokenizer type to be loaded.
577
            trust_remote_code (`bool`, *optional*, defaults to `False`):
578
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
Sylvain Gugger's avatar
Sylvain Gugger committed
579
580
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
581
582
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
Sylvain Gugger's avatar
Sylvain Gugger committed
583
584
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.
thomwolf's avatar
thomwolf committed
585

586
        Examples:
587

588
589
        ```python
        >>> from transformers import AutoTokenizer
590

591
        >>> # Download vocabulary from huggingface.co and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
592
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
593

594
        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
595
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
thomwolf's avatar
thomwolf committed
596

597
        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
Yih-Dar's avatar
Yih-Dar committed
598
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
599
600
601

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
602
        ```"""
603
        config = kwargs.pop("config", None)
604
        kwargs["_from_auto"] = True
605

606
        use_fast = kwargs.pop("use_fast", True)
607
        tokenizer_type = kwargs.pop("tokenizer_type", None)
608
        trust_remote_code = kwargs.pop("trust_remote_code", False)
609

610
611
612
613
614
615
616
617
618
619
620
621
622
        # First, let's see whether the tokenizer_type is passed so that we can leverage it
        if tokenizer_type is not None:
            tokenizer_class = None
            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)

            if tokenizer_class_tuple is None:
                raise ValueError(
                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
                )

            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple

Arthur's avatar
Arthur committed
623
624
625
626
627
628
629
630
            if use_fast:
                if tokenizer_fast_class_name is not None:
                    tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)
                else:
                    logger.warning(
                        "`use_fast` is set to `True` but the tokenizer class does not have a fast version. "
                        " Falling back to the slow version."
                    )
631
632
633
634
635
636
637
638
639
            if tokenizer_class is None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)

            if tokenizer_class is None:
                raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")

            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
640
        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
641
642
        if "_commit_hash" in tokenizer_config:
            kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
643
        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
644
645
646
647
648
649
650
        tokenizer_auto_map = None
        if "auto_map" in tokenizer_config:
            if isinstance(tokenizer_config["auto_map"], (tuple, list)):
                # Legacy format for dynamic tokenizers
                tokenizer_auto_map = tokenizer_config["auto_map"]
            else:
                tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
651
652
653
654

        # If that did not work, let's try to use the config.
        if config_tokenizer_class is None:
            if not isinstance(config, PretrainedConfig):
655
656
657
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
                )
658
            config_tokenizer_class = config.tokenizer_class
659
660
            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
661
662
663

        # If we have the tokenizer class from the tokenizer config or the model config we're good!
        if config_tokenizer_class is not None:
664
            tokenizer_class = None
665
666
667
            if tokenizer_auto_map is not None:
                if not trust_remote_code:
                    raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
668
669
670
                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that"
                        " repo on your local machine. Make sure you have read the code there to avoid malicious use,"
                        " then set the option `trust_remote_code=True` to remove this error."
671
672
                    )
                if kwargs.get("revision", None) is None:
673
                    logger.warning(
Sylvain Gugger's avatar
Sylvain Gugger committed
674
675
                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure"
                        " no malicious code has been contributed in a newer revision."
676
677
678
679
680
681
682
683
684
685
686
                    )

                if use_fast and tokenizer_auto_map[1] is not None:
                    class_ref = tokenizer_auto_map[1]
                else:
                    class_ref = tokenizer_auto_map[0]

                module_file, class_name = class_ref.split(".")
                tokenizer_class = get_class_from_dynamic_module(
                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
                )
687
                tokenizer_class.register_for_auto_class()
688
689

            elif use_fast and not config_tokenizer_class.endswith("Fast"):
690
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
691
692
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
693
                tokenizer_class_candidate = config_tokenizer_class
694
695
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

696
            if tokenizer_class is None:
697
                raise ValueError(
698
                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
699
                )
700
701
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

702
        # Otherwise we have to be creative.
703
704
705
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
706
                logger.warning(
707
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
708
                    f"config class: {config.decoder.__class__}. It is not recommended to use the "
709
710
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
                    "specific tokenizer classes."
711
712
713
                )
            config = config.encoder

714
715
        model_type = config_class_to_model_type(type(config).__name__)
        if model_type is not None:
716
            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
717
            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
718
719
                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
720
721
722
723
724
725
726
                if tokenizer_class_py is not None:
                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
                        "in order to use this tokenizer."
                    )
727

728
        raise ValueError(
729
730
            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
731
        )
732
733
734
735
736
737
738

    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None):
        """
        Register a new tokenizer in this mapping.


        Args:
739
            config_class ([`PretrainedConfig`]):
740
                The configuration corresponding to the model to register.
741
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
742
                The slow tokenizer to register.
743
            slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
            raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
            raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
            raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")

        if (
            slow_tokenizer_class is not None
            and fast_tokenizer_class is not None
            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
        ):
            raise ValueError(
                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
                "consistent with the slow tokenizer class you passed (fast tokenizer has "
                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
                "so they match!"
            )

        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
        if config_class in TOKENIZER_MAPPING._extra_content:
            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
            if slow_tokenizer_class is None:
                slow_tokenizer_class = existing_slow
            if fast_tokenizer_class is None:
                fast_tokenizer_class = existing_fast

        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class))