tokenization_auto.py 40.9 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
""" Auto Tokenizer class."""
thomwolf's avatar
thomwolf committed
16

17
import importlib
18
19
import json
import os
20
import warnings
21
from collections import OrderedDict
22
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
thomwolf's avatar
thomwolf committed
23

Sylvain Gugger's avatar
Sylvain Gugger committed
24
from ...configuration_utils import PretrainedConfig
25
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
26
from ...tokenization_utils import PreTrainedTokenizer
27
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
28
from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging
29
30
from ..encoder_decoder import EncoderDecoderConfig
from .auto_factory import _LazyAutoMapping
31
from .configuration_auto import (
32
    CONFIG_MAPPING_NAMES,
33
    AutoConfig,
34
    config_class_to_model_type,
35
    model_type_to_module_name,
36
    replace_list_option_in_docstrings,
37
)
Aymeric Augustin's avatar
Aymeric Augustin committed
38

thomwolf's avatar
thomwolf committed
39

40
41
42
43
44
45
if is_tokenizers_available():
    from ...tokenization_utils_fast import PreTrainedTokenizerFast
else:
    PreTrainedTokenizerFast = None


Lysandre Debut's avatar
Lysandre Debut committed
46
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
47

48
49
50
51
52
53
54
if TYPE_CHECKING:
    # This significantly improves completion suggestion performance when
    # the transformers package is used with Microsoft's Pylance language server.
    TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
    TOKENIZER_MAPPING_NAMES = OrderedDict(
        [
55
            (
56
                "albert",
57
                (
58
59
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
60
                ),
61
            ),
62
            ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
63
            ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
64
            ("bart", ("BartTokenizer", "BartTokenizerFast")),
65
            (
66
                "barthez",
67
                (
68
69
                    "BarthezTokenizer" if is_sentencepiece_available() else None,
                    "BarthezTokenizerFast" if is_tokenizers_available() else None,
70
                ),
71
            ),
72
73
74
75
76
            ("bartpho", ("BartphoTokenizer", None)),
            ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
            ("bert-japanese", ("BertJapaneseTokenizer", None)),
            ("bertweet", ("BertweetTokenizer", None)),
77
            (
78
                "big_bird",
79
                (
80
81
                    "BigBirdTokenizer" if is_sentencepiece_available() else None,
                    "BigBirdTokenizerFast" if is_tokenizers_available() else None,
82
                ),
83
            ),
84
            ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
Kamal Raj Kanakarajan's avatar
Kamal Raj Kanakarajan committed
85
            ("biogpt", ("BioGptTokenizer", None)),
86
87
            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
Younes Belkada's avatar
Younes Belkada committed
88
            ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
89
            ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
Younes Belkada's avatar
Younes Belkada committed
90
            ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)),
91
            ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
Jinho Park's avatar
Jinho Park committed
92
            ("bros", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
93
            ("byt5", ("ByT5Tokenizer", None)),
94
            (
95
96
97
98
99
                "camembert",
                (
                    "CamembertTokenizer" if is_sentencepiece_available() else None,
                    "CamembertTokenizerFast" if is_tokenizers_available() else None,
                ),
100
            ),
101
            ("canine", ("CanineTokenizer", None)),
102
            ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
103
104
105
106
107
108
109
            (
                "clap",
                (
                    "RobertaTokenizer",
                    "RobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
110
            (
111
                "clip",
112
                (
113
114
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
115
                ),
116
            ),
NielsRogge's avatar
NielsRogge committed
117
118
119
120
121
122
123
            (
                "clipseg",
                (
                    "CLIPTokenizer",
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
124
125
126
127
128
129
130
            (
                "code_llama",
                (
                    "CodeLlamaTokenizer" if is_sentencepiece_available() else None,
                    "CodeLlamaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
rooa's avatar
rooa committed
131
            ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
132
            ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
133
            (
134
                "cpm",
135
                (
136
137
                    "CpmTokenizer" if is_sentencepiece_available() else None,
                    "CpmTokenizerFast" if is_tokenizers_available() else None,
138
                ),
139
            ),
140
            ("cpmant", ("CpmAntTokenizer", None)),
141
            ("ctrl", ("CTRLTokenizer", None)),
142
            ("data2vec-audio", ("Wav2Vec2CTCTokenizer", None)),
143
144
            ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
            ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
145
            (
146
                "deberta-v2",
147
                (
148
149
                    "DebertaV2Tokenizer" if is_sentencepiece_available() else None,
                    "DebertaV2TokenizerFast" if is_tokenizers_available() else None,
150
                ),
151
            ),
152
            ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
153
            (
154
                "dpr",
155
                (
156
157
                    "DPRQuestionEncoderTokenizer",
                    "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
158
                ),
159
            ),
160
            ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
161
            ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
162
            ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
Matt's avatar
Matt committed
163
            ("esm", ("EsmTokenizer", None)),
164
165
166
            ("flaubert", ("FlaubertTokenizer", None)),
            ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
            ("fsmt", ("FSMTTokenizer", None)),
167
            ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
168
            ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
169
            ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
170
            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
171
            ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
172
            ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
173
            ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
174
            ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
175
            ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
176
            ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
177
            ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
178
179
180
            ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
            ("hubert", ("Wav2Vec2CTCTokenizer", None)),
            ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
181
            ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
182
            ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
183
            ("jukebox", ("JukeboxTokenizer", None)),
184
            ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
185
            ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
186
            ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
187
            ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
188
            ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
189
            ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
190
191
192
193
194
195
196
            (
                "llama",
                (
                    "LlamaTokenizer" if is_sentencepiece_available() else None,
                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
197
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
Daniel Stancl's avatar
Daniel Stancl committed
198
199
200
201
202
203
204
            (
                "longt5",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
205
206
207
208
            ("luke", ("LukeTokenizer", None)),
            ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
            ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
209
            (
210
                "mbart",
211
                (
212
213
                    "MBartTokenizer" if is_sentencepiece_available() else None,
                    "MBartTokenizerFast" if is_tokenizers_available() else None,
214
                ),
215
216
            ),
            (
217
                "mbart50",
218
                (
219
220
                    "MBart50Tokenizer" if is_sentencepiece_available() else None,
                    "MBart50TokenizerFast" if is_tokenizers_available() else None,
221
                ),
222
            ),
223
            ("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
224
            ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
wangpeng's avatar
wangpeng committed
225
            ("mgp-str", ("MgpstrTokenizer", None)),
226
227
228
            ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
229
            ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
230
            ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
231
            (
232
                "mt5",
233
                (
234
235
                    "MT5Tokenizer" if is_sentencepiece_available() else None,
                    "MT5TokenizerFast" if is_tokenizers_available() else None,
236
237
                ),
            ),
238
            ("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
StevenTang1998's avatar
StevenTang1998 committed
239
            ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
240
            ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
241
            (
Lysandre Debut's avatar
Lysandre Debut committed
242
243
244
245
246
247
                "nllb",
                (
                    "NllbTokenizer" if is_sentencepiece_available() else None,
                    "NllbTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
248
249
250
251
252
253
254
            (
                "nllb-moe",
                (
                    "NllbTokenizer" if is_sentencepiece_available() else None,
                    "NllbTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
Lysandre Debut's avatar
Lysandre Debut committed
255
            (
256
                "nystromformer",
257
                (
258
259
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
260
                ),
261
            ),
Jitesh Jain's avatar
Jitesh Jain committed
262
            ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
263
            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
264
            ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
265
            ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
266
            (
267
                "pegasus",
268
                (
269
270
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
271
                ),
272
            ),
273
274
275
276
277
278
279
            (
                "pegasus_x",
                (
                    "PegasusTokenizer" if is_sentencepiece_available() else None,
                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
280
            (
281
                "perceiver",
282
                (
283
284
                    "PerceiverTokenizer",
                    None,
285
286
                ),
            ),
287
288
289
290
291
292
293
            (
                "persimmon",
                (
                    "LlamaTokenizer" if is_sentencepiece_available() else None,
                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
294
            ("phobert", ("PhobertTokenizer", None)),
Younes Belkada's avatar
Younes Belkada committed
295
            ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
296
297
298
299
300
            ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
            ("prophetnet", ("ProphetNetTokenizer", None)),
            ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
            ("rag", ("RagTokenizer", None)),
            ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
301
            (
302
                "reformer",
303
                (
304
305
                    "ReformerTokenizer" if is_sentencepiece_available() else None,
                    "ReformerTokenizerFast" if is_tokenizers_available() else None,
306
307
                ),
            ),
308
309
310
311
312
313
314
            (
                "rembert",
                (
                    "RemBertTokenizer" if is_sentencepiece_available() else None,
                    "RemBertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
315
316
            ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
317
318
319
320
            (
                "roberta-prelayernorm",
                ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None),
            ),
321
            ("roc_bert", ("RoCBertTokenizer", None)),
322
            ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
Sylvain Gugger's avatar
Sylvain Gugger committed
323
            ("rwkv", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
324
325
            ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
            ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
326
            ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
327
            ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
328
            (
329
330
                "squeezebert",
                ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
331
            ),
332
333
334
335
336
337
338
            (
                "switch_transformers",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
339
            (
340
                "t5",
341
                (
342
343
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
344
345
                ),
            ),
346
347
348
            ("tapas", ("TapasTokenizer", None)),
            ("tapex", ("TapexTokenizer", None)),
            ("transfo-xl", ("TransfoXLTokenizer", None)),
349
350
351
352
353
354
355
            (
                "umt5",
                (
                    "T5Tokenizer" if is_sentencepiece_available() else None,
                    "T5TokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
356
            ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
357
            ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
Matthijs Hollemans's avatar
Matthijs Hollemans committed
358
            ("vits", ("VitsTokenizer", None)),
359
            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
360
            ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
361
            ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
362
            ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
NielsRogge's avatar
NielsRogge committed
363
            ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
364
365
366
367
368
369
370
            (
                "xglm",
                (
                    "XGLMTokenizer" if is_sentencepiece_available() else None,
                    "XGLMTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
371
372
            ("xlm", ("XLMTokenizer", None)),
            ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
373
            (
374
                "xlm-roberta",
375
                (
376
377
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
378
379
                ),
            ),
380
381
382
383
384
385
386
            (
                "xlm-roberta-xl",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
387
388
389
390
391
392
393
            (
                "xlnet",
                (
                    "XLNetTokenizer" if is_sentencepiece_available() else None,
                    "XLNetTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
Jannis Vamvas's avatar
Jannis Vamvas committed
394
395
396
397
398
399
400
            (
                "xmod",
                (
                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
401
402
403
404
405
406
407
            (
                "yoso",
                (
                    "AlbertTokenizer" if is_sentencepiece_available() else None,
                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
408
409
        ]
    )
410

411
412
413
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
414

415

416
def tokenizer_class_from_name(class_name: str):
417
418
419
420
421
    if class_name == "PreTrainedTokenizerFast":
        return PreTrainedTokenizerFast

    for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
        if class_name in tokenizers:
422
            module_name = model_type_to_module_name(module_name)
423

424
            module = importlib.import_module(f".{module_name}", "transformers.models")
425
426
427
428
            try:
                return getattr(module, class_name)
            except AttributeError:
                continue
429

430
431
432
433
434
    for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
        for tokenizer in tokenizers:
            if getattr(tokenizer, "__name__", None) == class_name:
                return tokenizer

435
436
437
438
439
440
    # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
    # init and we return the proper dummy to get an appropriate error message.
    main_module = importlib.import_module("transformers")
    if hasattr(main_module, class_name):
        return getattr(main_module, class_name)

441
    return None
442
443


444
445
446
447
448
449
def get_tokenizer_config(
    pretrained_model_name_or_path: Union[str, os.PathLike],
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
    proxies: Optional[Dict[str, str]] = None,
450
    token: Optional[Union[bool, str]] = None,
451
452
    revision: Optional[str] = None,
    local_files_only: bool = False,
453
    subfolder: str = "",
454
455
456
457
458
459
    **kwargs,
):
    """
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
460
        pretrained_model_name_or_path (`str` or `os.PathLike`):
461
462
            This can be either:

463
            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
Sylvain Gugger's avatar
Sylvain Gugger committed
464
465
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
              under a user or organization name, like `dbmdz/bert-base-german-cased`.
466
467
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
468

469
        cache_dir (`str` or `os.PathLike`, *optional*):
470
471
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
472
        force_download (`bool`, *optional*, defaults to `False`):
473
474
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
475
        resume_download (`bool`, *optional*, defaults to `False`):
476
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
477
        proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
478
479
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
480
        token (`str` or *bool*, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
481
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
482
            when running `huggingface-cli login` (stored in `~/.huggingface`).
483
        revision (`str`, *optional*, defaults to `"main"`):
484
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
485
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
486
            identifier allowed by git.
487
488
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
489
490
491
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.
492

493
    <Tip>
494

495
    Passing `token=True` is required when you want to use a private model.
496

497
    </Tip>
498
499

    Returns:
500
        `Dict`: The configuration of the tokenizer.
501

502
    Examples:
503

504
505
506
507
508
    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
509

510
511
    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer
512

513
514
515
516
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```"""
517
518
519
520
521
522
523
524
525
    use_auth_token = kwargs.pop("use_auth_token", None)
    if use_auth_token is not None:
        warnings.warn(
            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
        )
        if token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        token = use_auth_token

526
527
    commit_hash = kwargs.get("_commit_hash", None)
    resolved_config_file = cached_file(
528
529
530
531
532
533
        pretrained_model_name_or_path,
        TOKENIZER_CONFIG_FILE,
        cache_dir=cache_dir,
        force_download=force_download,
        resume_download=resume_download,
        proxies=proxies,
534
        token=token,
535
536
        revision=revision,
        local_files_only=local_files_only,
537
        subfolder=subfolder,
538
539
540
        _raise_exceptions_for_missing_entries=False,
        _raise_exceptions_for_connection_errors=False,
        _commit_hash=commit_hash,
541
542
    )
    if resolved_config_file is None:
543
544
        logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
        return {}
545
    commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
546
547

    with open(resolved_config_file, encoding="utf-8") as reader:
548
549
550
        result = json.load(reader)
    result["_commit_hash"] = commit_hash
    return result
551
552


Julien Chaumond's avatar
Julien Chaumond committed
553
class AutoTokenizer:
554
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
555
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
556
    created with the [`AutoTokenizer.from_pretrained`] class method.
thomwolf's avatar
thomwolf committed
557

558
    This class cannot be instantiated directly using `__init__()` (throws an error).
thomwolf's avatar
thomwolf committed
559
    """
560

thomwolf's avatar
thomwolf committed
561
    def __init__(self):
562
563
564
565
        raise EnvironmentError(
            "AutoTokenizer is designed to be instantiated "
            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
        )
thomwolf's avatar
thomwolf committed
566
567

    @classmethod
568
    @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
thomwolf's avatar
thomwolf committed
569
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
570
571
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
thomwolf's avatar
thomwolf committed
572

Sylvain Gugger's avatar
Sylvain Gugger committed
573
574
575
        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:
576

577
        List options
thomwolf's avatar
thomwolf committed
578
579

        Params:
580
            pretrained_model_name_or_path (`str` or `os.PathLike`):
581
582
                Can be either:

583
                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Sylvain Gugger's avatar
Sylvain Gugger committed
584
585
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
586
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
Sylvain Gugger's avatar
Sylvain Gugger committed
587
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
588
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
589
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
Sylvain Gugger's avatar
Sylvain Gugger committed
590
                      applicable to all derived classes)
591
592
593
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
Christopher Akiki's avatar
Christopher Akiki committed
594
                The configuration object used to determine the tokenizer class to instantiate.
595
            cache_dir (`str` or `os.PathLike`, *optional*):
596
597
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
598
            force_download (`bool`, *optional*, defaults to `False`):
599
600
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
601
            resume_download (`bool`, *optional*, defaults to `False`):
602
603
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
604
            proxies (`Dict[str, str]`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
605
606
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
607
            revision (`str`, *optional*, defaults to `"main"`):
Julien Chaumond's avatar
Julien Chaumond committed
608
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
609
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
Julien Chaumond's avatar
Julien Chaumond committed
610
                identifier allowed by git.
611
            subfolder (`str`, *optional*):
612
613
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
614
            use_fast (`bool`, *optional*, defaults to `True`):
615
616
617
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
618
            tokenizer_type (`str`, *optional*):
619
                Tokenizer type to be loaded.
620
            trust_remote_code (`bool`, *optional*, defaults to `False`):
621
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
Sylvain Gugger's avatar
Sylvain Gugger committed
622
623
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
624
625
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
Sylvain Gugger's avatar
Sylvain Gugger committed
626
627
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.
thomwolf's avatar
thomwolf committed
628

629
        Examples:
630

631
632
        ```python
        >>> from transformers import AutoTokenizer
633

634
        >>> # Download vocabulary from huggingface.co and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
635
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
636

637
        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
Sylvain Gugger's avatar
Sylvain Gugger committed
638
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
thomwolf's avatar
thomwolf committed
639

640
        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
Yih-Dar's avatar
Yih-Dar committed
641
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
642
643
644

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
645
        ```"""
646
647
648
649
650
651
652
653
654
655
656
        use_auth_token = kwargs.pop("use_auth_token", None)
        if use_auth_token is not None:
            warnings.warn(
                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
            )
            if kwargs.get("token", None) is not None:
                raise ValueError(
                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
                )
            kwargs["token"] = use_auth_token

657
        config = kwargs.pop("config", None)
658
        kwargs["_from_auto"] = True
659

660
        use_fast = kwargs.pop("use_fast", True)
661
        tokenizer_type = kwargs.pop("tokenizer_type", None)
662
        trust_remote_code = kwargs.pop("trust_remote_code", None)
663

664
665
666
667
668
669
670
671
672
673
674
675
676
        # First, let's see whether the tokenizer_type is passed so that we can leverage it
        if tokenizer_type is not None:
            tokenizer_class = None
            tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)

            if tokenizer_class_tuple is None:
                raise ValueError(
                    f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
                    f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}."
                )

            tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple

Arthur's avatar
Arthur committed
677
678
679
680
681
682
683
684
            if use_fast:
                if tokenizer_fast_class_name is not None:
                    tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)
                else:
                    logger.warning(
                        "`use_fast` is set to `True` but the tokenizer class does not have a fast version. "
                        " Falling back to the slow version."
                    )
685
686
687
688
689
690
691
692
693
            if tokenizer_class is None:
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)

            if tokenizer_class is None:
                raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")

            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

        # Next, let's try to use the tokenizer_config file to get the tokenizer class.
694
        tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
695
696
        if "_commit_hash" in tokenizer_config:
            kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
697
        config_tokenizer_class = tokenizer_config.get("tokenizer_class")
698
699
700
701
702
703
704
        tokenizer_auto_map = None
        if "auto_map" in tokenizer_config:
            if isinstance(tokenizer_config["auto_map"], (tuple, list)):
                # Legacy format for dynamic tokenizers
                tokenizer_auto_map = tokenizer_config["auto_map"]
            else:
                tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
705
706
707
708

        # If that did not work, let's try to use the config.
        if config_tokenizer_class is None:
            if not isinstance(config, PretrainedConfig):
709
710
711
                config = AutoConfig.from_pretrained(
                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
                )
712
            config_tokenizer_class = config.tokenizer_class
713
714
            if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                tokenizer_auto_map = config.auto_map["AutoTokenizer"]
715

716
717
718
719
720
        has_remote_code = tokenizer_auto_map is not None
        has_local_code = config_tokenizer_class is not None or type(config) in TOKENIZER_MAPPING
        trust_remote_code = resolve_trust_remote_code(
            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
        )
721

722
723
724
725
726
727
728
        if has_remote_code and trust_remote_code:
            if use_fast and tokenizer_auto_map[1] is not None:
                class_ref = tokenizer_auto_map[1]
            else:
                class_ref = tokenizer_auto_map[0]
            tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
            _ = kwargs.pop("code_revision", None)
729
730
            if os.path.isdir(pretrained_model_name_or_path):
                tokenizer_class.register_for_auto_class()
731
732
733
734
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif config_tokenizer_class is not None:
            tokenizer_class = None
            if use_fast and not config_tokenizer_class.endswith("Fast"):
735
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
736
737
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
738
                tokenizer_class_candidate = config_tokenizer_class
739
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
740
            if tokenizer_class is None:
741
                raise ValueError(
742
                    f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
743
                )
744
745
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

746
        # Otherwise we have to be creative.
747
748
749
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
750
                logger.warning(
751
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
752
                    f"config class: {config.decoder.__class__}. It is not recommended to use the "
753
754
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
                    "specific tokenizer classes."
755
756
757
                )
            config = config.encoder

758
759
        model_type = config_class_to_model_type(type(config).__name__)
        if model_type is not None:
760
            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
761
            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
762
763
                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
764
765
766
767
768
769
770
                if tokenizer_class_py is not None:
                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
                        "in order to use this tokenizer."
                    )
771

772
        raise ValueError(
773
774
            f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
775
        )
776

zspo's avatar
zspo committed
777
    def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False):
778
779
780
781
782
        """
        Register a new tokenizer in this mapping.


        Args:
783
            config_class ([`PretrainedConfig`]):
784
                The configuration corresponding to the model to register.
785
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
786
                The slow tokenizer to register.
787
            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
            raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
        if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast):
            raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
        if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer):
            raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")

        if (
            slow_tokenizer_class is not None
            and fast_tokenizer_class is not None
            and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast)
            and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
        ):
            raise ValueError(
                "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
                "consistent with the slow tokenizer class you passed (fast tokenizer has "
                f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
                "so they match!"
            )

        # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
        if config_class in TOKENIZER_MAPPING._extra_content:
            existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
            if slow_tokenizer_class is None:
                slow_tokenizer_class = existing_slow
            if fast_tokenizer_class is None:
                fast_tokenizer_class = existing_fast

zspo's avatar
zspo committed
818
        TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class), exist_ok=exist_ok)