tokenization_auto.py 20.3 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Julien Chaumond's avatar
Julien Chaumond committed
15
""" Auto Tokenizer class. """
thomwolf's avatar
thomwolf committed
16
17


18
from collections import OrderedDict
thomwolf's avatar
thomwolf committed
19

20
from ... import GPTNeoConfig
Sylvain Gugger's avatar
Sylvain Gugger committed
21
22
23
24
25
26
27
from ...configuration_utils import PretrainedConfig
from ...file_utils import is_sentencepiece_available, is_tokenizers_available
from ...utils import logging
from ..bart.tokenization_bart import BartTokenizer
from ..bert.tokenization_bert import BertTokenizer
from ..bert_japanese.tokenization_bert_japanese import BertJapaneseTokenizer
from ..bertweet.tokenization_bertweet import BertweetTokenizer
28
from ..blenderbot.tokenization_blenderbot import BlenderbotTokenizer
29
from ..blenderbot_small.tokenization_blenderbot_small import BlenderbotSmallTokenizer
abhishek thakur's avatar
abhishek thakur committed
30
from ..convbert.tokenization_convbert import ConvBertTokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
31
32
33
34
35
36
37
38
39
40
41
from ..ctrl.tokenization_ctrl import CTRLTokenizer
from ..deberta.tokenization_deberta import DebertaTokenizer
from ..distilbert.tokenization_distilbert import DistilBertTokenizer
from ..dpr.tokenization_dpr import DPRQuestionEncoderTokenizer
from ..electra.tokenization_electra import ElectraTokenizer
from ..flaubert.tokenization_flaubert import FlaubertTokenizer
from ..fsmt.tokenization_fsmt import FSMTTokenizer
from ..funnel.tokenization_funnel import FunnelTokenizer
from ..gpt2.tokenization_gpt2 import GPT2Tokenizer
from ..herbert.tokenization_herbert import HerbertTokenizer
from ..layoutlm.tokenization_layoutlm import LayoutLMTokenizer
Patrick von Platen's avatar
Patrick von Platen committed
42
from ..led.tokenization_led import LEDTokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
43
44
45
from ..longformer.tokenization_longformer import LongformerTokenizer
from ..lxmert.tokenization_lxmert import LxmertTokenizer
from ..mobilebert.tokenization_mobilebert import MobileBertTokenizer
StillKeepTry's avatar
StillKeepTry committed
46
from ..mpnet.tokenization_mpnet import MPNetTokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
47
48
49
50
51
52
53
from ..openai.tokenization_openai import OpenAIGPTTokenizer
from ..phobert.tokenization_phobert import PhobertTokenizer
from ..prophetnet.tokenization_prophetnet import ProphetNetTokenizer
from ..rag.tokenization_rag import RagTokenizer
from ..retribert.tokenization_retribert import RetriBertTokenizer
from ..roberta.tokenization_roberta import RobertaTokenizer
from ..squeezebert.tokenization_squeezebert import SqueezeBertTokenizer
NielsRogge's avatar
NielsRogge committed
54
from ..tapas.tokenization_tapas import TapasTokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
55
from ..transfo_xl.tokenization_transfo_xl import TransfoXLTokenizer
56
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
57
from ..xlm.tokenization_xlm import XLMTokenizer
58
59
60
from .configuration_auto import (
    AlbertConfig,
    AutoConfig,
Sam Shleifer's avatar
Sam Shleifer committed
61
    BartConfig,
62
    BertConfig,
63
    BertGenerationConfig,
Vasudev Gupta's avatar
Vasudev Gupta committed
64
    BigBirdConfig,
Sam Shleifer's avatar
Sam Shleifer committed
65
    BlenderbotConfig,
66
    BlenderbotSmallConfig,
67
    CamembertConfig,
abhishek thakur's avatar
abhishek thakur committed
68
    ConvBertConfig,
69
    CTRLConfig,
Pengcheng He's avatar
Pengcheng He committed
70
    DebertaConfig,
71
    DebertaV2Config,
72
    DistilBertConfig,
Ola Piktus's avatar
Ola Piktus committed
73
    DPRConfig,
Lysandre Debut's avatar
Lysandre Debut committed
74
    ElectraConfig,
75
    EncoderDecoderConfig,
Lysandre's avatar
Lysandre committed
76
    FlaubertConfig,
77
    FSMTConfig,
Sylvain Gugger's avatar
Sylvain Gugger committed
78
    FunnelConfig,
79
    GPT2Config,
Sehoon Kim's avatar
Sehoon Kim committed
80
    IBertConfig,
Minghao Li's avatar
Minghao Li committed
81
    LayoutLMConfig,
Patrick von Platen's avatar
Patrick von Platen committed
82
    LEDConfig,
Iz Beltagy's avatar
Iz Beltagy committed
83
    LongformerConfig,
84
    LxmertConfig,
85
    M2M100Config,
86
    MarianConfig,
87
    MBartConfig,
88
    MobileBertConfig,
StillKeepTry's avatar
StillKeepTry committed
89
    MPNetConfig,
90
    MT5Config,
91
    OpenAIGPTConfig,
92
    PegasusConfig,
Weizhen's avatar
Weizhen committed
93
    ProphetNetConfig,
Ola Piktus's avatar
Ola Piktus committed
94
    RagConfig,
Patrick von Platen's avatar
Patrick von Platen committed
95
    ReformerConfig,
Yacine Jernite's avatar
Yacine Jernite committed
96
    RetriBertConfig,
97
    RobertaConfig,
98
    Speech2TextConfig,
99
    SqueezeBertConfig,
100
    T5Config,
NielsRogge's avatar
NielsRogge committed
101
    TapasConfig,
102
    TransfoXLConfig,
Patrick von Platen's avatar
Patrick von Platen committed
103
    Wav2Vec2Config,
104
    XLMConfig,
Weizhen's avatar
Weizhen committed
105
    XLMProphetNetConfig,
106
107
    XLMRobertaConfig,
    XLNetConfig,
108
    replace_list_option_in_docstrings,
109
)
Aymeric Augustin's avatar
Aymeric Augustin committed
110

thomwolf's avatar
thomwolf committed
111

112
if is_sentencepiece_available():
Sylvain Gugger's avatar
Sylvain Gugger committed
113
    from ..albert.tokenization_albert import AlbertTokenizer
114
    from ..barthez.tokenization_barthez import BarthezTokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
115
    from ..bert_generation.tokenization_bert_generation import BertGenerationTokenizer
Vasudev Gupta's avatar
Vasudev Gupta committed
116
    from ..big_bird.tokenization_big_bird import BigBirdTokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
117
    from ..camembert.tokenization_camembert import CamembertTokenizer
118
    from ..deberta_v2.tokenization_deberta_v2 import DebertaV2Tokenizer
119
    from ..m2m_100 import M2M100Tokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
120
121
    from ..marian.tokenization_marian import MarianTokenizer
    from ..mbart.tokenization_mbart import MBartTokenizer
122
    from ..mbart.tokenization_mbart50 import MBart50Tokenizer
123
    from ..mt5 import MT5Tokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
124
125
    from ..pegasus.tokenization_pegasus import PegasusTokenizer
    from ..reformer.tokenization_reformer import ReformerTokenizer
126
    from ..speech_to_text import Speech2TextTokenizer
Sylvain Gugger's avatar
Sylvain Gugger committed
127
128
129
130
    from ..t5.tokenization_t5 import T5Tokenizer
    from ..xlm_prophetnet.tokenization_xlm_prophetnet import XLMProphetNetTokenizer
    from ..xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer
    from ..xlnet.tokenization_xlnet import XLNetTokenizer
131
132
else:
    AlbertTokenizer = None
133
    BarthezTokenizer = None
134
    BertGenerationTokenizer = None
Vasudev Gupta's avatar
Vasudev Gupta committed
135
    BigBirdTokenizer = None
136
    CamembertTokenizer = None
137
    DebertaV2Tokenizer = None
138
139
    MarianTokenizer = None
    MBartTokenizer = None
140
    MBart50Tokenizer = None
141
    MT5Tokenizer = None
142
143
144
145
146
    PegasusTokenizer = None
    ReformerTokenizer = None
    T5Tokenizer = None
    XLMRobertaTokenizer = None
    XLNetTokenizer = None
147
    XLMProphetNetTokenizer = None
148
149
    M2M100Tokenizer = None
    Speech2TextTokenizer = None
150
151

if is_tokenizers_available():
Sylvain Gugger's avatar
Sylvain Gugger committed
152
153
    from ..albert.tokenization_albert_fast import AlbertTokenizerFast
    from ..bart.tokenization_bart_fast import BartTokenizerFast
154
    from ..barthez.tokenization_barthez_fast import BarthezTokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
155
156
    from ..bert.tokenization_bert_fast import BertTokenizerFast
    from ..camembert.tokenization_camembert_fast import CamembertTokenizerFast
abhishek thakur's avatar
abhishek thakur committed
157
    from ..convbert.tokenization_convbert_fast import ConvBertTokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
158
159
160
161
162
163
164
    from ..distilbert.tokenization_distilbert_fast import DistilBertTokenizerFast
    from ..dpr.tokenization_dpr_fast import DPRQuestionEncoderTokenizerFast
    from ..electra.tokenization_electra_fast import ElectraTokenizerFast
    from ..funnel.tokenization_funnel_fast import FunnelTokenizerFast
    from ..gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
    from ..herbert.tokenization_herbert_fast import HerbertTokenizerFast
    from ..layoutlm.tokenization_layoutlm_fast import LayoutLMTokenizerFast
Patrick von Platen's avatar
Patrick von Platen committed
165
    from ..led.tokenization_led_fast import LEDTokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
166
167
    from ..longformer.tokenization_longformer_fast import LongformerTokenizerFast
    from ..lxmert.tokenization_lxmert_fast import LxmertTokenizerFast
168
    from ..mbart.tokenization_mbart50_fast import MBart50TokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
169
170
    from ..mbart.tokenization_mbart_fast import MBartTokenizerFast
    from ..mobilebert.tokenization_mobilebert_fast import MobileBertTokenizerFast
StillKeepTry's avatar
StillKeepTry committed
171
    from ..mpnet.tokenization_mpnet_fast import MPNetTokenizerFast
172
    from ..mt5 import MT5TokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
173
174
175
176
177
178
179
180
181
    from ..openai.tokenization_openai_fast import OpenAIGPTTokenizerFast
    from ..pegasus.tokenization_pegasus_fast import PegasusTokenizerFast
    from ..reformer.tokenization_reformer_fast import ReformerTokenizerFast
    from ..retribert.tokenization_retribert_fast import RetriBertTokenizerFast
    from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
    from ..squeezebert.tokenization_squeezebert_fast import SqueezeBertTokenizerFast
    from ..t5.tokenization_t5_fast import T5TokenizerFast
    from ..xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast
    from ..xlnet.tokenization_xlnet_fast import XLNetTokenizerFast
182
183
184
else:
    AlbertTokenizerFast = None
    BartTokenizerFast = None
185
    BarthezTokenizerFast = None
186
187
    BertTokenizerFast = None
    CamembertTokenizerFast = None
abhishek thakur's avatar
abhishek thakur committed
188
    ConvBertTokenizerFast = None
189
190
191
192
193
    DistilBertTokenizerFast = None
    DPRQuestionEncoderTokenizerFast = None
    ElectraTokenizerFast = None
    FunnelTokenizerFast = None
    GPT2TokenizerFast = None
194
    HerbertTokenizerFast = None
195
    LayoutLMTokenizerFast = None
196
    LEDTokenizerFast = None
197
198
199
    LongformerTokenizerFast = None
    LxmertTokenizerFast = None
    MBartTokenizerFast = None
200
    MBart50TokenizerFast = None
201
    MobileBertTokenizerFast = None
StillKeepTry's avatar
StillKeepTry committed
202
    MPNetTokenizerFast = None
203
    MT5TokenizerFast = None
204
205
206
207
208
209
210
211
212
213
    OpenAIGPTTokenizerFast = None
    PegasusTokenizerFast = None
    ReformerTokenizerFast = None
    RetriBertTokenizerFast = None
    RobertaTokenizerFast = None
    SqueezeBertTokenizerFast = None
    T5TokenizerFast = None
    XLMRobertaTokenizerFast = None
    XLNetTokenizerFast = None

214

Lysandre Debut's avatar
Lysandre Debut committed
215
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
216

217

Julien Chaumond's avatar
Julien Chaumond committed
218
TOKENIZER_MAPPING = OrderedDict(
219
    [
Yacine Jernite's avatar
Yacine Jernite committed
220
        (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
221
        (T5Config, (T5Tokenizer, T5TokenizerFast)),
222
        (MT5Config, (MT5Tokenizer, MT5TokenizerFast)),
Vasily Shamporov's avatar
Vasily Shamporov committed
223
        (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),
224
        (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)),
225
226
227
228
229
        (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
        (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)),
        (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
        (MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
        (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
230
        (MarianConfig, (MarianTokenizer, None)),
231
232
        (BlenderbotSmallConfig, (BlenderbotSmallTokenizer, None)),
        (BlenderbotConfig, (BlenderbotTokenizer, None)),
233
        (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
234
        (BartConfig, (BartTokenizer, BartTokenizerFast)),
235
        (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
236
        (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
237
        (ReformerConfig, (ReformerTokenizer, ReformerTokenizerFast)),
Lysandre Debut's avatar
Lysandre Debut committed
238
        (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
Sylvain Gugger's avatar
Sylvain Gugger committed
239
        (FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)),
240
        (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
Minghao Li's avatar
Minghao Li committed
241
        (LayoutLMConfig, (LayoutLMTokenizer, LayoutLMTokenizerFast)),
Ola Piktus's avatar
Ola Piktus committed
242
        (DPRConfig, (DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast)),
243
        (SqueezeBertConfig, (SqueezeBertTokenizer, SqueezeBertTokenizerFast)),
244
245
246
        (BertConfig, (BertTokenizer, BertTokenizerFast)),
        (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
        (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)),
247
248
        (TransfoXLConfig, (TransfoXLTokenizer, None)),
        (XLNetConfig, (XLNetTokenizer, XLNetTokenizerFast)),
249
250
251
        (FlaubertConfig, (FlaubertTokenizer, None)),
        (XLMConfig, (XLMTokenizer, None)),
        (CTRLConfig, (CTRLTokenizer, None)),
252
        (FSMTConfig, (FSMTTokenizer, None)),
253
        (BertGenerationConfig, (BertGenerationTokenizer, None)),
Pengcheng He's avatar
Pengcheng He committed
254
        (DebertaConfig, (DebertaTokenizer, None)),
255
        (DebertaV2Config, (DebertaV2Tokenizer, None)),
Ola Piktus's avatar
Ola Piktus committed
256
        (RagConfig, (RagTokenizer, None)),
Weizhen's avatar
Weizhen committed
257
        (XLMProphetNetConfig, (XLMProphetNetTokenizer, None)),
258
259
        (Speech2TextConfig, (Speech2TextTokenizer, None)),
        (M2M100Config, (M2M100Tokenizer, None)),
Weizhen's avatar
Weizhen committed
260
        (ProphetNetConfig, (ProphetNetTokenizer, None)),
StillKeepTry's avatar
StillKeepTry committed
261
        (MPNetConfig, (MPNetTokenizer, MPNetTokenizerFast)),
NielsRogge's avatar
NielsRogge committed
262
        (TapasConfig, (TapasTokenizer, None)),
Patrick von Platen's avatar
Patrick von Platen committed
263
        (LEDConfig, (LEDTokenizer, LEDTokenizerFast)),
abhishek thakur's avatar
abhishek thakur committed
264
        (ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)),
Vasudev Gupta's avatar
Vasudev Gupta committed
265
        (BigBirdConfig, (BigBirdTokenizer, None)),
Sehoon Kim's avatar
Sehoon Kim committed
266
        (IBertConfig, (RobertaTokenizer, RobertaTokenizerFast)),
267
        (Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)),
268
        (GPTNeoConfig, (GPT2Tokenizer, GPT2TokenizerFast)),
269
270
271
    ]
)

272
273
274
275
276
277
278
# For tokenizers which are not directly mapped from a config
NO_CONFIG_TOKENIZER = [
    BertJapaneseTokenizer,
    BertweetTokenizer,
    HerbertTokenizer,
    HerbertTokenizerFast,
    PhobertTokenizer,
Lysandre Debut's avatar
Lysandre Debut committed
279
    BarthezTokenizer,
280
    BarthezTokenizerFast,
281
282
    MBart50Tokenizer,
    MBart50TokenizerFast,
283
284
285
]


286
287
288
289
290
SLOW_TOKENIZER_MAPPING = {
    k: (v[0] if v[0] is not None else v[1])
    for k, v in TOKENIZER_MAPPING.items()
    if (v[0] is not None or v[1] is not None)
}
291

292

293
294
295
296
def tokenizer_class_from_name(class_name: str):
    all_tokenizer_classes = (
        [v[0] for v in TOKENIZER_MAPPING.values() if v[0] is not None]
        + [v[1] for v in TOKENIZER_MAPPING.values() if v[1] is not None]
297
        + [v for v in NO_CONFIG_TOKENIZER if v is not None]
298
299
300
301
302
303
    )
    for c in all_tokenizer_classes:
        if c.__name__ == class_name:
            return c


Julien Chaumond's avatar
Julien Chaumond committed
304
class AutoTokenizer:
305
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
306
307
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
    created with the :meth:`AutoTokenizer.from_pretrained` class method.
thomwolf's avatar
thomwolf committed
308

309
    This class cannot be instantiated directly using ``__init__()`` (throws an error).
thomwolf's avatar
thomwolf committed
310
    """
311

thomwolf's avatar
thomwolf committed
312
    def __init__(self):
313
314
315
316
        raise EnvironmentError(
            "AutoTokenizer is designed to be instantiated "
            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
        )
thomwolf's avatar
thomwolf committed
317
318

    @classmethod
319
    @replace_list_option_in_docstrings(SLOW_TOKENIZER_MAPPING)
thomwolf's avatar
thomwolf committed
320
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
321
322
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
thomwolf's avatar
thomwolf committed
323

324
325
326
        The tokenizer class to instantiate is selected based on the :obj:`model_type` property of the config object
        (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's
        missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
327

328
        List options
thomwolf's avatar
thomwolf committed
329
330

        Params:
331
            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
332
333
                Can be either:

334
335
336
                    - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
337
338
339
340
                    - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
                      using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.,
                      ``./my_model_directory/``.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
Sylvain Gugger's avatar
Sylvain Gugger committed
341
342
                      single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``. (Not
                      applicable to all derived classes)
343
344
345
346
            inputs (additional positional arguments, `optional`):
                Will be passed along to the Tokenizer ``__init__()`` method.
            config (:class:`~transformers.PreTrainedConfig`, `optional`)
                The configuration object used to dertermine the tokenizer class to instantiate.
347
            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
348
349
350
351
352
353
354
355
356
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (:obj:`Dict[str, str]`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
357
358
                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
Julien Chaumond's avatar
Julien Chaumond committed
359
360
361
362
            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
                identifier allowed by git.
363
364
365
            subfolder (:obj:`str`, `optional`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
366
            use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
367
368
369
370
371
                Whether or not to try to load the fast version of the tokenizer.
            kwargs (additional keyword arguments, `optional`):
                Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like
                ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
                ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__()`` for more details.
thomwolf's avatar
thomwolf committed
372
373
374

        Examples::

375
            >>> from transformers import AutoTokenizer
376

377
            >>> # Download vocabulary from huggingface.co and cache.
378
            >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
379

380
            >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
381
            >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
382

383
384
            >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
            >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
thomwolf's avatar
thomwolf committed
385
386

        """
387
388
389
390
        config = kwargs.pop("config", None)
        if not isinstance(config, PretrainedConfig):
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

391
        use_fast = kwargs.pop("use_fast", True)
392
393

        if config.tokenizer_class is not None:
394
            tokenizer_class = None
395
396
            if use_fast and not config.tokenizer_class.endswith("Fast"):
                tokenizer_class_candidate = f"{config.tokenizer_class}Fast"
397
398
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
399
                tokenizer_class_candidate = config.tokenizer_class
400
401
                tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)

402
            if tokenizer_class is None:
403
404
405
                raise ValueError(
                    "Tokenizer class {} does not exist or is not currently imported.".format(tokenizer_class_candidate)
                )
406
407
            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

408
409
410
411
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
                logger.warn(
412
413
414
415
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
                    f"config class: {config.decoder.__class}. It is not recommended to use the "
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
                    "specific tokenizer classes."
416
417
418
                )
            config = config.encoder

419
420
        if type(config) in TOKENIZER_MAPPING.keys():
            tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
421
            if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
422
423
                return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
            else:
424
425
426
427
428
429
430
                if tokenizer_class_py is not None:
                    return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    raise ValueError(
                        "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
                        "in order to use this tokenizer."
                    )
431

432
        raise ValueError(
433
434
            "Unrecognized configuration class {} to build an AutoTokenizer.\n"
            "Model type should be one of {}.".format(
Julien Chaumond's avatar
Julien Chaumond committed
435
                config.__class__, ", ".join(c.__name__ for c in TOKENIZER_MAPPING.keys())
436
437
            )
        )