test_tokenization_auto.py 20 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
Sylvain Gugger's avatar
Sylvain Gugger committed
2
# Copyright 2020 The HuggingFace Team. All rights reserved.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

16
17
import os
import shutil
18
import sys
19
import tempfile
Aymeric Augustin's avatar
Aymeric Augustin committed
20
import unittest
21
from pathlib import Path
thomwolf's avatar
thomwolf committed
22

23
24
import pytest

25
import transformers
Aymeric Augustin's avatar
Aymeric Augustin committed
26
27
28
29
from transformers import (
    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
    AutoTokenizer,
30
    BertConfig,
Aymeric Augustin's avatar
Aymeric Augustin committed
31
    BertTokenizer,
32
    BertTokenizerFast,
33
    CTRLTokenizer,
Aymeric Augustin's avatar
Aymeric Augustin committed
34
    GPT2Tokenizer,
35
    GPT2TokenizerFast,
36
    PreTrainedTokenizerFast,
Julien Chaumond's avatar
Julien Chaumond committed
37
    RobertaTokenizer,
38
    RobertaTokenizerFast,
39
    is_tokenizers_available,
Aymeric Augustin's avatar
Aymeric Augustin committed
40
)
41
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
42
43
44
45
46
from transformers.models.auto.tokenization_auto import (
    TOKENIZER_MAPPING,
    get_tokenizer_config,
    tokenizer_class_from_name,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
47
from transformers.models.roberta.configuration_roberta import RobertaConfig
48
49
from transformers.testing_utils import (
    DUMMY_DIFF_TOKENIZER_IDENTIFIER,
50
    DUMMY_UNKNOWN_IDENTIFIER,
51
    SMALL_MODEL_IDENTIFIER,
52
    RequestCounter,
53
    require_tokenizers,
54
    slow,
55
)
thomwolf's avatar
thomwolf committed
56
57


Yih-Dar's avatar
Yih-Dar committed
58
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
59

60
61
from test_module.custom_configuration import CustomConfig  # noqa E402
from test_module.custom_tokenization import CustomTokenizer  # noqa E402
62
63
64


if is_tokenizers_available():
65
    from test_module.custom_tokenization_fast import CustomTokenizerFast
66
67


thomwolf's avatar
thomwolf committed
68
class AutoTokenizerTest(unittest.TestCase):
69
70
71
    def setUp(self):
        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0

72
    @slow
thomwolf's avatar
thomwolf committed
73
    def test_tokenizer_from_pretrained(self):
74
        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
thomwolf's avatar
thomwolf committed
75
76
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
77
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
thomwolf's avatar
thomwolf committed
78
79
            self.assertGreater(len(tokenizer), 0)

80
        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
thomwolf's avatar
thomwolf committed
81
82
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
83
            self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast))
thomwolf's avatar
thomwolf committed
84
85
            self.assertGreater(len(tokenizer), 0)

Julien Chaumond's avatar
Julien Chaumond committed
86
87
    def test_tokenizer_from_pretrained_identifier(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
88
89
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)
Julien Chaumond's avatar
Julien Chaumond committed
90
91

    def test_tokenizer_from_model_type(self):
92
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
93
94
        self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 20)
95

96
97
98
99
100
101
102
103
    def test_tokenizer_from_tokenizer_class(self):
        config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
        self.assertIsInstance(config, RobertaConfig)
        # Check that tokenizer_type ≠ model_type
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)

104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
    def test_tokenizer_from_type(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert", use_fast=False)
            self.assertIsInstance(tokenizer, BertTokenizer)

        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
            shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2", use_fast=False)
            self.assertIsInstance(tokenizer, GPT2Tokenizer)

    @require_tokenizers
    def test_tokenizer_from_type_fast(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert")
            self.assertIsInstance(tokenizer, BertTokenizerFast)

        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
            shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2")
            self.assertIsInstance(tokenizer, GPT2TokenizerFast)

    def test_tokenizer_from_type_incorrect_name(self):
        with pytest.raises(ValueError):
            AutoTokenizer.from_pretrained("./", tokenizer_type="xxx")

137
    @require_tokenizers
138
    def test_tokenizer_identifier_with_correct_config(self):
139
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
140
            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
141
142
143
144
145
146
147
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))

            if isinstance(tokenizer, BertTokenizer):
                self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
            else:
                self.assertEqual(tokenizer.do_lower_case, False)

Sylvain Gugger's avatar
Sylvain Gugger committed
148
            self.assertEqual(tokenizer.model_max_length, 512)
149

150
    @require_tokenizers
151
    def test_tokenizer_identifier_non_existent(self):
152
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
153
            with self.assertRaisesRegex(
154
155
                EnvironmentError,
                "julien-c/herlolip-not-exists is not a local folder and is not a valid model identifier",
156
            ):
157
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
Lysandre's avatar
Lysandre committed
158

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
    def test_model_name_edge_cases_in_mappings(self):
        # tests: https://github.com/huggingface/transformers/pull/13251
        # 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
        # 2. models that don't remap 1-1 from model-name to model file, e.g., openai-gpt -> openai
        tokenizers = TOKENIZER_MAPPING.values()
        tokenizer_names = []

        for slow_tok, fast_tok in tokenizers:
            if slow_tok is not None:
                tokenizer_names.append(slow_tok.__name__)

            if fast_tok is not None:
                tokenizer_names.append(fast_tok.__name__)

        for tokenizer_name in tokenizer_names:
            # must find the right class
            tokenizer_class_from_name(tokenizer_name)

177
    @require_tokenizers
178
    def test_from_pretrained_use_fast_toggle(self):
179
180
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)
181
182
183
184
185
186
187
188
189
190
191

    @require_tokenizers
    def test_do_lower_case(self):
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=False)
        sample = "Hello, world. How are you?"
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])

        tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])
192
193
194
195
196
197
198
199
200

    @require_tokenizers
    def test_PreTrainedTokenizerFast_from_pretrained(self):
        tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
        self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
        self.assertEqual(tokenizer.model_max_length, 512)
        self.assertEqual(tokenizer.vocab_size, 30000)
        self.assertEqual(tokenizer.unk_token, "[UNK]")
        self.assertEqual(tokenizer.padding_side, "right")
201
        self.assertEqual(tokenizer.truncation_side, "right")
202
203
204
205
206
207
208
209
210
211
212

    def test_auto_tokenizer_from_local_folder(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)

        self.assertIsInstance(tokenizer2, tokenizer.__class__)
        self.assertEqual(tokenizer2.vocab_size, 12)

213
214
215
216
217
    def test_auto_tokenizer_fast_no_slow(self):
        tokenizer = AutoTokenizer.from_pretrained("ctrl")
        # There is no fast CTRL so this always gives us a slow tokenizer.
        self.assertIsInstance(tokenizer, CTRLTokenizer)

218
219
220
    def test_get_tokenizer_config(self):
        # Check we can load the tokenizer config of an online model.
        config = get_tokenizer_config("bert-base-cased")
221
        _ = config.pop("_commit_hash", None)
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
        # If we ever update bert-base-cased tokenizer config, this dict here will need to be updated.
        self.assertEqual(config, {"do_lower_case": False})

        # This model does not have a tokenizer_config so we get back an empty dict.
        config = get_tokenizer_config(SMALL_MODEL_IDENTIFIER)
        self.assertDictEqual(config, {})

        # A tokenizer saved with `save_pretrained` always creates a tokenizer config.
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            config = get_tokenizer_config(tmp_dir)

        # Check the class of the tokenizer was properly saved (note that it always saves the slow class).
        self.assertEqual(config["tokenizer_class"], "BertTokenizer")
237
238
239

    def test_new_tokenizer_registration(self):
        try:
240
            AutoConfig.register("custom", CustomConfig)
241

242
            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
243
244
245
246
            # Trying to register something existing in the Transformers library will raise an error
            with self.assertRaises(ValueError):
                AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer)

247
            tokenizer = CustomTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
248
249
250
251
            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
252
                self.assertIsInstance(new_tokenizer, CustomTokenizer)
253
254

        finally:
255
256
257
258
            if "custom" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["custom"]
            if CustomConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[CustomConfig]
259
260
261
262

    @require_tokenizers
    def test_new_tokenizer_fast_registration(self):
        try:
263
            AutoConfig.register("custom", CustomConfig)
264
265

            # Can register in two steps
266
267
268
269
            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, None))
            AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast)
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast))
270

271
            del TOKENIZER_MAPPING._extra_content[CustomConfig]
272
            # Can register in one step
273
274
275
276
            AutoTokenizer.register(
                CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast
            )
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast))
277
278
279
280
281
282
283
284
285
286

            # Trying to register something existing in the Transformers library will raise an error
            with self.assertRaises(ValueError):
                AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast)

            # We pass through a bert tokenizer fast cause there is no converter slow to fast for our new toknizer
            # and that model does not have a tokenizer.json
            with tempfile.TemporaryDirectory() as tmp_dir:
                bert_tokenizer = BertTokenizerFast.from_pretrained(SMALL_MODEL_IDENTIFIER)
                bert_tokenizer.save_pretrained(tmp_dir)
287
                tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
288
289
290
291
292

            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
293
                self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
294
295

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False)
296
                self.assertIsInstance(new_tokenizer, CustomTokenizer)
297
298

        finally:
299
300
301
302
            if "custom" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["custom"]
            if CustomConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[CustomConfig]
303

304
    def test_from_pretrained_dynamic_tokenizer(self):
305
306
307
308
309
310
311
312
313
        # If remote code is not set, we will time out when asking whether to load the model.
        with self.assertRaises(ValueError):
            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
        # If remote code is disabled, we can't load this config.
        with self.assertRaises(ValueError):
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
            )

314
315
        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
        self.assertTrue(tokenizer.special_attribute_present)
316
317
318
319
320
321
        # Test tokenizer can be reloaded.
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True)
        self.assertTrue(reloaded_tokenizer.special_attribute_present)

322
323
        if is_tokenizers_available():
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
324
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast")
325
326
327
328
329
330
331

            # Test we can also load the slow version
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
            )
            self.assertTrue(tokenizer.special_attribute_present)
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
332
333
334
335
336
337
            # Test tokenizer can be reloaded.
            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)
                reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True, use_fast=False)
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
            self.assertTrue(reloaded_tokenizer.special_attribute_present)
338
339
        else:
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
340
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
341

342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
    @require_tokenizers
    def test_from_pretrained_dynamic_tokenizer_conflict(self):
        class NewTokenizer(BertTokenizer):
            special_attribute_present = False

        class NewTokenizerFast(BertTokenizerFast):
            slow_tokenizer_class = NewTokenizer
            special_attribute_present = False

        try:
            AutoConfig.register("custom", CustomConfig)
            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
            AutoTokenizer.register(CustomConfig, fast_tokenizer_class=NewTokenizerFast)
            # If remote code is not set, the default is to use local
            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
            self.assertFalse(tokenizer.special_attribute_present)
            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False)
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
            self.assertFalse(tokenizer.special_attribute_present)

            # If remote code is disabled, we load the local one.
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
            )
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
            self.assertFalse(tokenizer.special_attribute_present)
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False
            )
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
            self.assertFalse(tokenizer.special_attribute_present)

            # If remote is enabled, we load from the Hub
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True
            )
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
            self.assertTrue(tokenizer.special_attribute_present)
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
            )
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
            self.assertTrue(tokenizer.special_attribute_present)

        finally:
            if "custom" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["custom"]
            if CustomConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[CustomConfig]

393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
    def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
        tokenizer = AutoTokenizer.from_pretrained(
            "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True
        )
        self.assertTrue(tokenizer.special_attribute_present)
        if is_tokenizers_available():
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")

            # Test we can also load the slow version
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True, use_fast=False
            )
            self.assertTrue(tokenizer.special_attribute_present)
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
        else:
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")

410
411
412
413
414
415
416
417
418
419
420
    def test_repo_not_found(self):
        with self.assertRaisesRegex(
            EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
        ):
            _ = AutoTokenizer.from_pretrained("bert-base")

    def test_revision_not_found(self):
        with self.assertRaisesRegex(
            EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
        ):
            _ = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
421
422
423
424
425
426
427

    def test_cached_tokenizer_has_minimum_calls_to_head(self):
        # Make sure we have cached the tokenizer.
        _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
        with RequestCounter() as counter:
            _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
            self.assertEqual(counter.get_request_count, 0)
428
            self.assertEqual(counter.head_request_count, 1)
429
            self.assertEqual(counter.other_request_count, 0)