test_tokenization_auto.py 22.7 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
Sylvain Gugger's avatar
Sylvain Gugger committed
2
# Copyright 2020 The HuggingFace Team. All rights reserved.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

16
import json
17
18
import os
import shutil
19
import sys
20
import tempfile
Aymeric Augustin's avatar
Aymeric Augustin committed
21
import unittest
22
from pathlib import Path
thomwolf's avatar
thomwolf committed
23

24
25
import pytest

26
import transformers
Aymeric Augustin's avatar
Aymeric Augustin committed
27
28
29
30
from transformers import (
    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
    AutoTokenizer,
31
    BertConfig,
Aymeric Augustin's avatar
Aymeric Augustin committed
32
    BertTokenizer,
33
    BertTokenizerFast,
34
    CTRLTokenizer,
Aymeric Augustin's avatar
Aymeric Augustin committed
35
    GPT2Tokenizer,
36
    GPT2TokenizerFast,
37
    PreTrainedTokenizerFast,
Julien Chaumond's avatar
Julien Chaumond committed
38
    RobertaTokenizer,
39
    RobertaTokenizerFast,
40
    is_tokenizers_available,
Aymeric Augustin's avatar
Aymeric Augustin committed
41
)
42
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
43
44
45
46
47
from transformers.models.auto.tokenization_auto import (
    TOKENIZER_MAPPING,
    get_tokenizer_config,
    tokenizer_class_from_name,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
48
from transformers.models.roberta.configuration_roberta import RobertaConfig
49
50
from transformers.testing_utils import (
    DUMMY_DIFF_TOKENIZER_IDENTIFIER,
51
    DUMMY_UNKNOWN_IDENTIFIER,
52
    SMALL_MODEL_IDENTIFIER,
53
    RequestCounter,
54
    require_tokenizers,
55
    slow,
56
)
thomwolf's avatar
thomwolf committed
57
58


Yih-Dar's avatar
Yih-Dar committed
59
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
60

61
62
from test_module.custom_configuration import CustomConfig  # noqa E402
from test_module.custom_tokenization import CustomTokenizer  # noqa E402
63
64
65


if is_tokenizers_available():
66
    from test_module.custom_tokenization_fast import CustomTokenizerFast
67
68


thomwolf's avatar
thomwolf committed
69
class AutoTokenizerTest(unittest.TestCase):
70
71
72
    def setUp(self):
        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0

73
    @slow
thomwolf's avatar
thomwolf committed
74
    def test_tokenizer_from_pretrained(self):
75
        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
thomwolf's avatar
thomwolf committed
76
77
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
78
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
thomwolf's avatar
thomwolf committed
79
80
            self.assertGreater(len(tokenizer), 0)

81
        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
thomwolf's avatar
thomwolf committed
82
83
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
84
            self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast))
thomwolf's avatar
thomwolf committed
85
86
            self.assertGreater(len(tokenizer), 0)

Julien Chaumond's avatar
Julien Chaumond committed
87
88
    def test_tokenizer_from_pretrained_identifier(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
89
90
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)
Julien Chaumond's avatar
Julien Chaumond committed
91
92

    def test_tokenizer_from_model_type(self):
93
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
94
95
        self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 20)
96

97
98
99
100
101
102
103
104
    def test_tokenizer_from_tokenizer_class(self):
        config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
        self.assertIsInstance(config, RobertaConfig)
        # Check that tokenizer_type ≠ model_type
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    def test_tokenizer_from_type(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert", use_fast=False)
            self.assertIsInstance(tokenizer, BertTokenizer)

        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
            shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2", use_fast=False)
            self.assertIsInstance(tokenizer, GPT2Tokenizer)

    @require_tokenizers
    def test_tokenizer_from_type_fast(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert")
            self.assertIsInstance(tokenizer, BertTokenizerFast)

        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
            shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2")
            self.assertIsInstance(tokenizer, GPT2TokenizerFast)

    def test_tokenizer_from_type_incorrect_name(self):
        with pytest.raises(ValueError):
            AutoTokenizer.from_pretrained("./", tokenizer_type="xxx")

138
    @require_tokenizers
139
    def test_tokenizer_identifier_with_correct_config(self):
140
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
141
            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
142
143
144
145
146
147
148
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))

            if isinstance(tokenizer, BertTokenizer):
                self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
            else:
                self.assertEqual(tokenizer.do_lower_case, False)

Sylvain Gugger's avatar
Sylvain Gugger committed
149
            self.assertEqual(tokenizer.model_max_length, 512)
150

151
    @require_tokenizers
152
    def test_tokenizer_identifier_non_existent(self):
153
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
154
            with self.assertRaisesRegex(
155
156
                EnvironmentError,
                "julien-c/herlolip-not-exists is not a local folder and is not a valid model identifier",
157
            ):
158
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
Lysandre's avatar
Lysandre committed
159

160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    def test_model_name_edge_cases_in_mappings(self):
        # tests: https://github.com/huggingface/transformers/pull/13251
        # 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
        # 2. models that don't remap 1-1 from model-name to model file, e.g., openai-gpt -> openai
        tokenizers = TOKENIZER_MAPPING.values()
        tokenizer_names = []

        for slow_tok, fast_tok in tokenizers:
            if slow_tok is not None:
                tokenizer_names.append(slow_tok.__name__)

            if fast_tok is not None:
                tokenizer_names.append(fast_tok.__name__)

        for tokenizer_name in tokenizer_names:
            # must find the right class
            tokenizer_class_from_name(tokenizer_name)

178
    @require_tokenizers
179
    def test_from_pretrained_use_fast_toggle(self):
180
181
182
183
        self.assertIsInstance(
            AutoTokenizer.from_pretrained("google-bert/bert-base-cased", use_fast=False), BertTokenizer
        )
        self.assertIsInstance(AutoTokenizer.from_pretrained("google-bert/bert-base-cased"), BertTokenizerFast)
184
185
186

    @require_tokenizers
    def test_do_lower_case(self):
187
        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", do_lower_case=False)
188
189
190
191
192
193
194
        sample = "Hello, world. How are you?"
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])

        tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])
195
196
197
198
199
200
201
202
203

    @require_tokenizers
    def test_PreTrainedTokenizerFast_from_pretrained(self):
        tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
        self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
        self.assertEqual(tokenizer.model_max_length, 512)
        self.assertEqual(tokenizer.vocab_size, 30000)
        self.assertEqual(tokenizer.unk_token, "[UNK]")
        self.assertEqual(tokenizer.padding_side, "right")
204
        self.assertEqual(tokenizer.truncation_side, "right")
205
206
207
208
209
210
211
212
213
214
215

    def test_auto_tokenizer_from_local_folder(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)

        self.assertIsInstance(tokenizer2, tokenizer.__class__)
        self.assertEqual(tokenizer2.vocab_size, 12)

216
    def test_auto_tokenizer_fast_no_slow(self):
217
        tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
218
219
220
        # There is no fast CTRL so this always gives us a slow tokenizer.
        self.assertIsInstance(tokenizer, CTRLTokenizer)

221
222
    def test_get_tokenizer_config(self):
        # Check we can load the tokenizer config of an online model.
223
        config = get_tokenizer_config("google-bert/bert-base-cased")
224
        _ = config.pop("_commit_hash", None)
225
        # If we ever update google-bert/bert-base-cased tokenizer config, this dict here will need to be updated.
226
        self.assertEqual(config, {"do_lower_case": False, "model_max_length": 512})
227
228
229
230
231
232
233
234
235
236
237
238
239

        # This model does not have a tokenizer_config so we get back an empty dict.
        config = get_tokenizer_config(SMALL_MODEL_IDENTIFIER)
        self.assertDictEqual(config, {})

        # A tokenizer saved with `save_pretrained` always creates a tokenizer config.
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            config = get_tokenizer_config(tmp_dir)

        # Check the class of the tokenizer was properly saved (note that it always saves the slow class).
        self.assertEqual(config["tokenizer_class"], "BertTokenizer")
240
241
242

    def test_new_tokenizer_registration(self):
        try:
243
            AutoConfig.register("custom", CustomConfig)
244

245
            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
246
247
248
249
            # Trying to register something existing in the Transformers library will raise an error
            with self.assertRaises(ValueError):
                AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer)

250
            tokenizer = CustomTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
251
252
253
254
            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
255
                self.assertIsInstance(new_tokenizer, CustomTokenizer)
256
257

        finally:
258
259
260
261
            if "custom" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["custom"]
            if CustomConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[CustomConfig]
262
263
264
265

    @require_tokenizers
    def test_new_tokenizer_fast_registration(self):
        try:
266
            AutoConfig.register("custom", CustomConfig)
267
268

            # Can register in two steps
269
270
271
272
            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, None))
            AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast)
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast))
273

274
            del TOKENIZER_MAPPING._extra_content[CustomConfig]
275
            # Can register in one step
276
277
278
279
            AutoTokenizer.register(
                CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast
            )
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast))
280
281
282
283
284
285
286
287
288
289

            # Trying to register something existing in the Transformers library will raise an error
            with self.assertRaises(ValueError):
                AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast)

            # We pass through a bert tokenizer fast cause there is no converter slow to fast for our new toknizer
            # and that model does not have a tokenizer.json
            with tempfile.TemporaryDirectory() as tmp_dir:
                bert_tokenizer = BertTokenizerFast.from_pretrained(SMALL_MODEL_IDENTIFIER)
                bert_tokenizer.save_pretrained(tmp_dir)
290
                tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
291
292
293
294
295

            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
296
                self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
297
298

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False)
299
                self.assertIsInstance(new_tokenizer, CustomTokenizer)
300
301

        finally:
302
303
304
305
            if "custom" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["custom"]
            if CustomConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[CustomConfig]
306

307
    def test_from_pretrained_dynamic_tokenizer(self):
308
309
310
311
312
313
314
315
316
        # If remote code is not set, we will time out when asking whether to load the model.
        with self.assertRaises(ValueError):
            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
        # If remote code is disabled, we can't load this config.
        with self.assertRaises(ValueError):
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
            )

317
318
        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
        self.assertTrue(tokenizer.special_attribute_present)
319
320
321
322
323
324
        # Test tokenizer can be reloaded.
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True)
        self.assertTrue(reloaded_tokenizer.special_attribute_present)

325
326
        if is_tokenizers_available():
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
327
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast")
328
329
330
331
332
333
334

            # Test we can also load the slow version
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
            )
            self.assertTrue(tokenizer.special_attribute_present)
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
335
336
337
338
339
340
            # Test tokenizer can be reloaded.
            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)
                reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True, use_fast=False)
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
            self.assertTrue(reloaded_tokenizer.special_attribute_present)
341
342
        else:
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
343
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
344

345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
    @require_tokenizers
    def test_from_pretrained_dynamic_tokenizer_conflict(self):
        class NewTokenizer(BertTokenizer):
            special_attribute_present = False

        class NewTokenizerFast(BertTokenizerFast):
            slow_tokenizer_class = NewTokenizer
            special_attribute_present = False

        try:
            AutoConfig.register("custom", CustomConfig)
            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
            AutoTokenizer.register(CustomConfig, fast_tokenizer_class=NewTokenizerFast)
            # If remote code is not set, the default is to use local
            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
            self.assertFalse(tokenizer.special_attribute_present)
            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False)
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
            self.assertFalse(tokenizer.special_attribute_present)

            # If remote code is disabled, we load the local one.
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
            )
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
            self.assertFalse(tokenizer.special_attribute_present)
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False
            )
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
            self.assertFalse(tokenizer.special_attribute_present)

            # If remote is enabled, we load from the Hub
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True
            )
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
            self.assertTrue(tokenizer.special_attribute_present)
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
            )
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
            self.assertTrue(tokenizer.special_attribute_present)

        finally:
            if "custom" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["custom"]
            if CustomConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[CustomConfig]

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
    def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
        tokenizer = AutoTokenizer.from_pretrained(
            "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True
        )
        self.assertTrue(tokenizer.special_attribute_present)
        if is_tokenizers_available():
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")

            # Test we can also load the slow version
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True, use_fast=False
            )
            self.assertTrue(tokenizer.special_attribute_present)
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
        else:
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")

413
414
415
416
417
418
419
420
421
422
423
    def test_repo_not_found(self):
        with self.assertRaisesRegex(
            EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
        ):
            _ = AutoTokenizer.from_pretrained("bert-base")

    def test_revision_not_found(self):
        with self.assertRaisesRegex(
            EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
        ):
            _ = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
424
425
426
427
428
429

    def test_cached_tokenizer_has_minimum_calls_to_head(self):
        # Make sure we have cached the tokenizer.
        _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
        with RequestCounter() as counter:
            _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
430
431
432
        self.assertEqual(counter["GET"], 0)
        self.assertEqual(counter["HEAD"], 1)
        self.assertEqual(counter.total_calls, 1)
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502

    def test_init_tokenizer_with_trust(self):
        nop_tokenizer_code = """
import transformers

class NopTokenizer(transformers.PreTrainedTokenizer):
    def get_vocab(self):
        return {}
"""

        nop_config_code = """
from transformers import PretrainedConfig

class NopConfig(PretrainedConfig):
    model_type = "test_unregistered_dynamic"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
"""

        with tempfile.TemporaryDirectory() as tmp_dir:
            fake_model_id = "hf-internal-testing/test_unregistered_dynamic"
            fake_repo = os.path.join(tmp_dir, fake_model_id)
            os.makedirs(fake_repo)

            tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py")
            with open(tokenizer_src_file, "w") as wfp:
                wfp.write(nop_tokenizer_code)

            model_config_src_file = os.path.join(fake_repo, "config.py")
            with open(model_config_src_file, "w") as wfp:
                wfp.write(nop_config_code)

            config = {
                "model_type": "test_unregistered_dynamic",
                "auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"},
            }

            config_file = os.path.join(fake_repo, "config.json")
            with open(config_file, "w") as wfp:
                json.dump(config, wfp, indent=2)

            tokenizer_config = {
                "auto_map": {
                    "AutoTokenizer": [
                        f"{fake_model_id}--tokenizer.NopTokenizer",
                        None,
                    ]
                }
            }

            tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json")
            with open(tokenizer_config_file, "w") as wfp:
                json.dump(tokenizer_config, wfp, indent=2)

            prev_dir = os.getcwd()
            try:
                # it looks like subdir= is broken in the from_pretrained also, so this is necessary
                os.chdir(tmp_dir)

                # this should work because we trust the code
                _ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True)
                try:
                    # this should fail because we don't trust and we're not at a terminal for interactive response
                    _ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=False)
                    self.fail("AutoTokenizer.from_pretrained with trust_remote_code=False should raise ValueException")
                except ValueError:
                    pass
            finally:
                os.chdir(prev_dir)