test_tokenization_auto.py 16.7 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
Sylvain Gugger's avatar
Sylvain Gugger committed
2
# Copyright 2020 The HuggingFace Team. All rights reserved.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

16
17
import os
import shutil
18
import sys
19
import tempfile
Aymeric Augustin's avatar
Aymeric Augustin committed
20
import unittest
21
from pathlib import Path
thomwolf's avatar
thomwolf committed
22

23
24
import pytest

Aymeric Augustin's avatar
Aymeric Augustin committed
25
26
27
28
from transformers import (
    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
    AutoTokenizer,
29
    BertConfig,
Aymeric Augustin's avatar
Aymeric Augustin committed
30
    BertTokenizer,
31
    BertTokenizerFast,
32
    CTRLTokenizer,
Aymeric Augustin's avatar
Aymeric Augustin committed
33
    GPT2Tokenizer,
34
    GPT2TokenizerFast,
35
    PreTrainedTokenizerFast,
Julien Chaumond's avatar
Julien Chaumond committed
36
    RobertaTokenizer,
37
    RobertaTokenizerFast,
38
    is_tokenizers_available,
Aymeric Augustin's avatar
Aymeric Augustin committed
39
)
40
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
41
42
43
44
45
from transformers.models.auto.tokenization_auto import (
    TOKENIZER_MAPPING,
    get_tokenizer_config,
    tokenizer_class_from_name,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
46
from transformers.models.roberta.configuration_roberta import RobertaConfig
47
48
from transformers.testing_utils import (
    DUMMY_DIFF_TOKENIZER_IDENTIFIER,
49
    DUMMY_UNKNOWN_IDENTIFIER,
50
    SMALL_MODEL_IDENTIFIER,
51
    RequestCounter,
52
    require_tokenizers,
53
    slow,
54
)
thomwolf's avatar
thomwolf committed
55
56


Yih-Dar's avatar
Yih-Dar committed
57
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
58

59
60
from test_module.custom_configuration import CustomConfig  # noqa E402
from test_module.custom_tokenization import CustomTokenizer  # noqa E402
61
62
63


if is_tokenizers_available():
64
    from test_module.custom_tokenization_fast import CustomTokenizerFast
65
66


thomwolf's avatar
thomwolf committed
67
class AutoTokenizerTest(unittest.TestCase):
68
    @slow
thomwolf's avatar
thomwolf committed
69
    def test_tokenizer_from_pretrained(self):
70
        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
thomwolf's avatar
thomwolf committed
71
72
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
73
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
thomwolf's avatar
thomwolf committed
74
75
            self.assertGreater(len(tokenizer), 0)

76
        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
thomwolf's avatar
thomwolf committed
77
78
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
79
            self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast))
thomwolf's avatar
thomwolf committed
80
81
            self.assertGreater(len(tokenizer), 0)

Julien Chaumond's avatar
Julien Chaumond committed
82
83
    def test_tokenizer_from_pretrained_identifier(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
84
85
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)
Julien Chaumond's avatar
Julien Chaumond committed
86
87

    def test_tokenizer_from_model_type(self):
88
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
89
90
        self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 20)
91

92
93
94
95
96
97
98
99
    def test_tokenizer_from_tokenizer_class(self):
        config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
        self.assertIsInstance(config, RobertaConfig)
        # Check that tokenizer_type ≠ model_type
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
    def test_tokenizer_from_type(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert", use_fast=False)
            self.assertIsInstance(tokenizer, BertTokenizer)

        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
            shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2", use_fast=False)
            self.assertIsInstance(tokenizer, GPT2Tokenizer)

    @require_tokenizers
    def test_tokenizer_from_type_fast(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert")
            self.assertIsInstance(tokenizer, BertTokenizerFast)

        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
            shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2")
            self.assertIsInstance(tokenizer, GPT2TokenizerFast)

    def test_tokenizer_from_type_incorrect_name(self):
        with pytest.raises(ValueError):
            AutoTokenizer.from_pretrained("./", tokenizer_type="xxx")

133
    @require_tokenizers
134
    def test_tokenizer_identifier_with_correct_config(self):
135
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
136
            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
137
138
139
140
141
142
143
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))

            if isinstance(tokenizer, BertTokenizer):
                self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
            else:
                self.assertEqual(tokenizer.do_lower_case, False)

Sylvain Gugger's avatar
Sylvain Gugger committed
144
            self.assertEqual(tokenizer.model_max_length, 512)
145

146
    @require_tokenizers
147
    def test_tokenizer_identifier_non_existent(self):
148
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
149
            with self.assertRaisesRegex(
150
151
                EnvironmentError,
                "julien-c/herlolip-not-exists is not a local folder and is not a valid model identifier",
152
            ):
153
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
Lysandre's avatar
Lysandre committed
154

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
    def test_model_name_edge_cases_in_mappings(self):
        # tests: https://github.com/huggingface/transformers/pull/13251
        # 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
        # 2. models that don't remap 1-1 from model-name to model file, e.g., openai-gpt -> openai
        tokenizers = TOKENIZER_MAPPING.values()
        tokenizer_names = []

        for slow_tok, fast_tok in tokenizers:
            if slow_tok is not None:
                tokenizer_names.append(slow_tok.__name__)

            if fast_tok is not None:
                tokenizer_names.append(fast_tok.__name__)

        for tokenizer_name in tokenizer_names:
            # must find the right class
            tokenizer_class_from_name(tokenizer_name)

173
    @require_tokenizers
174
    def test_from_pretrained_use_fast_toggle(self):
175
176
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)
177
178
179
180
181
182
183
184
185
186
187

    @require_tokenizers
    def test_do_lower_case(self):
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=False)
        sample = "Hello, world. How are you?"
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])

        tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])
188
189
190
191
192
193
194
195
196

    @require_tokenizers
    def test_PreTrainedTokenizerFast_from_pretrained(self):
        tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
        self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
        self.assertEqual(tokenizer.model_max_length, 512)
        self.assertEqual(tokenizer.vocab_size, 30000)
        self.assertEqual(tokenizer.unk_token, "[UNK]")
        self.assertEqual(tokenizer.padding_side, "right")
197
        self.assertEqual(tokenizer.truncation_side, "right")
198
199
200
201
202
203
204
205
206
207
208

    def test_auto_tokenizer_from_local_folder(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)

        self.assertIsInstance(tokenizer2, tokenizer.__class__)
        self.assertEqual(tokenizer2.vocab_size, 12)

209
210
211
212
213
    def test_auto_tokenizer_fast_no_slow(self):
        tokenizer = AutoTokenizer.from_pretrained("ctrl")
        # There is no fast CTRL so this always gives us a slow tokenizer.
        self.assertIsInstance(tokenizer, CTRLTokenizer)

214
215
216
    def test_get_tokenizer_config(self):
        # Check we can load the tokenizer config of an online model.
        config = get_tokenizer_config("bert-base-cased")
217
        _ = config.pop("_commit_hash", None)
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
        # If we ever update bert-base-cased tokenizer config, this dict here will need to be updated.
        self.assertEqual(config, {"do_lower_case": False})

        # This model does not have a tokenizer_config so we get back an empty dict.
        config = get_tokenizer_config(SMALL_MODEL_IDENTIFIER)
        self.assertDictEqual(config, {})

        # A tokenizer saved with `save_pretrained` always creates a tokenizer config.
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            config = get_tokenizer_config(tmp_dir)

        # Check the class of the tokenizer was properly saved (note that it always saves the slow class).
        self.assertEqual(config["tokenizer_class"], "BertTokenizer")
233
234
235

    def test_new_tokenizer_registration(self):
        try:
236
            AutoConfig.register("custom", CustomConfig)
237

238
            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
239
240
241
242
            # Trying to register something existing in the Transformers library will raise an error
            with self.assertRaises(ValueError):
                AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer)

243
            tokenizer = CustomTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
244
245
246
247
            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
248
                self.assertIsInstance(new_tokenizer, CustomTokenizer)
249
250

        finally:
251
252
253
254
            if "custom" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["custom"]
            if CustomConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[CustomConfig]
255
256
257
258

    @require_tokenizers
    def test_new_tokenizer_fast_registration(self):
        try:
259
            AutoConfig.register("custom", CustomConfig)
260
261

            # Can register in two steps
262
263
264
265
            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, None))
            AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast)
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast))
266

267
            del TOKENIZER_MAPPING._extra_content[CustomConfig]
268
            # Can register in one step
269
270
271
272
            AutoTokenizer.register(
                CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast
            )
            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast))
273
274
275
276
277
278
279
280
281
282

            # Trying to register something existing in the Transformers library will raise an error
            with self.assertRaises(ValueError):
                AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast)

            # We pass through a bert tokenizer fast cause there is no converter slow to fast for our new toknizer
            # and that model does not have a tokenizer.json
            with tempfile.TemporaryDirectory() as tmp_dir:
                bert_tokenizer = BertTokenizerFast.from_pretrained(SMALL_MODEL_IDENTIFIER)
                bert_tokenizer.save_pretrained(tmp_dir)
283
                tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
284
285
286
287
288

            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
289
                self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
290
291

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False)
292
                self.assertIsInstance(new_tokenizer, CustomTokenizer)
293
294

        finally:
295
296
297
298
            if "custom" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["custom"]
            if CustomConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[CustomConfig]
299

300
301
302
    def test_from_pretrained_dynamic_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
        self.assertTrue(tokenizer.special_attribute_present)
303
304
305
306
307
308
        # Test tokenizer can be reloaded.
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True)
        self.assertTrue(reloaded_tokenizer.special_attribute_present)

309
310
        if is_tokenizers_available():
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
311
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast")
312
313
314
315
316
317
318

            # Test we can also load the slow version
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
            )
            self.assertTrue(tokenizer.special_attribute_present)
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
319
320
321
322
323
324
            # Test tokenizer can be reloaded.
            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)
                reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True, use_fast=False)
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
            self.assertTrue(reloaded_tokenizer.special_attribute_present)
325
326
        else:
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
327
            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345

    def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
        tokenizer = AutoTokenizer.from_pretrained(
            "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True
        )
        self.assertTrue(tokenizer.special_attribute_present)
        if is_tokenizers_available():
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")

            # Test we can also load the slow version
            tokenizer = AutoTokenizer.from_pretrained(
                "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True, use_fast=False
            )
            self.assertTrue(tokenizer.special_attribute_present)
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
        else:
            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")

346
347
348
349
350
351
352
353
354
355
356
    def test_repo_not_found(self):
        with self.assertRaisesRegex(
            EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
        ):
            _ = AutoTokenizer.from_pretrained("bert-base")

    def test_revision_not_found(self):
        with self.assertRaisesRegex(
            EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
        ):
            _ = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
357
358
359
360
361
362
363

    def test_cached_tokenizer_has_minimum_calls_to_head(self):
        # Make sure we have cached the tokenizer.
        _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
        with RequestCounter() as counter:
            _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
            self.assertEqual(counter.get_request_count, 0)
364
            self.assertEqual(counter.head_request_count, 1)
365
            self.assertEqual(counter.other_request_count, 0)