test_tokenization_auto.py 13.8 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
Sylvain Gugger's avatar
Sylvain Gugger committed
2
# Copyright 2020 The HuggingFace Team. All rights reserved.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

16
17
import os
import shutil
18
import tempfile
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import unittest
thomwolf's avatar
thomwolf committed
20

21
22
import pytest

Aymeric Augustin's avatar
Aymeric Augustin committed
23
24
25
26
from transformers import (
    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
    AutoTokenizer,
27
    BertConfig,
Aymeric Augustin's avatar
Aymeric Augustin committed
28
    BertTokenizer,
29
    BertTokenizerFast,
30
    CTRLTokenizer,
Aymeric Augustin's avatar
Aymeric Augustin committed
31
    GPT2Tokenizer,
32
    GPT2TokenizerFast,
33
    PretrainedConfig,
34
    PreTrainedTokenizerFast,
Julien Chaumond's avatar
Julien Chaumond committed
35
    RobertaTokenizer,
36
    RobertaTokenizerFast,
37
    is_tokenizers_available,
Aymeric Augustin's avatar
Aymeric Augustin committed
38
)
39
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
40
41
42
43
44
from transformers.models.auto.tokenization_auto import (
    TOKENIZER_MAPPING,
    get_tokenizer_config,
    tokenizer_class_from_name,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
45
from transformers.models.roberta.configuration_roberta import RobertaConfig
46
47
from transformers.testing_utils import (
    DUMMY_DIFF_TOKENIZER_IDENTIFIER,
48
    DUMMY_UNKNOWN_IDENTIFIER,
49
    SMALL_MODEL_IDENTIFIER,
50
    require_tokenizers,
51
    slow,
52
)
thomwolf's avatar
thomwolf committed
53
54


55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class NewConfig(PretrainedConfig):
    model_type = "new-model"


class NewTokenizer(BertTokenizer):
    pass


if is_tokenizers_available():

    class NewTokenizerFast(BertTokenizerFast):
        slow_tokenizer_class = NewTokenizer
        pass


thomwolf's avatar
thomwolf committed
70
class AutoTokenizerTest(unittest.TestCase):
71
    @slow
thomwolf's avatar
thomwolf committed
72
    def test_tokenizer_from_pretrained(self):
73
        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
thomwolf's avatar
thomwolf committed
74
75
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
76
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
thomwolf's avatar
thomwolf committed
77
78
            self.assertGreater(len(tokenizer), 0)

79
        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
thomwolf's avatar
thomwolf committed
80
81
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
82
            self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast))
thomwolf's avatar
thomwolf committed
83
84
            self.assertGreater(len(tokenizer), 0)

Julien Chaumond's avatar
Julien Chaumond committed
85
86
    def test_tokenizer_from_pretrained_identifier(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
87
88
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)
Julien Chaumond's avatar
Julien Chaumond committed
89
90

    def test_tokenizer_from_model_type(self):
91
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
92
93
        self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 20)
94

95
96
97
98
99
100
101
102
    def test_tokenizer_from_tokenizer_class(self):
        config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
        self.assertIsInstance(config, RobertaConfig)
        # Check that tokenizer_type ≠ model_type
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)

103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
    def test_tokenizer_from_type(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert", use_fast=False)
            self.assertIsInstance(tokenizer, BertTokenizer)

        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
            shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2", use_fast=False)
            self.assertIsInstance(tokenizer, GPT2Tokenizer)

    @require_tokenizers
    def test_tokenizer_from_type_fast(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert")
            self.assertIsInstance(tokenizer, BertTokenizerFast)

        with tempfile.TemporaryDirectory() as tmp_dir:
            shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
            shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))

            tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2")
            self.assertIsInstance(tokenizer, GPT2TokenizerFast)

    def test_tokenizer_from_type_incorrect_name(self):
        with pytest.raises(ValueError):
            AutoTokenizer.from_pretrained("./", tokenizer_type="xxx")

136
    @require_tokenizers
137
    def test_tokenizer_identifier_with_correct_config(self):
138
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
139
            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
140
141
142
143
144
145
146
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))

            if isinstance(tokenizer, BertTokenizer):
                self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
            else:
                self.assertEqual(tokenizer.do_lower_case, False)

Sylvain Gugger's avatar
Sylvain Gugger committed
147
            self.assertEqual(tokenizer.model_max_length, 512)
148

149
    @require_tokenizers
150
    def test_tokenizer_identifier_non_existent(self):
151
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
152
153
154
            with self.assertRaisesRegex(
                ValueError, ".*is not a local path or a model identifier on the model Hub. Did you make a typo?"
            ):
155
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
Lysandre's avatar
Lysandre committed
156
157
158
159
160
161
162
163
164

    def test_parents_and_children_in_mappings(self):
        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
        # by the parents and will return the wrong configuration type when using auto models

        mappings = (TOKENIZER_MAPPING,)

        for mapping in mappings:
            mapping = tuple(mapping.items())
165
166
            for index, (child_config, _) in enumerate(mapping[1:]):
                for parent_config, _ in mapping[: index + 1]:
167
                    with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
Lysandre's avatar
Lysandre committed
168
                        self.assertFalse(issubclass(child_config, parent_config))
169

170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
    def test_model_name_edge_cases_in_mappings(self):
        # tests: https://github.com/huggingface/transformers/pull/13251
        # 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
        # 2. models that don't remap 1-1 from model-name to model file, e.g., openai-gpt -> openai
        tokenizers = TOKENIZER_MAPPING.values()
        tokenizer_names = []

        for slow_tok, fast_tok in tokenizers:
            if slow_tok is not None:
                tokenizer_names.append(slow_tok.__name__)

            if fast_tok is not None:
                tokenizer_names.append(fast_tok.__name__)

        for tokenizer_name in tokenizer_names:
            # must find the right class
            tokenizer_class_from_name(tokenizer_name)

188
    @require_tokenizers
189
    def test_from_pretrained_use_fast_toggle(self):
190
191
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)
192
193
194
195
196
197
198
199
200
201
202

    @require_tokenizers
    def test_do_lower_case(self):
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=False)
        sample = "Hello, world. How are you?"
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])

        tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])
203
204
205
206
207
208
209
210
211

    @require_tokenizers
    def test_PreTrainedTokenizerFast_from_pretrained(self):
        tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
        self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
        self.assertEqual(tokenizer.model_max_length, 512)
        self.assertEqual(tokenizer.vocab_size, 30000)
        self.assertEqual(tokenizer.unk_token, "[UNK]")
        self.assertEqual(tokenizer.padding_side, "right")
212
        self.assertEqual(tokenizer.truncation_side, "right")
213
214
215
216
217
218
219
220
221
222
223

    def test_auto_tokenizer_from_local_folder(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)

        self.assertIsInstance(tokenizer2, tokenizer.__class__)
        self.assertEqual(tokenizer2.vocab_size, 12)

224
225
226
227
228
    def test_auto_tokenizer_fast_no_slow(self):
        tokenizer = AutoTokenizer.from_pretrained("ctrl")
        # There is no fast CTRL so this always gives us a slow tokenizer.
        self.assertIsInstance(tokenizer, CTRLTokenizer)

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
    def test_get_tokenizer_config(self):
        # Check we can load the tokenizer config of an online model.
        config = get_tokenizer_config("bert-base-cased")
        # If we ever update bert-base-cased tokenizer config, this dict here will need to be updated.
        self.assertEqual(config, {"do_lower_case": False})

        # This model does not have a tokenizer_config so we get back an empty dict.
        config = get_tokenizer_config(SMALL_MODEL_IDENTIFIER)
        self.assertDictEqual(config, {})

        # A tokenizer saved with `save_pretrained` always creates a tokenizer config.
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            config = get_tokenizer_config(tmp_dir)

        # Check the class of the tokenizer was properly saved (note that it always saves the slow class).
        self.assertEqual(config["tokenizer_class"], "BertTokenizer")
        # Check other keys just to make sure the config was properly saved /reloaded.
        self.assertEqual(config["name_or_path"], SMALL_MODEL_IDENTIFIER)
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312

    def test_new_tokenizer_registration(self):
        try:
            AutoConfig.register("new-model", NewConfig)

            AutoTokenizer.register(NewConfig, slow_tokenizer_class=NewTokenizer)
            # Trying to register something existing in the Transformers library will raise an error
            with self.assertRaises(ValueError):
                AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer)

            tokenizer = NewTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
                self.assertIsInstance(new_tokenizer, NewTokenizer)

        finally:
            if "new-model" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["new-model"]
            if NewConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[NewConfig]

    @require_tokenizers
    def test_new_tokenizer_fast_registration(self):
        try:
            AutoConfig.register("new-model", NewConfig)

            # Can register in two steps
            AutoTokenizer.register(NewConfig, slow_tokenizer_class=NewTokenizer)
            self.assertEqual(TOKENIZER_MAPPING[NewConfig], (NewTokenizer, None))
            AutoTokenizer.register(NewConfig, fast_tokenizer_class=NewTokenizerFast)
            self.assertEqual(TOKENIZER_MAPPING[NewConfig], (NewTokenizer, NewTokenizerFast))

            del TOKENIZER_MAPPING._extra_content[NewConfig]
            # Can register in one step
            AutoTokenizer.register(NewConfig, slow_tokenizer_class=NewTokenizer, fast_tokenizer_class=NewTokenizerFast)
            self.assertEqual(TOKENIZER_MAPPING[NewConfig], (NewTokenizer, NewTokenizerFast))

            # Trying to register something existing in the Transformers library will raise an error
            with self.assertRaises(ValueError):
                AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast)

            # We pass through a bert tokenizer fast cause there is no converter slow to fast for our new toknizer
            # and that model does not have a tokenizer.json
            with tempfile.TemporaryDirectory() as tmp_dir:
                bert_tokenizer = BertTokenizerFast.from_pretrained(SMALL_MODEL_IDENTIFIER)
                bert_tokenizer.save_pretrained(tmp_dir)
                tokenizer = NewTokenizerFast.from_pretrained(tmp_dir)

            with tempfile.TemporaryDirectory() as tmp_dir:
                tokenizer.save_pretrained(tmp_dir)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
                self.assertIsInstance(new_tokenizer, NewTokenizerFast)

                new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False)
                self.assertIsInstance(new_tokenizer, NewTokenizer)

        finally:
            if "new-model" in CONFIG_MAPPING._extra_content:
                del CONFIG_MAPPING._extra_content["new-model"]
            if NewConfig in TOKENIZER_MAPPING._extra_content:
                del TOKENIZER_MAPPING._extra_content[NewConfig]