test_tokenization_auto.py 8.46 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
Sylvain Gugger's avatar
Sylvain Gugger committed
2
# Copyright 2020 The HuggingFace Team. All rights reserved.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

16
import tempfile
Aymeric Augustin's avatar
Aymeric Augustin committed
17
import unittest
thomwolf's avatar
thomwolf committed
18

Aymeric Augustin's avatar
Aymeric Augustin committed
19
20
21
22
23
from transformers import (
    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
    AutoTokenizer,
    BertTokenizer,
24
    BertTokenizerFast,
25
    CTRLTokenizer,
Aymeric Augustin's avatar
Aymeric Augustin committed
26
    GPT2Tokenizer,
27
    GPT2TokenizerFast,
28
    PreTrainedTokenizerFast,
Julien Chaumond's avatar
Julien Chaumond committed
29
    RobertaTokenizer,
30
    RobertaTokenizerFast,
Aymeric Augustin's avatar
Aymeric Augustin committed
31
)
Sylvain Gugger's avatar
Sylvain Gugger committed
32
from transformers.models.auto.configuration_auto import AutoConfig
33
34
35
36
37
from transformers.models.auto.tokenization_auto import (
    TOKENIZER_MAPPING,
    get_tokenizer_config,
    tokenizer_class_from_name,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
38
from transformers.models.roberta.configuration_roberta import RobertaConfig
39
40
from transformers.testing_utils import (
    DUMMY_DIFF_TOKENIZER_IDENTIFIER,
41
    DUMMY_UNKNOWN_IDENTIFIER,
42
    SMALL_MODEL_IDENTIFIER,
43
    require_tokenizers,
44
    slow,
45
)
thomwolf's avatar
thomwolf committed
46
47
48


class AutoTokenizerTest(unittest.TestCase):
49
    @slow
thomwolf's avatar
thomwolf committed
50
    def test_tokenizer_from_pretrained(self):
51
        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
thomwolf's avatar
thomwolf committed
52
53
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
54
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
thomwolf's avatar
thomwolf committed
55
56
            self.assertGreater(len(tokenizer), 0)

57
        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
thomwolf's avatar
thomwolf committed
58
59
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
60
            self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast))
thomwolf's avatar
thomwolf committed
61
62
            self.assertGreater(len(tokenizer), 0)

Julien Chaumond's avatar
Julien Chaumond committed
63
64
    def test_tokenizer_from_pretrained_identifier(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
65
66
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)
Julien Chaumond's avatar
Julien Chaumond committed
67
68

    def test_tokenizer_from_model_type(self):
69
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
70
71
        self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 20)
72

73
74
75
76
77
78
79
80
    def test_tokenizer_from_tokenizer_class(self):
        config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
        self.assertIsInstance(config, RobertaConfig)
        # Check that tokenizer_type ≠ model_type
        tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        self.assertEqual(tokenizer.vocab_size, 12)

81
    @require_tokenizers
82
    def test_tokenizer_identifier_with_correct_config(self):
83
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
84
            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
85
86
87
88
89
90
91
            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))

            if isinstance(tokenizer, BertTokenizer):
                self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
            else:
                self.assertEqual(tokenizer.do_lower_case, False)

Sylvain Gugger's avatar
Sylvain Gugger committed
92
            self.assertEqual(tokenizer.model_max_length, 512)
93

94
    @require_tokenizers
95
    def test_tokenizer_identifier_non_existent(self):
96
        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
97
98
            with self.assertRaises(EnvironmentError):
                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
Lysandre's avatar
Lysandre committed
99
100
101
102
103
104
105
106
107

    def test_parents_and_children_in_mappings(self):
        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
        # by the parents and will return the wrong configuration type when using auto models

        mappings = (TOKENIZER_MAPPING,)

        for mapping in mappings:
            mapping = tuple(mapping.items())
108
109
            for index, (child_config, _) in enumerate(mapping[1:]):
                for parent_config, _ in mapping[: index + 1]:
110
                    with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
Lysandre's avatar
Lysandre committed
111
                        self.assertFalse(issubclass(child_config, parent_config))
112

113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
    def test_model_name_edge_cases_in_mappings(self):
        # tests: https://github.com/huggingface/transformers/pull/13251
        # 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
        # 2. models that don't remap 1-1 from model-name to model file, e.g., openai-gpt -> openai
        tokenizers = TOKENIZER_MAPPING.values()
        tokenizer_names = []

        for slow_tok, fast_tok in tokenizers:
            if slow_tok is not None:
                tokenizer_names.append(slow_tok.__name__)

            if fast_tok is not None:
                tokenizer_names.append(fast_tok.__name__)

        for tokenizer_name in tokenizer_names:
            # must find the right class
            tokenizer_class_from_name(tokenizer_name)

131
    @require_tokenizers
132
    def test_from_pretrained_use_fast_toggle(self):
133
134
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)
135
136
137
138
139
140
141
142
143
144
145

    @require_tokenizers
    def test_do_lower_case(self):
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=False)
        sample = "Hello, world. How are you?"
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])

        tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
        tokens = tokenizer.tokenize(sample)
        self.assertEqual("[UNK]", tokens[0])
146
147
148
149
150
151
152
153
154

    @require_tokenizers
    def test_PreTrainedTokenizerFast_from_pretrained(self):
        tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
        self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
        self.assertEqual(tokenizer.model_max_length, 512)
        self.assertEqual(tokenizer.vocab_size, 30000)
        self.assertEqual(tokenizer.unk_token, "[UNK]")
        self.assertEqual(tokenizer.padding_side, "right")
155
156
157
158
159
160
161
162
163
164
165

    def test_auto_tokenizer_from_local_folder(self):
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)

        self.assertIsInstance(tokenizer2, tokenizer.__class__)
        self.assertEqual(tokenizer2.vocab_size, 12)

166
167
168
169
170
    def test_auto_tokenizer_fast_no_slow(self):
        tokenizer = AutoTokenizer.from_pretrained("ctrl")
        # There is no fast CTRL so this always gives us a slow tokenizer.
        self.assertIsInstance(tokenizer, CTRLTokenizer)

171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
    def test_get_tokenizer_config(self):
        # Check we can load the tokenizer config of an online model.
        config = get_tokenizer_config("bert-base-cased")
        # If we ever update bert-base-cased tokenizer config, this dict here will need to be updated.
        self.assertEqual(config, {"do_lower_case": False})

        # This model does not have a tokenizer_config so we get back an empty dict.
        config = get_tokenizer_config(SMALL_MODEL_IDENTIFIER)
        self.assertDictEqual(config, {})

        # A tokenizer saved with `save_pretrained` always creates a tokenizer config.
        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
        with tempfile.TemporaryDirectory() as tmp_dir:
            tokenizer.save_pretrained(tmp_dir)
            config = get_tokenizer_config(tmp_dir)

        # Check the class of the tokenizer was properly saved (note that it always saves the slow class).
        self.assertEqual(config["tokenizer_class"], "BertTokenizer")
        # Check other keys just to make sure the config was properly saved /reloaded.
        self.assertEqual(config["name_or_path"], SMALL_MODEL_IDENTIFIER)