test_tokenization_bert.py 11.3 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
Sylvain Gugger's avatar
Sylvain Gugger committed
2
# Copyright 2020 The HuggingFace Team. All rights reserved.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

thomwolf's avatar
thomwolf committed
16
17

import os
18
import unittest
thomwolf's avatar
thomwolf committed
19

20
from transformers import BertTokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
21
from transformers.models.bert.tokenization_bert import (
Aymeric Augustin's avatar
Aymeric Augustin committed
22
    VOCAB_FILES_NAMES,
23
24
25
26
27
28
29
    BasicTokenizer,
    BertTokenizer,
    WordpieceTokenizer,
    _is_control,
    _is_punctuation,
    _is_whitespace,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
30
from transformers.testing_utils import require_tokenizers, slow
thomwolf's avatar
thomwolf committed
31

32
from .test_tokenization_common import TokenizerTesterMixin, filter_non_english
thomwolf's avatar
thomwolf committed
33

34

35
@require_tokenizers
36
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
37
38

    tokenizer_class = BertTokenizer
39
    rust_tokenizer_class = BertTokenizerFast
Anthony MOI's avatar
Anthony MOI committed
40
    test_rust_tokenizer = True
41
    space_between_special_tokens = True
42
    from_pretrained_filter = filter_non_english
43
44

    def setUp(self):
Julien Chaumond's avatar
Julien Chaumond committed
45
        super().setUp()
thomwolf's avatar
thomwolf committed
46

47
        vocab_tokens = [
48
49
50
            "[UNK]",
            "[CLS]",
            "[SEP]",
51
52
            "[PAD]",
            "[MASK]",
53
54
55
56
57
58
59
60
61
62
            "want",
            "##want",
            "##ed",
            "wa",
            "un",
            "runn",
            "##ing",
            ",",
            "low",
            "lowest",
63
        ]
64
65
        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
66
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
thomwolf's avatar
thomwolf committed
67

68
    def get_input_output_texts(self, tokenizer):
69
70
        input_text = "UNwant\u00E9d,running"
        output_text = "unwanted, running"
71
        return input_text, output_text
72

73
    def test_full_tokenizer(self):
thomwolf's avatar
thomwolf committed
74
        tokenizer = self.tokenizer_class(self.vocab_file)
thomwolf's avatar
thomwolf committed
75

76
        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
77
        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
78
        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
79

Anthony MOI's avatar
Anthony MOI committed
80
81
82
83
84
    def test_rust_and_python_full_tokenizers(self):
        if not self.test_rust_tokenizer:
            return

        tokenizer = self.get_tokenizer()
Funtowicz Morgan's avatar
Funtowicz Morgan committed
85
        rust_tokenizer = self.get_rust_tokenizer()
Anthony MOI's avatar
Anthony MOI committed
86

87
        sequence = "UNwant\u00E9d,running"
Anthony MOI's avatar
Anthony MOI committed
88
89
90
91
92
93

        tokens = tokenizer.tokenize(sequence)
        rust_tokens = rust_tokenizer.tokenize(sequence)
        self.assertListEqual(tokens, rust_tokens)

        ids = tokenizer.encode(sequence, add_special_tokens=False)
Funtowicz Morgan's avatar
Funtowicz Morgan committed
94
        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
Anthony MOI's avatar
Anthony MOI committed
95
96
97
98
99
100
101
        self.assertListEqual(ids, rust_ids)

        rust_tokenizer = self.get_rust_tokenizer()
        ids = tokenizer.encode(sequence)
        rust_ids = rust_tokenizer.encode(sequence)
        self.assertListEqual(ids, rust_ids)

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
        # With lower casing
        tokenizer = self.get_tokenizer(do_lower_case=True)
        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)

        sequence = "UNwant\u00E9d,running"

        tokens = tokenizer.tokenize(sequence)
        rust_tokens = rust_tokenizer.tokenize(sequence)
        self.assertListEqual(tokens, rust_tokens)

        ids = tokenizer.encode(sequence, add_special_tokens=False)
        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
        self.assertListEqual(ids, rust_ids)

        rust_tokenizer = self.get_rust_tokenizer()
        ids = tokenizer.encode(sequence)
        rust_ids = rust_tokenizer.encode(sequence)
        self.assertListEqual(ids, rust_ids)

121
    def test_chinese(self):
thomwolf's avatar
thomwolf committed
122
        tokenizer = BasicTokenizer()
123

124
        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
125

126
    def test_basic_tokenizer_lower(self):
thomwolf's avatar
thomwolf committed
127
        tokenizer = BasicTokenizer(do_lower_case=True)
thomwolf's avatar
thomwolf committed
128

129
        self.assertListEqual(
130
131
132
            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
        )
        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
thomwolf's avatar
thomwolf committed
133

134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
    def test_basic_tokenizer_lower_strip_accents_false(self):
        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)

        self.assertListEqual(
            tokenizer.tokenize(" \tH盲LLo!how  \n Are yoU?  "), ["h盲llo", "!", "how", "are", "you", "?"]
        )
        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])

    def test_basic_tokenizer_lower_strip_accents_true(self):
        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)

        self.assertListEqual(
            tokenizer.tokenize(" \tH盲LLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
        )
        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])

    def test_basic_tokenizer_lower_strip_accents_default(self):
        tokenizer = BasicTokenizer(do_lower_case=True)

        self.assertListEqual(
            tokenizer.tokenize(" \tH盲LLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
        )
        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])

158
    def test_basic_tokenizer_no_lower(self):
thomwolf's avatar
thomwolf committed
159
        tokenizer = BasicTokenizer(do_lower_case=False)
thomwolf's avatar
thomwolf committed
160

161
        self.assertListEqual(
162
163
            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
        )
thomwolf's avatar
thomwolf committed
164

165
166
167
168
169
170
171
172
173
174
175
176
177
178
    def test_basic_tokenizer_no_lower_strip_accents_false(self):
        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)

        self.assertListEqual(
            tokenizer.tokenize(" \tH盲LLo!how  \n Are yoU?  "), ["H盲LLo", "!", "how", "Are", "yoU", "?"]
        )

    def test_basic_tokenizer_no_lower_strip_accents_true(self):
        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)

        self.assertListEqual(
            tokenizer.tokenize(" \tH盲LLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
        )

179
180
181
182
183
184
185
    def test_basic_tokenizer_respects_never_split_tokens(self):
        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])

        self.assertListEqual(
            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
        )

186
    def test_wordpiece_tokenizer(self):
187
        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
thomwolf's avatar
thomwolf committed
188

189
190
191
        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i
192
        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
thomwolf's avatar
thomwolf committed
193

194
        self.assertListEqual(tokenizer.tokenize(""), [])
thomwolf's avatar
thomwolf committed
195

196
        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
thomwolf's avatar
thomwolf committed
197

198
        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
thomwolf's avatar
thomwolf committed
199

200
    def test_is_whitespace(self):
201
202
203
204
205
        self.assertTrue(_is_whitespace(" "))
        self.assertTrue(_is_whitespace("\t"))
        self.assertTrue(_is_whitespace("\r"))
        self.assertTrue(_is_whitespace("\n"))
        self.assertTrue(_is_whitespace("\u00A0"))
thomwolf's avatar
thomwolf committed
206

207
208
        self.assertFalse(_is_whitespace("A"))
        self.assertFalse(_is_whitespace("-"))
thomwolf's avatar
thomwolf committed
209

210
    def test_is_control(self):
211
        self.assertTrue(_is_control("\u0005"))
thomwolf's avatar
thomwolf committed
212

213
214
215
216
        self.assertFalse(_is_control("A"))
        self.assertFalse(_is_control(" "))
        self.assertFalse(_is_control("\t"))
        self.assertFalse(_is_control("\r"))
thomwolf's avatar
thomwolf committed
217

218
    def test_is_punctuation(self):
219
220
221
222
        self.assertTrue(_is_punctuation("-"))
        self.assertTrue(_is_punctuation("$"))
        self.assertTrue(_is_punctuation("`"))
        self.assertTrue(_is_punctuation("."))
thomwolf's avatar
thomwolf committed
223

224
225
        self.assertFalse(_is_punctuation("A"))
        self.assertFalse(_is_punctuation(" "))
thomwolf's avatar
thomwolf committed
226

227
228
229
230
231
232
233
234
235
236
237
    def test_clean_text(self):
        tokenizer = self.get_tokenizer()
        rust_tokenizer = self.get_rust_tokenizer()

        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])

        self.assertListEqual(
            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
        )

238
    @slow
239
    def test_sequence_builders(self):
thomwolf's avatar
thomwolf committed
240
        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
241

Lysandre's avatar
Remove  
Lysandre committed
242
243
        text = tokenizer.encode("sequence builders", add_special_tokens=False)
        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
244

245
246
        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
247
248
249

        assert encoded_sentence == [101] + text + [102]
        assert encoded_pair == [101] + text + [102] + text_2 + [102]
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301

    def test_offsets_with_special_characters(self):
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)

                sentence = f"A, na茂ve {tokenizer_r.mask_token} AllenNLP sentence."
                tokens = tokenizer_r.encode_plus(
                    sentence,
                    return_attention_mask=False,
                    return_token_type_ids=False,
                    return_offsets_mapping=True,
                    add_special_tokens=True,
                )

                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
                expected_results = (
                    [
                        ((0, 0), tokenizer_r.cls_token),
                        ((0, 1), "A"),
                        ((1, 2), ","),
                        ((3, 5), "na"),
                        ((5, 6), "##茂"),
                        ((6, 8), "##ve"),
                        ((9, 15), tokenizer_r.mask_token),
                        ((16, 21), "Allen"),
                        ((21, 23), "##NL"),
                        ((23, 24), "##P"),
                        ((25, 33), "sentence"),
                        ((33, 34), "."),
                        ((0, 0), tokenizer_r.sep_token),
                    ]
                    if not do_lower_case
                    else [
                        ((0, 0), tokenizer_r.cls_token),
                        ((0, 1), "a"),
                        ((1, 2), ","),
                        ((3, 8), "naive"),
                        ((9, 15), tokenizer_r.mask_token),
                        ((16, 21), "allen"),
                        ((21, 23), "##nl"),
                        ((23, 24), "##p"),
                        ((25, 33), "sentence"),
                        ((33, 34), "."),
                        ((0, 0), tokenizer_r.sep_token),
                    ]
                )

                self.assertEqual(
                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
                )
                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])