test_tokenization_common.py 20.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

16

thomwolf's avatar
thomwolf committed
17
import os
18
import pickle
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import shutil
20
import tempfile
Aymeric Augustin's avatar
Aymeric Augustin committed
21
22
from io import open

23

24
class TokenizerTesterMixin:
25

26
    tokenizer_class = None
27

28
29
    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()
30

31
32
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
33

34
35
    def get_tokenizer(self, **kwargs):
        raise NotImplementedError
36

37
38
    def get_input_output_texts(self):
        raise NotImplementedError
thomwolf's avatar
thomwolf committed
39

40
41
42
43
44
45
46
47
48
49
50
51
52
53
    def test_tokenizers_common_properties(self):
        tokenizer = self.get_tokenizer()
        attributes_list = [
            "bos_token",
            "eos_token",
            "unk_token",
            "sep_token",
            "pad_token",
            "cls_token",
            "mask_token",
        ]
        for attr in attributes_list:
            self.assertTrue(hasattr(tokenizer, attr))
            self.assertTrue(hasattr(tokenizer, attr + "_id"))
54

55
56
        self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
        self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
57

58
59
60
        attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
        for attr in attributes_list:
            self.assertTrue(hasattr(tokenizer, attr))
61

62
63
64
65
    def test_save_and_load_tokenizer(self):
        # safety check on max_len default value so we are sure the test works
        tokenizer = self.get_tokenizer()
        self.assertNotEqual(tokenizer.max_len, 42)
66

67
68
        # Now let's start the test
        tokenizer = self.get_tokenizer(max_len=42)
thomwolf's avatar
thomwolf committed
69

70
        before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
71

72
        with tempfile.TemporaryDirectory() as tmpdirname:
73
74
            tokenizer.save_pretrained(tmpdirname)
            tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
75

76
77
            after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
            self.assertListEqual(before_tokens, after_tokens)
78

79
80
81
            self.assertEqual(tokenizer.max_len, 42)
            tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
            self.assertEqual(tokenizer.max_len, 43)
82

83
84
85
    def test_pickle_tokenizer(self):
        tokenizer = self.get_tokenizer()
        self.assertIsNotNone(tokenizer)
86

87
88
        text = "Munich and Berlin are nice cities"
        subwords = tokenizer.tokenize(text)
89

90
        with tempfile.TemporaryDirectory() as tmpdirname:
91

92
93
94
            filename = os.path.join(tmpdirname, "tokenizer.bin")
            with open(filename, "wb") as handle:
                pickle.dump(tokenizer, handle)
95

96
97
            with open(filename, "rb") as handle:
                tokenizer_new = pickle.load(handle)
98

99
        subwords_loaded = tokenizer_new.tokenize(text)
100

101
        self.assertListEqual(subwords, subwords_loaded)
102

103
104
    def test_added_tokens_do_lower_case(self):
        tokenizer = self.get_tokenizer(do_lower_case=True)
105

106
        special_token = tokenizer.all_special_tokens[0]
107

108
109
        text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
        text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
110

111
        toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
112

113
114
115
        new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
        added = tokenizer.add_tokens(new_toks)
        self.assertEqual(added, 2)
116

117
118
        toks = tokenizer.tokenize(text)
        toks2 = tokenizer.tokenize(text2)
119

120
121
122
        self.assertEqual(len(toks), len(toks2))
        self.assertNotEqual(len(toks), len(toks0))  # toks0 should be longer
        self.assertListEqual(toks, toks2)
123

124
125
126
        # Check that none of the special tokens are lowercased
        sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
        tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
Lysandre's avatar
Lysandre committed
127

128
129
        for special_token in tokenizer.all_special_tokens:
            self.assertTrue(special_token in tokenized_sequence)
Lysandre's avatar
Lysandre committed
130

131
        tokenizer = self.get_tokenizer(do_lower_case=False)
132

133
134
        added = tokenizer.add_tokens(new_toks)
        self.assertEqual(added, 4)
135

136
137
        toks = tokenizer.tokenize(text)
        toks2 = tokenizer.tokenize(text2)
138

139
140
141
        self.assertEqual(len(toks), len(toks2))  # Length should still be the same
        self.assertNotEqual(len(toks), len(toks0))
        self.assertNotEqual(toks[1], toks2[1])  # But at least the first non-special tokens should differ
142

143
144
    def test_add_tokens_tokenizer(self):
        tokenizer = self.get_tokenizer()
145

146
147
        vocab_size = tokenizer.vocab_size
        all_size = len(tokenizer)
148

149
150
        self.assertNotEqual(vocab_size, 0)
        self.assertEqual(vocab_size, all_size)
151

152
153
154
155
        new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
        added_toks = tokenizer.add_tokens(new_toks)
        vocab_size_2 = tokenizer.vocab_size
        all_size_2 = len(tokenizer)
156

157
158
159
160
        self.assertNotEqual(vocab_size_2, 0)
        self.assertEqual(vocab_size, vocab_size_2)
        self.assertEqual(added_toks, len(new_toks))
        self.assertEqual(all_size_2, all_size + len(new_toks))
161

162
163
        tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
        out_string = tokenizer.decode(tokens)
thomwolf's avatar
thomwolf committed
164

165
166
167
        self.assertGreaterEqual(len(tokens), 4)
        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
168

169
170
171
172
        new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
        added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
        vocab_size_3 = tokenizer.vocab_size
        all_size_3 = len(tokenizer)
173

174
175
176
177
        self.assertNotEqual(vocab_size_3, 0)
        self.assertEqual(vocab_size, vocab_size_3)
        self.assertEqual(added_toks_2, len(new_toks_2))
        self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
178

179
180
181
182
        tokens = tokenizer.encode(
            ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
        )
        out_string = tokenizer.decode(tokens)
183

184
185
186
187
188
189
190
        self.assertGreaterEqual(len(tokens), 6)
        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[0], tokens[1])
        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[-2], tokens[-3])
        self.assertEqual(tokens[0], tokenizer.eos_token_id)
        self.assertEqual(tokens[-2], tokenizer.pad_token_id)
191

192
193
194
    def test_add_special_tokens(self):
        tokenizer = self.get_tokenizer()
        input_text, output_text = self.get_input_output_texts()
195

196
        special_token = "[SPECIAL TOKEN]"
197

198
199
200
        tokenizer.add_special_tokens({"cls_token": special_token})
        encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
        assert len(encoded_special_token) == 1
201

202
203
        text = " ".join([input_text, special_token, output_text])
        encoded = tokenizer.encode(text, add_special_tokens=False)
204

205
206
207
208
        input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
        output_encoded = tokenizer.encode(output_text, add_special_tokens=False)
        special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
        assert encoded == input_encoded + special_token_id + output_encoded
209

210
211
        decoded = tokenizer.decode(encoded, skip_special_tokens=True)
        assert special_token not in decoded
212

213
214
215
    def test_required_methods_tokenizer(self):
        tokenizer = self.get_tokenizer()
        input_text, output_text = self.get_input_output_texts()
216

217
218
219
220
        tokens = tokenizer.tokenize(input_text)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
        self.assertListEqual(ids, ids_2)
221

222
223
        tokens_2 = tokenizer.convert_ids_to_tokens(ids)
        text_2 = tokenizer.decode(ids)
224

225
        self.assertEqual(text_2, output_text)
226

227
        self.assertNotEqual(len(tokens_2), 0)
228
        self.assertIsInstance(text_2, str)
229

230
231
    def test_encode_decode_with_spaces(self):
        tokenizer = self.get_tokenizer()
LysandreJik's avatar
LysandreJik committed
232

233
234
235
236
237
238
        new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
        tokenizer.add_tokens(new_toks)
        input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
        encoded = tokenizer.encode(input, add_special_tokens=False)
        decoded = tokenizer.decode(encoded)
        self.assertEqual(decoded, input)
239

240
241
242
243
244
    def test_pretrained_model_lists(self):
        weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
        weights_lists_2 = []
        for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
            weights_lists_2.append(list(map_list.keys()))
245

246
247
        for weights_list_2 in weights_lists_2:
            self.assertListEqual(weights_list, weights_list_2)
LysandreJik's avatar
LysandreJik committed
248

249
250
    def test_mask_output(self):
        tokenizer = self.get_tokenizer()
251

252
        if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
253
254
            seq_0 = "Test this method."
            seq_1 = "With these inputs."
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
            information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
            sequences, mask = information["input_ids"], information["token_type_ids"]
            self.assertEqual(len(sequences), len(mask))

    def test_number_of_added_tokens(self):
        tokenizer = self.get_tokenizer()

        seq_0 = "Test this method."
        seq_1 = "With these inputs."

        sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
        attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)

        # Method is implemented (e.g. not GPT-2)
        if len(attached_sequences) != 2:
            self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))

    def test_maximum_encoding_length_single_input(self):
        tokenizer = self.get_tokenizer()

        seq_0 = "This is a sentence to be encoded."
        stride = 2

        sequence = tokenizer.encode(seq_0, add_special_tokens=False)
        num_added_tokens = tokenizer.num_added_tokens()
        total_length = len(sequence) + num_added_tokens
        information = tokenizer.encode_plus(
            seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride, return_overflowing_tokens=True,
        )

        truncated_sequence = information["input_ids"]
        overflowing_tokens = information["overflowing_tokens"]

        self.assertEqual(len(overflowing_tokens), 2 + stride)
        self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
        self.assertEqual(len(truncated_sequence), total_length - 2)
        self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))

    def test_maximum_encoding_length_pair_input(self):
        tokenizer = self.get_tokenizer()

        seq_0 = "This is a sentence to be encoded."
        seq_1 = "This is another sentence to be encoded."
        stride = 2

        sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
        sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)

        sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
        truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
            tokenizer.encode(seq_0, add_special_tokens=False), tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
        )

        information = tokenizer.encode_plus(
            seq_0,
            seq_1,
            max_length=len(sequence) - 2,
            add_special_tokens=True,
            stride=stride,
            truncation_strategy="only_second",
            return_overflowing_tokens=True,
        )
        information_first_truncated = tokenizer.encode_plus(
            seq_0,
            seq_1,
            max_length=len(sequence) - 2,
            add_special_tokens=True,
            stride=stride,
            truncation_strategy="only_first",
            return_overflowing_tokens=True,
        )

        truncated_sequence = information["input_ids"]
        overflowing_tokens = information["overflowing_tokens"]
        overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]

        self.assertEqual(len(overflowing_tokens), 2 + stride)
        self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
        self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
        self.assertEqual(len(truncated_sequence), len(sequence) - 2)
        self.assertEqual(truncated_sequence, truncated_second_sequence)

    def test_encode_input_type(self):
        tokenizer = self.get_tokenizer()

        sequence = "Let's encode this sequence"

        tokens = tokenizer.tokenize(sequence)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        formatted_input = tokenizer.encode(sequence, add_special_tokens=True)

        self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
        self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)

    def test_special_tokens_mask(self):
        tokenizer = self.get_tokenizer()

        sequence_0 = "Encode this."
        sequence_1 = "This one too please."

        # Testing single inputs
        encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
        encoded_sequence_dict = tokenizer.encode_plus(
            sequence_0, add_special_tokens=True, return_special_tokens_mask=True
        )
        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
        special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))

        filtered_sequence = [
            (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
        ]
        filtered_sequence = [x for x in filtered_sequence if x is not None]
        self.assertEqual(encoded_sequence, filtered_sequence)

        # Testing inputs pairs
        encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
            sequence_1, add_special_tokens=False
        )
        encoded_sequence_dict = tokenizer.encode_plus(
            sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
        )
        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
        special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))

        filtered_sequence = [
            (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
        ]
        filtered_sequence = [x for x in filtered_sequence if x is not None]
        self.assertEqual(encoded_sequence, filtered_sequence)

        # Testing with already existing special tokens
        if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
            tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
        encoded_sequence_dict = tokenizer.encode_plus(
            sequence_0, add_special_tokens=True, return_special_tokens_mask=True
        )
        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
        special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
        special_tokens_mask = tokenizer.get_special_tokens_mask(
            encoded_sequence_w_special, already_has_special_tokens=True
        )
        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
        self.assertEqual(special_tokens_mask_orig, special_tokens_mask)

    def test_padding_to_max_length(self):
        tokenizer = self.get_tokenizer()

        sequence = "Sequence"
        padding_size = 10
        padding_idx = tokenizer.pad_token_id

        # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
        tokenizer.padding_side = "right"
        encoded_sequence = tokenizer.encode(sequence)
        sequence_length = len(encoded_sequence)
        padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
        padded_sequence_length = len(padded_sequence)
        assert sequence_length + padding_size == padded_sequence_length
        assert encoded_sequence + [padding_idx] * padding_size == padded_sequence

        # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
        tokenizer.padding_side = "left"
        encoded_sequence = tokenizer.encode(sequence)
        sequence_length = len(encoded_sequence)
        padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
        padded_sequence_length = len(padded_sequence)
        assert sequence_length + padding_size == padded_sequence_length
        assert [padding_idx] * padding_size + encoded_sequence == padded_sequence

        # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
        encoded_sequence = tokenizer.encode(sequence)
        sequence_length = len(encoded_sequence)

        tokenizer.padding_side = "right"
        padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
        padded_sequence_right_length = len(padded_sequence_right)

        tokenizer.padding_side = "left"
        padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
        padded_sequence_left_length = len(padded_sequence_left)

        assert sequence_length == padded_sequence_right_length
        assert encoded_sequence == padded_sequence_right
        assert sequence_length == padded_sequence_left_length
        assert encoded_sequence == padded_sequence_left

    def test_encode_plus_with_padding(self):
        tokenizer = self.get_tokenizer()

        sequence = "Sequence"
        padding_size = 10
        padding_idx = tokenizer.pad_token_id
        token_type_padding_idx = tokenizer.pad_token_type_id

        encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
        input_ids = encoded_sequence["input_ids"]
        token_type_ids = encoded_sequence["token_type_ids"]
        attention_mask = encoded_sequence["attention_mask"]
        special_tokens_mask = encoded_sequence["special_tokens_mask"]
        sequence_length = len(input_ids)

        # Test right padding
        tokenizer.padding_side = "right"
        padded_sequence = tokenizer.encode_plus(
            sequence,
            max_length=sequence_length + padding_size,
            pad_to_max_length=True,
            return_special_tokens_mask=True,
        )
        padded_input_ids = padded_sequence["input_ids"]
        padded_token_type_ids = padded_sequence["token_type_ids"]
        padded_attention_mask = padded_sequence["attention_mask"]
        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
        padded_sequence_length = len(padded_input_ids)

        assert sequence_length + padding_size == padded_sequence_length
        assert input_ids + [padding_idx] * padding_size == padded_input_ids
        assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
        assert attention_mask + [0] * padding_size == padded_attention_mask
        assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask

        # Test left padding
        tokenizer.padding_side = "left"
        padded_sequence = tokenizer.encode_plus(
            sequence,
            max_length=sequence_length + padding_size,
            pad_to_max_length=True,
            return_special_tokens_mask=True,
        )
        padded_input_ids = padded_sequence["input_ids"]
        padded_token_type_ids = padded_sequence["token_type_ids"]
        padded_attention_mask = padded_sequence["attention_mask"]
        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
        padded_sequence_length = len(padded_input_ids)

        assert sequence_length + padding_size == padded_sequence_length
        assert [padding_idx] * padding_size + input_ids == padded_input_ids
        assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
        assert [0] * padding_size + attention_mask == padded_attention_mask
        assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask