test_tokenization_common.py 23.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
thomwolf's avatar
thomwolf committed
15
from __future__ import absolute_import, division, print_function, unicode_literals
16

thomwolf's avatar
thomwolf committed
17
import os
Aymeric Augustin's avatar
Aymeric Augustin committed
18
import shutil
19
import sys
20
import tempfile
21
import unittest
Aymeric Augustin's avatar
Aymeric Augustin committed
22
23
from io import open

24
25
26

if sys.version_info[0] == 2:
    import cPickle as pickle
thomwolf's avatar
thomwolf committed
27
28
29

    class TemporaryDirectory(object):
        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
30

thomwolf's avatar
thomwolf committed
31
32
33
        def __enter__(self):
            self.name = tempfile.mkdtemp()
            return self.name
34

thomwolf's avatar
thomwolf committed
35
36
        def __exit__(self, exc_type, exc_value, traceback):
            shutil.rmtree(self.name)
37
38


39
40
else:
    import pickle
41

thomwolf's avatar
thomwolf committed
42
43
    TemporaryDirectory = tempfile.TemporaryDirectory
    unicode = str
44
45


46
47
class CommonTestCases:
    class CommonTokenizerTester(unittest.TestCase):
48

49
        tokenizer_class = None
50

51
52
        def setUp(self):
            self.tmpdirname = tempfile.mkdtemp()
53

54
55
        def tearDown(self):
            shutil.rmtree(self.tmpdirname)
56

57
        def get_tokenizer(self, **kwargs):
58
            raise NotImplementedError
59

60
61
        def get_input_output_texts(self):
            raise NotImplementedError
thomwolf's avatar
thomwolf committed
62

63
64
        def test_tokenizers_common_properties(self):
            tokenizer = self.get_tokenizer()
65
66
67
68
69
70
71
72
73
            attributes_list = [
                "bos_token",
                "eos_token",
                "unk_token",
                "sep_token",
                "pad_token",
                "cls_token",
                "mask_token",
            ]
74
75
76
77
78
            for attr in attributes_list:
                self.assertTrue(hasattr(tokenizer, attr))
                self.assertTrue(hasattr(tokenizer, attr + "_id"))

            self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
79
            self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
80

81
            attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
82
83
84
            for attr in attributes_list:
                self.assertTrue(hasattr(tokenizer, attr))

85
        def test_save_and_load_tokenizer(self):
86
            # safety check on max_len default value so we are sure the test works
87
            tokenizer = self.get_tokenizer()
88
89
90
91
            self.assertNotEqual(tokenizer.max_len, 42)

            # Now let's start the test
            tokenizer = self.get_tokenizer(max_len=42)
thomwolf's avatar
thomwolf committed
92

93
            before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
94

95
96
            with TemporaryDirectory() as tmpdirname:
                tokenizer.save_pretrained(tmpdirname)
thomwolf's avatar
thomwolf committed
97
                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
98

99
                after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
100
101
102
                self.assertListEqual(before_tokens, after_tokens)

                self.assertEqual(tokenizer.max_len, 42)
thomwolf's avatar
thomwolf committed
103
                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
104
                self.assertEqual(tokenizer.max_len, 43)
105

106
107
108
        def test_pickle_tokenizer(self):
            tokenizer = self.get_tokenizer()
            self.assertIsNotNone(tokenizer)
109

110
            text = "Munich and Berlin are nice cities"
111
            subwords = tokenizer.tokenize(text)
112

113
            with TemporaryDirectory() as tmpdirname:
114

115
                filename = os.path.join(tmpdirname, "tokenizer.bin")
116
117
                with open(filename, "wb") as handle:
                    pickle.dump(tokenizer, handle)
118

119
120
                with open(filename, "rb") as handle:
                    tokenizer_new = pickle.load(handle)
121

122
            subwords_loaded = tokenizer_new.tokenize(text)
123

124
            self.assertListEqual(subwords, subwords_loaded)
125

126
127
128
        def test_added_tokens_do_lower_case(self):
            tokenizer = self.get_tokenizer(do_lower_case=True)

129
130
131
132
            special_token = tokenizer.all_special_tokens[0]

            text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
            text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
133
134
135

            toks0 = tokenizer.tokenize(text)  # toks before adding new_toks

136
            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
137
138
139
140
141
142
143
144
145
146
            added = tokenizer.add_tokens(new_toks)
            self.assertEqual(added, 2)

            toks = tokenizer.tokenize(text)
            toks2 = tokenizer.tokenize(text2)

            self.assertEqual(len(toks), len(toks2))
            self.assertNotEqual(len(toks), len(toks0))  # toks0 should be longer
            self.assertListEqual(toks, toks2)

Lysandre's avatar
Lysandre committed
147
148
149
150
151
            # Check that none of the special tokens are lowercased
            sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
            tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)

            for special_token in tokenizer.all_special_tokens:
Lysandre's avatar
Lysandre committed
152
                self.assertTrue(special_token in tokenized_sequence)
Lysandre's avatar
Lysandre committed
153

154
155
156
157
158
159
160
161
162
163
            tokenizer = self.get_tokenizer(do_lower_case=False)

            added = tokenizer.add_tokens(new_toks)
            self.assertEqual(added, 4)

            toks = tokenizer.tokenize(text)
            toks2 = tokenizer.tokenize(text2)

            self.assertEqual(len(toks), len(toks2))  # Length should still be the same
            self.assertNotEqual(len(toks), len(toks0))
164
            self.assertNotEqual(toks[1], toks2[1])  # But at least the first non-special tokens should differ
165

166
167
        def test_add_tokens_tokenizer(self):
            tokenizer = self.get_tokenizer()
168

169
170
            vocab_size = tokenizer.vocab_size
            all_size = len(tokenizer)
171

172
173
            self.assertNotEqual(vocab_size, 0)
            self.assertEqual(vocab_size, all_size)
174

thomwolf's avatar
thomwolf committed
175
            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
176
177
178
            added_toks = tokenizer.add_tokens(new_toks)
            vocab_size_2 = tokenizer.vocab_size
            all_size_2 = len(tokenizer)
179

180
181
182
183
            self.assertNotEqual(vocab_size_2, 0)
            self.assertEqual(vocab_size, vocab_size_2)
            self.assertEqual(added_toks, len(new_toks))
            self.assertEqual(all_size_2, all_size + len(new_toks))
184

Lysandre's avatar
Remove  
Lysandre committed
185
            tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
thomwolf's avatar
thomwolf committed
186
187
            out_string = tokenizer.decode(tokens)

188
189
190
            self.assertGreaterEqual(len(tokens), 4)
            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
191

192
            new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
193
194
195
            added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
            vocab_size_3 = tokenizer.vocab_size
            all_size_3 = len(tokenizer)
196

197
198
199
200
            self.assertNotEqual(vocab_size_3, 0)
            self.assertEqual(vocab_size, vocab_size_3)
            self.assertEqual(added_toks_2, len(new_toks_2))
            self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
201

202
203
204
            tokens = tokenizer.encode(
                ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
            )
thomwolf's avatar
thomwolf committed
205
            out_string = tokenizer.decode(tokens)
206

207
208
209
210
211
            self.assertGreaterEqual(len(tokens), 6)
            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
            self.assertGreater(tokens[0], tokens[1])
            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
            self.assertGreater(tokens[-2], tokens[-3])
212
            self.assertEqual(tokens[0], tokenizer.eos_token_id)
213
            self.assertEqual(tokens[-2], tokenizer.pad_token_id)
214

215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
        def test_add_special_tokens(self):
            tokenizer = self.get_tokenizer()
            input_text, output_text = self.get_input_output_texts()

            special_token = "[SPECIAL TOKEN]"

            tokenizer.add_special_tokens({"cls_token": special_token})
            encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
            assert len(encoded_special_token) == 1

            text = " ".join([input_text, special_token, output_text])
            encoded = tokenizer.encode(text, add_special_tokens=False)

            input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
            output_encoded = tokenizer.encode(output_text, add_special_tokens=False)
            special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
            assert encoded == input_encoded + special_token_id + output_encoded

            decoded = tokenizer.decode(encoded, skip_special_tokens=True)
            assert special_token not in decoded
235

236
237
238
        def test_required_methods_tokenizer(self):
            tokenizer = self.get_tokenizer()
            input_text, output_text = self.get_input_output_texts()
239

240
241
            tokens = tokenizer.tokenize(input_text)
            ids = tokenizer.convert_tokens_to_ids(tokens)
Lysandre's avatar
Remove  
Lysandre committed
242
            ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
243
            self.assertListEqual(ids, ids_2)
244

245
246
247
248
249
250
251
252
            tokens_2 = tokenizer.convert_ids_to_tokens(ids)
            text_2 = tokenizer.decode(ids)

            self.assertEqual(text_2, output_text)

            self.assertNotEqual(len(tokens_2), 0)
            self.assertIsInstance(text_2, (str, unicode))

LysandreJik's avatar
LysandreJik committed
253
254
255
        def test_encode_decode_with_spaces(self):
            tokenizer = self.get_tokenizer()

256
            new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
LysandreJik's avatar
LysandreJik committed
257
258
259
260
261
            tokenizer.add_tokens(new_toks)
            input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
            encoded = tokenizer.encode(input, add_special_tokens=False)
            decoded = tokenizer.decode(encoded)
            self.assertEqual(decoded, input)
262
263
264
265
266
267
268
269
270

        def test_pretrained_model_lists(self):
            weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
            weights_lists_2 = []
            for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
                weights_lists_2.append(list(map_list.keys()))

            for weights_list_2 in weights_lists_2:
                self.assertListEqual(weights_list, weights_list_2)
LysandreJik's avatar
LysandreJik committed
271

272
273
274
275
276
277
        def test_mask_output(self):
            if sys.version_info <= (3, 0):
                return

            tokenizer = self.get_tokenizer()

278
            if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
279
280
                seq_0 = "Test this method."
                seq_1 = "With these inputs."
thomwolf's avatar
thomwolf committed
281
                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
282
                sequences, mask = information["input_ids"], information["token_type_ids"]
283
                self.assertEqual(len(sequences), len(mask))
284
285
286
287
288
289
290

        def test_number_of_added_tokens(self):
            tokenizer = self.get_tokenizer()

            seq_0 = "Test this method."
            seq_1 = "With these inputs."

Lysandre's avatar
Remove  
Lysandre committed
291
            sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
292
293
294
295
            attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)

            # Method is implemented (e.g. not GPT-2)
            if len(attached_sequences) != 2:
296
                self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
297
298
299
300
301

        def test_maximum_encoding_length_single_input(self):
            tokenizer = self.get_tokenizer()

            seq_0 = "This is a sentence to be encoded."
LysandreJik's avatar
LysandreJik committed
302
            stride = 2
303

Lysandre's avatar
Remove  
Lysandre committed
304
            sequence = tokenizer.encode(seq_0, add_special_tokens=False)
305
306
            num_added_tokens = tokenizer.num_added_tokens()
            total_length = len(sequence) + num_added_tokens
307
308
309
310
311
312
313
            information = tokenizer.encode_plus(
                seq_0,
                max_length=total_length - 2,
                add_special_tokens=True,
                stride=stride,
                return_overflowing_tokens=True,
            )
314

LysandreJik's avatar
LysandreJik committed
315
            truncated_sequence = information["input_ids"]
316
317
            overflowing_tokens = information["overflowing_tokens"]

318
            self.assertEqual(len(overflowing_tokens), 2 + stride)
319
            self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
320
321
            self.assertEqual(len(truncated_sequence), total_length - 2)
            self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
322
323
324
325
326
327

        def test_maximum_encoding_length_pair_input(self):
            tokenizer = self.get_tokenizer()

            seq_0 = "This is a sentence to be encoded."
            seq_1 = "This is another sentence to be encoded."
328
329
            stride = 2

Lysandre's avatar
Remove  
Lysandre committed
330
331
            sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
            sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
332
333

            sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
334
            truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
Lysandre's avatar
Remove  
Lysandre committed
335
                tokenizer.encode(seq_0, add_special_tokens=False),
336
                tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
337
            )
338

339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
            information = tokenizer.encode_plus(
                seq_0,
                seq_1,
                max_length=len(sequence) - 2,
                add_special_tokens=True,
                stride=stride,
                truncation_strategy="only_second",
                return_overflowing_tokens=True,
            )
            information_first_truncated = tokenizer.encode_plus(
                seq_0,
                seq_1,
                max_length=len(sequence) - 2,
                add_special_tokens=True,
                stride=stride,
                truncation_strategy="only_first",
                return_overflowing_tokens=True,
            )
357

LysandreJik's avatar
LysandreJik committed
358
            truncated_sequence = information["input_ids"]
359
            overflowing_tokens = information["overflowing_tokens"]
360
            overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
361

362
            self.assertEqual(len(overflowing_tokens), 2 + stride)
363
364
            self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
            self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
365
366
            self.assertEqual(len(truncated_sequence), len(sequence) - 2)
            self.assertEqual(truncated_sequence, truncated_second_sequence)
367

LysandreJik's avatar
LysandreJik committed
368
        def test_encode_input_type(self):
369
370
371
            tokenizer = self.get_tokenizer()

            sequence = "Let's encode this sequence"
LysandreJik's avatar
LysandreJik committed
372
373
374
375
376

            tokens = tokenizer.tokenize(sequence)
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            formatted_input = tokenizer.encode(sequence, add_special_tokens=True)

377
378
            self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
            self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
LysandreJik's avatar
LysandreJik committed
379

380
        def test_special_tokens_mask(self):
LysandreJik's avatar
LysandreJik committed
381
382
383
384
385
386
            tokenizer = self.get_tokenizer()

            sequence_0 = "Encode this."
            sequence_1 = "This one too please."

            # Testing single inputs
Lysandre's avatar
Remove  
Lysandre committed
387
            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
388
389
390
            encoded_sequence_dict = tokenizer.encode_plus(
                sequence_0, add_special_tokens=True, return_special_tokens_mask=True
            )
LysandreJik's avatar
LysandreJik committed
391
            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
392
            special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
393
            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
LysandreJik's avatar
LysandreJik committed
394

395
396
397
            filtered_sequence = [
                (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
            ]
LysandreJik's avatar
LysandreJik committed
398
            filtered_sequence = [x for x in filtered_sequence if x is not None]
399
            self.assertEqual(encoded_sequence, filtered_sequence)
LysandreJik's avatar
LysandreJik committed
400
401

            # Testing inputs pairs
402
403
404
405
406
407
            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
                sequence_1, add_special_tokens=False
            )
            encoded_sequence_dict = tokenizer.encode_plus(
                sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
            )
LysandreJik's avatar
LysandreJik committed
408
            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
409
            special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
410
            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
LysandreJik's avatar
LysandreJik committed
411

412
413
414
            filtered_sequence = [
                (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
            ]
LysandreJik's avatar
LysandreJik committed
415
            filtered_sequence = [x for x in filtered_sequence if x is not None]
416
            self.assertEqual(encoded_sequence, filtered_sequence)
LysandreJik's avatar
LysandreJik committed
417

418
419
            # Testing with already existing special tokens
            if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
420
421
422
423
                tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
            encoded_sequence_dict = tokenizer.encode_plus(
                sequence_0, add_special_tokens=True, return_special_tokens_mask=True
            )
424
            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
425
            special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
426
427
428
            special_tokens_mask = tokenizer.get_special_tokens_mask(
                encoded_sequence_w_special, already_has_special_tokens=True
            )
429
430
            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
            self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
431
432
433
434
435
436
437
438

        def test_padding_to_max_length(self):
            tokenizer = self.get_tokenizer()

            sequence = "Sequence"
            padding_size = 10
            padding_idx = tokenizer.pad_token_id

439
            # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
440
            tokenizer.padding_side = "right"
441
442
            encoded_sequence = tokenizer.encode(sequence)
            sequence_length = len(encoded_sequence)
443
444
445
            padded_sequence = tokenizer.encode(
                sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
            )
446
447
448
449
            padded_sequence_length = len(padded_sequence)
            assert sequence_length + padding_size == padded_sequence_length
            assert encoded_sequence + [padding_idx] * padding_size == padded_sequence

450
            # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
451
            tokenizer.padding_side = "left"
452
453
            encoded_sequence = tokenizer.encode(sequence)
            sequence_length = len(encoded_sequence)
454
455
456
            padded_sequence = tokenizer.encode(
                sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
            )
457
            padded_sequence_length = len(padded_sequence)
458
459
460
461
462
463
            assert sequence_length + padding_size == padded_sequence_length
            assert [padding_idx] * padding_size + encoded_sequence == padded_sequence

            # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
            encoded_sequence = tokenizer.encode(sequence)
            sequence_length = len(encoded_sequence)
464
465
466

            tokenizer.padding_side = "right"
            padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
467
            padded_sequence_right_length = len(padded_sequence_right)
468
469
470

            tokenizer.padding_side = "left"
            padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
471
            padded_sequence_left_length = len(padded_sequence_left)
472

473
474
475
476
            assert sequence_length == padded_sequence_right_length
            assert encoded_sequence == padded_sequence_right
            assert sequence_length == padded_sequence_left_length
            assert encoded_sequence == padded_sequence_left
477
478
479
480
481
482
483
484
485
486

        def test_encode_plus_with_padding(self):
            tokenizer = self.get_tokenizer()

            sequence = "Sequence"
            padding_size = 10
            padding_idx = tokenizer.pad_token_id
            token_type_padding_idx = tokenizer.pad_token_type_id

            encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
487
488
489
490
            input_ids = encoded_sequence["input_ids"]
            token_type_ids = encoded_sequence["token_type_ids"]
            attention_mask = encoded_sequence["attention_mask"]
            special_tokens_mask = encoded_sequence["special_tokens_mask"]
491
492
            sequence_length = len(input_ids)

493
            # Test right padding
494
            tokenizer.padding_side = "right"
495
496
497
498
499
500
501
502
503
504
            padded_sequence = tokenizer.encode_plus(
                sequence,
                max_length=sequence_length + padding_size,
                pad_to_max_length=True,
                return_special_tokens_mask=True,
            )
            padded_input_ids = padded_sequence["input_ids"]
            padded_token_type_ids = padded_sequence["token_type_ids"]
            padded_attention_mask = padded_sequence["attention_mask"]
            padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
505
506
507
508
509
            padded_sequence_length = len(padded_input_ids)

            assert sequence_length + padding_size == padded_sequence_length
            assert input_ids + [padding_idx] * padding_size == padded_input_ids
            assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
510
511
            assert attention_mask + [0] * padding_size == padded_attention_mask
            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
512
513

            # Test left padding
514
            tokenizer.padding_side = "left"
515
516
517
518
519
520
521
522
523
524
            padded_sequence = tokenizer.encode_plus(
                sequence,
                max_length=sequence_length + padding_size,
                pad_to_max_length=True,
                return_special_tokens_mask=True,
            )
            padded_input_ids = padded_sequence["input_ids"]
            padded_token_type_ids = padded_sequence["token_type_ids"]
            padded_attention_mask = padded_sequence["attention_mask"]
            padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
525
526
527
528
529
            padded_sequence_length = len(padded_input_ids)

            assert sequence_length + padding_size == padded_sequence_length
            assert [padding_idx] * padding_size + input_ids == padded_input_ids
            assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
530
531
            assert [0] * padding_size + attention_mask == padded_attention_mask
            assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask