tokenization_tests_commons.py 22.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
thomwolf's avatar
thomwolf committed
15
from __future__ import absolute_import, division, print_function, unicode_literals
16

thomwolf's avatar
thomwolf committed
17
import os
18
19
import sys
from io import open
20
import tempfile
thomwolf's avatar
thomwolf committed
21
import shutil
22
import unittest
23
24
25

if sys.version_info[0] == 2:
    import cPickle as pickle
thomwolf's avatar
thomwolf committed
26
27
28
29
30
31
32
33

    class TemporaryDirectory(object):
        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
        def __enter__(self):
            self.name = tempfile.mkdtemp()
            return self.name
        def __exit__(self, exc_type, exc_value, traceback):
            shutil.rmtree(self.name)
34
35
else:
    import pickle
thomwolf's avatar
thomwolf committed
36
37
    TemporaryDirectory = tempfile.TemporaryDirectory
    unicode = str
38
39


40
class CommonTestCases:
41

42
    class CommonTokenizerTester(unittest.TestCase):
43

44
        tokenizer_class = None
45

46
47
        def setUp(self):
            self.tmpdirname = tempfile.mkdtemp()
48

49
50
        def tearDown(self):
            shutil.rmtree(self.tmpdirname)
51

52
        def get_tokenizer(self, **kwargs):
53
            raise NotImplementedError
54

55
56
        def get_input_output_texts(self):
            raise NotImplementedError
thomwolf's avatar
thomwolf committed
57

58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
        def test_tokenizers_common_properties(self):
            tokenizer = self.get_tokenizer()
            attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
                                "pad_token", "cls_token", "mask_token"]
            for attr in attributes_list:
                self.assertTrue(hasattr(tokenizer, attr))
                self.assertTrue(hasattr(tokenizer, attr + "_id"))

            self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
            self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))

            attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
                                "added_tokens_decoder"]
            for attr in attributes_list:
                self.assertTrue(hasattr(tokenizer, attr))

74
        def test_save_and_load_tokenizer(self):
75
            # safety check on max_len default value so we are sure the test works
76
            tokenizer = self.get_tokenizer()
77
78
79
80
            self.assertNotEqual(tokenizer.max_len, 42)

            # Now let's start the test
            tokenizer = self.get_tokenizer(max_len=42)
thomwolf's avatar
thomwolf committed
81

Lysandre's avatar
Remove  
Lysandre committed
82
            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
83

84
85
            with TemporaryDirectory() as tmpdirname:
                tokenizer.save_pretrained(tmpdirname)
thomwolf's avatar
thomwolf committed
86
                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
87

Lysandre's avatar
Remove  
Lysandre committed
88
                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
89
90
91
                self.assertListEqual(before_tokens, after_tokens)

                self.assertEqual(tokenizer.max_len, 42)
thomwolf's avatar
thomwolf committed
92
                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
93
                self.assertEqual(tokenizer.max_len, 43)
94

95
96
97
        def test_pickle_tokenizer(self):
            tokenizer = self.get_tokenizer()
            self.assertIsNotNone(tokenizer)
98

99
100
            text = u"Munich and Berlin are nice cities"
            subwords = tokenizer.tokenize(text)
101

102
            with TemporaryDirectory() as tmpdirname:
103

104
                filename = os.path.join(tmpdirname, u"tokenizer.bin")
105
106
                with open(filename, "wb") as handle:
                    pickle.dump(tokenizer, handle)
107

108
109
                with open(filename, "rb") as handle:
                    tokenizer_new = pickle.load(handle)
110

111
            subwords_loaded = tokenizer_new.tokenize(text)
112

113
            self.assertListEqual(subwords, subwords_loaded)
114

115
116
117
        def test_added_tokens_do_lower_case(self):
            tokenizer = self.get_tokenizer(do_lower_case=True)

118
119
120
121
            special_token = tokenizer.all_special_tokens[0]

            text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
            text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

            toks0 = tokenizer.tokenize(text)  # toks before adding new_toks

            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", 'AAAAA BBBBBB', 'CCCCCCCCCDDDDDDDD']
            added = tokenizer.add_tokens(new_toks)
            self.assertEqual(added, 2)

            toks = tokenizer.tokenize(text)
            toks2 = tokenizer.tokenize(text2)

            self.assertEqual(len(toks), len(toks2))
            self.assertNotEqual(len(toks), len(toks0))  # toks0 should be longer
            self.assertListEqual(toks, toks2)

            tokenizer = self.get_tokenizer(do_lower_case=False)

            added = tokenizer.add_tokens(new_toks)
            self.assertEqual(added, 4)

            toks = tokenizer.tokenize(text)
            toks2 = tokenizer.tokenize(text2)

            self.assertEqual(len(toks), len(toks2))  # Length should still be the same
            self.assertNotEqual(len(toks), len(toks0))
146
            self.assertNotEqual(toks[1], toks2[1])  # But at least the first non-special tokens should differ
147

148
149
        def test_add_tokens_tokenizer(self):
            tokenizer = self.get_tokenizer()
150

151
152
            vocab_size = tokenizer.vocab_size
            all_size = len(tokenizer)
153

154
155
            self.assertNotEqual(vocab_size, 0)
            self.assertEqual(vocab_size, all_size)
156

thomwolf's avatar
thomwolf committed
157
            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
158
159
160
            added_toks = tokenizer.add_tokens(new_toks)
            vocab_size_2 = tokenizer.vocab_size
            all_size_2 = len(tokenizer)
161

162
163
164
165
            self.assertNotEqual(vocab_size_2, 0)
            self.assertEqual(vocab_size, vocab_size_2)
            self.assertEqual(added_toks, len(new_toks))
            self.assertEqual(all_size_2, all_size + len(new_toks))
166

Lysandre's avatar
Remove  
Lysandre committed
167
            tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
thomwolf's avatar
thomwolf committed
168
169
            out_string = tokenizer.decode(tokens)

170
171
172
            self.assertGreaterEqual(len(tokens), 4)
            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
173

174
            new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
175
                          'pad_token': "<<<<<|||>|>>>>|>"}
176
177
178
            added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
            vocab_size_3 = tokenizer.vocab_size
            all_size_3 = len(tokenizer)
179

180
181
182
183
            self.assertNotEqual(vocab_size_3, 0)
            self.assertEqual(vocab_size, vocab_size_3)
            self.assertEqual(added_toks_2, len(new_toks_2))
            self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
184

Lysandre's avatar
Remove  
Lysandre committed
185
186
            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
                                      add_special_tokens=False)
thomwolf's avatar
thomwolf committed
187
            out_string = tokenizer.decode(tokens)
188

189
190
191
192
193
            self.assertGreaterEqual(len(tokens), 6)
            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
            self.assertGreater(tokens[0], tokens[1])
            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
            self.assertGreater(tokens[-2], tokens[-3])
194
            self.assertEqual(tokens[0], tokenizer.eos_token_id)
195
            self.assertEqual(tokens[-2], tokenizer.pad_token_id)
196

197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
        def test_add_special_tokens(self):
            tokenizer = self.get_tokenizer()
            input_text, output_text = self.get_input_output_texts()

            special_token = "[SPECIAL TOKEN]"

            tokenizer.add_special_tokens({"cls_token": special_token})
            encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
            assert len(encoded_special_token) == 1

            text = " ".join([input_text, special_token, output_text])
            encoded = tokenizer.encode(text, add_special_tokens=False)

            input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
            output_encoded = tokenizer.encode(output_text, add_special_tokens=False)
            special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
            assert encoded == input_encoded + special_token_id + output_encoded

            decoded = tokenizer.decode(encoded, skip_special_tokens=True)
            assert special_token not in decoded
217

218
219
220
        def test_required_methods_tokenizer(self):
            tokenizer = self.get_tokenizer()
            input_text, output_text = self.get_input_output_texts()
221

222
223
            tokens = tokenizer.tokenize(input_text)
            ids = tokenizer.convert_tokens_to_ids(tokens)
Lysandre's avatar
Remove  
Lysandre committed
224
            ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
225
            self.assertListEqual(ids, ids_2)
226

227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
            tokens_2 = tokenizer.convert_ids_to_tokens(ids)
            text_2 = tokenizer.decode(ids)

            self.assertEqual(text_2, output_text)

            self.assertNotEqual(len(tokens_2), 0)
            self.assertIsInstance(text_2, (str, unicode))


        def test_pretrained_model_lists(self):
            weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
            weights_lists_2 = []
            for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
                weights_lists_2.append(list(map_list.keys()))

            for weights_list_2 in weights_lists_2:
                self.assertListEqual(weights_list, weights_list_2)
LysandreJik's avatar
LysandreJik committed
244

245
246
247
248
249
250
        def test_mask_output(self):
            if sys.version_info <= (3, 0):
                return

            tokenizer = self.get_tokenizer()

251
            if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer":
252
253
                seq_0 = "Test this method."
                seq_1 = "With these inputs."
thomwolf's avatar
thomwolf committed
254
                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
255
                sequences, mask = information["input_ids"], information["token_type_ids"]
256
                self.assertEqual(len(sequences), len(mask))
257
258
259
260
261
262
263

        def test_number_of_added_tokens(self):
            tokenizer = self.get_tokenizer()

            seq_0 = "Test this method."
            seq_1 = "With these inputs."

Lysandre's avatar
Remove  
Lysandre committed
264
            sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
265
266
267
268
            attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)

            # Method is implemented (e.g. not GPT-2)
            if len(attached_sequences) != 2:
269
                self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
270
271
272
273
274

        def test_maximum_encoding_length_single_input(self):
            tokenizer = self.get_tokenizer()

            seq_0 = "This is a sentence to be encoded."
LysandreJik's avatar
LysandreJik committed
275
            stride = 2
276

Lysandre's avatar
Remove  
Lysandre committed
277
            sequence = tokenizer.encode(seq_0, add_special_tokens=False)
278
279
            num_added_tokens = tokenizer.num_added_tokens()
            total_length = len(sequence) + num_added_tokens
thomwolf's avatar
thomwolf committed
280
281
282
283
284
            information = tokenizer.encode_plus(seq_0,
                                                max_length=total_length - 2,
                                                add_special_tokens=True,
                                                stride=stride,
                                                return_overflowing_tokens=True)
285

LysandreJik's avatar
LysandreJik committed
286
            truncated_sequence = information["input_ids"]
287
288
            overflowing_tokens = information["overflowing_tokens"]

289
290
291
292
            self.assertEqual(len(overflowing_tokens), 2 + stride)
            self.assertEqual(overflowing_tokens, sequence[-(2 + stride):])
            self.assertEqual(len(truncated_sequence), total_length - 2)
            self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
293
294
295
296
297
298

        def test_maximum_encoding_length_pair_input(self):
            tokenizer = self.get_tokenizer()

            seq_0 = "This is a sentence to be encoded."
            seq_1 = "This is another sentence to be encoded."
299
300
            stride = 2

Lysandre's avatar
Remove  
Lysandre committed
301
302
            sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
            sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
303
304

            sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
305
            truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
Lysandre's avatar
Remove  
Lysandre committed
306
307
                tokenizer.encode(seq_0, add_special_tokens=False),
                tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
308
            )
309
310

            information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
thomwolf's avatar
thomwolf committed
311
312
                                                stride=stride, truncation_strategy='only_second',
                                                return_overflowing_tokens=True)
313
314
            information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
                                                                add_special_tokens=True, stride=stride,
thomwolf's avatar
thomwolf committed
315
316
                                                                truncation_strategy='only_first',
                                                                return_overflowing_tokens=True)
317

LysandreJik's avatar
LysandreJik committed
318
            truncated_sequence = information["input_ids"]
319
            overflowing_tokens = information["overflowing_tokens"]
320
            overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
321

322
323
324
325
326
            self.assertEqual(len(overflowing_tokens), 2 + stride)
            self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):])
            self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):])
            self.assertEqual(len(truncated_sequence), len(sequence) - 2)
            self.assertEqual(truncated_sequence, truncated_second_sequence)
327

LysandreJik's avatar
LysandreJik committed
328
        def test_encode_input_type(self):
329
330
331
            tokenizer = self.get_tokenizer()

            sequence = "Let's encode this sequence"
LysandreJik's avatar
LysandreJik committed
332
333
334
335
336

            tokens = tokenizer.tokenize(sequence)
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            formatted_input = tokenizer.encode(sequence, add_special_tokens=True)

337
338
            self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
            self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
LysandreJik's avatar
LysandreJik committed
339

340
        def test_special_tokens_mask(self):
LysandreJik's avatar
LysandreJik committed
341
342
343
344
345
346
            tokenizer = self.get_tokenizer()

            sequence_0 = "Encode this."
            sequence_1 = "This one too please."

            # Testing single inputs
Lysandre's avatar
Remove  
Lysandre committed
347
            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
thomwolf's avatar
thomwolf committed
348
            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True, return_special_tokens_mask=True)
LysandreJik's avatar
LysandreJik committed
349
            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
350
            special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
351
            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
LysandreJik's avatar
LysandreJik committed
352

353
            filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
LysandreJik's avatar
LysandreJik committed
354
            filtered_sequence = [x for x in filtered_sequence if x is not None]
355
            self.assertEqual(encoded_sequence, filtered_sequence)
LysandreJik's avatar
LysandreJik committed
356
357

            # Testing inputs pairs
Lysandre's avatar
Remove  
Lysandre committed
358
359
            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1,
                                                                                                         add_special_tokens=False)
thomwolf's avatar
thomwolf committed
360
361
            encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True,
                                                          return_special_tokens_mask=True)
LysandreJik's avatar
LysandreJik committed
362
            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
363
            special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
364
            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
LysandreJik's avatar
LysandreJik committed
365

366
            filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
LysandreJik's avatar
LysandreJik committed
367
            filtered_sequence = [x for x in filtered_sequence if x is not None]
368
            self.assertEqual(encoded_sequence, filtered_sequence)
LysandreJik's avatar
LysandreJik committed
369

370
371
372
            # Testing with already existing special tokens
            if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
                tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
thomwolf's avatar
thomwolf committed
373
374
375
            encoded_sequence_dict = tokenizer.encode_plus(sequence_0,
                                                          add_special_tokens=True,
                                                          return_special_tokens_mask=True)
376
            encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
377
            special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
378
379
380
            special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
            self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
            self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
381
382
383
384
385
386
387
388

        def test_padding_to_max_length(self):
            tokenizer = self.get_tokenizer()

            sequence = "Sequence"
            padding_size = 10
            padding_idx = tokenizer.pad_token_id

389
            # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
390
            tokenizer.padding_side = "right"
391
392
            encoded_sequence = tokenizer.encode(sequence)
            sequence_length = len(encoded_sequence)
393
            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
394
395
396
397
            padded_sequence_length = len(padded_sequence)
            assert sequence_length + padding_size == padded_sequence_length
            assert encoded_sequence + [padding_idx] * padding_size == padded_sequence

398
            # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
399
            tokenizer.padding_side = "left"
400
401
            encoded_sequence = tokenizer.encode(sequence)
            sequence_length = len(encoded_sequence)
402
            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
403
            padded_sequence_length = len(padded_sequence)
404
405
406
407
408
409
            assert sequence_length + padding_size == padded_sequence_length
            assert [padding_idx] * padding_size + encoded_sequence == padded_sequence

            # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
            encoded_sequence = tokenizer.encode(sequence)
            sequence_length = len(encoded_sequence)
410
411
412

            tokenizer.padding_side = "right"
            padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
413
            padded_sequence_right_length = len(padded_sequence_right)
414
415
416

            tokenizer.padding_side = "left"
            padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
417
            padded_sequence_left_length = len(padded_sequence_left)
418

419
420
421
422
            assert sequence_length == padded_sequence_right_length
            assert encoded_sequence == padded_sequence_right
            assert sequence_length == padded_sequence_left_length
            assert encoded_sequence == padded_sequence_left
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438

        def test_encode_plus_with_padding(self):
            tokenizer = self.get_tokenizer()

            sequence = "Sequence"
            padding_size = 10
            padding_idx = tokenizer.pad_token_id
            token_type_padding_idx = tokenizer.pad_token_type_id

            encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
            input_ids = encoded_sequence['input_ids']
            token_type_ids = encoded_sequence['token_type_ids']
            attention_mask = encoded_sequence['attention_mask']
            special_tokens_mask = encoded_sequence['special_tokens_mask']
            sequence_length = len(input_ids)

439
            # Test right padding
440
441
            tokenizer.padding_side = "right"
            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
442
443
444
445
446
447
448
449
450
451
            padded_input_ids = padded_sequence['input_ids']
            padded_token_type_ids = padded_sequence['token_type_ids']
            padded_attention_mask = padded_sequence['attention_mask']
            padded_special_tokens_mask = padded_sequence['special_tokens_mask']
            padded_sequence_length = len(padded_input_ids)

            assert sequence_length + padding_size == padded_sequence_length
            assert input_ids + [padding_idx] * padding_size == padded_input_ids
            assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
            assert attention_mask + [0] * padding_size == padded_attention_mask 
452
453
454
            assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask 

            # Test left padding
455
456
            tokenizer.padding_side = "left"
            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
457
458
459
460
461
462
463
464
465
466
467
            padded_input_ids = padded_sequence['input_ids']
            padded_token_type_ids = padded_sequence['token_type_ids']
            padded_attention_mask = padded_sequence['attention_mask']
            padded_special_tokens_mask = padded_sequence['special_tokens_mask']
            padded_sequence_length = len(padded_input_ids)

            assert sequence_length + padding_size == padded_sequence_length
            assert [padding_idx] * padding_size + input_ids == padded_input_ids
            assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
            assert [0] * padding_size + attention_mask == padded_attention_mask 
            assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask