test_tokenization_common.py 23.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

16

thomwolf's avatar
thomwolf committed
17
import os
18
import pickle
Aymeric Augustin's avatar
Aymeric Augustin committed
19
import shutil
20
import tempfile
Aymeric Augustin's avatar
Aymeric Augustin committed
21

22

23
class TokenizerTesterMixin:
24

25
    tokenizer_class = None
Anthony MOI's avatar
Anthony MOI committed
26
    test_rust_tokenizer = False
27

28
29
    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()
30

31
32
    def tearDown(self):
        shutil.rmtree(self.tmpdirname)
33

34
35
    def get_tokenizer(self, **kwargs):
        raise NotImplementedError
36

Anthony MOI's avatar
Anthony MOI committed
37
38
    def get_rust_tokenizer(self, **kwargs):
        raise NotImplementedError
39

40
41
    def get_input_output_texts(self):
        raise NotImplementedError
thomwolf's avatar
thomwolf committed
42

43
44
45
46
47
48
49
50
51
52
53
54
55
56
    def test_tokenizers_common_properties(self):
        tokenizer = self.get_tokenizer()
        attributes_list = [
            "bos_token",
            "eos_token",
            "unk_token",
            "sep_token",
            "pad_token",
            "cls_token",
            "mask_token",
        ]
        for attr in attributes_list:
            self.assertTrue(hasattr(tokenizer, attr))
            self.assertTrue(hasattr(tokenizer, attr + "_id"))
57

58
59
        self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
        self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
60

61
62
63
        attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
        for attr in attributes_list:
            self.assertTrue(hasattr(tokenizer, attr))
64

65
66
67
68
    def test_save_and_load_tokenizer(self):
        # safety check on max_len default value so we are sure the test works
        tokenizer = self.get_tokenizer()
        self.assertNotEqual(tokenizer.max_len, 42)
69

70
71
        # Now let's start the test
        tokenizer = self.get_tokenizer(max_len=42)
thomwolf's avatar
thomwolf committed
72

73
        before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
74

75
        with tempfile.TemporaryDirectory() as tmpdirname:
76
77
            tokenizer.save_pretrained(tmpdirname)
            tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
78

79
80
            after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
            self.assertListEqual(before_tokens, after_tokens)
81

82
83
84
            self.assertEqual(tokenizer.max_len, 42)
            tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
            self.assertEqual(tokenizer.max_len, 43)
85

86
87
88
    def test_pickle_tokenizer(self):
        tokenizer = self.get_tokenizer()
        self.assertIsNotNone(tokenizer)
89

90
91
        text = "Munich and Berlin are nice cities"
        subwords = tokenizer.tokenize(text)
92

93
        with tempfile.TemporaryDirectory() as tmpdirname:
94

95
96
97
            filename = os.path.join(tmpdirname, "tokenizer.bin")
            with open(filename, "wb") as handle:
                pickle.dump(tokenizer, handle)
98

99
100
            with open(filename, "rb") as handle:
                tokenizer_new = pickle.load(handle)
101

102
        subwords_loaded = tokenizer_new.tokenize(text)
103

104
        self.assertListEqual(subwords, subwords_loaded)
105

106
107
    def test_added_tokens_do_lower_case(self):
        tokenizer = self.get_tokenizer(do_lower_case=True)
108

109
        special_token = tokenizer.all_special_tokens[0]
110

111
112
        text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
        text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
113

114
        toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
115

116
117
118
        new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
        added = tokenizer.add_tokens(new_toks)
        self.assertEqual(added, 2)
119

120
121
        toks = tokenizer.tokenize(text)
        toks2 = tokenizer.tokenize(text2)
122

123
124
125
        self.assertEqual(len(toks), len(toks2))
        self.assertNotEqual(len(toks), len(toks0))  # toks0 should be longer
        self.assertListEqual(toks, toks2)
126

127
128
129
        # Check that none of the special tokens are lowercased
        sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
        tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
Lysandre's avatar
Lysandre committed
130

131
132
        for special_token in tokenizer.all_special_tokens:
            self.assertTrue(special_token in tokenized_sequence)
Lysandre's avatar
Lysandre committed
133

134
        tokenizer = self.get_tokenizer(do_lower_case=False)
135

136
137
        added = tokenizer.add_tokens(new_toks)
        self.assertEqual(added, 4)
138

139
140
        toks = tokenizer.tokenize(text)
        toks2 = tokenizer.tokenize(text2)
141

142
143
144
        self.assertEqual(len(toks), len(toks2))  # Length should still be the same
        self.assertNotEqual(len(toks), len(toks0))
        self.assertNotEqual(toks[1], toks2[1])  # But at least the first non-special tokens should differ
145

146
147
    def test_add_tokens_tokenizer(self):
        tokenizer = self.get_tokenizer()
148

149
150
        vocab_size = tokenizer.vocab_size
        all_size = len(tokenizer)
151

152
153
        self.assertNotEqual(vocab_size, 0)
        self.assertEqual(vocab_size, all_size)
154

155
156
157
158
        new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
        added_toks = tokenizer.add_tokens(new_toks)
        vocab_size_2 = tokenizer.vocab_size
        all_size_2 = len(tokenizer)
159

160
161
162
163
        self.assertNotEqual(vocab_size_2, 0)
        self.assertEqual(vocab_size, vocab_size_2)
        self.assertEqual(added_toks, len(new_toks))
        self.assertEqual(all_size_2, all_size + len(new_toks))
164

165
        tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
thomwolf's avatar
thomwolf committed
166

167
168
169
        self.assertGreaterEqual(len(tokens), 4)
        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
170

171
172
173
174
        new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
        added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
        vocab_size_3 = tokenizer.vocab_size
        all_size_3 = len(tokenizer)
175

176
177
178
179
        self.assertNotEqual(vocab_size_3, 0)
        self.assertEqual(vocab_size, vocab_size_3)
        self.assertEqual(added_toks_2, len(new_toks_2))
        self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
180

181
182
183
        tokens = tokenizer.encode(
            ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
        )
184

185
186
187
188
189
190
191
        self.assertGreaterEqual(len(tokens), 6)
        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[0], tokens[1])
        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[-2], tokens[-3])
        self.assertEqual(tokens[0], tokenizer.eos_token_id)
        self.assertEqual(tokens[-2], tokenizer.pad_token_id)
192

193
194
195
    def test_add_special_tokens(self):
        tokenizer = self.get_tokenizer()
        input_text, output_text = self.get_input_output_texts()
196

197
        special_token = "[SPECIAL TOKEN]"
198

199
200
201
        tokenizer.add_special_tokens({"cls_token": special_token})
        encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
        assert len(encoded_special_token) == 1
202

203
204
        text = " ".join([input_text, special_token, output_text])
        encoded = tokenizer.encode(text, add_special_tokens=False)
205

206
        input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
207
        output_encoded = tokenizer.encode(" " + output_text, add_special_tokens=False)
208
209
        special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
        assert encoded == input_encoded + special_token_id + output_encoded
210

211
212
        decoded = tokenizer.decode(encoded, skip_special_tokens=True)
        assert special_token not in decoded
213

214
215
216
    def test_required_methods_tokenizer(self):
        tokenizer = self.get_tokenizer()
        input_text, output_text = self.get_input_output_texts()
217

218
219
220
221
        tokens = tokenizer.tokenize(input_text)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
        self.assertListEqual(ids, ids_2)
222

223
224
        tokens_2 = tokenizer.convert_ids_to_tokens(ids)
        text_2 = tokenizer.decode(ids)
225

226
        self.assertEqual(text_2, output_text)
227

228
        self.assertNotEqual(len(tokens_2), 0)
229
        self.assertIsInstance(text_2, str)
230

231
232
    def test_encode_decode_with_spaces(self):
        tokenizer = self.get_tokenizer()
LysandreJik's avatar
LysandreJik committed
233

234
235
236
237
238
239
        new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
        tokenizer.add_tokens(new_toks)
        input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
        encoded = tokenizer.encode(input, add_special_tokens=False)
        decoded = tokenizer.decode(encoded)
        self.assertEqual(decoded, input)
240

241
242
243
244
245
    def test_pretrained_model_lists(self):
        weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
        weights_lists_2 = []
        for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
            weights_lists_2.append(list(map_list.keys()))
246

247
248
        for weights_list_2 in weights_lists_2:
            self.assertListEqual(weights_list, weights_list_2)
LysandreJik's avatar
LysandreJik committed
249

250
251
    def test_mask_output(self):
        tokenizer = self.get_tokenizer()
252

253
        if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
254
255
            seq_0 = "Test this method."
            seq_1 = "With these inputs."
256
257
258
259
260
261
262
263
264
265
266
            information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
            sequences, mask = information["input_ids"], information["token_type_ids"]
            self.assertEqual(len(sequences), len(mask))

    def test_number_of_added_tokens(self):
        tokenizer = self.get_tokenizer()

        seq_0 = "Test this method."
        seq_1 = "With these inputs."

        sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
267
        attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True, add_prefix_space=False)
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282

        # Method is implemented (e.g. not GPT-2)
        if len(attached_sequences) != 2:
            self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))

    def test_maximum_encoding_length_single_input(self):
        tokenizer = self.get_tokenizer()

        seq_0 = "This is a sentence to be encoded."
        stride = 2

        sequence = tokenizer.encode(seq_0, add_special_tokens=False)
        num_added_tokens = tokenizer.num_added_tokens()
        total_length = len(sequence) + num_added_tokens
        information = tokenizer.encode_plus(
283
284
285
286
287
288
            seq_0,
            max_length=total_length - 2,
            add_special_tokens=True,
            stride=stride,
            return_overflowing_tokens=True,
            add_prefix_space=False,
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
        )

        truncated_sequence = information["input_ids"]
        overflowing_tokens = information["overflowing_tokens"]

        self.assertEqual(len(overflowing_tokens), 2 + stride)
        self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
        self.assertEqual(len(truncated_sequence), total_length - 2)
        self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))

    def test_maximum_encoding_length_pair_input(self):
        tokenizer = self.get_tokenizer()

        seq_0 = "This is a sentence to be encoded."
        seq_1 = "This is another sentence to be encoded."
        stride = 2

        sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
        sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)

309
        sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True, add_prefix_space=False)
310
311
312
313
314
315
316
317
318
319
320
321
        truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
            tokenizer.encode(seq_0, add_special_tokens=False), tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
        )

        information = tokenizer.encode_plus(
            seq_0,
            seq_1,
            max_length=len(sequence) - 2,
            add_special_tokens=True,
            stride=stride,
            truncation_strategy="only_second",
            return_overflowing_tokens=True,
322
            add_prefix_space=False,
323
324
325
326
327
328
329
330
331
        )
        information_first_truncated = tokenizer.encode_plus(
            seq_0,
            seq_1,
            max_length=len(sequence) - 2,
            add_special_tokens=True,
            stride=stride,
            truncation_strategy="only_first",
            return_overflowing_tokens=True,
332
            add_prefix_space=False,
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
        )

        truncated_sequence = information["input_ids"]
        overflowing_tokens = information["overflowing_tokens"]
        overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]

        self.assertEqual(len(overflowing_tokens), 2 + stride)
        self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
        self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
        self.assertEqual(len(truncated_sequence), len(sequence) - 2)
        self.assertEqual(truncated_sequence, truncated_second_sequence)

    def test_encode_input_type(self):
        tokenizer = self.get_tokenizer()

        sequence = "Let's encode this sequence"

        tokens = tokenizer.tokenize(sequence)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
352
        formatted_input = tokenizer.encode(sequence, add_special_tokens=True, add_prefix_space=False)
353
354
355
356

        self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
        self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)

357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
    def test_swap_special_token(self):
        tokenizer = self.get_tokenizer()

        mask = "<mask>"
        sequence = "Encode this sequence"
        sequence_masked_0 = "Encode <mask> sequence"
        sequence_masked_1 = "<mask> this sequence"

        # Add tokens so that masked token isn't split
        tokenizer.add_tokens(sequence.split())
        tokenizer.add_special_tokens({"mask_token": mask})
        mask_ind = tokenizer.convert_tokens_to_ids(mask)
        encoded = tokenizer.encode(sequence, add_special_tokens=False)

        # Test first masked sequence
        encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
        mask_loc = encoded_masked.index(mask_ind)
        encoded_masked[mask_loc] = encoded[mask_loc]

        self.assertEqual(encoded_masked, encoded)

        # Test second masked sequence
        encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
        mask_loc = encoded_masked.index(mask_ind)
        encoded_masked[mask_loc] = encoded[mask_loc]

        self.assertEqual(encoded_masked, encoded)

385
386
387
388
389
390
391
392
393
    def test_special_tokens_mask(self):
        tokenizer = self.get_tokenizer()

        sequence_0 = "Encode this."
        sequence_1 = "This one too please."

        # Testing single inputs
        encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
        encoded_sequence_dict = tokenizer.encode_plus(
394
            sequence_0, add_special_tokens=True, return_special_tokens_mask=True, add_prefix_space=False
395
396
397
398
399
400
401
402
403
404
405
406
        )
        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
        special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))

        filtered_sequence = [
            (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
        ]
        filtered_sequence = [x for x in filtered_sequence if x is not None]
        self.assertEqual(encoded_sequence, filtered_sequence)

        # Testing inputs pairs
407
408
        encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
        encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
409
        encoded_sequence_dict = tokenizer.encode_plus(
410
            sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True, add_prefix_space=False
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
        )
        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
        special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))

        filtered_sequence = [
            (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
        ]
        filtered_sequence = [x for x in filtered_sequence if x is not None]
        self.assertEqual(encoded_sequence, filtered_sequence)

        # Testing with already existing special tokens
        if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
            tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
        encoded_sequence_dict = tokenizer.encode_plus(
            sequence_0, add_special_tokens=True, return_special_tokens_mask=True
        )
        encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
        special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
        special_tokens_mask = tokenizer.get_special_tokens_mask(
            encoded_sequence_w_special, already_has_special_tokens=True
        )
        self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
        self.assertEqual(special_tokens_mask_orig, special_tokens_mask)

    def test_padding_to_max_length(self):
        tokenizer = self.get_tokenizer()

        sequence = "Sequence"
        padding_size = 10
        padding_idx = tokenizer.pad_token_id

        # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
        tokenizer.padding_side = "right"
        encoded_sequence = tokenizer.encode(sequence)
        sequence_length = len(encoded_sequence)
        padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
        padded_sequence_length = len(padded_sequence)
        assert sequence_length + padding_size == padded_sequence_length
        assert encoded_sequence + [padding_idx] * padding_size == padded_sequence

        # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
        tokenizer.padding_side = "left"
        encoded_sequence = tokenizer.encode(sequence)
        sequence_length = len(encoded_sequence)
        padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
        padded_sequence_length = len(padded_sequence)
        assert sequence_length + padding_size == padded_sequence_length
        assert [padding_idx] * padding_size + encoded_sequence == padded_sequence

        # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
        encoded_sequence = tokenizer.encode(sequence)
        sequence_length = len(encoded_sequence)

        tokenizer.padding_side = "right"
        padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
        padded_sequence_right_length = len(padded_sequence_right)

        tokenizer.padding_side = "left"
        padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
        padded_sequence_left_length = len(padded_sequence_left)

        assert sequence_length == padded_sequence_right_length
        assert encoded_sequence == padded_sequence_right
        assert sequence_length == padded_sequence_left_length
        assert encoded_sequence == padded_sequence_left

    def test_encode_plus_with_padding(self):
        tokenizer = self.get_tokenizer()

        sequence = "Sequence"
        padding_size = 10
        padding_idx = tokenizer.pad_token_id
        token_type_padding_idx = tokenizer.pad_token_type_id

        encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
        input_ids = encoded_sequence["input_ids"]
        token_type_ids = encoded_sequence["token_type_ids"]
        attention_mask = encoded_sequence["attention_mask"]
        special_tokens_mask = encoded_sequence["special_tokens_mask"]
        sequence_length = len(input_ids)

        # Test right padding
        tokenizer.padding_side = "right"
        padded_sequence = tokenizer.encode_plus(
            sequence,
            max_length=sequence_length + padding_size,
            pad_to_max_length=True,
            return_special_tokens_mask=True,
        )
        padded_input_ids = padded_sequence["input_ids"]
        padded_token_type_ids = padded_sequence["token_type_ids"]
        padded_attention_mask = padded_sequence["attention_mask"]
        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
        padded_sequence_length = len(padded_input_ids)

        assert sequence_length + padding_size == padded_sequence_length
        assert input_ids + [padding_idx] * padding_size == padded_input_ids
        assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
        assert attention_mask + [0] * padding_size == padded_attention_mask
        assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask

        # Test left padding
        tokenizer.padding_side = "left"
        padded_sequence = tokenizer.encode_plus(
            sequence,
            max_length=sequence_length + padding_size,
            pad_to_max_length=True,
            return_special_tokens_mask=True,
        )
        padded_input_ids = padded_sequence["input_ids"]
        padded_token_type_ids = padded_sequence["token_type_ids"]
        padded_attention_mask = padded_sequence["attention_mask"]
        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
        padded_sequence_length = len(padded_input_ids)

        assert sequence_length + padding_size == padded_sequence_length
        assert [padding_idx] * padding_size + input_ids == padded_input_ids
        assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
        assert [0] * padding_size + attention_mask == padded_attention_mask
        assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
532
533
534
535
536
537
538

    def test_separate_tokenizers(self):
        # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
        # we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.

        tokenizer = self.get_tokenizer(random_argument=True)
        print(tokenizer.init_kwargs)
Lysandre's avatar
Style  
Lysandre committed
539
        assert tokenizer.init_kwargs["random_argument"] is True
540
541
542
        new_tokenizer = self.get_tokenizer(random_argument=False)
        print(tokenizer.init_kwargs)
        print(new_tokenizer.init_kwargs)
Lysandre's avatar
Style  
Lysandre committed
543
544
        assert tokenizer.init_kwargs["random_argument"] is True
        assert new_tokenizer.init_kwargs["random_argument"] is False
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564

    def test_get_vocab(self):
        tokenizer = self.get_tokenizer()
        vocab = tokenizer.get_vocab()

        self.assertIsInstance(vocab, dict)
        self.assertEqual(len(vocab), len(tokenizer))

        for word, ind in vocab.items():
            self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
            self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)

        tokenizer.add_tokens(["asdfasdfasdfasdf"])
        vocab = tokenizer.get_vocab()
        self.assertIsInstance(vocab, dict)
        self.assertEqual(len(vocab), len(tokenizer))

        for word, ind in vocab.items():
            self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
            self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)