test_tokenization_roberta.py 8.33 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Aymeric Augustin's avatar
Aymeric Augustin committed
15

16

LysandreJik's avatar
LysandreJik committed
17
import json
Aymeric Augustin's avatar
Aymeric Augustin committed
18
import os
19
import unittest
20

21
from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
Sylvain Gugger's avatar
Sylvain Gugger committed
22
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
23
from transformers.testing_utils import require_tokenizers, slow
Aymeric Augustin's avatar
Aymeric Augustin committed
24

25
from .test_tokenization_common import TokenizerTesterMixin
26
27


28
@require_tokenizers
29
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
LysandreJik's avatar
LysandreJik committed
30
    tokenizer_class = RobertaTokenizer
31
32
    rust_tokenizer_class = RobertaTokenizerFast
    test_rust_tokenizer = True
33
    from_pretrained_kwargs = {"cls_token": "<s>"}
34

LysandreJik's avatar
LysandreJik committed
35
    def setUp(self):
Julien Chaumond's avatar
Julien Chaumond committed
36
        super().setUp()
LysandreJik's avatar
LysandreJik committed
37
38

        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
        vocab = [
            "l",
            "o",
            "w",
            "e",
            "r",
            "s",
            "t",
            "i",
            "d",
            "n",
            "\u0120",
            "\u0120l",
            "\u0120n",
            "\u0120lo",
            "\u0120low",
            "er",
            "\u0120lowest",
            "\u0120newer",
            "\u0120wider",
            "<unk>",
        ]
61
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
62
        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
LysandreJik's avatar
LysandreJik committed
63
64
        self.special_tokens_map = {"unk_token": "<unk>"}

65
66
        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
67
        with open(self.vocab_file, "w", encoding="utf-8") as fp:
thomwolf's avatar
thomwolf committed
68
            fp.write(json.dumps(vocab_tokens) + "\n")
69
        with open(self.merges_file, "w", encoding="utf-8") as fp:
LysandreJik's avatar
LysandreJik committed
70
            fp.write("\n".join(merges))
71

72
73
    def get_tokenizer(self, **kwargs):
        kwargs.update(self.special_tokens_map)
74
        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
75

76
77
78
79
80
    def get_rust_tokenizer(self, **kwargs):
        kwargs.update(self.special_tokens_map)
        return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self, tokenizer):
81
82
        input_text = "lower newer"
        output_text = "lower newer"
LysandreJik's avatar
LysandreJik committed
83
84
85
        return input_text, output_text

    def test_full_tokenizer(self):
86
        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
thomwolf's avatar
thomwolf committed
87
        text = "lower newer"
88
89
        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
        tokens = tokenizer.tokenize(text)  # , add_prefix_space=True)
LysandreJik's avatar
LysandreJik committed
90
        self.assertListEqual(tokens, bpe_tokens)
91

LysandreJik's avatar
LysandreJik committed
92
        input_tokens = tokens + [tokenizer.unk_token]
93
        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
94
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
95

LysandreJik's avatar
LysandreJik committed
96
97
    def roberta_dict_integration_testing(self):
        tokenizer = self.get_tokenizer()
98

99
        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
LysandreJik's avatar
LysandreJik committed
100
        self.assertListEqual(
101
102
            tokenizer.encode("Hello world! c茅c茅 herlolip 418", add_special_tokens=False),
            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
LysandreJik's avatar
LysandreJik committed
103
        )
104

105
    @slow
106
    def test_sequence_builders(self):
107
        tokenizer = self.tokenizer_class.from_pretrained("roberta-base")
108

Lysandre's avatar
Remove  
Lysandre committed
109
110
        text = tokenizer.encode("sequence builders", add_special_tokens=False)
        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
111

112
113
114
        encoded_text_from_decode = tokenizer.encode(
            "sequence builders", add_special_tokens=True, add_prefix_space=False
        )
115
        encoded_pair_from_decode = tokenizer.encode(
116
            "sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
117
        )
LysandreJik's avatar
LysandreJik committed
118

119
120
        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
121

LysandreJik's avatar
LysandreJik committed
122
123
        assert encoded_sentence == encoded_text_from_decode
        assert encoded_pair == encoded_pair_from_decode
124
125
126
127
128
129
130
131

    def test_space_encoding(self):
        tokenizer = self.get_tokenizer()

        sequence = "Encode this sequence."
        space_encoding = tokenizer.byte_encoder[" ".encode("utf-8")[0]]

        # Testing encoder arguments
132
        encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=False)
133
134
135
136
137
138
139
140
141
142
        first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
        self.assertNotEqual(first_char, space_encoding)

        encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
        first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
        self.assertEqual(first_char, space_encoding)

        tokenizer.add_special_tokens({"bos_token": "<s>"})
        encoded = tokenizer.encode(sequence, add_special_tokens=True)
        first_char = tokenizer.convert_ids_to_tokens(encoded[1])[0]
143
        self.assertNotEqual(first_char, space_encoding)
144

145
        # Testing spaces after special tokens
146
        mask = "<mask>"
147
148
149
        tokenizer.add_special_tokens(
            {"mask_token": AddedToken(mask, lstrip=True, rstrip=False)}
        )  # mask token has a left space
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        mask_ind = tokenizer.convert_tokens_to_ids(mask)

        sequence = "Encode <mask> sequence"
        sequence_nospace = "Encode <mask>sequence"

        encoded = tokenizer.encode(sequence)
        mask_loc = encoded.index(mask_ind)
        first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
        self.assertEqual(first_char, space_encoding)

        encoded = tokenizer.encode(sequence_nospace)
        mask_loc = encoded.index(mask_ind)
        first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
        self.assertNotEqual(first_char, space_encoding)
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

    def test_pretokenized_inputs(self):
        pass

    def test_embeded_special_tokens(self):
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                sentence = "A, <mask> AllenNLP sentence."
                tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
                tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

                # token_type_ids should put 0 everywhere
                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))

                # attention_mask should put 1 everywhere, so sum over length should be 1
                self.assertEqual(
                    sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
                    sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
                )

                tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
                tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])

                # Rust correctly handles the space before the mask while python doesnt
                self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
                self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])

                self.assertSequenceEqual(
                    tokens_p_str, ["<s>", "A", ",", "<mask>", "臓Allen", "N", "LP", "臓sentence", ".", "</s>"]
                )
                self.assertSequenceEqual(
                    tokens_r_str, ["<s>", "A", ",", "<mask>", "臓Allen", "N", "LP", "臓sentence", ".", "</s>"]
                )