test_tokenization_pegasus.py 10.8 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
16
import unittest

17
from transformers import PegasusTokenizer, PegasusTokenizerFast
18
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
19
from transformers.utils import cached_property
20

Yih-Dar's avatar
Yih-Dar committed
21
from ...test_tokenization_common import TokenizerTesterMixin
22
23


24
25
26
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")


27
28
@require_sentencepiece
@require_tokenizers
29
30
class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = PegasusTokenizer
31
32
    rust_tokenizer_class = PegasusTokenizerFast
    test_rust_tokenizer = True
33
    test_sentencepiece = True
34
35
36
37

    def setUp(self):
        super().setUp()

38
39
40
        # We have a SentencePiece fixture for testing
        tokenizer = PegasusTokenizer(SAMPLE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)
41
42

    @cached_property
43
    def _large_tokenizer(self):
44
45
46
        return PegasusTokenizer.from_pretrained("google/pegasus-large")

    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
47
        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
48
49
50
51

    def get_input_output_texts(self, tokenizer):
        return ("This is a test", "This is a test")

52
53
54
55
56
    def test_convert_token_and_id(self):
        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
        token = "</s>"
        token_id = 1

57
58
        self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id)
        self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token)
59
60
61
62
63
64

    def test_get_vocab(self):
        vocab_keys = list(self.get_tokenizer().get_vocab().keys())

        self.assertEqual(vocab_keys[0], "<pad>")
        self.assertEqual(vocab_keys[1], "</s>")
65
66
        self.assertEqual(vocab_keys[-1], "<unk_102>")
        self.assertEqual(len(vocab_keys), 1_104)
67
68
69
70

    def test_vocab_size(self):
        self.assertEqual(self.get_tokenizer().vocab_size, 1_103)

71
72
73
    def test_mask_tokens_rust_pegasus(self):
        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
Sylvain Gugger's avatar
Sylvain Gugger committed
74
75
76
77
        raw_input_str = (
            "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important"
            " </s> <pad> <pad> <pad>"
        )
78
79
80
81
82
83
84
85
86
87
88
89
90
91
        rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
        py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
        self.assertListEqual(py_ids, rust_ids)

    def test_large_mask_tokens(self):
        tokenizer = self._large_tokenizer
        # <mask_1> masks whole sentence while <mask_2> masks single word
        raw_input_str = "<mask_1> To ensure a <mask_2> flow of bank resolutions."
        desired_result = [2, 413, 615, 114, 3, 1971, 113, 1679, 10710, 107, 1]
        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
        self.assertListEqual(desired_result, ids)

    def test_large_tokenizer_settings(self):
        tokenizer = self._large_tokenizer
92
93
94
95
96
97
98
99
100
101
102
103
        # The tracebacks for the following asserts are **better** without messages or self.assertEqual
        assert tokenizer.vocab_size == 96103
        assert tokenizer.pad_token_id == 0
        assert tokenizer.eos_token_id == 1
        assert tokenizer.offset == 103
        assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105
        assert tokenizer.unk_token == "<unk>"
        assert tokenizer.model_max_length == 1024
        raw_input_str = "To ensure a smooth flow of bank resolutions."
        desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1]
        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
        self.assertListEqual(desired_result, ids)
104
        assert tokenizer.convert_ids_to_tokens([0, 1, 2, 3]) == ["<pad>", "</s>", "<mask_1>", "<mask_2>"]
105
106

    @require_torch
107
    def test_large_seq2seq_truncation(self):
108
        src_texts = ["This is going to be way too long." * 150, "short example"]
109
        tgt_texts = ["not super long but more than 5 tokens", "tiny"]
110
        batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
111
112
113
        targets = self._large_tokenizer(
            text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
        )
114

115
116
        assert batch.input_ids.shape == (2, 1024)
        assert batch.attention_mask.shape == (2, 1024)
117
118
        assert targets["input_ids"].shape == (2, 5)
        assert len(batch) == 2  # input_ids, attention_mask.
Vasudev Gupta's avatar
Vasudev Gupta committed
119

120
121
122
123
124
125
126
127
128
129
130
131
    @slow
    def test_tokenizer_integration(self):
        # fmt: off
        expected_encoding = {'input_ids': [[38979, 143, 18485, 606, 130, 26669, 87686, 121, 54189, 1129, 111, 26669, 87686, 121, 9114, 14787, 121, 13249, 158, 592, 956, 121, 14621, 31576, 143, 62613, 108, 9688, 930, 43430, 11562, 62613, 304, 108, 11443, 897, 108, 9314, 17415, 63399, 108, 11443, 7614, 18316, 118, 4284, 7148, 12430, 143, 1400, 25703, 158, 111, 4284, 7148, 11772, 143, 21297, 1064, 158, 122, 204, 3506, 1754, 1133, 14787, 1581, 115, 33224, 4482, 111, 1355, 110, 29173, 317, 50833, 108, 20147, 94665, 111, 77198, 107, 1], [110, 62613, 117, 638, 112, 1133, 121, 20098, 1355, 79050, 13872, 135, 1596, 53541, 1352, 141, 13039, 5542, 124, 302, 518, 111, 268, 2956, 115, 149, 4427, 107, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [139, 1235, 2799, 18289, 17780, 204, 109, 9474, 1296, 107, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
        # fmt: on

        self.tokenizer_integration_test_util(
            expected_encoding=expected_encoding,
            model_name="google/bigbird-pegasus-large-arxiv",
            revision="ba85d0851d708441f91440d509690f1ab6353415",
        )

132
133
134
135
136
137
138
139
    @unittest.skip("Need to fix this after #26538")
    def test_training_new_tokenizer(self):
        pass

    @unittest.skip("Need to fix this after #26538")
    def test_training_new_tokenizer_with_special_tokens_change(self):
        pass

Vasudev Gupta's avatar
Vasudev Gupta committed
140
141
142
143
144
145
146

@require_sentencepiece
@require_tokenizers
class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = PegasusTokenizer
    rust_tokenizer_class = PegasusTokenizerFast
    test_rust_tokenizer = True
147
    test_sentencepiece = True
Vasudev Gupta's avatar
Vasudev Gupta committed
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

    def setUp(self):
        super().setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
        tokenizer.save_pretrained(self.tmpdirname)

    @cached_property
    def _large_tokenizer(self):
        return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self, tokenizer):
        return ("This is a test", "This is a test")

    def test_mask_tokens_rust_pegasus(self):
        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
Sylvain Gugger's avatar
Sylvain Gugger committed
169
170
171
172
        raw_input_str = (
            "Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>"
            " <pad> <pad> <pad>"
        )
Vasudev Gupta's avatar
Vasudev Gupta committed
173
174
175
176
177
178
179
180
181
        rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
        py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
        self.assertListEqual(py_ids, rust_ids)

    @require_torch
    def test_large_seq2seq_truncation(self):
        src_texts = ["This is going to be way too long." * 1000, "short example"]
        tgt_texts = ["not super long but more than 5 tokens", "tiny"]
        batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
182
183
184
        targets = self._large_tokenizer(
            text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
        )
Vasudev Gupta's avatar
Vasudev Gupta committed
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

        assert batch.input_ids.shape == (2, 4096)
        assert batch.attention_mask.shape == (2, 4096)
        assert targets["input_ids"].shape == (2, 5)
        assert len(batch) == 2  # input_ids, attention_mask.

    def test_equivalence_to_orig_tokenizer(self):
        """
        To run with original TF tokenizer:

        !wget https://github.com/google-research/bigbird/raw/master/bigbird/vocab/pegasus.model
        !pip install tensorflow-text

        import tensorflow.compat.v2 as tf
        import tensorflow_text as tft

        VOCAB_FILE = "./pegasus.model"

        tf.enable_v2_behavior()

        test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
        tokenizer = tft.SentencepieceTokenizer(model=tf.io.gfile.GFile(VOCAB_FILE, "rb").read())

        tokenizer.tokenize(test_str)
        """

Sylvain Gugger's avatar
Sylvain Gugger committed
211
212
213
214
        test_str = (
            "This is an example string that is used to test the original TF implementation against the HF"
            " implementation"
        )
Vasudev Gupta's avatar
Vasudev Gupta committed
215
216
217
218
219
220
221

        token_ids = self._large_tokenizer(test_str).input_ids

        self.assertListEqual(
            token_ids,
            [182, 117, 142, 587, 4211, 120, 117, 263, 112, 804, 109, 856, 25016, 3137, 464, 109, 26955, 3137, 1],
        )