"src/turbomind/layers/FfnFP8Weight.h" did not exist on "fe46dac2c2ea1a988929fba05e9d3d3c9b11dfd7"
test_tokenization_pegasus.py 10.8 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
16
import unittest

17
from transformers import PegasusTokenizer, PegasusTokenizerFast
18
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
19
from transformers.utils import cached_property
20

Yih-Dar's avatar
Yih-Dar committed
21
from ...test_tokenization_common import TokenizerTesterMixin
22
23


24
25
26
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")


27
28
@require_sentencepiece
@require_tokenizers
29
30
class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = PegasusTokenizer
31
32
    rust_tokenizer_class = PegasusTokenizerFast
    test_rust_tokenizer = True
33
    test_sentencepiece = True
34
35
36
37

    def setUp(self):
        super().setUp()

38
39
40
        # We have a SentencePiece fixture for testing
        tokenizer = PegasusTokenizer(SAMPLE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)
41
42

    @cached_property
43
    def _large_tokenizer(self):
44
45
46
        return PegasusTokenizer.from_pretrained("google/pegasus-large")

    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
47
        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
48
49
50
51

    def get_input_output_texts(self, tokenizer):
        return ("This is a test", "This is a test")

52
53
54
55
56
    def test_convert_token_and_id(self):
        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
        token = "</s>"
        token_id = 1

57
58
        self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id)
        self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token)
59
60
61
62
63
64

    def test_get_vocab(self):
        vocab_keys = list(self.get_tokenizer().get_vocab().keys())

        self.assertEqual(vocab_keys[0], "<pad>")
        self.assertEqual(vocab_keys[1], "</s>")
65
66
        self.assertEqual(vocab_keys[104], "<unk_102>")
        self.assertEqual(len(vocab_keys), 1_103)
67
68
69
70

    def test_vocab_size(self):
        self.assertEqual(self.get_tokenizer().vocab_size, 1_103)

71
72
73
    def test_mask_tokens_rust_pegasus(self):
        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
Sylvain Gugger's avatar
Sylvain Gugger committed
74
75
76
77
        raw_input_str = (
            "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important"
            " </s> <pad> <pad> <pad>"
        )
78
79
80
81
82
83
84
85
86
87
88
89
90
91
        rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
        py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
        self.assertListEqual(py_ids, rust_ids)

    def test_large_mask_tokens(self):
        tokenizer = self._large_tokenizer
        # <mask_1> masks whole sentence while <mask_2> masks single word
        raw_input_str = "<mask_1> To ensure a <mask_2> flow of bank resolutions."
        desired_result = [2, 413, 615, 114, 3, 1971, 113, 1679, 10710, 107, 1]
        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
        self.assertListEqual(desired_result, ids)

    def test_large_tokenizer_settings(self):
        tokenizer = self._large_tokenizer
92
93
94
95
96
97
98
99
100
101
102
103
        # The tracebacks for the following asserts are **better** without messages or self.assertEqual
        assert tokenizer.vocab_size == 96103
        assert tokenizer.pad_token_id == 0
        assert tokenizer.eos_token_id == 1
        assert tokenizer.offset == 103
        assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105
        assert tokenizer.unk_token == "<unk>"
        assert tokenizer.model_max_length == 1024
        raw_input_str = "To ensure a smooth flow of bank resolutions."
        desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1]
        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
        self.assertListEqual(desired_result, ids)
104
        assert tokenizer.convert_ids_to_tokens([0, 1, 2, 3]) == ["<pad>", "</s>", "<mask_1>", "<mask_2>"]
105
106

    @require_torch
107
    def test_large_seq2seq_truncation(self):
108
        src_texts = ["This is going to be way too long." * 150, "short example"]
109
        tgt_texts = ["not super long but more than 5 tokens", "tiny"]
110
        batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
111
112
113
        targets = self._large_tokenizer(
            text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
        )
114

115
116
        assert batch.input_ids.shape == (2, 1024)
        assert batch.attention_mask.shape == (2, 1024)
117
118
        assert targets["input_ids"].shape == (2, 5)
        assert len(batch) == 2  # input_ids, attention_mask.
Vasudev Gupta's avatar
Vasudev Gupta committed
119

120
121
122
123
124
125
126
127
128
129
130
131
    @slow
    def test_tokenizer_integration(self):
        # fmt: off
        expected_encoding = {'input_ids': [[38979, 143, 18485, 606, 130, 26669, 87686, 121, 54189, 1129, 111, 26669, 87686, 121, 9114, 14787, 121, 13249, 158, 592, 956, 121, 14621, 31576, 143, 62613, 108, 9688, 930, 43430, 11562, 62613, 304, 108, 11443, 897, 108, 9314, 17415, 63399, 108, 11443, 7614, 18316, 118, 4284, 7148, 12430, 143, 1400, 25703, 158, 111, 4284, 7148, 11772, 143, 21297, 1064, 158, 122, 204, 3506, 1754, 1133, 14787, 1581, 115, 33224, 4482, 111, 1355, 110, 29173, 317, 50833, 108, 20147, 94665, 111, 77198, 107, 1], [110, 62613, 117, 638, 112, 1133, 121, 20098, 1355, 79050, 13872, 135, 1596, 53541, 1352, 141, 13039, 5542, 124, 302, 518, 111, 268, 2956, 115, 149, 4427, 107, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [139, 1235, 2799, 18289, 17780, 204, 109, 9474, 1296, 107, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
        # fmt: on

        self.tokenizer_integration_test_util(
            expected_encoding=expected_encoding,
            model_name="google/bigbird-pegasus-large-arxiv",
            revision="ba85d0851d708441f91440d509690f1ab6353415",
        )

132
133
134
    # @unittest.skip("We have to use from_slow")
    # def test_added_tokens_serialization(self):
    #     pass
135

Vasudev Gupta's avatar
Vasudev Gupta committed
136
137
138
139
140
141
142

@require_sentencepiece
@require_tokenizers
class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = PegasusTokenizer
    rust_tokenizer_class = PegasusTokenizerFast
    test_rust_tokenizer = True
143
    test_sentencepiece = True
Vasudev Gupta's avatar
Vasudev Gupta committed
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

    def setUp(self):
        super().setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
        tokenizer.save_pretrained(self.tmpdirname)

    @cached_property
    def _large_tokenizer(self):
        return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self, tokenizer):
        return ("This is a test", "This is a test")

    def test_mask_tokens_rust_pegasus(self):
        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
Sylvain Gugger's avatar
Sylvain Gugger committed
165
166
167
168
        raw_input_str = (
            "Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>"
            " <pad> <pad> <pad>"
        )
Vasudev Gupta's avatar
Vasudev Gupta committed
169
170
171
172
173
174
175
176
177
        rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
        py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
        self.assertListEqual(py_ids, rust_ids)

    @require_torch
    def test_large_seq2seq_truncation(self):
        src_texts = ["This is going to be way too long." * 1000, "short example"]
        tgt_texts = ["not super long but more than 5 tokens", "tiny"]
        batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
178
179
180
        targets = self._large_tokenizer(
            text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
        )
Vasudev Gupta's avatar
Vasudev Gupta committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

        assert batch.input_ids.shape == (2, 4096)
        assert batch.attention_mask.shape == (2, 4096)
        assert targets["input_ids"].shape == (2, 5)
        assert len(batch) == 2  # input_ids, attention_mask.

    def test_equivalence_to_orig_tokenizer(self):
        """
        To run with original TF tokenizer:

        !wget https://github.com/google-research/bigbird/raw/master/bigbird/vocab/pegasus.model
        !pip install tensorflow-text

        import tensorflow.compat.v2 as tf
        import tensorflow_text as tft

        VOCAB_FILE = "./pegasus.model"

        tf.enable_v2_behavior()

        test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
        tokenizer = tft.SentencepieceTokenizer(model=tf.io.gfile.GFile(VOCAB_FILE, "rb").read())

        tokenizer.tokenize(test_str)
        """

Sylvain Gugger's avatar
Sylvain Gugger committed
207
208
209
210
        test_str = (
            "This is an example string that is used to test the original TF implementation against the HF"
            " implementation"
        )
Vasudev Gupta's avatar
Vasudev Gupta committed
211
212
213
214
215
216
217

        token_ids = self._large_tokenizer(test_str).input_ids

        self.assertListEqual(
            token_ids,
            [182, 117, 142, 587, 4211, 120, 117, 263, 112, 804, 109, 856, 25016, 3137, 464, 109, 26955, 3137, 1],
        )
218
219
220
221

    # @unittest.skip("We have to use from_slow")
    # def test_added_tokens_serialization(self):
    #     pass