test_tokenization_clip.py 8.49 KB
Newer Older
Suraj Patil's avatar
Suraj Patil committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import os
import unittest

from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
23
from transformers.testing_utils import require_ftfy, require_tokenizers
Suraj Patil's avatar
Suraj Patil committed
24
25
26
27
28
29
30
31
32

from .test_tokenization_common import TokenizerTesterMixin


@require_tokenizers
class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    tokenizer_class = CLIPTokenizer
    rust_tokenizer_class = CLIPTokenizerFast
33
34
    test_rust_tokenizer = True
    from_pretrained_kwargs = {}
Suraj Patil's avatar
Suraj Patil committed
35
36
37
38
    test_seq2seq = False

    def setUp(self):
        super().setUp()
39
40
        # temporary addition: to test the new slow to fast converter
        self.tokenizers_list = [(CLIPTokenizerFast, "SaulLu/clip-vit-base-patch32", {})]
Suraj Patil's avatar
Suraj Patil committed
41
42

        # fmt: off
43
        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
Suraj Patil's avatar
Suraj Patil committed
44
45
        # fmt: on
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
46
        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
Suraj Patil's avatar
Suraj Patil committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
        self.special_tokens_map = {"unk_token": "<unk>"}

        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
        with open(self.vocab_file, "w", encoding="utf-8") as fp:
            fp.write(json.dumps(vocab_tokens) + "\n")
        with open(self.merges_file, "w", encoding="utf-8") as fp:
            fp.write("\n".join(merges))

    def get_tokenizer(self, **kwargs):
        kwargs.update(self.special_tokens_map)
        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_rust_tokenizer(self, **kwargs):
        kwargs.update(self.special_tokens_map)
        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self, tokenizer):
        input_text = "lower newer"
66
        output_text = "lower newer"
Suraj Patil's avatar
Suraj Patil committed
67
68
69
70
71
72
        return input_text, output_text

    def test_full_tokenizer(self):
        tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
        text = "lower newer"
        bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
73
        tokens = tokenizer.tokenize(text)
Suraj Patil's avatar
Suraj Patil committed
74
75
76
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + [tokenizer.unk_token]
77
        input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
Suraj Patil's avatar
Suraj Patil committed
78
79
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

80
81
    @require_ftfy
    def test_check_encoding_slow_fast(self):
Suraj Patil's avatar
Suraj Patil committed
82
83
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
84
                tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
Suraj Patil's avatar
Suraj Patil committed
85
86
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)

87
88
89
                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
                text_tokenized_s = tokenizer_s.tokenize(text)
                text_tokenized_r = tokenizer_r.tokenize(text)
Suraj Patil's avatar
Suraj Patil committed
90

91
                self.assertListEqual(text_tokenized_s, text_tokenized_r)
Suraj Patil's avatar
Suraj Patil committed
92

93
94
95
96
97
                # Test that the tokenization is identical on an example containing a character (Latin Small Letter A
                # with Tilde) encoded in 2 different ways
                text = "xa\u0303y" + " " + "x\xe3y"
                text_tokenized_s = tokenizer_s.tokenize(text)
                text_tokenized_r = tokenizer_r.tokenize(text)
Suraj Patil's avatar
Suraj Patil committed
98

99
                self.assertListEqual(text_tokenized_s, text_tokenized_r)
Suraj Patil's avatar
Suraj Patil committed
100

101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
                # Test that the tokenization is identical on unicode of space type
                spaces_unicodes = [
                    "\u0009",  # (horizontal tab, '\t')
                    "\u000B",  # (vertical tab)
                    "\u000C",  # (form feed)
                    "\u0020",  # (space, ' ')
                    "\u200E",  # (left-to-right mark):w
                    "\u200F",  # (right-to-left mark)
                ]
                for unicode_seq in spaces_unicodes:
                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)

                    self.assertListEqual(text_tokenized_s, text_tokenized_r)

                # Test that the tokenization is identical on unicode of line break type
                line_break_unicodes = [
                    "\u000A",  # (line feed, '\n')
                    "\r\n",  # (carriage return and line feed, '\r\n')
                    "\u000D",  # (carriage return, '\r')
                    "\r",  # (carriage return, '\r')
                    "\u000D",  # (carriage return, '\r')
                    "\u2028",  # (line separator)
                    "\u2029",  # (paragraph separator)
                    # "\u0085", # (next line)
                ]
Suraj Patil's avatar
Suraj Patil committed
127

128
129
130
                # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
                # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
                # space (and thus into an empty list).
Suraj Patil's avatar
Suraj Patil committed
131

132
133
134
                for unicode_seq in line_break_unicodes:
                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
Suraj Patil's avatar
Suraj Patil committed
135

136
                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
Suraj Patil's avatar
Suraj Patil committed
137

138
139
140
141
142
143
    def test_offsets_mapping_with_different_add_prefix_space_argument(self):
        # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
                text = f"{text_of_1_token} {text_of_1_token}"
Suraj Patil's avatar
Suraj Patil committed
144

145
146
147
148
149
150
151
152
153
154
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
                    pretrained_name,
                    use_fast=True,
                )
                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
                self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
                self.assertEqual(
                    encoding.offset_mapping[1],
                    (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
                )
Suraj Patil's avatar
Suraj Patil committed
155

156
                text = f" {text}"
Suraj Patil's avatar
Suraj Patil committed
157

158
159
160
161
162
163
164
165
166
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
                    pretrained_name,
                    use_fast=True,
                )
                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
                self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
                self.assertEqual(
                    encoding.offset_mapping[1],
                    (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
Suraj Patil's avatar
Suraj Patil committed
167
168
                )

169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
    def test_log_warning(self):
        # Test related to the breaking change introduced in transformers v4.17.0
        # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
        with self.assertRaises(ValueError) as context:
            self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer")

        self.assertTrue(
            context.exception.args[0].startswith(
                "The `backend_tokenizer` provided does not match the expected format."
            )
        )

    @require_ftfy
    def test_tokenization_python_rust_equals(self):
        super().test_tokenization_python_rust_equals()

    # overwrite common test
    def test_added_tokens_do_lower_case(self):
        # CLIP always lower cases letters
        pass