test_tokenization_clip.py 8.34 KB
Newer Older
Suraj Patil's avatar
Suraj Patil committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import os
import unittest

from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
23
from transformers.testing_utils import require_ftfy, require_tokenizers
Suraj Patil's avatar
Suraj Patil committed
24

Yih-Dar's avatar
Yih-Dar committed
25
from ...test_tokenization_common import TokenizerTesterMixin
Suraj Patil's avatar
Suraj Patil committed
26
27
28
29
30
31
32


@require_tokenizers
class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    tokenizer_class = CLIPTokenizer
    rust_tokenizer_class = CLIPTokenizerFast
33
34
    test_rust_tokenizer = True
    from_pretrained_kwargs = {}
Suraj Patil's avatar
Suraj Patil committed
35
36
37
38
39
40
    test_seq2seq = False

    def setUp(self):
        super().setUp()

        # fmt: off
41
        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
Suraj Patil's avatar
Suraj Patil committed
42
43
        # fmt: on
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
44
        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
Suraj Patil's avatar
Suraj Patil committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
        self.special_tokens_map = {"unk_token": "<unk>"}

        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
        with open(self.vocab_file, "w", encoding="utf-8") as fp:
            fp.write(json.dumps(vocab_tokens) + "\n")
        with open(self.merges_file, "w", encoding="utf-8") as fp:
            fp.write("\n".join(merges))

    def get_tokenizer(self, **kwargs):
        kwargs.update(self.special_tokens_map)
        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_rust_tokenizer(self, **kwargs):
        kwargs.update(self.special_tokens_map)
        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self, tokenizer):
        input_text = "lower newer"
64
        output_text = "lower newer"
Suraj Patil's avatar
Suraj Patil committed
65
66
67
68
69
70
        return input_text, output_text

    def test_full_tokenizer(self):
        tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
        text = "lower newer"
        bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
71
        tokens = tokenizer.tokenize(text)
Suraj Patil's avatar
Suraj Patil committed
72
73
74
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + [tokenizer.unk_token]
75
        input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
Suraj Patil's avatar
Suraj Patil committed
76
77
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

78
79
    @require_ftfy
    def test_check_encoding_slow_fast(self):
Suraj Patil's avatar
Suraj Patil committed
80
81
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
82
                tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
Suraj Patil's avatar
Suraj Patil committed
83
84
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)

85
86
87
                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
                text_tokenized_s = tokenizer_s.tokenize(text)
                text_tokenized_r = tokenizer_r.tokenize(text)
Suraj Patil's avatar
Suraj Patil committed
88

89
                self.assertListEqual(text_tokenized_s, text_tokenized_r)
Suraj Patil's avatar
Suraj Patil committed
90

91
92
93
94
95
                # Test that the tokenization is identical on an example containing a character (Latin Small Letter A
                # with Tilde) encoded in 2 different ways
                text = "xa\u0303y" + " " + "x\xe3y"
                text_tokenized_s = tokenizer_s.tokenize(text)
                text_tokenized_r = tokenizer_r.tokenize(text)
Suraj Patil's avatar
Suraj Patil committed
96

97
                self.assertListEqual(text_tokenized_s, text_tokenized_r)
Suraj Patil's avatar
Suraj Patil committed
98

99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
                # Test that the tokenization is identical on unicode of space type
                spaces_unicodes = [
                    "\u0009",  # (horizontal tab, '\t')
                    "\u000B",  # (vertical tab)
                    "\u000C",  # (form feed)
                    "\u0020",  # (space, ' ')
                    "\u200E",  # (left-to-right mark):w
                    "\u200F",  # (right-to-left mark)
                ]
                for unicode_seq in spaces_unicodes:
                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)

                    self.assertListEqual(text_tokenized_s, text_tokenized_r)

                # Test that the tokenization is identical on unicode of line break type
                line_break_unicodes = [
                    "\u000A",  # (line feed, '\n')
                    "\r\n",  # (carriage return and line feed, '\r\n')
                    "\u000D",  # (carriage return, '\r')
                    "\r",  # (carriage return, '\r')
                    "\u000D",  # (carriage return, '\r')
                    "\u2028",  # (line separator)
                    "\u2029",  # (paragraph separator)
                    # "\u0085", # (next line)
                ]
Suraj Patil's avatar
Suraj Patil committed
125

126
127
128
                # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
                # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
                # space (and thus into an empty list).
Suraj Patil's avatar
Suraj Patil committed
129

130
131
132
                for unicode_seq in line_break_unicodes:
                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
Suraj Patil's avatar
Suraj Patil committed
133

134
                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
Suraj Patil's avatar
Suraj Patil committed
135

136
137
138
139
140
141
    def test_offsets_mapping_with_different_add_prefix_space_argument(self):
        # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
                text = f"{text_of_1_token} {text_of_1_token}"
Suraj Patil's avatar
Suraj Patil committed
142

143
144
145
146
147
148
149
150
151
152
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
                    pretrained_name,
                    use_fast=True,
                )
                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
                self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
                self.assertEqual(
                    encoding.offset_mapping[1],
                    (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
                )
Suraj Patil's avatar
Suraj Patil committed
153

154
                text = f" {text}"
Suraj Patil's avatar
Suraj Patil committed
155

156
157
158
159
160
161
162
163
164
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
                    pretrained_name,
                    use_fast=True,
                )
                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
                self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
                self.assertEqual(
                    encoding.offset_mapping[1],
                    (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
Suraj Patil's avatar
Suraj Patil committed
165
166
                )

167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
    def test_log_warning(self):
        # Test related to the breaking change introduced in transformers v4.17.0
        # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
        with self.assertRaises(ValueError) as context:
            self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer")

        self.assertTrue(
            context.exception.args[0].startswith(
                "The `backend_tokenizer` provided does not match the expected format."
            )
        )

    @require_ftfy
    def test_tokenization_python_rust_equals(self):
        super().test_tokenization_python_rust_equals()

    # overwrite common test
    def test_added_tokens_do_lower_case(self):
        # CLIP always lower cases letters
        pass