test_tokenization_speech_to_text_2.py 5.27 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import tempfile
import unittest

from transformers.models.speech_to_text_2 import Speech2Text2Tokenizer
from transformers.models.speech_to_text_2.tokenization_speech_to_text_2 import VOCAB_FILES_NAMES
from transformers.testing_utils import is_pt_tf_cross_test

from .test_tokenization_common import TokenizerTesterMixin


class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = Speech2Text2Tokenizer
    test_rust_tokenizer = False

    def setUp(self):
        super().setUp()

        vocab = "<s> <pad> </s> <unk> here@@ a couple of@@ words for the vocab".split(" ")
        vocab_tokens = dict(zip(vocab, range(len(vocab))))

        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}

        self.tmpdirname = tempfile.mkdtemp()
        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
        with open(self.vocab_file, "w", encoding="utf-8") as fp:
            fp.write(json.dumps(vocab_tokens) + "\n")

    def test_get_vocab(self):
        vocab_keys = list(self.get_tokenizer().get_vocab().keys())

        self.assertEqual(vocab_keys[0], "<s>")
        self.assertEqual(vocab_keys[1], "<pad>")
        self.assertEqual(vocab_keys[-1], "vocab")
        self.assertEqual(len(vocab_keys), 12)

    def test_vocab_size(self):
        self.assertEqual(self.get_tokenizer().vocab_size, 12)

    def test_tokenizer_decode(self):
        tokenizer = Speech2Text2Tokenizer.from_pretrained(self.tmpdirname)

        # make sure @@ is correctly concatenated
        token_ids = [4, 6, 8, 7, 10]  # ["here@@", "couple", "words", "of@@", "the"]
        output_string = tokenizer.decode(token_ids)

        self.assertTrue(output_string == "herecouple words ofthe")

    # currently tokenizer cannot do encoding, but just decoding
    def test_add_special_tokens(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_add_tokens_tokenizer(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_added_tokens_do_lower_case(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_batch_encode_plus_batch_sequence_length(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_batch_encode_plus_overflowing_tokens(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_batch_encode_plus_padding(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_call(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_encode_plus_with_padding(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_internal_consistency(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_maximum_encoding_length_pair_input(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_maximum_encoding_length_single_input(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_number_of_added_tokens(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_padding_to_max_length(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_padding_to_multiple_of(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_pickle_tokenizer(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_prepare_for_model(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_pretokenized_inputs(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_right_and_left_padding(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_save_and_load_tokenizer(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_special_tokens_mask(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_special_tokens_mask_input_pairs(self):
        pass

    # currently tokenizer cannot do encoding, but just decoding
    def test_token_type_ids(self):
        pass

Patrick von Platen's avatar
Patrick von Platen committed
152
153
154
155
    # currently tokenizer cannot do encoding, but just decoding
    def test_added_token_are_matched_longest_first(self):
        pass

156
157
158
159
    # currently tokenizer cannot do encoding, but just decoding
    @is_pt_tf_cross_test
    def test_batch_encode_plus_tensors(self):
        pass