tokenizer.py 8.71 KB
Newer Older
q.yao's avatar
q.yao committed
1
# Copyright (c) OpenMMLab. All rights reserved.
2
import json
q.yao's avatar
q.yao committed
3
import os.path as osp
4
from typing import Optional, Sequence, Union
5

q.yao's avatar
q.yao committed
6
import torch
7

q.yao's avatar
q.yao committed
8

q.yao's avatar
q.yao committed
9
10
class SentencePieceTokenizer:
    """Tokenizer of sentencepiece.
lvhan028's avatar
lvhan028 committed
11
12
13
14

    Args:
        model_file (str): the path of the tokenizer model
    """
q.yao's avatar
q.yao committed
15
16

    def __init__(self, model_file: str):
q.yao's avatar
q.yao committed
17
18
        from sentencepiece import SentencePieceProcessor
        self.model = SentencePieceProcessor(model_file=model_file)
19
        self._no_prefix_space_tokens = None
q.yao's avatar
q.yao committed
20

q.yao's avatar
q.yao committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    @property
    def vocab_size(self):
        """vocabulary size."""
        return self.model.vocab_size()

    @property
    def bos_token_id(self):
        """begine of the sentence token id."""
        return self.model.bos_id()

    @property
    def eos_token_id(self):
        """end of the sentence token id."""
        return self.model.eos_id()
q.yao's avatar
q.yao committed
35

36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    @property
    def no_prefix_space_tokens(self):
        """tokens without prefix space."""
        if self._no_prefix_space_tokens is None:
            vocab = self.model.IdToPiece(list(range(self.vocab_size)))
            self._no_prefix_space_tokens = {
                i
                for i, tok in enumerate(vocab) if not tok.startswith('▁')
            }
        return self._no_prefix_space_tokens

    def _maybe_add_prefix_space(self, tokens, decoded):
        """maybe add prefix space for incremental decoding."""
        if len(tokens) and tokens[0] not in self.no_prefix_space_tokens:
            return ' ' + decoded
        else:
            return decoded

q.yao's avatar
q.yao committed
54
    def encode(self, s: str):
lvhan028's avatar
lvhan028 committed
55
56
57
58
59
60
61
        """Tokenize a prompt.

        Args:
            s (str): a prompt
        Returns:
            list[int]: token ids
        """
q.yao's avatar
q.yao committed
62
63
64
65
66
67
68
69
70
        add_bos = False
        add_eos = False
        if s.find('<BOS>') != -1:
            s = s.replace('<BOS>', '')
            add_bos = True
        if s == '<EOS>':
            s = ''
            add_eos = True
        return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
q.yao's avatar
q.yao committed
71

72
    def decode(self, t: Sequence[int], offset: Optional[int] = None):
lvhan028's avatar
lvhan028 committed
73
74
75
76
        """De-tokenize.

        Args:
            t (List[int]): a list of token ids
77
78
            offset (int): for incrementally decoding. Default to None, which
                means not applied.
lvhan028's avatar
lvhan028 committed
79
80
81
        Returns:
            str: text of decoding tokens
        """
q.yao's avatar
q.yao committed
82
83
        if isinstance(t, torch.Tensor):
            t = t.tolist()
84
85
86
87
88
        t = t[offset:]
        out_string = self.model.Decode(t)
        if offset:
            out_string = self._maybe_add_prefix_space(t, out_string)
        return out_string
q.yao's avatar
q.yao committed
89

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    def __call__(self, s: Union[str, Sequence[str]]):
        """Tokenize prompts.

        Args:
            s (str): prompts
        Returns:
            list[int]: token ids
        """
        import addict
        add_bos = False
        add_eos = False

        input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
        return addict.Addict(input_ids=input_ids)

105

q.yao's avatar
q.yao committed
106
107
class HuggingFaceTokenizer:
    """Tokenizer of sentencepiece.
lvhan028's avatar
lvhan028 committed
108
109

    Args:
q.yao's avatar
q.yao committed
110
        model_dir (str): the directory of the tokenizer model
lvhan028's avatar
lvhan028 committed
111
    """
112

q.yao's avatar
q.yao committed
113
    def __init__(self, model_dir: str):
114
        from transformers import AutoTokenizer, LlamaTokenizerFast
q.yao's avatar
q.yao committed
115
116
117
118
119
120
121
122
        model_file = osp.join(model_dir, 'tokenizer.model')
        backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
        model_file_exists = osp.exists(model_file)
        if not osp.exists(backend_tokenizer_file) and model_file_exists:
            print('WARNING: Can not find tokenizer.json. '
                  'It may take long time to initialize the tokenizer.')
        self.model = AutoTokenizer.from_pretrained(model_dir,
                                                   trust_remote_code=True)
123
124
        self.need_padding = isinstance(self.model, LlamaTokenizerFast)
        self._no_prefix_space_tokens = None
q.yao's avatar
q.yao committed
125
126
127
128
        # save tokenizer.json to reuse
        if not osp.exists(backend_tokenizer_file) and model_file_exists:
            if hasattr(self.model, 'backend_tokenizer'):
                self.model.backend_tokenizer.save(backend_tokenizer_file)
q.yao's avatar
q.yao committed
129

130
131
132
133
134
135
136
        if self.model.eos_token_id is None:
            generation_config_file = osp.join(model_dir,
                                              'generation_config.json')
            with open(generation_config_file, 'r') as f:
                cfg = json.load(f)
                self.model.eos_token_id = cfg['eos_token_id']

q.yao's avatar
q.yao committed
137
138
139
140
    @property
    def vocab_size(self):
        """vocabulary size."""
        return self.model.vocab_size
q.yao's avatar
q.yao committed
141

q.yao's avatar
q.yao committed
142
143
144
145
    @property
    def bos_token_id(self):
        """begine of the sentence token id."""
        return self.model.bos_token_id
q.yao's avatar
q.yao committed
146

q.yao's avatar
q.yao committed
147
148
149
150
151
    @property
    def eos_token_id(self):
        """end of the sentence token id."""
        return self.model.eos_token_id

152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
    @property
    def no_prefix_space_tokens(self):
        """tokens without prefix space."""
        if self._no_prefix_space_tokens is None:
            vocab = self.model.convert_ids_to_tokens(
                list(range(self.vocab_size)))
            self._no_prefix_space_tokens = {
                i
                for i, tok in enumerate(vocab) if not tok.startswith('▁')
            }
        return self._no_prefix_space_tokens

    def _maybe_add_prefix_space(self, tokens, decoded):
        """maybe add prefix space for incremental decoding."""
        if self.need_padding and len(
                tokens) and tokens[0] not in self.no_prefix_space_tokens:
            return ' ' + decoded
        else:
            return decoded

q.yao's avatar
q.yao committed
172
173
    def encode(self, s: str):
        """Tokenize a prompt.
q.yao's avatar
q.yao committed
174

q.yao's avatar
q.yao committed
175
176
        Args:
            s (str): a prompt
q.yao's avatar
q.yao committed
177
        Returns:
q.yao's avatar
q.yao committed
178
            list[int]: token ids
q.yao's avatar
q.yao committed
179
        """
q.yao's avatar
q.yao committed
180
181
182
183
184
185
186
187
        add_special_tokens = False
        if s.find('<BOS>') != -1:
            s = s.replace('<BOS>', '<s>')
        if s == '<EOS>':
            s = '</s>'
        if len(s) == 0:
            add_special_tokens = True
        return self.model.encode(s, add_special_tokens=add_special_tokens)
188

189
    def decode(self, t: Sequence[int], offset: Optional[int] = None):
q.yao's avatar
q.yao committed
190
191
192
193
        """De-tokenize.

        Args:
            t (List[int]): a list of token ids
194
195
            offset (int): for incrementally decoding. Default to None, which
                means not applied.
q.yao's avatar
q.yao committed
196
197
198
199
        Returns:
            str: text of decoding tokens
        """
        skip_special_tokens = True
200
201
202
203
204
205
        t = t[offset:]
        out_string = self.model.decode(t,
                                       skip_special_tokens=skip_special_tokens)
        if offset:
            out_string = self._maybe_add_prefix_space(t, out_string)
        return out_string
q.yao's avatar
q.yao committed
206

207
208
209
210
211
212
213
214
215
216
217
    def __call__(self, s: Union[str, Sequence[str]]):
        """Tokenize prompts.

        Args:
            s (str): prompts
        Returns:
            list[int]: token ids
        """
        add_special_tokens = False
        return self.model(s, add_special_tokens=add_special_tokens)

q.yao's avatar
q.yao committed
218

q.yao's avatar
q.yao committed
219
220
class Tokenizer:
    """Tokenize prompts or de-tokenize tokens into texts.
lvhan028's avatar
lvhan028 committed
221
222

    Args:
q.yao's avatar
q.yao committed
223
        model_file (str): the path of the tokenizer model
lvhan028's avatar
lvhan028 committed
224
    """
225

q.yao's avatar
q.yao committed
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
    def __init__(self, model_file: str):
        if model_file.endswith('.model'):
            model_folder = osp.split(model_file)[0]
        else:
            model_folder = model_file
            model_file = osp.join(model_folder, 'tokenizer.model')
        tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')

        model_file_exists = osp.exists(model_file)
        config_exists = osp.exists(tokenizer_config_file)
        use_hf_model = config_exists or not model_file_exists

        if not use_hf_model:
            self.model = SentencePieceTokenizer(model_file)
        else:
            self.model = HuggingFaceTokenizer(model_folder)

    @property
    def vocab_size(self):
        """vocabulary size."""
        return self.model.vocab_size
q.yao's avatar
q.yao committed
247

q.yao's avatar
q.yao committed
248
249
250
251
    @property
    def bos_token_id(self):
        """begine of the sentence token id."""
        return self.model.bos_token_id
q.yao's avatar
q.yao committed
252

q.yao's avatar
q.yao committed
253
254
255
256
257
258
259
    @property
    def eos_token_id(self):
        """end of the sentence token id."""
        return self.model.eos_token_id

    def encode(self, s: str):
        """Tokenize a prompt.
q.yao's avatar
q.yao committed
260
261

        Args:
q.yao's avatar
q.yao committed
262
263
264
265
266
267
            s (str): a prompt
        Returns:
            list[int]: token ids
        """
        return self.model.encode(s)

268
    def decode(self, t: Sequence[int], offset: Optional[int] = None):
q.yao's avatar
q.yao committed
269
        """De-tokenize.
q.yao's avatar
q.yao committed
270

q.yao's avatar
q.yao committed
271
272
        Args:
            t (List[int]): a list of token ids
273
274
            offset (int): for incrementally decoding. Default to None, which
                means not applied.
q.yao's avatar
q.yao committed
275
        Returns:
q.yao's avatar
q.yao committed
276
            str: text of decoding tokens
q.yao's avatar
q.yao committed
277
        """
278
        return self.model.decode(t, offset)
279
280
281
282
283
284
285
286
287
288

    def __call__(self, s: Union[str, Sequence[str]]):
        """Tokenize prompts.

        Args:
            s (str): prompts
        Returns:
            list[int]: token ids
        """
        return self.model(s)