tokenizer.py 8.83 KB
Newer Older
q.yao's avatar
q.yao committed
1
# Copyright (c) OpenMMLab. All rights reserved.
2
import json
q.yao's avatar
q.yao committed
3
import os.path as osp
4
from typing import Optional, Sequence, Union
5

q.yao's avatar
q.yao committed
6
import torch
7

q.yao's avatar
q.yao committed
8

q.yao's avatar
q.yao committed
9
10
class SentencePieceTokenizer:
    """Tokenizer of sentencepiece.
lvhan028's avatar
lvhan028 committed
11
12
13
14

    Args:
        model_file (str): the path of the tokenizer model
    """
q.yao's avatar
q.yao committed
15
16

    def __init__(self, model_file: str):
q.yao's avatar
q.yao committed
17
18
        from sentencepiece import SentencePieceProcessor
        self.model = SentencePieceProcessor(model_file=model_file)
19
        self._no_prefix_space_tokens = None
q.yao's avatar
q.yao committed
20

q.yao's avatar
q.yao committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    @property
    def vocab_size(self):
        """vocabulary size."""
        return self.model.vocab_size()

    @property
    def bos_token_id(self):
        """begine of the sentence token id."""
        return self.model.bos_id()

    @property
    def eos_token_id(self):
        """end of the sentence token id."""
        return self.model.eos_id()
q.yao's avatar
q.yao committed
35

36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    @property
    def no_prefix_space_tokens(self):
        """tokens without prefix space."""
        if self._no_prefix_space_tokens is None:
            vocab = self.model.IdToPiece(list(range(self.vocab_size)))
            self._no_prefix_space_tokens = {
                i
                for i, tok in enumerate(vocab) if not tok.startswith('▁')
            }
        return self._no_prefix_space_tokens

    def _maybe_add_prefix_space(self, tokens, decoded):
        """maybe add prefix space for incremental decoding."""
        if len(tokens) and tokens[0] not in self.no_prefix_space_tokens:
            return ' ' + decoded
        else:
            return decoded

q.yao's avatar
q.yao committed
54
    def encode(self, s: str):
lvhan028's avatar
lvhan028 committed
55
56
57
58
59
60
61
        """Tokenize a prompt.

        Args:
            s (str): a prompt
        Returns:
            list[int]: token ids
        """
q.yao's avatar
q.yao committed
62
63
64
65
66
67
68
69
70
        add_bos = False
        add_eos = False
        if s.find('<BOS>') != -1:
            s = s.replace('<BOS>', '')
            add_bos = True
        if s == '<EOS>':
            s = ''
            add_eos = True
        return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
q.yao's avatar
q.yao committed
71

72
    def decode(self, t: Sequence[int], offset: Optional[int] = None):
lvhan028's avatar
lvhan028 committed
73
74
75
76
        """De-tokenize.

        Args:
            t (List[int]): a list of token ids
77
78
            offset (int): for incrementally decoding. Default to None, which
                means not applied.
lvhan028's avatar
lvhan028 committed
79
80
81
        Returns:
            str: text of decoding tokens
        """
q.yao's avatar
q.yao committed
82
83
        if isinstance(t, torch.Tensor):
            t = t.tolist()
84
85
86
87
88
        t = t[offset:]
        out_string = self.model.Decode(t)
        if offset:
            out_string = self._maybe_add_prefix_space(t, out_string)
        return out_string
q.yao's avatar
q.yao committed
89

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    def __call__(self, s: Union[str, Sequence[str]]):
        """Tokenize prompts.

        Args:
            s (str): prompts
        Returns:
            list[int]: token ids
        """
        import addict
        add_bos = False
        add_eos = False

        input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
        return addict.Addict(input_ids=input_ids)

105

q.yao's avatar
q.yao committed
106
107
class HuggingFaceTokenizer:
    """Tokenizer of sentencepiece.
lvhan028's avatar
lvhan028 committed
108
109

    Args:
q.yao's avatar
q.yao committed
110
        model_dir (str): the directory of the tokenizer model
lvhan028's avatar
lvhan028 committed
111
    """
112

q.yao's avatar
q.yao committed
113
    def __init__(self, model_dir: str):
Lyu Han's avatar
Lyu Han committed
114
115
        from transformers import (AutoTokenizer, CodeLlamaTokenizerFast,
                                  LlamaTokenizerFast)
q.yao's avatar
q.yao committed
116
117
118
119
120
121
122
123
        model_file = osp.join(model_dir, 'tokenizer.model')
        backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
        model_file_exists = osp.exists(model_file)
        if not osp.exists(backend_tokenizer_file) and model_file_exists:
            print('WARNING: Can not find tokenizer.json. '
                  'It may take long time to initialize the tokenizer.')
        self.model = AutoTokenizer.from_pretrained(model_dir,
                                                   trust_remote_code=True)
Lyu Han's avatar
Lyu Han committed
124
125
        self.need_padding = isinstance(self.model, LlamaTokenizerFast) \
            or isinstance(self.model, CodeLlamaTokenizerFast)
126
        self._no_prefix_space_tokens = None
q.yao's avatar
q.yao committed
127
128
129
130
        # save tokenizer.json to reuse
        if not osp.exists(backend_tokenizer_file) and model_file_exists:
            if hasattr(self.model, 'backend_tokenizer'):
                self.model.backend_tokenizer.save(backend_tokenizer_file)
q.yao's avatar
q.yao committed
131

132
133
134
135
136
137
138
        if self.model.eos_token_id is None:
            generation_config_file = osp.join(model_dir,
                                              'generation_config.json')
            with open(generation_config_file, 'r') as f:
                cfg = json.load(f)
                self.model.eos_token_id = cfg['eos_token_id']

q.yao's avatar
q.yao committed
139
140
141
142
    @property
    def vocab_size(self):
        """vocabulary size."""
        return self.model.vocab_size
q.yao's avatar
q.yao committed
143

q.yao's avatar
q.yao committed
144
145
146
147
    @property
    def bos_token_id(self):
        """begine of the sentence token id."""
        return self.model.bos_token_id
q.yao's avatar
q.yao committed
148

q.yao's avatar
q.yao committed
149
150
151
152
153
    @property
    def eos_token_id(self):
        """end of the sentence token id."""
        return self.model.eos_token_id

154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
    @property
    def no_prefix_space_tokens(self):
        """tokens without prefix space."""
        if self._no_prefix_space_tokens is None:
            vocab = self.model.convert_ids_to_tokens(
                list(range(self.vocab_size)))
            self._no_prefix_space_tokens = {
                i
                for i, tok in enumerate(vocab) if not tok.startswith('▁')
            }
        return self._no_prefix_space_tokens

    def _maybe_add_prefix_space(self, tokens, decoded):
        """maybe add prefix space for incremental decoding."""
        if self.need_padding and len(
                tokens) and tokens[0] not in self.no_prefix_space_tokens:
            return ' ' + decoded
        else:
            return decoded

q.yao's avatar
q.yao committed
174
175
    def encode(self, s: str):
        """Tokenize a prompt.
q.yao's avatar
q.yao committed
176

q.yao's avatar
q.yao committed
177
178
        Args:
            s (str): a prompt
q.yao's avatar
q.yao committed
179
        Returns:
q.yao's avatar
q.yao committed
180
            list[int]: token ids
q.yao's avatar
q.yao committed
181
        """
q.yao's avatar
q.yao committed
182
183
184
185
186
187
188
189
        add_special_tokens = False
        if s.find('<BOS>') != -1:
            s = s.replace('<BOS>', '<s>')
        if s == '<EOS>':
            s = '</s>'
        if len(s) == 0:
            add_special_tokens = True
        return self.model.encode(s, add_special_tokens=add_special_tokens)
190

191
    def decode(self, t: Sequence[int], offset: Optional[int] = None):
q.yao's avatar
q.yao committed
192
193
194
195
        """De-tokenize.

        Args:
            t (List[int]): a list of token ids
196
197
            offset (int): for incrementally decoding. Default to None, which
                means not applied.
q.yao's avatar
q.yao committed
198
199
200
201
        Returns:
            str: text of decoding tokens
        """
        skip_special_tokens = True
202
203
204
205
206
207
        t = t[offset:]
        out_string = self.model.decode(t,
                                       skip_special_tokens=skip_special_tokens)
        if offset:
            out_string = self._maybe_add_prefix_space(t, out_string)
        return out_string
q.yao's avatar
q.yao committed
208

209
210
211
212
213
214
215
216
217
218
219
    def __call__(self, s: Union[str, Sequence[str]]):
        """Tokenize prompts.

        Args:
            s (str): prompts
        Returns:
            list[int]: token ids
        """
        add_special_tokens = False
        return self.model(s, add_special_tokens=add_special_tokens)

q.yao's avatar
q.yao committed
220

q.yao's avatar
q.yao committed
221
222
class Tokenizer:
    """Tokenize prompts or de-tokenize tokens into texts.
lvhan028's avatar
lvhan028 committed
223
224

    Args:
q.yao's avatar
q.yao committed
225
        model_file (str): the path of the tokenizer model
lvhan028's avatar
lvhan028 committed
226
    """
227

q.yao's avatar
q.yao committed
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
    def __init__(self, model_file: str):
        if model_file.endswith('.model'):
            model_folder = osp.split(model_file)[0]
        else:
            model_folder = model_file
            model_file = osp.join(model_folder, 'tokenizer.model')
        tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')

        model_file_exists = osp.exists(model_file)
        config_exists = osp.exists(tokenizer_config_file)
        use_hf_model = config_exists or not model_file_exists

        if not use_hf_model:
            self.model = SentencePieceTokenizer(model_file)
        else:
            self.model = HuggingFaceTokenizer(model_folder)

    @property
    def vocab_size(self):
        """vocabulary size."""
        return self.model.vocab_size
q.yao's avatar
q.yao committed
249

q.yao's avatar
q.yao committed
250
251
252
253
    @property
    def bos_token_id(self):
        """begine of the sentence token id."""
        return self.model.bos_token_id
q.yao's avatar
q.yao committed
254

q.yao's avatar
q.yao committed
255
256
257
258
259
260
261
    @property
    def eos_token_id(self):
        """end of the sentence token id."""
        return self.model.eos_token_id

    def encode(self, s: str):
        """Tokenize a prompt.
q.yao's avatar
q.yao committed
262
263

        Args:
q.yao's avatar
q.yao committed
264
265
266
267
268
269
            s (str): a prompt
        Returns:
            list[int]: token ids
        """
        return self.model.encode(s)

270
    def decode(self, t: Sequence[int], offset: Optional[int] = None):
q.yao's avatar
q.yao committed
271
        """De-tokenize.
q.yao's avatar
q.yao committed
272

q.yao's avatar
q.yao committed
273
274
        Args:
            t (List[int]): a list of token ids
275
276
            offset (int): for incrementally decoding. Default to None, which
                means not applied.
q.yao's avatar
q.yao committed
277
        Returns:
q.yao's avatar
q.yao committed
278
            str: text of decoding tokens
q.yao's avatar
q.yao committed
279
        """
280
        return self.model.decode(t, offset)
281
282
283
284
285
286
287
288
289
290

    def __call__(self, s: Union[str, Sequence[str]]):
        """Tokenize prompts.

        Args:
            s (str): prompts
        Returns:
            list[int]: token ids
        """
        return self.model(s)