llm.py 25.1 KB
Newer Older
zhouxiang's avatar
zhouxiang committed
1
2
3
4
5
import ctypes;
import math
import os;
import threading
from typing import Optional, Tuple, Union, List, Callable, Dict, Any;
6
from copy import deepcopy
7
8
9
10
11
12
13
14
15
16

import platform
if platform.system() == 'Windows':
    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"))
else:
    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.so"))

fastllm_lib.create_llm_model.argtypes = [ctypes.c_char_p]
fastllm_lib.create_llm_model.restype = ctypes.c_int

zhouxiang's avatar
zhouxiang committed
17
18
19
20
21
22
fastllm_lib.token_decode.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_char_p]
fastllm_lib.token_decode.restype = ctypes.c_int

fastllm_lib.token_encode_string.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
fastllm_lib.token_encode_string.restype = ctypes.c_int

23
24
25
26
27
28
29
30
31
32
33
34
35
36
fastllm_lib.launch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
                                                  ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
                                                  ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.launch_response_llm_model.restype = ctypes.c_int

fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
fastllm_lib.fetch_response_llm_model.restype = ctypes.c_int

fastllm_lib.fetch_response_logits_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_float)]
fastllm_lib.fetch_response_logits_llm_model.restype = ctypes.c_int

fastllm_lib.response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_char_p,
                                               ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
                                               ctypes.c_float, ctypes.c_float, ctypes.c_bool]
37
38
# fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
fastllm_lib.response_str_llm_model.restype = ctypes.POINTER(ctypes.c_char)
39
40
41
42
43
44
45

fastllm_lib.launch_response_str_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p,
                                                     ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
                                                     ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.launch_response_str_llm_model.restype = ctypes.c_int

fastllm_lib.fetch_response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
46
47
# fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
fastllm_lib.fetch_response_str_llm_model.restype = ctypes.POINTER(ctypes.c_char)
48
49

fastllm_lib.make_history_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
50
51
# fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
fastllm_lib.make_history_llm_model.restype = ctypes.POINTER(ctypes.c_char)
52
53

fastllm_lib.make_input_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p]
54
55
# fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
fastllm_lib.make_input_llm_model.restype = ctypes.POINTER(ctypes.c_char)
56
57
58
59
60

fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int]

fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]

61
fastllm_lib.get_llm_model_type.argtype = [ctypes.c_int]
62
63
# fastllm_lib.get_llm_model_type.restype = ctypes.c_char_p
fastllm_lib.get_llm_model_type.restype = ctypes.POINTER(ctypes.c_char)
64
65
66
67
68
69
70
71
72
73
74

fastllm_lib.response_batch_str_llm_model.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int,
                                                     ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
                                                     ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.response_batch_str_llm_model.restype = ctypes.POINTER(ctypes.c_char_p)

fastllm_lib.response_batch_tokens_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int),
                                                        ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
                                                        ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.response_batch_tokens_llm_model.restype = ctypes.POINTER(ctypes.c_char_p)

75
76
77
78
79
fastllm_lib.freeChars.argtype = [ctypes.POINTER(ctypes.c_char)]
# fastllm_lib.freeChars.restype = ctypes.c_char_p

fastllm_lib.freeCharArray.argtype = [ctypes.POINTER(ctypes.c_char_p)]

80
def set_cpu_threads(threads: int):
zhouxiang's avatar
zhouxiang committed
81
    fastllm_lib.set_cpu_threads(threads);
82
83

def get_cpu_threads() -> int:
zhouxiang's avatar
zhouxiang committed
84
    return fastllm_lib.get_cpu_threads();
85
86

def print_ins_info():
zhouxiang's avatar
zhouxiang committed
87
    fastllm_lib.print_cpu_ins();
88
89

def set_cpu_kvcache(cpu_kvcache):
zhouxiang's avatar
zhouxiang committed
90
    fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache));
91
92

def get_cpu_kvcache():
zhouxiang's avatar
zhouxiang committed
93
    return fastllm_lib.get_kvcache_in_cpu();
94
95

def set_cpu_low_mem(low_mem):
zhouxiang's avatar
zhouxiang committed
96
    fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem));
97
98

def get_cpu_low_mem():
zhouxiang's avatar
zhouxiang committed
99
    return fastllm_lib.get_cpu_low_mem();
100
101

def set_device_map(device_map):
zhouxiang's avatar
zhouxiang committed
102
103
    devices = [];
    values = [];
104
    if (isinstance(device_map, str)):
zhouxiang's avatar
zhouxiang committed
105
106
        devices.append(device_map);
        values.append(1);
107
    elif (isinstance(device_map, list)):
zhouxiang's avatar
zhouxiang committed
108
109
        devices = [str(x) for x in device_map];
        values = [1 for x in device_map];
110
    elif (isinstance(device_map, dict)):
zhouxiang's avatar
zhouxiang committed
111
112
        devices = [str(x) for x in device_map.keys()];
        values = [int(device_map[x]) for x in device_map.keys()];
113
    else:
zhouxiang's avatar
zhouxiang committed
114
115
116
117
        print("set_device_map error.");
        return;
    device_str = ''.join(devices);
    device_len = [len(x) for x in devices];
118
119
120
    fastllm_lib.set_device_map(len(device_len),
                               (ctypes.c_int * len(device_len))(*device_len),
                               device_str.encode(),
zhouxiang's avatar
zhouxiang committed
121
                               (ctypes.c_int * len(values))(*values));
122
123
124
def from_hf(model,
            tokenizer = None,
            dtype = "float16"):
zhouxiang's avatar
zhouxiang committed
125
126
    from fastllm_pytools import hf_model;
    return hf_model.create(model, tokenizer, dtype = dtype);
127
128
129
130
131

class model:
    def __init__ (self, path : str,
                  id : int = -99999):
        if (id != -99999):
zhouxiang's avatar
zhouxiang committed
132
            self.model = id;
133
        else:
zhouxiang's avatar
zhouxiang committed
134
135
136
137
138
139
140
141
142
143
144
145
            self.model = fastllm_lib.create_llm_model(path.encode());
        self.direct_query = False;

        # 为了减少重复申请释放buffer对象而使用的线程局部存储区对象池
        self.thread_local_obj = threading.local()
        self.thread_local_obj.tokenizer_encode_string__output_buffer = None
        self.thread_local_obj.tokenizer_decode_token__output_buffer = None

        # tokenizer_decode_token 输出结果的静态缓存,手工触发构建
        # 由于token数量有限且不太多,所以缓存该结果来减少调用较为适合。
        # 不做成自动缓存是为了避免在多线程调用的时候对缓存dict加锁,同时也为不同场景提供选择空间
        self.tokenizer_decode_token_cache = None
146

147
148
149
        model_type_ptr = fastllm_lib.get_llm_model_type(self.model)
        self.model_type = ctypes.string_at(model_type_ptr).decode()
        fastllm_lib.freeChars(model_type_ptr)
150
151
        # print("model_type:", self.model_type)

152
153
154
155
    def get_prompt(self,
                   query: str,
                   history: List[Tuple[str, str]] = None) -> str:
        if (not(history)):
zhouxiang's avatar
zhouxiang committed
156
157
            history = [];
        prompt = "";
158
        for i, (old_query, response) in enumerate(history):
159
160
161
162
163
164
165
            history_ptr = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode())
            prompt = ctypes.string_at(history_ptr).decode()
            fastllm_lib.freeChars(history_ptr)
        
        input_ptr = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode())
        prompt = ctypes.string_at(input_ptr).decode()
        fastllm_lib.freeChars(input_ptr)
zhouxiang's avatar
zhouxiang committed
166
        return prompt;
167
168

    def save(self, path : str):
zhouxiang's avatar
zhouxiang committed
169
        fastllm_lib.save_llm_model(self.model, path.encode());
170
171

    def eval(self):
zhouxiang's avatar
zhouxiang committed
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
        pass;

    def build_tokenizer_decode_token_cache(self):
        if self.tokenizer_decode_token_cache is not None:
            return

        cache_dict = dict()
        vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
        for token_id in range(vocab_size):
            cache_dict[token_id] = self.tokenizer_decode_token(token_id)

        self.tokenizer_decode_token_cache = cache_dict

    def tokenizer_encode_string(self, content: str) -> List[int]:
        output_buffer_init_len = 1024
        if self.thread_local_obj.tokenizer_encode_string__output_buffer is None:
            self.thread_local_obj.tokenizer_encode_string__output_buffer = (ctypes.c_int * output_buffer_init_len)()

        buffer = self.thread_local_obj.tokenizer_encode_string__output_buffer
        buffer_len = len(buffer)
        result_len = fastllm_lib.token_encode_string(self.model, content.encode(), buffer_len, buffer)
        if result_len > buffer_len:
            if result_len > 10240:
                # 要处理的数据过长,使用一次性的buffer
                temp_buffer = (ctypes.c_int * result_len)()
                ret = fastllm_lib.token_encode_string(self.model, content.encode(), result_len, temp_buffer)
                return [i for i in temp_buffer]
            else:
                # 扩展buffer大小
                new_buffer_len = round(math.ceil(result_len / 1024.0)) * 1024
                buffer = (ctypes.c_int * new_buffer_len)()
                self.thread_local_obj.tokenizer_encode_string__output_buffer = buffer
                result_len = fastllm_lib.token_encode_string(self.model, content.encode(), new_buffer_len, buffer)

        return [buffer[i] for i in range(result_len)]

    def tokenizer_decode_token(self, token_id: int) -> bytes:
        if self.tokenizer_decode_token_cache is not None:
            cache_result = self.tokenizer_decode_token_cache.get(token_id)
            if cache_result is not None:
                return cache_result

        output_buffer_init_len = 256
        if self.thread_local_obj.tokenizer_decode_token__output_buffer is None:
            self.thread_local_obj.tokenizer_decode_token__output_buffer = ctypes.create_string_buffer(output_buffer_init_len)

        buffer = self.thread_local_obj.tokenizer_decode_token__output_buffer
        ret = fastllm_lib.token_decode(self.model, token_id, len(buffer), buffer)
        if ret > 0:
            # buffer长度不够,扩展buffer大小
            new_buffer_len = round(math.ceil(ret / 16.0)) * 16
            buffer = ctypes.create_string_buffer(new_buffer_len)
            self.thread_local_obj.tokenizer_decode_token__output_buffer = buffer
            ret = fastllm_lib.token_decode(self.model, token_id, len(buffer), buffer)
            assert ret == 0

        buffer_bytes = buffer.raw
        result_len = len(buffer_bytes)
        for i in range(len(buffer_bytes)):
            if buffer_bytes[i] == 0:
                result_len = i
                break
        return buffer_bytes[:result_len]
235
236
237
238
239

    def response_logits(self,
                        query: str,
                        history: List[Tuple[str, str]] = None,
                        tokenizer = None) -> str:
zhouxiang's avatar
zhouxiang committed
240
        prompt = query if self.direct_query else self.get_prompt(query, history);
241
242
        if (tokenizer == None):
            handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
243
244
                                                               ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
                                                               ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True));
245
        else:
zhouxiang's avatar
zhouxiang committed
246
            input = tokenizer.encode(prompt);
247
            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
zhouxiang's avatar
zhouxiang committed
248
249
                                                           1, False, 1, 1, 1, 1, True);
        vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model);
250
        logits = list(range(vocab_size))
zhouxiang's avatar
zhouxiang committed
251
252
253
        array = (ctypes.c_float * (vocab_size * 4))(*logits);
        ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
        out = list(array)[:vocab_size];
254
        while (ret != -1):
zhouxiang's avatar
zhouxiang committed
255
256
            ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
        return out;
257
258
259
260

    def response(self,
                 query: str,
                 history: List[Tuple[str, str]] = None,
261
                 max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.1) -> str:
zhouxiang's avatar
zhouxiang committed
262
        ret = "";
263
264
265
266
267
268
269
270
        for i in self.stream_response(query = query,
                                      history = history,
                                      max_length = max_length,
                                      do_sample = do_sample,
                                      top_p = top_p, top_k = top_k,
                                      temperature = temperature,
                                      repeat_penalty = repeat_penalty,
                                      one_by_one = True):
zhouxiang's avatar
zhouxiang committed
271
272
            ret += i;
        return ret;
273
274
275
276

    def stream_response(self,
                        query: str,
                        history: List[Tuple[str, str]] = None,
277
                        max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.1,
278
                        one_by_one = True):
zhouxiang's avatar
zhouxiang committed
279
        prompt = query if self.direct_query else self.get_prompt(query, history);
280
281
        handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
                                                           ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
zhouxiang's avatar
zhouxiang committed
282
283
284
285
                                                           ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False));
        res = "";
        ret = b'';
        fail_cnt = 0;
286
        while True:
287
288
289
290
            # ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
            ret_chararry = fastllm_lib.fetch_response_str_llm_model(self.model, handle);
            ret += ctypes.string_at(ret_chararry)
            fastllm_lib.freeChars(ret_chararry)
zhouxiang's avatar
zhouxiang committed
291
            cur = "";
292
            try:
293
                cur = ret.decode()
zhouxiang's avatar
zhouxiang committed
294
                ret = b'';
295
            except:
zhouxiang's avatar
zhouxiang committed
296
                fail_cnt += 1;
297
                if (fail_cnt == 20):
zhouxiang's avatar
zhouxiang committed
298
                    break;
299
                else:
zhouxiang's avatar
zhouxiang committed
300
301
                    continue;
            fail_cnt = 0;
302
            if (cur == "<flmeos>"):
zhouxiang's avatar
zhouxiang committed
303
304
305
306
307
308
309
310
311
                break;
            if one_by_one:
                yield cur;
            else:
                res += cur;
                yield res;

    def stream_response_raw(self,
                            input_tokens: List[int],
312
                            max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.1,
zhouxiang's avatar
zhouxiang committed
313
314
315
316
317
318
319
320
321
322
323
324
325
326
                            one_by_one = True
                            ):
        handle = fastllm_lib.launch_response_llm_model(self.model, len(input_tokens),
                                                       (ctypes.c_int * len(input_tokens))(*input_tokens),
                                                       ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
                                                       ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))

        # 可能遇到长尾char需要多个token才能够生成,所以只返回bytes,string.decode策略交给外部
        # 方便统计输出token数量,和控制不完整utf8时候解码的逻辑

        total_bytes = b''
        while True:
            cur_token = fastllm_lib.fetch_response_llm_model(self.model, handle)
            if cur_token == -1:
327
                break
zhouxiang's avatar
zhouxiang committed
328
329
330

            cur_bytes = self.tokenizer_decode_token(cur_token)

331
            if one_by_one:
zhouxiang's avatar
zhouxiang committed
332
                yield cur_bytes
333
            else:
zhouxiang's avatar
zhouxiang committed
334
335
                total_bytes += cur_bytes
                yield total_bytes
336
337

    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
338
             do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.1, **kwargs):
339
340
341
342
343
344
345
346
        if self.model_type  != "chatglm3":
            if (not(history)):
                history = [];
            prompt = query if self.direct_query else self.get_prompt(query, history);
            input = tokenizer.encode(prompt);
            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
                                                           False);
347

348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
            result = [];
            while True:
                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
                if (cur == -1):
                    break;
                result.append(cur);
            response = tokenizer.decode(result);
            history = history + [(query, response)];
            return response, history;
        else:
            if history is None:
                history = []
            role = "user"
            input = self.build_chatglm3_input(tokenizer, query, history=history, role=role)
            history.append({"role": role, "content": query})

            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
                                                           False);
            tokens = [];
            while True:
                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
                if (cur == -1):
                    break;
                tokens.append(cur);
            response = tokenizer.decode(tokens);
            if response and response[-1] != "�":
                response, new_history = self.process_chatglm3_response(response, history)
                return response, new_history
377
378

    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
379
                    max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.1,
380
                    return_past_key_values = False, **kwargs) -> str:
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
        if self.model_type  != "chatglm3":
            if (not(history)):
                history = [];
            prompt = query if self.direct_query else self.get_prompt(query, history);
            input = tokenizer.encode(prompt);
            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
                                                           False);
            tokens = [];
            while True:
                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
                if (cur == -1):
                    break;
                tokens.append(cur);
                response = tokenizer.decode(tokens);
                new_history = history + [(query, response)];
                if return_past_key_values:
                    yield response, new_history, None;
                else:
                    yield response, new_history;
        else:
            if history is None:
                history = []
            role = "user"
            input = self.build_chatglm3_input(tokenizer, query, history=history, role=role)
            history.append({"role": role, "content": query})

            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
                                                           False);
            tokens = [];
            while True:
                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
                if (cur == -1):
                    break;
                tokens.append(cur);
                response = tokenizer.decode(tokens);
                if response and response[-1] != "�":
                    response, new_history = self.process_chatglm3_response(response, history)
                    if return_past_key_values:
                        yield response, new_history, past_key_values
                    else:
                        yield response, new_history

425
426
427

    def set_adapter(self, name: str):
        fastllm_lib.set_adapter(self.model, str(name).encode())
428

429
430
    def disable_adapter(self):
        fastllm_lib.disable_adapter(self.model)
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467

    def process_chatglm3_response(self, output, history):
        content = ""
        history = deepcopy(history)
        for response in output.split("<|assistant|>"):
            metadata, content = response.split("\n", maxsplit=1)
            if not metadata.strip():
                content = content.strip()
                history.append({"role": "assistant", "metadata": metadata, "content": content})
                content = content.replace("[[训练时间]]", "2023年")
            else:
                history.append({"role": "assistant", "metadata": metadata, "content": content})
                if history[0]["role"] == "system" and "tools" in history[0]:
                    content = "\n".join(content.split("\n")[1:-1])
                    def tool_call(**kwargs):
                        return kwargs
                    parameters = eval(content)
                    content = {"name": metadata.strip(), "parameters": parameters}
                else:
                    content = {"name": metadata.strip(), "content": content}
        return content, history

    def build_chatglm3_input(self, tokenizer, query, history=None, role="user"):
        if history is None:
            history = []
        input_ids = []
        for item in history:
            content = item["content"]
            if item["role"] == "system" and "tools" in item:
                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
            input_ids.extend(tokenizer.build_single_message(item["role"], item.get("metadata", ""), content))
        input_ids.extend(tokenizer.build_single_message(role, "", query))
        input_ids.extend([tokenizer.get_command("<|assistant|>")])
        return input_ids

    def response_batch(self, querys: List[str],
                       historys: List[List[Tuple[str, str]]] = None,
468
                       max_length: int = 1024, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.1,
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
                       **kwargs) -> List[str]:
        query_size = len(querys)
        if (not(historys)):
            historys = [[] for _ in range(query_size)]
        inputs = (ctypes.c_char_p * query_size)()
        for i, query in enumerate(querys):
            prompt = query if self.direct_query else self.get_prompt(query, historys[i])
            inputs[i] = ctypes.c_char_p(prompt.encode())

        outputs = fastllm_lib.response_batch_str_llm_model(self.model, inputs, query_size,
                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty, False)

        responses = []
        for i in range(query_size):
            response = ctypes.string_at(outputs[i]).decode()
            responses.append(response)
            historys[i] = historys[i] + [(querys[i], response)]
zhouxiang's avatar
zhouxiang committed
486
        fastllm_lib.freeCharArray(outputs, query_size)
487
488
489
        return responses, historys

    def chat_batch(self, tokenizer, querys: List[str], historys: List[List[Tuple[str, str]]] = None, max_length: int = 1024,
490
                   do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.1, **kwargs):
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
        query_size = len(querys)
        if (not(historys)):
            historys = [[] for _ in range(query_size)]

            inputs = []
            inputs_len = []
            for i, query in enumerate(querys):
                prompt = query if self.direct_query else self.get_prompt(query, historys[i])
                input = tokenizer.encode(prompt);
                inputs.extend(input)
                inputs_len.append(len(input))

            outputs = fastllm_lib.response_batch_tokens_llm_model(self.model, query_size,
                                                                  (ctypes.c_int * len(inputs_len))(*inputs_len),
                                                                  (ctypes.c_int * len(inputs))(*inputs),
                                                                  max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
                                                                  False)

            responses = []
            for i in range(query_size):
                response = ctypes.string_at(outputs[i]).decode()
                responses.append(response)
                historys[i] = historys[i] + [(querys[i], response)]
zhouxiang's avatar
zhouxiang committed
514
            fastllm_lib.freeCharArray(outputs, query_size)
515
516
517
            return responses, historys