添加response_batch和chat_batch接口以支持batch输入

597fe8a4 · zhouxiang · 69cac0e1 · 597fe8a4 · 597fe8a4 · 597fe8a4
Commit 597fe8a4 authored Nov 08, 2023 by zhouxiang
6 changed files
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ source /opt/dtk-23.04/cuda/env.sh
 ```
 #进入本工程目录
 cd package
-python setup install
+python setup.py install
 ```
 ## 数据集
@@ -69,8 +69,6 @@ python setup install
 [原版Qwen模型下载](https://huggingface.co/Qwen/Qwen-7B-Chat/tree/main)
-## 模型推理
 ### 模型转换
 ```
@@ -90,14 +88,16 @@ python3 qwen2flm.py qwen-7b-int8.bin int8 #    导出int8模型，参数为导
 # 命令行聊天程序，使用了模型创建以及流式对话效果
 python cli_demo.py -p qwen-7b-int8.bin
-# 简易webui，需要先安装streamlit-chat
+# batch推理示例程序
+python cli_demo_batch.py -p qwen-7b-fp16.bin
+# 简易webui，需要先安装streamlit-chat，并且需要在容器启动时映射streamlit的端口到外部网络
 streamlit run web_demo.py qwen-7b-int8.bin 
 # 可以使用benchmark程序进行性能测试，根据./benchmark -h描述进行配置和测试
 # 测试示例如下：
 ./benchmark -p qwen-7b-int8.bin -f prompts/beijing.txt -b 1
-./benchmark -p qwen-7b-int8.bin -f prompts/beijing.txt -b 1
+./benchmark -p qwen-7b-int8.bin -f prompts/beijing.txt -b 8
-./benchmark -p qwen-7b-int8.bin -f prompts/hello.txt -b 512 -l 18
 # 如果benchmark没有可执行权限，需要手动添加后执行性能测试
 chmod +x benchmark
 ```
@@ -126,6 +126,6 @@ chmod +x benchmark
 https://developer.hpccube.com/codes/modelzoo/qwen-7b_fastllm
-## 参考
+## 参考资料
 https://github.com/QwenLM/Qwen-7B
--- a/cli_demo_batch.py
+++ b/cli_demo_batch.py
+import argparse
+from fastllm_pytools import llm
+import time
+def args_parser():
+    parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
+    parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = args_parser()
+    model_path = args.path
+    prompts = ["深圳有什么好玩的", "上海有什么好玩的", "晚上睡不着怎么办", "南京有什么好吃的"]
+    print(prompts)
+    responses, historys = [], []
+    model = llm.model(model_path)
+    t0 = time.time()
+    responses, historys = model.response_batch(prompts)        
+    t1 = time.time()
+    token_output_count = 0
+    word_len = 0
+    for i, res in enumerate(responses):
+        tokens = model.tokenizer_encode_string(res)
+        token_output_count += len(tokens)
+        word_len += len(res)
+        print("batch index: ", i)
+        print(res)
+        print("")
+    print("\ntoken/s: {:.2f}, character/s: {:.2f}".format(token_output_count/(t1-t0), word_len/(t1-t0)))
--- a/package/fastllm_pytools/hf_model.py
+++ b/package/fastllm_pytools/hf_model.py
@@ -26,6 +26,8 @@ def create(model,
        exit(0);
    # 0.1 model info
+    if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
+        model.config.model_type = "chatglm3"
    modelInfo = model.config.__dict__
    if model.generation_config is not None:
        modelInfo.update(model.generation_config.__dict__)

--- a/package/fastllm_pytools/libfastllm_tools.so
+++ b/package/fastllm_pytools/libfastllm_tools.so
--- a/package/fastllm_pytools/llm.py
+++ b/package/fastllm_pytools/llm.py
@@ -3,6 +3,7 @@ import math
 import os;
 import threading
 from typing import Optional, Tuple, Union, List, Callable, Dict, Any;
+from copy import deepcopy
 import platform
 if platform.system() == 'Windows':
@@ -53,6 +54,19 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_
 fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
+fastllm_lib.get_llm_model_type.argtype = [ctypes.c_int]
+fastllm_lib.get_llm_model_type.restype = ctypes.c_char_p
+fastllm_lib.response_batch_str_llm_model.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int,
+                                                     ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                     ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.response_batch_str_llm_model.restype = ctypes.POINTER(ctypes.c_char_p)
+fastllm_lib.response_batch_tokens_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int),
+                                                        ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                        ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.response_batch_tokens_llm_model.restype = ctypes.POINTER(ctypes.c_char_p)
 def set_cpu_threads(threads: int):
    fastllm_lib.set_cpu_threads(threads);
@@ -120,6 +134,9 @@ class model:
        # 不做成自动缓存是为了避免在多线程调用的时候对缓存dict加锁，同时也为不同场景提供选择空间
        self.tokenizer_decode_token_cache = None
+        self.model_type = fastllm_lib.get_llm_model_type(self.model).decode()
+        # print("model_type:", self.model_type)
    def get_prompt(self,
                   query: str,
                   history: List[Tuple[str, str]] = None) -> str:
@@ -206,8 +223,8 @@ class model:
        prompt = query if self.direct_query else self.get_prompt(query, history);
        if (tokenizer == None):
            handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
-                                                           ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
+                                                               ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
-                                                           ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True));
+                                                               ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True));
        else:
            input = tokenizer.encode(prompt);
            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
@@ -299,49 +316,180 @@ class model:
    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
             do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
-        if (not(history)):
+        if self.model_type  != "chatglm3":
-            history = [];
+            if (not(history)):
-        prompt = query if self.direct_query else self.get_prompt(query, history);
+                history = [];
-        input = tokenizer.encode(prompt);
+            prompt = query if self.direct_query else self.get_prompt(query, history);
-        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+            input = tokenizer.encode(prompt);
-                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
-                                                       False);
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                           False);
-        result = [];
+            result = [];
-        while True:
+            while True:
-            cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
-            if (cur == -1):
+                if (cur == -1):
-                break;
+                    break;
-            result.append(cur);
+                result.append(cur);
-        response = tokenizer.decode(result);
+            response = tokenizer.decode(result);
-        history = history + [(query, response)];
+            history = history + [(query, response)];
-        return response, history;
+            return response, history;
+        else:
+            if history is None:
+                history = []
+            role = "user"
+            input = self.build_chatglm3_input(tokenizer, query, history=history, role=role)
+            history.append({"role": role, "content": query})
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                           False);
+            tokens = [];
+            while True:
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+                if (cur == -1):
+                    break;
+                tokens.append(cur);
+            response = tokenizer.decode(tokens);
+            if response and response[-1] != "�":
+                response, new_history = self.process_chatglm3_response(response, history)
+                return response, new_history
    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
                    max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
                    return_past_key_values = False, **kwargs) -> str:
-        if (not(history)):
+        if self.model_type  != "chatglm3":
-            history = [];
+            if (not(history)):
-        prompt = query if self.direct_query else self.get_prompt(query, history);
+                history = [];
-        input = tokenizer.encode(prompt);
+            prompt = query if self.direct_query else self.get_prompt(query, history);
-        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+            input = tokenizer.encode(prompt);
-                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
-                                                       False);
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
-        tokens = [];
+                                                           False);
-        while True:
+            tokens = [];
-            cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+            while True:
-            if (cur == -1):
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
-                break;
+                if (cur == -1):
-            tokens.append(cur);
+                    break;
-            response = tokenizer.decode(tokens);
+                tokens.append(cur);
-            new_history = history + [(query, response)];
+                response = tokenizer.decode(tokens);
-            if return_past_key_values:
+                new_history = history + [(query, response)];
-                yield response, new_history, None;
+                if return_past_key_values:
-            else:
+                    yield response, new_history, None;
-                yield response, new_history;
+                else:
+                    yield response, new_history;
+        else:
+            if history is None:
+                history = []
+            role = "user"
+            input = self.build_chatglm3_input(tokenizer, query, history=history, role=role)
+            history.append({"role": role, "content": query})
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                           False);
+            tokens = [];
+            while True:
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+                if (cur == -1):
+                    break;
+                tokens.append(cur);
+                response = tokenizer.decode(tokens);
+                if response and response[-1] != "�":
+                    response, new_history = self.process_chatglm3_response(response, history)
+                    if return_past_key_values:
+                        yield response, new_history, past_key_values
+                    else:
+                        yield response, new_history
    def set_adapter(self, name: str):
        fastllm_lib.set_adapter(self.model, str(name).encode())
    def disable_adapter(self):
        fastllm_lib.disable_adapter(self.model)
+    def process_chatglm3_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            metadata, content = response.split("\n", maxsplit=1)
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    content = "\n".join(content.split("\n")[1:-1])
+                    def tool_call(**kwargs):
+                        return kwargs
+                    parameters = eval(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+    def build_chatglm3_input(self, tokenizer, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(tokenizer.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(tokenizer.build_single_message(role, "", query))
+        input_ids.extend([tokenizer.get_command("<|assistant|>")])
+        return input_ids
+    def response_batch(self, querys: List[str],
+                       historys: List[List[Tuple[str, str]]] = None,
+                       max_length: int = 1024, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
+                       **kwargs) -> List[str]:
+        query_size = len(querys)
+        if (not(historys)):
+            historys = [[] for _ in range(query_size)]
+        inputs = (ctypes.c_char_p * query_size)()
+        for i, query in enumerate(querys):
+            prompt = query if self.direct_query else self.get_prompt(query, historys[i])
+            inputs[i] = ctypes.c_char_p(prompt.encode())
+        outputs = fastllm_lib.response_batch_str_llm_model(self.model, inputs, query_size,
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty, False)
+        responses = []
+        for i in range(query_size):
+            response = ctypes.string_at(outputs[i]).decode()
+            responses.append(response)
+            historys[i] = historys[i] + [(querys[i], response)]
+        return responses, historys
+    def chat_batch(self, tokenizer, querys: List[str], historys: List[List[Tuple[str, str]]] = None, max_length: int = 1024,
+                   do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
+        query_size = len(querys)
+        if (not(historys)):
+            historys = [[] for _ in range(query_size)]
+            inputs = []
+            inputs_len = []
+            for i, query in enumerate(querys):
+                prompt = query if self.direct_query else self.get_prompt(query, historys[i])
+                input = tokenizer.encode(prompt);
+                inputs.extend(input)
+                inputs_len.append(len(input))
+            outputs = fastllm_lib.response_batch_tokens_llm_model(self.model, query_size,
+                                                                  (ctypes.c_int * len(inputs_len))(*inputs_len),
+                                                                  (ctypes.c_int * len(inputs))(*inputs),
+                                                                  max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                                  False)
+            responses = []
+            for i in range(query_size):
+                response = ctypes.string_at(outputs[i]).decode()
+                responses.append(response)
+                historys[i] = historys[i] + [(querys[i], response)]
+            return responses, historys
--- a/package/fastllm_pytools/torch2flm.py
+++ b/package/fastllm_pytools/torch2flm.py
@@ -80,6 +80,8 @@ def tofile(exportPath,
    fo.write(struct.pack('i', 2))
    # 0.1 model info
+    if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
+        model.config.model_type = "chatglm3"
    modelInfo = model.config.__dict__
    if model.generation_config is not None:
        modelInfo.update(model.generation_config.__dict__)