Merge http://developer.hpccube.com/codes/chenzk/baichuan-13b_fastllm

704137ae · chenzk · ecaf86cf · 268abe98 · 704137ae · 704137ae
Commit 704137ae authored Jul 11, 2024 by chenzk
9 changed files
--- a/README.md
+++ b/README.md
-# Baichuan-13B_fastllm
+# Baichuan-13B

 ## 论文

@@ -33,7 +33,7 @@ Baichuan整体模型基于标准的Transformer结构，采用了和LLaMA一样
 在光源可拉取推理的docker镜像，拉取方式如下：

 ```
-docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10.1-py38
 ```

 ### 容器启动
@@ -43,7 +43,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk
 ```
 # <container_name> 自定义容器名
 # <project_path> 当前工程所在路径
-docker run -it --name=<container_name> -v <project_path>:/work -w /work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest /bin/bash
+docker run -it --name=<container_name> -v <project_path>:/work -w /work --privileged -v /opt/hyhal:/opt/hyhal --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --ipc=host --network host --shm-size=16G --group-add video image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10.1-py38 /bin/bash
 ```

 ### 加载环境
@@ -51,7 +51,7 @@ docker run -it --name=<container_name> -v <project_path>:/work -w /work --device
 进入容器后执行如下命令，加载运行环境变量

 ```
-source /opt/dtk-23.04/cuda/env.sh
+source /opt/dtk/cuda/env.sh
 ```

 ### 安装方法
@@ -99,6 +99,15 @@ python cli_demo.py -p baichuan-13b-fp16.bin

 # 简易webui，需要先安装streamlit-chat，并且需要在容器启动时映射streamlit的端口到外部网络
 streamlit run web_demo.py baichuan-13b-fp16.bin 
+
+# 按照openai接口实现的api_server的实例:
+# 需要先进入api_server_demo，安装所需依赖：
+cd api_server_demo
+pip install -r requirements.txt
+# 运行api_server服务，使用-p指定转换后的模型文件，客户端代码可以参考openai-client.py实现：
+python fastllm-openai.py -p ../baichuan-13b-fp16.bin 
+# 如果需要测试服务的并发性能，可以使用openai-client.py，修改其中的prompt和concurrencys变量值后执行：
+python openai-client.py
 ```

 ### 推理性能测试

--- a/api_server_demo/fastllm-openai.py
+++ b/api_server_demo/fastllm-openai.py
+# coding=utf-8
+# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
+# Usage: python openai_api.py
+# Visit http://localhost:8100/docs for documents.
+
+
+import time
+import json
+import torch
+import uvicorn
+import argparse
+from pydantic import BaseModel, Field
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from contextlib import asynccontextmanager
+from typing import Any, Dict, List, Literal, Optional, Union
+#from transformers import AutoTokenizer, AutoModel
+from sse_starlette.sse import ServerSentEvent, EventSourceResponse
+from fastllm_pytools import llm
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI): # collects GPU memory
+    yield
+    global device_map
+    if torch.cuda.is_available():
+        for device in device_map: 
+            with torch.cuda.device(device):
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+
+
+app = FastAPI(lifespan=lifespan)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "owner"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: Optional[list] = None
+
+
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = []
+
+
+class ChatMessage(BaseModel):
+    role: Literal["user", "assistant", "system"]
+    content: str
+
+class Usage(BaseModel):
+    prompt_tokens: int = None
+    total_tokens: int = None
+    completion_tokens: int = None
+
+class DeltaMessage(BaseModel):
+    role: Optional[Literal["user", "assistant", "system"]] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    max_length: Optional[int] = None
+    stream: Optional[bool] = False
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Literal["stop", "length"]
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    finish_reason: Optional[Literal["stop", "length"]]
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: Literal["chat.completion", "chat.completion.chunk"]
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
+    usage: Usage = None
+
+
+@app.get("/v1/models", response_model=ModelList)
+def list_models():
+    global model_list
+    for model in model_list:
+        ModelCard(id=model)
+        ModelList.data.append(ModelCard)
+    return ModelList()
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+def create_chat_completion(request: ChatCompletionRequest):
+    if request.model not in model_list:
+        raise HTTPException(status_code=400, detail="Invalid Model Name")
+
+    global model
+
+    id = "chatcmpl-A"
+
+    if request.messages[-1].role != "user":
+        raise HTTPException(status_code=400, detail="Invalid request")
+    query = request.messages[-1].content
+
+
+    if request.max_length is not None:
+        max_length = request.max_length
+    else:
+        max_length = 1024
+    
+    if request.temperature is not None:
+        temperature = request.temperature
+    else:
+        temperature = 0.1
+
+
+    if request.top_p is not None:
+        top_p = request.top_p
+    else:
+        top_p = 0.8
+
+    prev_messages = request.messages[:-1]
+    # print(prev_messages)
+    if len(prev_messages) > 0 and prev_messages[0].role == "system":
+        query = prev_messages.pop(0).content + query
+
+    history = []
+    if len(prev_messages) % 2 == 0:
+        for i in range(0, len(prev_messages), 2):
+            if prev_messages[i].role == "user" and prev_messages[i+1].role == "assistant":
+                history.append([prev_messages[i].content, prev_messages[i+1].content])
+    
+    if request.stream:
+        generate = predict(id=id, query=query,  history=history, max_length=max_length, top_p = top_p, temperature = temperature, model_id = request.model)
+        return EventSourceResponse(generate, media_type="text/event-stream")
+
+    response = model.response(query=query,  history=history, max_length=max_length, top_p = top_p, temperature = temperature)
+
+
+    choice_data = ChatCompletionResponseChoice(
+        index=0,
+        message=ChatMessage(role="assistant", content=response),
+        finish_reason="stop"
+    )
+
+    prompt_tokens = len(model.tokenizer_encode_string(query))
+    completion_tokens = len(model.tokenizer_encode_string(response))
+    usage = Usage(
+        prompt_tokens = prompt_tokens,
+        completion_tokens = completion_tokens,
+        total_tokens = prompt_tokens+completion_tokens,
+    )
+
+    return ChatCompletionResponse(id=id ,model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
+
+
+def predict(id: str, query: str, history: List[List[str]], model_id: str, max_length: int, top_p: float, temperature: float):
+    global model
+    creat_time = int(time.time())
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0,
+        delta=DeltaMessage(role="assistant"),
+        finish_reason=None
+    )
+    chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
+    #yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))  //pydantic从1.8.0开始不支持dumps_kwags参数，参考https://github.com/THUDM/ChatGLM2-6B/issues/308
+    yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
+
+    for new_response in model.stream_response(query=query,  history=history, max_length=max_length, top_p = top_p, temperature = temperature):
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=0,
+            delta=DeltaMessage(content=new_response),
+            finish_reason=None
+        )
+        chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
+        #yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
+        yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
+
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0,
+        delta=DeltaMessage(),
+        finish_reason="stop"
+    )
+    chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
+    #yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
+    yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
+    yield '[DONE]'
+
+def args_parser():
+    parser = argparse.ArgumentParser(description = 'baichuan2_chat_demo')
+    parser.add_argument('-p', '--path', type = str, default = "/model", help = '模型文件的路径')
+    parser.add_argument('-g', '--gpus', type = str, default = "0", help = '指定运行的gpu卡，例如“0，1”')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = args_parser()
+    global model_list
+    model_list = ["baichuan2-fastllm"]
+    global device_map
+    device_map  = ["cuda:"+num for num in args.gpus.split(',')]
+    llm.set_device_map(device_map)
+    model = llm.model(args.path)
+    uvicorn.run(app, host='127.0.0.1', port=8100)
+
--- a/api_server_demo/openai-client.py
+++ b/api_server_demo/openai-client.py
+import openai
+import time
+import threading
+import queue
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def jls_extract_def(model, messages, temperature, max_length, stream, index):
+    openai.api_base = "http://127.0.0.1:8100/v1"
+    openai.api_key = "none"
+    output_tokens = 0
+    ret = ""
+    
+    t0 = time.time()
+    result = openai.ChatCompletion.create(model=model,messages=messages, temperature=temperature, max_length=max_length, stream=stream)
+
+    for chunk in result:
+        # print(chunk)
+        output_tokens += 1
+        if hasattr(chunk.choices[0].delta, "content"):
+            if (index == 0):
+                print(chunk.choices[0].delta.content, end="", flush=True)
+            ret += chunk.choices[0].delta.content
+    t1 = time.time()
+    # print("\ntoken/s: {:.2f}, output_tokens: {}".format(output_tokens/(t1-t0),output_tokens))  
+    result = output_tokens, ret, output_tokens/(t1-t0)
+
+    return result
+
+if __name__ == "__main__":
+    prompt = "满江红全文"
+    concurrencys = [1]
+    
+    temperature = 0.1
+    max_length = 4096
+    stream = True
+    
+    prompts = [prompt] 
+    model="baichuan2-fastllm"
+    messages=[{"role": "user", "content": "你好"}]
+    
+    pool = ThreadPoolExecutor(max_workers=32)
+
+    for i in range(len(concurrencys)):
+        cur_prompts = prompts * concurrencys[i]
+        token_count = 0
+        threads = []
+        t0 = time.time()
+        for index, prompt in enumerate(cur_prompts):
+            messages[0]["content"] = prompt
+
+            t = pool.submit(jls_extract_def, model, messages, temperature, max_length, stream, index)
+            t.index = index
+            threads.append(t)
+
+        for future in as_completed(threads):
+            result = future.result()
+            print(future.index)
+            print(result)
+            print("\n")
+            token_count += result[0]
+
+        t1 = time.time()
+
+        print("\n---------------------------------------------\n")
+        print("\nconcurrency: {}".format(concurrencys[i]))
+        print("\ntotal use: {:.2f}".format(t1-t0))
+        print("\ntoken/s: {:.2f}, token_count: {}".format(token_count/(t1-t0),token_count))  
+        print("\n---------------------------------------------\n")
+
+
+
+
+
+
+
+
--- a/api_server_demo/requirements.txt
+++ b/api_server_demo/requirements.txt
+uvicorn==0.23.2
+pydantic==2.5.1
+fastapi==0.103.1
+sse_starlette
+openai==0.28
--- a/benchmark/benchmark
+++ b/benchmark/benchmark
--- a/package/fastllm_pytools/hf_model.py
+++ b/package/fastllm_pytools/hf_model.py
@@ -26,6 +26,9 @@ def create(model,
        exit(0);

    # 0.1 model info
+    # if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
+    #    model.config.model_type = "chatglm3"
+    #    print("model.config.model_type: chatglm3!")
    modelInfo = model.config.__dict__
    if model.generation_config is not None:
        modelInfo.update(model.generation_config.__dict__)
@@ -48,6 +51,12 @@ def create(model,
        if modelInfo["chat_format"] == "chatml":
            modelInfo["im_end_id"] = tokenizer.im_end_id
            modelInfo["im_start_id"] = tokenizer.im_start_id
+    if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
+        # chatglm3
+        modelInfo["pre_prompt"] = "";
+        modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|user|>")) + ">\n");
+        modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
+        modelInfo["history_sep"] = "";


    weight_type_dict = {};

--- a/package/fastllm_pytools/libfastllm_tools.so
+++ b/package/fastllm_pytools/libfastllm_tools.so
--- a/package/fastllm_pytools/llm.py
+++ b/package/fastllm_pytools/llm.py
@@ -3,10 +3,14 @@ import math
 import os;
 import threading
 from typing import Optional, Tuple, Union, List, Callable, Dict, Any;
+from copy import deepcopy
+import json

 import platform
 if platform.system() == 'Windows':
-    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"))
+    fastllm_lib = ctypes.CDLL(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"), winmode=0)
+elif platform.system() == 'Darwin':
+    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.dylib"))
 else:
    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.so"))

@@ -21,7 +25,8 @@ fastllm_lib.token_encode_string.restype = ctypes.c_int

 fastllm_lib.launch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
                                                  ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
-                                                  ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+                                                  ctypes.c_float, ctypes.c_float, ctypes.c_bool,
+                                                  ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
 fastllm_lib.launch_response_llm_model.restype = ctypes.c_int

 fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
@@ -33,26 +38,49 @@ fastllm_lib.fetch_response_logits_llm_model.restype = ctypes.c_int
 fastllm_lib.response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_char_p,
                                               ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
                                               ctypes.c_float, ctypes.c_float, ctypes.c_bool]
-fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
+# fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
+fastllm_lib.response_str_llm_model.restype = ctypes.POINTER(ctypes.c_char)

 fastllm_lib.launch_response_str_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p,
                                                     ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
-                                                     ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+                                                     ctypes.c_float, ctypes.c_float, ctypes.c_bool,
+                                                     ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
 fastllm_lib.launch_response_str_llm_model.restype = ctypes.c_int

 fastllm_lib.fetch_response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
-fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
+# fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
+fastllm_lib.fetch_response_str_llm_model.restype = ctypes.POINTER(ctypes.c_char)

 fastllm_lib.make_history_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
-fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
+# fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
+fastllm_lib.make_history_llm_model.restype = ctypes.POINTER(ctypes.c_char)

 fastllm_lib.make_input_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p]
-fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
+# fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
+fastllm_lib.make_input_llm_model.restype = ctypes.POINTER(ctypes.c_char)

 fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int]

 fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]

+fastllm_lib.get_llm_model_type.argtype = [ctypes.c_int]
+fastllm_lib.get_llm_model_type.restype = ctypes.POINTER(ctypes.c_char)
+
+fastllm_lib.response_batch_str_llm_model.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int,
+                                                     ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                     ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.response_batch_str_llm_model.restype = ctypes.POINTER(ctypes.c_char_p)
+
+fastllm_lib.response_batch_tokens_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int),
+                                                        ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                        ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.response_batch_tokens_llm_model.restype = ctypes.POINTER(ctypes.c_char_p)
+
+fastllm_lib.freeChars.argtype = [ctypes.POINTER(ctypes.c_char)]
+# fastllm_lib.freeChars.restype = ctypes.c_char_p
+
+fastllm_lib.freeCharArray.argtype = [ctypes.POINTER(ctypes.c_char_p)]
+
 def set_cpu_threads(threads: int):
    fastllm_lib.set_cpu_threads(threads);

@@ -120,6 +148,11 @@ class model:
        # 不做成自动缓存是为了避免在多线程调用的时候对缓存dict加锁，同时也为不同场景提供选择空间
        self.tokenizer_decode_token_cache = None

+        model_type_ptr = fastllm_lib.get_llm_model_type(self.model)
+        self.model_type = ctypes.string_at(model_type_ptr).decode()
+        fastllm_lib.freeChars(model_type_ptr)
+        # print("model_type:", self.model_type)
+
    def get_prompt(self,
                   query: str,
                   history: List[Tuple[str, str]] = None) -> str:
@@ -127,8 +160,13 @@ class model:
            history = [];
        prompt = "";
        for i, (old_query, response) in enumerate(history):
-            prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode();
-        prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode();
+            history_ptr = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode())
+            prompt = ctypes.string_at(history_ptr).decode()
+            fastllm_lib.freeChars(history_ptr)
+        
+        input_ptr = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode())
+        prompt = ctypes.string_at(input_ptr).decode()
+        fastllm_lib.freeChars(input_ptr)
        return prompt;

    def save(self, path : str):
@@ -150,7 +188,7 @@ class model:

    def tokenizer_encode_string(self, content: str) -> List[int]:
        output_buffer_init_len = 1024
-        if self.thread_local_obj.tokenizer_encode_string__output_buffer is None:
+        if not hasattr(self.thread_local_obj, 'tokenizer_encode_string__output_buffer') or self.thread_local_obj.tokenizer_encode_string__output_buffer is None:
            self.thread_local_obj.tokenizer_encode_string__output_buffer = (ctypes.c_int * output_buffer_init_len)()

        buffer = self.thread_local_obj.tokenizer_encode_string__output_buffer
@@ -178,7 +216,7 @@ class model:
                return cache_result

        output_buffer_init_len = 256
-        if self.thread_local_obj.tokenizer_decode_token__output_buffer is None:
+        if not hasattr(self.thread_local_obj, 'tokenizer_decode_token__output_buffer') or self.thread_local_obj.tokenizer_decode_token__output_buffer is None:
            self.thread_local_obj.tokenizer_decode_token__output_buffer = ctypes.create_string_buffer(output_buffer_init_len)

        buffer = self.thread_local_obj.tokenizer_decode_token__output_buffer
@@ -199,19 +237,29 @@ class model:
                break
        return buffer_bytes[:result_len]

+    def stop_token_ctypes(self, stop_token_ids):
+        if stop_token_ids is None:
+            return 0, None
+        else:
+            return ctypes.c_int(len(stop_token_ids)), (ctypes.c_int * len(stop_token_ids))(*stop_token_ids)
+        
    def response_logits(self,
                        query: str,
                        history: List[Tuple[str, str]] = None,
-                        tokenizer = None) -> str:
+                        tokenizer = None,
+                        stop_token_ids: List[int] = None,
+                        ) -> str:
        prompt = query if self.direct_query else self.get_prompt(query, history);
+        stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
        if (tokenizer == None):
            handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
                                                           ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
-                                                           ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True));
+                                                           ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True),
+                                                           stop_token_len, stop_token_list);
        else:
            input = tokenizer.encode(prompt);
            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
-                                                           1, False, 1, 1, 1, 1, True);
+                                                           1, False, 1, 1, 1, 1, True, stop_token_len, stop_token_list);
        vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model);
        logits = list(range(vocab_size))
        array = (ctypes.c_float * (vocab_size * 4))(*logits);
@@ -224,7 +272,8 @@ class model:
    def response(self,
                 query: str,
                 history: List[Tuple[str, str]] = None,
-                 max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
+                 max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
+                 stop_token_ids: List[int] = None) -> str:
        ret = "";
        for i in self.stream_response(query = query,
                                      history = history,
@@ -233,27 +282,33 @@ class model:
                                      top_p = top_p, top_k = top_k,
                                      temperature = temperature,
                                      repeat_penalty = repeat_penalty,
-                                      one_by_one = True):
+                                      one_by_one = True,
+                                      stop_token_ids = stop_token_ids):
            ret += i;
        return ret;

    def stream_response(self,
                        query: str,
                        history: List[Tuple[str, str]] = None,
-                        max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
-                        one_by_one = True):
+                        max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
+                        one_by_one = True, stop_token_ids: List[int] = None):
        prompt = query if self.direct_query else self.get_prompt(query, history);
+        stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids);
        handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
                                                           ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
-                                                           ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False));
+                                                           ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False),
+                                                           stop_token_len, stop_token_list);
        res = "";
        ret = b'';
        fail_cnt = 0;
        while True:
-            ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
+            # ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
+            ret_chararry = fastllm_lib.fetch_response_str_llm_model(self.model, handle);
+            ret += ctypes.string_at(ret_chararry)
+            fastllm_lib.freeChars(ret_chararry)
            cur = "";
            try:
-                cur = ret.decode();
+                cur = ret.decode()
                ret = b'';
            except:
                fail_cnt += 1;
@@ -272,13 +327,16 @@ class model:

    def stream_response_raw(self,
                            input_tokens: List[int],
-                            max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
-                            one_by_one = True
+                            max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
+                            one_by_one = True,
+                            stop_token_ids: List[int] = None
                            ):
+        stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
        handle = fastllm_lib.launch_response_llm_model(self.model, len(input_tokens),
                                                       (ctypes.c_int * len(input_tokens))(*input_tokens),
                                                       ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
-                                                       ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))
+                                                       ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False),
+                                                       stop_token_len, stop_token_list)

        # 可能遇到长尾char需要多个token才能够生成，所以只返回bytes，string.decode策略交给外部
        # 方便统计输出token数量，和控制不完整utf8时候解码的逻辑
@@ -298,50 +356,265 @@ class model:
                yield total_bytes

    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
-             do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
-        if (not(history)):
-            history = [];
-        prompt = query if self.direct_query else self.get_prompt(query, history);
-        input = tokenizer.encode(prompt);
-        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
-                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
-                                                       False);
+             do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01, stop_token_ids: List[int] = None, **kwargs):
+        if self.model_type  != "chatglm3":
+            if (not(history)):
+                history = [];
+            prompt = query if self.direct_query else self.get_prompt(query, history);
+            input = tokenizer.encode(prompt);
+            stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                       	   False, stop_token_len, stop_token_list);

-        result = [];
-        while True:
-            cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
-            if (cur == -1):
-                break;
-            result.append(cur);
-        response = tokenizer.decode(result);
-        history = history + [(query, response)];
-        return response, history;
+            result = [];
+            while True:
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+                if (cur == -1):
+                    break;
+                result.append(cur);
+            response = tokenizer.decode(result);
+            history = history + [(query, response)];
+            return response, history;
+        else:
+            if history is None:
+                history = []
+            role = "user"
+            input = self.build_chatglm3_input(tokenizer, query, history=history, role=role)
+            history.append({"role": role, "content": query})			
+            stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                           False, stop_token_len, stop_token_list);
+            tokens = [];
+            while True:
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+                if (cur == -1):
+                    break;
+                tokens.append(cur);
+            response = tokenizer.decode(tokens);
+            if response and response[-1] != "�":
+                response, new_history = self.process_chatglm3_response(response, history)
+                return response, new_history

    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
-                    max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
-                    return_past_key_values = False, **kwargs) -> str:
-        if (not(history)):
-            history = [];
-        prompt = query if self.direct_query else self.get_prompt(query, history);
-        input = tokenizer.encode(prompt);
-        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
-                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
-                                                       False);
-        tokens = [];
-        while True:
-            cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
-            if (cur == -1):
-                break;
-            tokens.append(cur);
-            response = tokenizer.decode(tokens);
-            new_history = history + [(query, response)];
-            if return_past_key_values:
-                yield response, new_history, None;
-            else:
-                yield response, new_history;
+                    max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
+                    return_past_key_values = False, stop_token_ids: List[int] = None, **kwargs) -> str:
+        if self.model_type  != "chatglm3":
+            if (not(history)):
+                history = [];
+            prompt = query if self.direct_query else self.get_prompt(query, history);
+            input = tokenizer.encode(prompt);
+            stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                           False, stop_token_len, stop_token_list);
+            tokens = [];
+            while True:
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+                if (cur == -1):
+                    break;
+                tokens.append(cur);
+                response = tokenizer.decode(tokens);
+                new_history = history + [(query, response)];
+                if return_past_key_values:
+                    yield response, new_history, None;
+                else:
+                    yield response, new_history;
+        else:
+            if history is None:
+                history = []
+            role = "user"
+            input = self.build_chatglm3_input(tokenizer, query, history=history, role=role)
+            history.append({"role": role, "content": query})
+            stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
+
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                           False, stop_token_len, stop_token_list);
+            tokens = [];
+            while True:
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+                if (cur == -1):
+                    break;
+                tokens.append(cur);
+                response = tokenizer.decode(tokens);
+                if response and response[-1] != "�":
+                    response, new_history = self.process_chatglm3_response(response, history)
+                    if return_past_key_values:
+                        yield response, new_history, past_key_values
+                    else:
+                        yield response, new_history
+

    def set_adapter(self, name: str):
        fastllm_lib.set_adapter(self.model, str(name).encode())
-    
+
    def disable_adapter(self):
        fastllm_lib.disable_adapter(self.model)
+
+    def process_chatglm3_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            metadata, content = response.split("\n", maxsplit=1)
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    content = "\n".join(content.split("\n")[1:-1])
+                    def tool_call(**kwargs):
+                        return kwargs
+                    parameters = eval(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+
+    def build_chatglm3_input(self, tokenizer, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(tokenizer.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(tokenizer.build_single_message(role, "", query))
+        input_ids.extend([tokenizer.get_command("<|assistant|>")])
+        return input_ids
+
+    def response_batch_raw(self, querys: List[str],
+                       historys: List[List[Tuple[str, str]]] = None,
+                       max_length: int = 1024, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
+                       **kwargs) -> List[str]:
+        query_size = len(querys)
+        if (not(historys)):
+            historys = [[] for _ in range(query_size)]
+        inputs = (ctypes.c_char_p * query_size)()
+        for i, query in enumerate(querys):
+            prompt = query if self.direct_query else self.get_prompt(query, historys[i])
+            inputs[i] = ctypes.c_char_p(prompt.encode())
+
+        outputs = fastllm_lib.response_batch_str_llm_model(self.model, inputs, query_size,
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty, False)
+
+        responses = []
+        for i in range(query_size):
+            response = ctypes.string_at(outputs[i]).decode()
+            responses.append(response)
+            historys[i] = historys[i] + [(querys[i], response)]
+        fastllm_lib.freeCharArray(outputs, query_size)
+        return responses, historys
+
+    def chat_batch_raw(self, tokenizer, querys: List[str], historys: List[List[Tuple[str, str]]] = None, max_length: int = 1024,
+                   do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01, **kwargs):
+        query_size = len(querys)
+        if (not(historys)):
+            historys = [[] for _ in range(query_size)]
+
+        inputs = []
+        inputs_len = []
+        for i, query in enumerate(querys):
+            prompt = query if self.direct_query else self.get_prompt(query, historys[i])
+            input = tokenizer.encode(prompt);
+            inputs.extend(input)
+            inputs_len.append(len(input))
+
+        outputs = fastllm_lib.response_batch_tokens_llm_model(self.model, query_size,
+                                                                (ctypes.c_int * len(inputs_len))(*inputs_len),
+                                                                (ctypes.c_int * len(inputs))(*inputs),
+                                                                max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                                False)
+
+        responses = []
+        for i in range(query_size):
+            response = ctypes.string_at(outputs[i]).decode()
+            responses.append(response)
+            historys[i] = historys[i] + [(querys[i], response)]
+        fastllm_lib.freeCharArray(outputs, query_size)
+        return responses, historys
+
+    def response_batch(self, querys: List[str],
+                       historys: List[List[Tuple[str, str]]] = None,
+                       max_length: int = 1024, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01,
+                       stop_token_ids: List[int] = None, **kwargs) -> List[str]:
+        query_size = len(querys)
+        if (not(historys)):
+            historys = [[] for _ in range(query_size)]
+        handles = []
+        for i, query in enumerate(querys):
+            prompt = query if self.direct_query else self.get_prompt(query, historys[i])
+            stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids);
+            handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
+                                                           ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
+                                                           ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False),
+                                                           stop_token_len, stop_token_list)
+            handles.append(handle)
+
+        responses = []
+        for i, handle in enumerate(handles):
+            res = ""
+            ret = b''
+            fail_cnt = 0
+            while True:
+                # ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
+                ret_chararry = fastllm_lib.fetch_response_str_llm_model(self.model, handle);
+                ret += ctypes.string_at(ret_chararry)
+                fastllm_lib.freeChars(ret_chararry)
+                cur = ""
+                try:
+                    cur = ret.decode()
+                    ret = b''
+                except:
+                    fail_cnt += 1
+                    if (fail_cnt == 20):
+                        break
+                    else:
+                        continue
+                fail_cnt = 0
+                if (cur == "<flmeos>"):
+                    break;
+                res += cur
+            responses.append(res)
+            historys[i] = historys[i] + [(querys[i], res)]
+
+        return responses, historys
+   
+
+    def chat_batch(self, tokenizer, querys: List[str], historys: List[List[Tuple[str, str]]] = None, max_length: int = 1024,
+                   do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01, stop_token_ids: List[int] = None, **kwargs):
+        query_size = len(querys)
+        if (not(historys)):
+            historys = [[] for _ in range(query_size)]
+
+        handles = []
+        for i, query in enumerate(querys):
+            prompt = query if self.direct_query else self.get_prompt(query, historys[i])
+            input = tokenizer.encode(prompt);
+            stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids);
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                           False, stop_token_len, stop_token_list);
+            handles.append(handle)
+
+        responses = []
+        for i, handle in enumerate(handles):
+            result = [];
+            while True:
+                cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
+                if (cur == -1):
+                    break;
+                result.append(cur);
+            response = tokenizer.decode(result);
+            responses.append(response)
+            historys[i] = historys[i] + [(querys[i], response)]
+
+        return responses, historys
+    
+    def release_memory(self):
+        fastllm_lib.release_memory(self.model)
+    
--- a/package/fastllm_pytools/torch2flm.py
+++ b/package/fastllm_pytools/torch2flm.py
@@ -80,6 +80,8 @@ def tofile(exportPath,
    fo.write(struct.pack('i', 2))

    # 0.1 model info
+    #if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
+    #    model.config.model_type = "chatglm3"
    modelInfo = model.config.__dict__
    if model.generation_config is not None:
        modelInfo.update(model.generation_config.__dict__)
@@ -112,6 +114,13 @@ def tofile(exportPath,
        if modelInfo["chat_format"] == "chatml":
            modelInfo["im_end_id"] = tokenizer.im_end_id
            modelInfo["im_start_id"] = tokenizer.im_start_id
+    if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
+        print("chatglm3")
+        # chatglm3
+        modelInfo["pre_prompt"] = "";
+        modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|user|>")) + ">\n");
+        modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
+        modelInfo["history_sep"] = "";

    modelInfo["tokenizer_use_score"] = "1" # 分词带分数