提交baichuan大模型推理程序

6ab9856b · zhouxiang · 6ab9856b · 6ab9856b · 6ab9856b · 6ab9856b
Commit 6ab9856b authored Sep 22, 2023 by zhouxiang
17 changed files
--- a/README.md
+++ b/README.md
+# Baichuan-13B_CPP
+
+## 模型结构
+
+Baichuan系列模型是由百川智能开发的开源大规模预训练模型，包含7B和13B等规模。其中，Baichuan-7B在大约1.2万亿tokens上训练的70亿参数模型，支持中英双语，上下文窗口长度为4096。Baichuan-13B是由百川智能继Baichuan-7B之后开发的包含130亿参数模型，它在高质量的语料上训练了1.4万亿tokens，超过LLaMA-13B 40%，是当前开源 13B 尺寸下训练数据量最多的模型。此外，百川智能还发布了对齐模型（Baichuan-13B-Chat），具有很强的对话能力。
+
+模型具体参数：
+
+| 模型名称 | 隐含层维度 | 层数 | 头数 | 词表大小 | 总参数量 | 训练数据(tokens) | 位置编码 | 最大长 |
+| -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- |
+| Baichuan-13B | 5,120 | 40 | 	40 | 64,000 | 13,264,901,120 | 1.4万亿 | ALiBi | 4096 |
+
+## 算法原理
+Baichuan整体模型基于标准的Transformer结构，采用了和LLaMA一样的模型设计。Baichuan-13B使用了ALiBi线性偏置技术，相对于Rotary Embedding计算量更小，对推理性能有显著提升。
+
+## 模型下载
+
+[原版模型下载]([baichuan-inc/Baichuan-13B-Chat · Hugging Face](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat))
+
+## 环境配置
+
+### 环境准备
+在光源可拉取推理的docker镜像，拉取方式如下：
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest
+```
+### 容器启动
+
+模型推理容器启动命令参考如下，用户根据需要修改：
+
+```
+# <container_name> 自定义容器名
+# <project_path> 当前工程所在路径
+docker run -it --name=<container_name> -v <project_path>:/work -w /work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest /bin/bash
+```
+
+### 加载环境
+
+进入容器后执行如下命令，加载运行环境变量
+
+```
+source /opt/dtk-23.04/cuda/env.sh
+```
+
+### 安装方法
+
+```
+#进入本工程目录
+cd package
+python setup install
+```
+
+### 模型转换
+
+```
+# 为了精简镜像，光源镜像中未包含模型推理时不需要的原版Baichuan-13B-chat模型运行所需要的依赖，
+# 如果有现成的原版Baichuan-13B-chat的运行环境中，可以将模型转换脚本baichuan2flm.py移动到原版模型的运行环境中，
+# 也可以通过执行pip install -r requirements.txt安装模型转换所需依赖；
+# 如果使用已经下载完成的模型或者自己finetune的模型需要修改baichuan2flm.py文件中创建tokenizer, model时的模型存放路径
+# 执行：
+python3 baichuan2flm.py baichuan-13b-fp16.bin float16 # 导出fp16模型，参数为导出的模型路径
+```
+
+
+### 模型推理
+
+```
+# 命令行聊天程序，使用了模型创建以及流式对话效果
+python cli_demo.py -p baichuan-13b-fp16.bin
+
+# 简易webui，需要先安装streamlit-chat
+streamlit run web_demo.py baichuan-13b-fp16.bin 
+```
+
+### 推理性能测试
+
+可以使用benchmark程序进行测速，根据./benchmark -h描述进行配置和测试，不同配置、不同输入，推理速度也会有一些差别
+
+```
+# 进入benchmark所在目录
+cd benchmark
+
+# 添加benchmark可执行权限
+chmod +x benchmark
+
+# 测试示例
+./benchmark -p ../baichuan-13b-fp16.bin -f prompts/beijing.txt 
+./benchmark -p ../baichuan-13b-fp16.bin -f prompts/hello.txt -b 512 -l 18
+```
+
+## 运行效果展示
+
+![baochuan推理](baichuan推理.gif)
+
+## 应用场景
+
+### 算法类别
+
+`自然语言处理`
+
+### 热点应用行业
+
+`nlp,智能聊天助手,科研`
+
+## 源码仓库及问题反馈
+
+- https://developer.hpccube.com/codes/modelzoo/baichuan-13b_cpp
+
+## 参考
+
+- [https://github.com/baichuan-inc/Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B)
\ No newline at end of file
--- a/baichuan2flm.py
+++ b/baichuan2flm.py
+import sys
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+from fastllm_pytools import torch2flm
+
+if __name__ == "__main__":
+    modelpath = "baichuan-inc/Baichuan-13B-Chat""
+    tokenizer = AutoTokenizer.from_pretrained(modelpath, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(modelpath, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
+    model.to("cpu")
+    try:
+        model.generation_config = GenerationConfig.from_pretrained(modelpath)
+    except:
+        pass
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan-13b-' + dtype + '.flm"
+    torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
--- a/benchmark/benchmark
+++ b/benchmark/benchmark
--- a/benchmark/loop_benchmark.sh
+++ b/benchmark/loop_benchmark.sh
+#!/bin/bash
+
+# 定义要运行的程序命令
+program="./benchmark -p ../chatglm2-6b-int8.bin -f prompts/beijing.txt --loop"
+
+# 定义要运行的实例数量
+num_instances=2
+
+# 启动后台进程
+for ((i=1; i<=num_instances; i++)); do
+  $program &
+done
+
+# 压测持续时间（秒）
+test_duration=120
+
+# 等待一段时间以进行压测
+sleep $test_duration
+
+# 杀死所有后台进程
+pkill -f benchmark
--- a/benchmark/prompts/beijing.txt
+++ b/benchmark/prompts/beijing.txt
+北京有什么景点？
\ No newline at end of file
--- a/benchmark/prompts/hello.txt
+++ b/benchmark/prompts/hello.txt
+Hello！
\ No newline at end of file
--- a/cli_demo.py
+++ b/cli_demo.py
+import argparse
+from fastllm_pytools import llm
+import time
+
+def args_parser():
+    parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
+    parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = args_parser()
+    model = llm.model(args.path)
+
+    history = []
+    print("输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+    while True:
+        query = input("\n用户：")
+        if query.strip() == "stop":
+            break
+        if query.strip() == "clear":
+            history = []
+            print("输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+            continue
+        print("AI:", end = "")
+        curResponse = ""
+
+        token_count = 0
+        t0 = time.time()
+
+        for response in model.stream_response(query, history = history):
+            curResponse += response
+            print(response, flush = True, end = "")
+            token_count += 1
+
+        t1 = time.time()
+        word_len = len(curResponse)
+        print("\ntoken/s: {:.2f}, character/s: {:.2f}".format(token_count/(t1-t0), word_len/(t1-t0)))
+
+        history.append((query, curResponse))
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode = 416
+# 模型名称
+modelName=Baichuan-13B_CPP
+# 模型描述
+modelDescription=Baichuan-13B 是由百川智能开发的包含 130 亿参数的开源可商用的大规模语言模型，在权威的中文和英文 benchmark 上均取得同尺寸最好的效果
+# 应用场景
+appScenario=推理,NLP,智能聊天助手,金融,教育
+# 框架类型
+frameType=cpp
--- a/package/fastllm_pytools/__init__.py
+++ b/package/fastllm_pytools/__init__.py
+__all__ = ["llm"]
\ No newline at end of file
--- a/package/fastllm_pytools/hf_model.py
+++ b/package/fastllm_pytools/hf_model.py
+from fastllm_pytools import llm
+import torch
+import ctypes
+import numpy as np
+
+fastllm_data_type_dict = {
+    "int4": 8,
+    "int8": 3,
+    "float16": 7
+}
+fastllm_weight_type_dict = {
+    "linear": 1,
+    "embedding": 2,
+    "QuantizedLinear": 111
+}
+
+def create(model,
+           tokenizer = None,
+           pre_prompt = None,
+           user_role = None,
+           bot_role = None,
+           history_sep = None,
+           dtype = "float16"):
+    if (dtype not in fastllm_data_type_dict):
+        print("dtype should in ", list(fastllm_data_type_dict.keys()))
+        exit(0)
+
+    # 0.1 model info
+    modelInfo = model.config.__dict__
+    if model.generation_config is not None:
+        modelInfo.update(model.generation_config.__dict__)
+    if (pre_prompt):
+        modelInfo["pre_prompt"] = pre_prompt
+    if (user_role):
+        modelInfo["user_role"] = user_role
+    if (bot_role):
+        modelInfo["bot_role"] = bot_role
+    if (history_sep):
+        modelInfo["history_sep"] = history_sep
+    if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
+        # Baichuan 2代
+        modelInfo["use_alibi"] = "1"
+        modelInfo["pre_prompt"] = ""
+        modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
+        modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
+        modelInfo["history_sep"] = ""
+    if (modelInfo["model_type"] == "qwen"):
+        if modelInfo["chat_format"] == "chatml":
+            modelInfo["im_end_id"] = tokenizer.im_end_id
+            modelInfo["im_start_id"] = tokenizer.im_start_id
+
+
+    weight_type_dict = {}
+    module_dict = {}
+    weight_bits = {}
+    for key, m in model.named_modules():
+        if (str(type(m)).find("QuantizedLinear") != -1):
+            weight_type_dict[key + ".weight"] = "QuantizedLinear"
+            weight_bits[key + ".weight"] = m.weight_bit_width
+        if (isinstance(m, torch.nn.Linear)):
+            weight_type_dict[key + ".weight"] = "linear"
+            module_dict[key + ".weight"] = m
+        if (isinstance(m, torch.nn.Embedding)):
+            weight_type_dict[key] = "embedding"
+
+    peft_config = {}
+    active_adapter = ""
+    if hasattr(model, "peft_config"):
+        peft_config = model.peft_config
+    if hasattr(model, "active_adapter"):
+        active_adapter = model.active_adapter
+
+    model = model.cpu()
+    dict = model.state_dict()
+    model_type = model.config.__dict__["model_type"]
+    model = llm.fastllm_lib.create_empty_llm_model(model_type.encode())
+    for it in modelInfo.keys():
+        llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode())
+
+    for adapter_name in peft_config.keys():
+        adapter_dict = peft_config[adapter_name].__dict__
+        for it in adapter_dict.keys():
+            llm.fastllm_lib.add_adapter_dict_llm_model(model, str(adapter_name).encode(), str(it).encode(), str(adapter_dict[it]).encode())
+    if len(active_adapter) != 0:
+        llm.fastllm_lib.set_adapter(model, str(active_adapter).encode())
+
+    # 1. vocab
+    if (tokenizer):
+        if (hasattr(tokenizer, "tokenizer")):
+            if modelInfo["model_type"] == "qwen":
+                pass
+            else:
+                tokenizer = tokenizer.tokenizer
+        if (hasattr(tokenizer, "sp_model")):
+            piece_size = tokenizer.sp_model.piece_size()
+            for i in range(piece_size):
+                llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
+                                                             i, ctypes.c_float(tokenizer.sp_model.get_score(i)))
+        else:
+            vocab = tokenizer.get_vocab()
+            for v in vocab.keys():
+                if (modelInfo["model_type"] == "moss"):
+                    vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0))
+                elif (modelInfo["model_type"] == "qwen"):
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0))
+                else:
+                    llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0))
+    tot = 0
+    for key in dict:
+        ori_data_type = 0
+        ori_np_data_type = np.float32
+        cur_weight_type = 0
+        if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
+            cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
+        to_data_type = 0
+
+        if (cur_weight_type == 1):
+            to_data_type = fastllm_data_type_dict[dtype]
+            if (to_data_type == 7):
+                ori_data_type = 7
+                ori_np_data_type = np.float16
+        elif (cur_weight_type == 2):
+            # TODO bfloat
+            to_data_type = 0
+
+        weight_name = key
+        if peft_config is not None:
+            weight_name = weight_name.replace('base_model.model.', '')
+        if (cur_weight_type == 111):
+            llm.fastllm_lib.add_qlinear_weight_llm_model(model, weight_name.encode(),
+                                                 len(dict[key].shape),
+                                                 (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
+                                                 weight_bits[key],
+                                                 dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
+                                                 dict[key].numpy().ctypes.data_as(ctypes.c_void_p))
+        else:
+            llm.fastllm_lib.add_weight_llm_model(model, weight_name.encode(),
+                                             len(dict[key].shape),
+                                             (ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
+                                             to_data_type, cur_weight_type, ori_data_type,
+                                             dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p))
+        tot += 1
+        print("convert (", tot, "/", len(dict), end = " )\r")
+
+    print("")
+    llm.fastllm_lib.init_params_llm_model(model)
+    llm.fastllm_lib.warmup_llm_model(model)
+    ret = llm.model("", id = model)
+    return ret
+
--- a/package/fastllm_pytools/libfastllm_tools.so
+++ b/package/fastllm_pytools/libfastllm_tools.so
--- a/package/fastllm_pytools/llm.py
+++ b/package/fastllm_pytools/llm.py
+import ctypes
+import os
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+
+import platform
+if platform.system() == 'Windows':
+    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"))
+else:
+    fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.so"))
+
+fastllm_lib.create_llm_model.argtypes = [ctypes.c_char_p]
+fastllm_lib.create_llm_model.restype = ctypes.c_int
+
+fastllm_lib.launch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
+                                                  ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                  ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.launch_response_llm_model.restype = ctypes.c_int
+
+fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
+fastllm_lib.fetch_response_llm_model.restype = ctypes.c_int
+
+fastllm_lib.fetch_response_logits_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_float)]
+fastllm_lib.fetch_response_logits_llm_model.restype = ctypes.c_int
+
+fastllm_lib.response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_char_p,
+                                               ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                               ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
+
+fastllm_lib.launch_response_str_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p,
+                                                     ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
+                                                     ctypes.c_float, ctypes.c_float, ctypes.c_bool]
+fastllm_lib.launch_response_str_llm_model.restype = ctypes.c_int
+
+fastllm_lib.fetch_response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
+fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
+
+fastllm_lib.make_history_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
+fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
+
+fastllm_lib.make_input_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p]
+fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
+
+fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int]
+
+fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
+
+def set_cpu_threads(threads: int):
+    fastllm_lib.set_cpu_threads(threads)
+
+def get_cpu_threads() -> int:
+    return fastllm_lib.get_cpu_threads()
+
+def print_ins_info():
+    fastllm_lib.print_cpu_ins()
+
+def set_cpu_kvcache(cpu_kvcache):
+    fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache))
+
+def get_cpu_kvcache():
+    return fastllm_lib.get_kvcache_in_cpu()
+
+def set_cpu_low_mem(low_mem):
+    fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem))
+
+def get_cpu_low_mem():
+    return fastllm_lib.get_cpu_low_mem()
+
+def set_device_map(device_map):
+    devices = []
+    values = []
+    if (isinstance(device_map, str)):
+        devices.append(device_map)
+        values.append(1)
+    elif (isinstance(device_map, list)):
+        devices = [str(x) for x in device_map]
+        values = [1 for x in device_map]
+    elif (isinstance(device_map, dict)):
+        devices = [str(x) for x in device_map.keys()]
+        values = [int(device_map[x]) for x in device_map.keys()]
+    else:
+        print("set_device_map error.")
+        return
+    device_str = ''.join(devices)
+    device_len = [len(x) for x in devices]
+    fastllm_lib.set_device_map(len(device_len),
+                               (ctypes.c_int * len(device_len))(*device_len),
+                               device_str.encode(),
+                               (ctypes.c_int * len(values))(*values))
+def from_hf(model,
+            tokenizer = None,
+            dtype = "float16"):
+    from fastllm_pytools import hf_model
+    return hf_model.create(model, tokenizer, dtype = dtype)
+
+class model:
+    def __init__ (self, path : str,
+                  id : int = -99999):
+        if (id != -99999):
+            self.model = id
+        else:
+            self.model = fastllm_lib.create_llm_model(path.encode())
+        self.direct_query = False
+
+    def get_prompt(self,
+                   query: str,
+                   history: List[Tuple[str, str]] = None) -> str:
+        if (not(history)):
+            history = []
+        prompt = ""
+        for i, (old_query, response) in enumerate(history):
+            prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode()
+        prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode()
+        return prompt
+
+    def save(self, path : str):
+        fastllm_lib.save_llm_model(self.model, path.encode())
+
+    def eval(self):
+        pass
+
+    def response_logits(self,
+                        query: str,
+                        history: List[Tuple[str, str]] = None,
+                        tokenizer = None) -> str:
+        prompt = query if self.direct_query else self.get_prompt(query, history)
+        if (tokenizer == None):
+            handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
+                                                           ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
+                                                           ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True))
+        else:
+            input = tokenizer.encode(prompt)
+            handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                           1, False, 1, 1, 1, 1, True)
+        vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model)
+        logits = list(range(vocab_size))
+        array = (ctypes.c_float * (vocab_size * 4))(*logits)
+        ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
+        out = list(array)[:vocab_size]
+        while (ret != -1):
+            ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array)
+        return out
+
+    def response(self,
+                 query: str,
+                 history: List[Tuple[str, str]] = None,
+                 max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
+        ret = ""
+        for i in self.stream_response(query = query,
+                                      history = history,
+                                      max_length = max_length,
+                                      do_sample = do_sample,
+                                      top_p = top_p, top_k = top_k,
+                                      temperature = temperature,
+                                      repeat_penalty = repeat_penalty,
+                                      one_by_one = True):
+            ret += i
+        return ret
+
+    def stream_response(self,
+                        query: str,
+                        history: List[Tuple[str, str]] = None,
+                        max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
+                        one_by_one = True):
+        prompt = query if self.direct_query else self.get_prompt(query, history)
+        handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
+                                                           ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
+                                                           ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False))
+        res = ""
+        ret = b''
+        fail_cnt = 0
+        while True:
+            ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle)
+            cur = ""
+            try:
+                cur = ret.decode()
+                ret = b''
+            except:
+                fail_cnt += 1
+                if (fail_cnt == 20):
+                    break
+                else:
+                    continue
+            fail_cnt = 0
+            if (cur == "<flmeos>"):
+                break
+            if one_by_one:
+                yield cur
+            else:
+                res += cur
+                yield res
+
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
+             do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
+        if (not(history)):
+            history = []
+        prompt = query if self.direct_query else self.get_prompt(query, history)
+        input = tokenizer.encode(prompt)
+        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                       False)
+
+        result = []
+        while True:
+            cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
+            if (cur == -1):
+                break
+            result.append(cur)
+        response = tokenizer.decode(result)
+        history = history + [(query, response)]
+        return response, history
+
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
+                    max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
+                    return_past_key_values = False, **kwargs) -> str:
+        if (not(history)):
+            history = []
+        prompt = query if self.direct_query else self.get_prompt(query, history)
+        input = tokenizer.encode(prompt)
+        handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
+                                                       max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
+                                                       False)
+        tokens = []
+        while True:
+            cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
+            if (cur == -1):
+                break
+            tokens.append(cur)
+            response = tokenizer.decode(tokens)
+            new_history = history + [(query, response)]
+            if return_past_key_values:
+                yield response, new_history, None
+            else:
+                yield response, new_history
+
+    def set_adapter(self, name: str):
+        fastllm_lib.set_adapter(self.model, str(name).encode())
+    
+    def disable_adapter(self):
+        fastllm_lib.disable_adapter(self.model)
--- a/package/fastllm_pytools/torch2flm.py
+++ b/package/fastllm_pytools/torch2flm.py
+import struct
+import numpy as np
+import torch
+
+def writeString(fo, s):
+    fo.write(struct.pack('i', len(s)))
+    fo.write(s.encode())
+
+def writeKeyValue(fo, key, value):
+    writeString(fo, key)
+    writeString(fo, value)
+
+fastllm_data_type_dict = {
+    "int4": 8,
+    "int8": 3,
+    "float16": 7,
+    "float32": 0,
+}
+fastllm_weight_type_dict = {
+    "linear": 1,
+    "embedding": 2
+}
+
+v = np.random.randint(-127, 127, [10, 20])
+temp = v
+c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
+c_scale = c_max / 127.0
+v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
+
+def write_int8(fo, v):
+    c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
+    c_scale = c_max / 127.0
+    v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
+    fo.write(struct.pack('i', 3))
+    fo.write(struct.pack('i', 0))
+    for i in range(c_max.shape[0]):
+        fo.write(struct.pack('f', -c_max[i][0]))
+        fo.write(struct.pack('f', c_max[i][0]))
+    fo.write(v.data)
+
+def write_int4(fo, v):
+    c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
+    c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
+    c_scale = c_max / 7.0
+    c_min = c_scale * -8.0
+    v = (v - c_min) / c_scale
+    v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
+    v = v[:, 0::2] * 16 + v[:, 1::2]
+    fo.write(struct.pack('i', 8))
+    fo.write(struct.pack('i', 0))
+    for i in range(c_min.shape[0]):
+        fo.write(struct.pack('f', c_min[i][0]))
+        fo.write(struct.pack('f', c_max[i][0]))
+    fo.write(v.data)
+
+def tofile(exportPath,
+           model,
+           tokenizer = None,
+           pre_prompt = None,
+           user_role = None,
+           bot_role = None,
+           history_sep = None,
+           dtype = "float16"):
+    if (dtype not in fastllm_data_type_dict):
+        print("dtype should in ", list(fastllm_data_type_dict.keys()))
+        exit(0)
+
+    dict = model.state_dict()
+    fo = open(exportPath, "wb")
+
+    # 0. version id
+    fo.write(struct.pack('i', 2))
+
+    # 0.1 model info
+    modelInfo = model.config.__dict__
+    if model.generation_config is not None:
+        modelInfo.update(model.generation_config.__dict__)
+    if ("model_type" not in modelInfo):
+        print("unknown model_type.")
+        exit(0)
+
+    if (pre_prompt):
+        modelInfo["pre_prompt"] = pre_prompt
+    if (user_role):
+        modelInfo["user_role"] = user_role
+    if (bot_role):
+        modelInfo["bot_role"] = bot_role
+    if (history_sep):
+        modelInfo["history_sep"] = history_sep
+    if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
+        # Baichuan 2代
+        modelInfo["use_alibi"] = "1"
+        modelInfo["pre_prompt"] = ""
+        modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
+        modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
+        modelInfo["history_sep"] = ""
+    if modelInfo["model_type"] == "qwen":
+        if modelInfo["chat_format"] == "chatml":
+            modelInfo["im_end_id"] = tokenizer.im_end_id
+            modelInfo["im_start_id"] = tokenizer.im_start_id
+
+    modelInfo["tokenizer_use_score"] = "1" # 分词带分数
+
+    if hasattr(model, "peft_config"):
+        adapter_size = len(model.peft_config)
+        modelInfo["peft_size"] = adapter_size
+
+    fo.write(struct.pack('i', len(modelInfo)))
+    for it in modelInfo.keys():
+        writeKeyValue(fo, str(it), str(modelInfo[it]))
+
+    if hasattr(model, "peft_config"):
+        for adapter_name in model.peft_config.keys():
+            adapter_dict = model.peft_config[adapter_name].__dict__
+            writeString(fo, adapter_name)
+            fo.write(struct.pack('i', len(adapter_dict)))
+            for it in adapter_dict.keys():
+                writeKeyValue(fo, str(it), str(adapter_dict[it]))
+
+    # 1. vocab
+    if (tokenizer):
+        if (hasattr(tokenizer, "tokenizer")):
+            if (modelInfo['model_type'] == "qwen"):
+                pass
+            else:
+                tokenizer = tokenizer.tokenizer
+        if (hasattr(tokenizer, "sp_model")):
+            piece_size = tokenizer.sp_model.piece_size()
+            fo.write(struct.pack('i', piece_size))
+            for i in range(piece_size):
+                s = tokenizer.sp_model.id_to_piece(i).encode()
+                fo.write(struct.pack('i', len(s)))
+                for c in s:
+                    fo.write(struct.pack('i', c))
+                fo.write(struct.pack('i', i))
+                fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
+        else:
+            vocab = tokenizer.get_vocab()
+            fo.write(struct.pack('i', len(vocab)))
+            for v in vocab.keys():
+                if (modelInfo['model_type'] == "qwen"):
+                    s = v
+                else:
+                    s = v.encode()
+                if (modelInfo["model_type"] == "moss"):
+                    s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
+                fo.write(struct.pack('i', len(s)))
+                for c in s:
+                    fo.write(struct.pack('i', c))
+                fo.write(struct.pack('i', vocab[v]))
+                fo.write(struct.pack('f', 1.0))
+    else:
+        fo.write(struct.pack('i', 0))
+
+    weight_type_dict = {}
+    module_dict = {}
+    for key, m in model.named_modules():
+        if (isinstance(m, torch.nn.Linear)):
+            weight_type_dict[key + ".weight"] = "linear"
+            module_dict[key + ".weight"] = m
+        if (isinstance(m, torch.nn.Embedding)):
+            weight_type_dict[key] = "embedding"
+
+    # 2. weight
+    fo.write(struct.pack('i', len(dict)))
+    tot = 0
+    for key in dict:
+        ori_data_type = 0
+        ori_np_data_type = np.float32
+        cur_weight_type = 0
+        if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
+            cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
+        to_data_type = 0
+        if (cur_weight_type == 1):
+            to_data_type = fastllm_data_type_dict[dtype]
+            if (to_data_type == 7):
+                ori_data_type = 7
+                ori_np_data_type = np.float16
+
+        cur = dict[key].numpy().astype(ori_np_data_type)
+        
+        if hasattr(model, "peft_config"):
+            weight_name = key.replace('base_model.model.', '')
+            fo.write(struct.pack('i', len(weight_name)))
+            fo.write(weight_name.encode())
+        else:
+            fo.write(struct.pack('i', len(key)))
+            fo.write(key.encode())
+        fo.write(struct.pack('i', len(cur.shape)))
+        for i in cur.shape:
+            fo.write(struct.pack('i', i))
+        if (to_data_type == 3):
+            write_int8(fo, cur)
+        elif (to_data_type == 8):
+            write_int4(fo, cur)
+        else:
+            fo.write(struct.pack('i', to_data_type))
+            fo.write(cur.data)
+        tot += 1
+        print("output (", tot, "/", len(dict), end = " )\r")
+    print("\nfinish.")
+    fo.close()
\ No newline at end of file
--- a/package/setup.py
+++ b/package/setup.py
+from setuptools import setup, find_packages
+
+setup (
+    name = "fastllm_pytools",
+    version = "0.0.1",
+    description = "Fastllm pytools",
+    packages = ['fastllm_pytools'],
+    url = "https://developer.hpccube.com/codes/aicomponent/fastllm",
+    package_data = {
+        '': ['*.dll', '*.so']
+    }
+)
--- a/requirements.txt
+++ b/requirements.txt
+transformers==4.32.0
+streamlit>=1.24.0
+sentencepiece
+urllib3==1.26.16
+transformers_stream_generator==0.0.4
+accelerate
+#einops
+#scipy
--- a/web_demo.py
+++ b/web_demo.py
+import streamlit as st
+from streamlit_chat import message
+from fastllm_pytools import llm
+import sys
+
+st.set_page_config(
+    page_title="fastllm web demo",
+    page_icon=":robot:"
+)
+
+@st.cache_resource
+def get_model():
+    model = llm.model(sys.argv[1])
+    return model
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+for i, (prompt, response) in enumerate(st.session_state.messages):
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    with st.chat_message("assistant"):
+        st.markdown(response)
+
+if prompt := st.chat_input("请开始对话"):
+    model = get_model()
+    with st.chat_message("user"):
+        st.markdown(prompt)
+
+    with st.chat_message("assistant"):
+        message_placeholder = st.empty()
+        full_response = ""
+        for chunk in model.stream_response(prompt, st.session_state.messages, one_by_one = True):
+            full_response += chunk
+            message_placeholder.markdown(full_response + "▌")
+        message_placeholder.markdown(full_response)
+    st.session_state.messages.append((prompt, full_response))