Commit 17de4239 authored by zhouxiang's avatar zhouxiang
Browse files

提交chatglm2-6b推理的示例代码

parents
# ChatGLM2-6B_CPP
## 模型介绍
ChatGLM**2**-6B 是开源中英双语对话模型 [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) 的第二代版本,在保留了初代模型对话流畅、部署门槛较低等众多优秀特性的基础之上,ChatGLM**2**-6B 引入了如下新特性:
1. **更强大的性能**:基于 ChatGLM 初代模型的开发经验,我们全面升级了 ChatGLM2-6B 的基座模型。ChatGLM2-6B 使用了 [GLM](https://github.com/THUDM/GLM) 的混合目标函数,经过了 1.4T 中英标识符的预训练与人类偏好对齐训练,[评测结果](https://github.com/THUDM/ChatGLM2-6B#评测结果)显示,相比于初代模型,ChatGLM2-6B 在 MMLU(+23%)、CEval(+33%)、GSM8K(+571%) 、BBH(+60%)等数据集上的性能取得了大幅度的提升,在同尺寸开源模型中具有较强的竞争力。
2. **更长的上下文**:基于 [FlashAttention](https://github.com/HazyResearch/flash-attention) 技术,我们将基座模型的上下文长度(Context Length)由 ChatGLM-6B 的 2K 扩展到了 32K,并在对话阶段使用 8K 的上下文长度训练。对于更长的上下文,我们发布了 [ChatGLM2-6B-32K](https://huggingface.co/THUDM/chatglm2-6b-32k) 模型。[LongBench](https://github.com/THUDM/LongBench) 的测评结果表明,在等量级的开源模型中,ChatGLM2-6B-32K 有着较为明显的竞争优势。
3. **更高效的推理**:基于 [Multi-Query Attention](http://arxiv.org/abs/1911.02150) 技术,ChatGLM2-6B 有更高效的推理速度和更低的显存占用:在官方的模型实现下,推理速度相比初代提升了 42%,INT4 量化下,6G 显存支持的对话长度由 1K 提升到了 8K。
4. **更开放的协议**:ChatGLM2-6B 权重对学术研究**完全开放**,在填写[问卷](https://open.bigmodel.cn/mla/form)进行登记后**亦允许免费商业使用**
本项目主要针对ChatGLM2-6B在DCU平台的推理性能优化,达到DCU平台较快的对话效果。
## 模型推理
### 下载镜像
在光源可拉取推理的docker镜像,拉取方式如下:
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:glm-ft-v1.0
```
### 安装方法
```
#进入本工程目录
cd package
python setup install
```
### ChatGLM2原版模型转换
```
# 将模型转换脚本chatglm_export.py移动到原版ChatGLM2-6B环境中,也可以根据工程自带的requirements.txt安装相关依赖
# 如果使用自己finetune的模型需要修改chatglm_export.py文件中创建tokenizer, model时的模型存放路径
# 执行:
python3 chatglm_export.py chatglm2-6b-fp16.bin float16 # 导出fp16模型,参数为导出的模型路径
python3 chatglm_export.py chatglm2-6b-fp16.bin int8 # 导出int8模型,参数为导出的模型路径
```
### 运行ChatGLM2-6B模型实例
```
# 命令行聊天程序,使用了模型创建以及流式对话效果
python cli_demo.py -p chatglm2-6b-fp16.bin
# 简易webui,需要先安装streamlit-chat
streamlit run web_demo.py chatglm2-6b-fp16.bin
```
## 源码仓库及问题反馈
https://developer.hpccube.com/codes/modelzoo/chatglm2-6b_cpp
## 参考
https://github.com/THUDM/ChatGLM2-6B
import sys
from transformers import AutoTokenizer, AutoModel
import struct
import numpy as np
import torch
def writeString(fo, s):
fo.write(struct.pack('i', len(s)))
fo.write(s.encode())
def writeKeyValue(fo, key, value):
writeString(fo, key)
writeString(fo, value)
fastllm_data_type_dict = {
"int4": 8,
"int8": 3,
"float16": 7,
"float32": 0,
}
fastllm_weight_type_dict = {
"linear": 1,
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20]);
temp = v;
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
def write_int8(fo, v):
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(v.data)
def tofile(exportPath,
model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
dict = model.state_dict()
fo = open(exportPath, "wb")
# 0. version id
fo.write(struct.pack('i', 2))
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if ("model_type" not in modelInfo):
print("unknown model_type.")
exit(0)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
if (modelInfo['model_type'] == "qwen"):
pass
else:
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
fo.write(struct.pack('i', piece_size))
for i in range(piece_size):
s = tokenizer.sp_model.id_to_piece(i).encode()
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', i))
fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
else:
vocab = tokenizer.get_vocab()
fo.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
if (modelInfo['model_type'] == "qwen"):
s = v
else:
s = v.decode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', vocab[v]))
fo.write(struct.pack('f', 1.0))
else:
fo.write(struct.pack('i', 0))
weight_type_dict = {}
module_dict = {}
for key, m in model.named_modules():
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
# 2. weight
fo.write(struct.pack('i', len(dict)))
tot = 0
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
if (to_data_type == 3):
write_int8(fo, cur)
elif (to_data_type == 8):
write_int4(fo, cur)
else:
fo.write(struct.pack('i', to_data_type))
fo.write(cur.data)
tot += 1
print("output (", tot, "/", len(dict), end = " )\r")
print("\nfinish.")
fo.close()
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("./chatglm2-6b/", trust_remote_code=True)
model = AutoModel.from_pretrained("./chatglm2_model/chatglm2-6b/", trust_remote_code=True)
model = model.eval()
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "chatglm-6b-' + dtype + '.bin"
tofile(exportPath, model, tokenizer, dtype = dtype)
import argparse
from fastllm_pytools import llm
def args_parser():
parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = args_parser()
model = llm.model(args.path)
history = []
print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
while True:
query = input("\n用户:")
if query.strip() == "stop":
break
if query.strip() == "clear":
history = []
print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
continue
print("AI:", end = "");
curResponse = "";
for response in model.stream_response(query, history = history):
curResponse += response;
print(response, flush = True, end = "")
history.append((query, curResponse))
\ No newline at end of file
__all__ = ["llm"]
\ No newline at end of file
from fastllm_pytools import llm;
import torch;
import ctypes;
import numpy as np;
fastllm_data_type_dict = {
"int4": 8,
"int8": 3,
"float16": 7
}
fastllm_weight_type_dict = {
"linear": 1,
"embedding": 2,
"QuantizedLinear": 111
}
def create(model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()));
exit(0);
# 0.1 model info
modelInfo = model.config.__dict__
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt;
if (user_role):
modelInfo["user_role"] = user_role;
if (bot_role):
modelInfo["bot_role"] = bot_role;
if (history_sep):
modelInfo["history_sep"] = history_sep;
if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
# Baichuan 2代
modelInfo["use_alibi"] = "1";
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = "";
if (modelInfo["model_type"] == "qwen"):
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
weight_type_dict = {};
module_dict = {};
weight_bits = {};
for key, m in model.named_modules():
if (str(type(m)).find("QuantizedLinear") != -1):
weight_type_dict[key + ".weight"] = "QuantizedLinear";
weight_bits[key + ".weight"] = m.weight_bit_width;
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear";
module_dict[key + ".weight"] = m;
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding";
model = model.cpu();
dict = model.state_dict();
model_type = model.config.__dict__["model_type"];
model = llm.fastllm_lib.create_empty_llm_model(model_type.encode());
for it in modelInfo.keys():
llm.fastllm_lib.add_dict_llm_model(model, str(it).encode(), str(modelInfo[it]).encode());
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
if modelInfo["model_type"] == "qwen":
pass
else:
tokenizer = tokenizer.tokenizer;
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size();
for i in range(piece_size):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, tokenizer.sp_model.id_to_piece(i).encode(),
i, ctypes.c_float(tokenizer.sp_model.get_score(i)));
else:
vocab = tokenizer.get_vocab();
for v in vocab.keys():
if (modelInfo["model_type"] == "moss"):
vv = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v];
llm.fastllm_lib.add_tokenizer_word_llm_model(model, vv, vocab[v], ctypes.c_float(1.0));
elif (modelInfo["model_type"] == "qwen"):
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v, vocab[v], ctypes.c_float(1.0));
else:
llm.fastllm_lib.add_tokenizer_word_llm_model(model, v.encode(), vocab[v], ctypes.c_float(1.0));
tot = 0;
for key in dict:
ori_data_type = 0;
ori_np_data_type = np.float32;
cur_weight_type = 0;
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]];
to_data_type = 0;
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype];
if (to_data_type == 7):
ori_data_type = 7;
ori_np_data_type = np.float16;
elif (cur_weight_type == 2):
# TODO bfloat
to_data_type = 0;
if (cur_weight_type == 111):
llm.fastllm_lib.add_qlinear_weight_llm_model(model, key.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
weight_bits[key],
dict[key + "_scale"].numpy().astype(np.float32).ctypes.data_as(ctypes.c_void_p),
dict[key].numpy().ctypes.data_as(ctypes.c_void_p));
else:
llm.fastllm_lib.add_weight_llm_model(model, key.encode(),
len(dict[key].shape),
(ctypes.c_int * len(dict[key].shape))(*list(dict[key].shape)),
to_data_type, cur_weight_type, ori_data_type,
dict[key].numpy().astype(ori_np_data_type).ctypes.data_as(ctypes.c_void_p));
tot += 1;
print("convert (", tot, "/", len(dict), end = " )\r");
print("");
llm.fastllm_lib.init_params_llm_model(model);
llm.fastllm_lib.warmup_llm_model(model);
ret = llm.model("", id = model);
return ret;
import ctypes;
import os;
from typing import Optional, Tuple, Union, List, Callable, Dict, Any;
import platform
if platform.system() == 'Windows':
fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "fastllm_tools.dll"))
else:
fastllm_lib = ctypes.cdll.LoadLibrary(os.path.join(os.path.split(os.path.realpath(__file__))[0], "libfastllm_tools.so"))
fastllm_lib.create_llm_model.argtypes = [ctypes.c_char_p]
fastllm_lib.create_llm_model.restype = ctypes.c_int
fastllm_lib.launch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.launch_response_llm_model.restype = ctypes.c_int
fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
fastllm_lib.fetch_response_llm_model.restype = ctypes.c_int
fastllm_lib.fetch_response_logits_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_float)]
fastllm_lib.fetch_response_logits_llm_model.restype = ctypes.c_int
fastllm_lib.response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_char_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.response_str_llm_model.restype = ctypes.c_char_p
fastllm_lib.launch_response_str_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool]
fastllm_lib.launch_response_str_llm_model.restype = ctypes.c_int
fastllm_lib.fetch_response_str_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
fastllm_lib.fetch_response_str_llm_model.restype = ctypes.c_char_p
fastllm_lib.make_history_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
fastllm_lib.make_history_llm_model.restype = ctypes.c_char_p
fastllm_lib.make_input_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p]
fastllm_lib.make_input_llm_model.restype = ctypes.c_char_p
fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_p, ctypes.c_float, ctypes.c_int]
fastllm_lib.set_device_map.argtype = [ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
def set_cpu_threads(threads: int):
fastllm_lib.set_cpu_threads(threads);
def get_cpu_threads() -> int:
return fastllm_lib.get_cpu_threads();
def print_ins_info():
fastllm_lib.print_cpu_ins();
def set_cpu_kvcache(cpu_kvcache):
fastllm_lib.set_kvcache_in_cpu(ctypes.c_bool(cpu_kvcache));
def get_cpu_kvcache():
return fastllm_lib.get_kvcache_in_cpu();
def set_cpu_low_mem(low_mem):
fastllm_lib.set_cpu_low_mem(ctypes.c_bool(low_mem));
def get_cpu_low_mem():
return fastllm_lib.get_cpu_low_mem();
def set_device_map(device_map):
devices = [];
values = [];
if (isinstance(device_map, str)):
devices.append(device_map);
values.append(1);
elif (isinstance(device_map, list)):
devices = [str(x) for x in device_map];
values = [1 for x in device_map];
elif (isinstance(device_map, dict)):
devices = [str(x) for x in device_map.keys()];
values = [int(device_map[x]) for x in device_map.keys()];
else:
print("set_device_map error.");
return;
device_str = ''.join(devices);
device_len = [len(x) for x in devices];
fastllm_lib.set_device_map(len(device_len),
(ctypes.c_int * len(device_len))(*device_len),
device_str.encode(),
(ctypes.c_int * len(values))(*values));
def from_hf(model,
tokenizer = None,
dtype = "float16"):
from fastllm_pytools import hf_model;
return hf_model.create(model, tokenizer, dtype = dtype);
class model:
def __init__ (self, path : str,
id : int = -99999):
if (id != -99999):
self.model = id;
else:
self.model = fastllm_lib.create_llm_model(path.encode());
self.direct_query = False;
def get_prompt(self,
query: str,
history: List[Tuple[str, str]] = None) -> str:
if (not(history)):
history = [];
prompt = "";
for i, (old_query, response) in enumerate(history):
prompt = fastllm_lib.make_history_llm_model(self.model, prompt.encode(), i, old_query.encode(), response.encode()).decode();
prompt = fastllm_lib.make_input_llm_model(self.model, prompt.encode(), len(history), query.encode()).decode();
return prompt;
def save(self, path : str):
fastllm_lib.save_llm_model(self.model, path.encode());
def eval(self):
pass;
def response_logits(self,
query: str,
history: List[Tuple[str, str]] = None,
tokenizer = None) -> str:
prompt = query if self.direct_query else self.get_prompt(query, history);
if (tokenizer == None):
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(1), ctypes.c_bool(False), ctypes.c_float(1), ctypes.c_int(1),
ctypes.c_float(1), ctypes.c_float(1), ctypes.c_bool(True));
else:
input = tokenizer.encode(prompt);
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
1, False, 1, 1, 1, 1, True);
vocab_size = fastllm_lib.get_tokenizer_vocab_size(self.model);
logits = list(range(vocab_size))
array = (ctypes.c_float * (vocab_size * 4))(*logits);
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
out = list(array)[:vocab_size];
while (ret != -1):
ret = fastllm_lib.fetch_response_logits_llm_model(self.model, handle, array);
return out;
def response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0) -> str:
ret = "";
for i in self.stream_response(query = query,
history = history,
max_length = max_length,
do_sample = do_sample,
top_p = top_p, top_k = top_k,
temperature = temperature,
repeat_penalty = repeat_penalty,
one_by_one = True):
ret += i;
return ret;
def stream_response(self,
query: str,
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
one_by_one = True):
prompt = query if self.direct_query else self.get_prompt(query, history);
handle = fastllm_lib.launch_response_str_llm_model(self.model, prompt.encode(),
ctypes.c_int(max_length), ctypes.c_bool(do_sample), ctypes.c_float(top_p), ctypes.c_int(top_k),
ctypes.c_float(temperature), ctypes.c_float(repeat_penalty), ctypes.c_bool(False));
res = "";
ret = b'';
fail_cnt = 0;
while True:
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle);
cur = "";
try:
cur = ret.decode();
ret = b'';
except:
fail_cnt += 1;
if (fail_cnt == 20):
break;
else:
continue;
fail_cnt = 0;
if (cur == "<flmeos>"):
break;
if one_by_one:
yield cur;
else:
res += cur;
yield res;
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192,
do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0, **kwargs):
if (not(history)):
history = [];
prompt = query if self.direct_query else self.get_prompt(query, history);
input = tokenizer.encode(prompt);
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False);
result = [];
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
if (cur == -1):
break;
result.append(cur);
response = tokenizer.decode(result);
history = history + [(query, response)];
return response, history;
def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
return_past_key_values = False, **kwargs) -> str:
if (not(history)):
history = [];
prompt = query if self.direct_query else self.get_prompt(query, history);
input = tokenizer.encode(prompt);
handle = fastllm_lib.launch_response_llm_model(self.model, len(input), (ctypes.c_int * len(input))(*input),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False);
tokens = [];
while True:
cur = fastllm_lib.fetch_response_llm_model(self.model, handle);
if (cur == -1):
break;
tokens.append(cur);
response = tokenizer.decode(tokens);
new_history = history + [(query, response)];
if return_past_key_values:
yield response, new_history, None;
else:
yield response, new_history;
import struct
import numpy as np
import torch
def writeString(fo, s):
fo.write(struct.pack('i', len(s)))
fo.write(s.encode())
def writeKeyValue(fo, key, value):
writeString(fo, key)
writeString(fo, value)
fastllm_data_type_dict = {
"int4": 8,
"int8": 3,
"float16": 7,
"float32": 0,
}
fastllm_weight_type_dict = {
"linear": 1,
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20]);
temp = v;
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
def write_int8(fo, v):
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(v.data)
def tofile(exportPath,
model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None,
dtype = "float16"):
if (dtype not in fastllm_data_type_dict):
print("dtype should in ", list(fastllm_data_type_dict.keys()))
exit(0)
dict = model.state_dict()
fo = open(exportPath, "wb")
# 0. version id
fo.write(struct.pack('i', 2))
# 0.1 model info
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
if ("model_type" not in modelInfo):
print("unknown model_type.")
exit(0)
if (pre_prompt):
modelInfo["pre_prompt"] = pre_prompt
if (user_role):
modelInfo["user_role"] = user_role
if (bot_role):
modelInfo["bot_role"] = bot_role
if (history_sep):
modelInfo["history_sep"] = history_sep
if (modelInfo["model_type"] == "baichuan" and hasattr(model, "model") and hasattr(model.model, "get_alibi_mask")):
# Baichuan 2代
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = ""
if modelInfo["model_type"] == "qwen":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
fo.write(struct.pack('i', len(modelInfo)))
for it in modelInfo.keys():
writeKeyValue(fo, str(it), str(modelInfo[it]))
# 1. vocab
if (tokenizer):
if (hasattr(tokenizer, "tokenizer")):
if (modelInfo['model_type'] == "qwen"):
pass
else:
tokenizer = tokenizer.tokenizer
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
fo.write(struct.pack('i', piece_size))
for i in range(piece_size):
s = tokenizer.sp_model.id_to_piece(i).encode()
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', i))
fo.write(struct.pack('f', float(tokenizer.sp_model.get_score(i))))
else:
vocab = tokenizer.get_vocab()
fo.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
if (modelInfo['model_type'] == "qwen"):
s = v
else:
s = v.decode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
fo.write(struct.pack('i', vocab[v]))
fo.write(struct.pack('f', 1.0))
else:
fo.write(struct.pack('i', 0))
weight_type_dict = {}
module_dict = {}
for key, m in model.named_modules():
if (isinstance(m, torch.nn.Linear)):
weight_type_dict[key + ".weight"] = "linear"
module_dict[key + ".weight"] = m
if (isinstance(m, torch.nn.Embedding)):
weight_type_dict[key] = "embedding"
# 2. weight
fo.write(struct.pack('i', len(dict)))
tot = 0
for key in dict:
ori_data_type = 0
ori_np_data_type = np.float32
cur_weight_type = 0
if (key in weight_type_dict and weight_type_dict[key] in fastllm_weight_type_dict):
cur_weight_type = fastllm_weight_type_dict[weight_type_dict[key]]
to_data_type = 0
if (cur_weight_type == 1):
to_data_type = fastllm_data_type_dict[dtype]
if (to_data_type == 7):
ori_data_type = 7
ori_np_data_type = np.float16
cur = dict[key].numpy().astype(ori_np_data_type)
fo.write(struct.pack('i', len(key)))
fo.write(key.encode())
fo.write(struct.pack('i', len(cur.shape)))
for i in cur.shape:
fo.write(struct.pack('i', i))
if (to_data_type == 3):
write_int8(fo, cur)
elif (to_data_type == 8):
write_int4(fo, cur)
else:
fo.write(struct.pack('i', to_data_type))
fo.write(cur.data)
tot += 1
print("output (", tot, "/", len(dict), end = " )\r")
print("\nfinish.")
fo.close()
\ No newline at end of file
from setuptools import setup, find_packages
setup (
name = "fastllm_pytools",
version = "0.0.1",
author = "huangyuyang",
author_email = "ztxz16@foxmail.com",
description = "Fastllm pytools",
url = "https://github.com/ztxz16/fastllm",
packages = ['fastllm_pytools'],
package_data = {
'': ['*.dll', '*.so']
}
)
import streamlit as st
from streamlit_chat import message
from fastllm_pytools import llm
import sys
st.set_page_config(
page_title="fastllm web demo",
page_icon=":robot:"
)
@st.cache_resource
def get_model():
model = llm.model(sys.argv[1])
return model
if "messages" not in st.session_state:
st.session_state.messages = []
for i, (prompt, response) in enumerate(st.session_state.messages):
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
st.markdown(response)
if prompt := st.chat_input("请开始对话"):
model = get_model()
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = ""
for chunk in model.stream_response(prompt, st.session_state.messages, one_by_one = True):
full_response += chunk
message_placeholder.markdown(full_response + "▌")
message_placeholder.markdown(full_response)
st.session_state.messages.append((prompt, full_response))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment