Commit 56215723 authored by zhouxiang's avatar zhouxiang
Browse files

1、同步到最新版本;2、增加batch推理接口;3、解决内存泄漏问题;4、修复llama系列流式输出不流畅的问题

parent 44be91d3
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -207,8 +207,8 @@ namespace fastllm {
RuntimeResult retCb,
const GenerationConfig &generationConfig) {
#ifdef PY_API
size_t pos = input.find_last_of("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos)? input.substr(0, pos-10):input;
size_t pos = input.rfind("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != -1)? input.substr(0, pos):input;
size_t hash_id = std::hash<std::string>{}(input);
Data inputIds = this->weight.tokenizer.Encode(prompt);
#else
......
......@@ -467,24 +467,36 @@ namespace fastllm {
positionIds.ToDevice(DataDevice::CPU);
if (index == 0) {
int seqLen = inputTokens[0].size();
std::vector<float> ids = std::vector<float>(batch * seqLen, 0);
std::vector <float> vmask = std::vector <float> (batch * seqLen * seqLen, 0);
std::vector<int> seqLens;
seqLens.resize(batch);
int maxLen = 0;
for (int i = 0; i < batch; i++) {
maxLen = std::max(maxLen, (int) inputTokens[i].size());
seqLens[i] = (int) inputTokens[i].size();
}
int seqLen = maxLen;
std::vector<float> ids = std::vector<float>(batch * seqLen, 151643);
std::vector<float> vpids = std::vector<float>(batch * seqLen, 0);
std::vector <float> vmask = std::vector <float> (batch * seqLen * seqLen, 0);
for (int b = 0; b < batch; b++) {
for (int i = 0; i < seqLen; i++) {
ids[b * seqLen + i] = inputTokens[b][i];
auto &tokens = inputTokens[b];
int len = tokens.size(), base = maxLen - len;
for (int i = 0; i < len; i++) {
ids[b * seqLen + base + i] = inputTokens[b][i];
vpids[b * seqLen + base + i] = i;
}
}
for (int i = 0; i < seqLen; i++) {
vpids[i] = i;
for (int j = i + 1; j < seqLen; j++) {
vmask[i * seqLen + j] = 1;
std::fill(vmask.data() + b * maxLen * maxLen,
vmask.data() + b * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
for (int j = maxLen - len; j < maxLen; j++) {
std::fill(vmask.data() + b * maxLen * maxLen + j * maxLen,
vmask.data() + b * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
}
for (int j = 0; j < len; j++) {
for (int k = j + 1; k < len; k++) {
vmask[b * maxLen * maxLen + (base + j) * maxLen + base + k] = 1;
}
}
}
for (int b = 1; b < batch; b++) {
memcpy(vmask.data() + b * seqLen * seqLen, vmask.data(), seqLen * seqLen * sizeof(float));
memcpy(vpids.data() + b * seqLen, vpids.data(), seqLen * sizeof(float));
}
inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen}, ids));
attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, seqLen, seqLen}, vmask));
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -21,8 +21,8 @@ fastllm_weight_type_dict = {
"embedding": 2
}
v = np.random.randint(-127, 127, [10, 20])
temp = v
v = np.random.randint(-127, 127, [10, 20]);
temp = v;
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
......@@ -34,23 +34,31 @@ def write_int8(fo, v):
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(struct.pack('f', -c_max[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
# c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
# c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
# c_scale = c_max / 7.0
# c_min = c_scale * -8.0
c_min = np.expand_dims(v.min(axis = -1), -1)
c_max = np.expand_dims(v.max(axis = -1), -1)
c_scale = (c_max - c_min) / 15.0
c_zero = np.round(0.0 - c_min / c_scale)
c_zero = c_zero.clip(0, 15)
c_min = -c_scale * c_zero
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(struct.pack('f', c_min[i][0]));
fo.write(struct.pack('f', c_max[i][0]));
fo.write(v.data)
def tofile(exportPath,
......@@ -72,6 +80,8 @@ def tofile(exportPath,
fo.write(struct.pack('i', 2))
# 0.1 model info
#if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
# model.config.model_type = "chatglm3"
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
......@@ -91,13 +101,26 @@ def tofile(exportPath,
# Baichuan 2代
modelInfo["use_alibi"] = "1"
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + "> ") if hasattr(model.generation_config, "user_token_id") else ""
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + ">") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = ""
if (modelInfo["model_type"] == "baichuan" and modelInfo["vocab_size"] == 125696):
# Baichuan 2代 7B
modelInfo["pre_prompt"] = ""
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.user_token_id) + ">") if hasattr(model.generation_config, "user_token_id") else "";
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(model.generation_config.assistant_token_id) + ">") if hasattr(model.generation_config, "assistant_token_id") else "";
modelInfo["history_sep"] = ""
if modelInfo["model_type"] == "qwen":
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
print("chatglm3")
# chatglm3
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|user|>")) + "> \n");
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
modelInfo["history_sep"] = "";
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
......@@ -140,10 +163,10 @@ def tofile(exportPath,
for v in vocab.keys():
if (modelInfo['model_type'] == "qwen"):
s = v
elif (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
else:
s = v.encode()
if (modelInfo["model_type"] == "moss"):
s = [(ord(c) if c not in tokenizer.byte_decoder else tokenizer.byte_decoder[c]) for c in v]
fo.write(struct.pack('i', len(s)))
for c in s:
fo.write(struct.pack('i', c))
......
This diff is collapsed.
This diff is collapsed.
uvicorn==0.23.2
pydantic==2.5.1
fastapi==0.103.1
sse_starlette
openaiopenai==0.28
import sys
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from fastllm_pytools import torch2flm
if __name__ == "__main__":
modelpath = "baichuan-inc/Baichuan2-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(modelpath, device_map="auto", torch_dtype=torch.float32, trust_remote_code=True)
# normalize lm_head
state_dict = model.state_dict()
state_dict['lm_head.weight'] = torch.nn.functional.normalize(state_dict['lm_head.weight'])
model.load_state_dict(state_dict)
try:
model.generation_config = GenerationConfig.from_pretrained(modelpath)
except:
pass
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan2-7b-" + dtype + ".flm"
torch2flm.tofile(exportPath, model.to('cpu'), tokenizer, dtype=dtype)
\ No newline at end of file
......@@ -14,5 +14,5 @@ if __name__ == "__main__":
except:
pass
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan-13b-' + dtype + '.flm"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "baichuan-13b-" + dtype + ".flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment