Commit 56215723 authored by zhouxiang's avatar zhouxiang
Browse files

1、同步到最新版本;2、增加batch推理接口;3、解决内存泄漏问题;4、修复llama系列流式输出不流畅的问题

parent 44be91d3
......@@ -4,9 +4,10 @@ from transformers.generation import GenerationConfig
from fastllm_pytools import torch2flm
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True, fp32=True).eval()
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
model_path = sys.argv[3] if len(sys.argv) >= 4 else "Qwen/Qwen-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, fp32=True).eval()
model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "qwen-7b-" + dtype + ".flm"
......
......@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup (
name = "fastllm_pytools",
version = "0.0.1",
version = "0.1.0",
author = "huangyuyang",
author_email = "ztxz16@foxmail.com",
description = "Fastllm pytools",
......@@ -10,6 +10,6 @@ setup (
packages = ['fastllm_pytools'],
package_data = {
'': ['*.dll', '*.so']
'': ['*.dll', '*.so', '*.dylib']
}
)
......@@ -71,7 +71,7 @@ extern "C" {
DLL_EXPORT char *string_to_chars(const std::string &s) {
char *svalue = new char[s.size() + 1];
memcpy(svalue, s.data(), s.size());
svalue[s.size()] = 0;
svalue[s.size()] = 0;
return svalue;
}
......@@ -117,6 +117,34 @@ extern "C" {
return;
}
DLL_EXPORT int token_decode(int modelId, int tokenId, int output_buffer_len, char *output_buffer) {
// 正常时候返回0,输出buffer长度不足时返回输出的bytes数量,包含末尾的\0
if(tokenId == -1) {
output_buffer[0] = '\0';
return 0;
}
auto model = models.GetModel(modelId);
std::string s = model->weight.tokenizer.DecodeTokens(std::vector <int> {tokenId});
if(s.length() + 1 > output_buffer_len) {
return (int)s.length() + 1;
}
memcpy(output_buffer, s.c_str(), s.length() + 1);
return 0;
}
DLL_EXPORT int token_encode_string(int modelId, char *content, int output_buffer_len, int *output_buffer) {
// 返回写入到output_buffer中的数量。当output不足时候,只输出对应的部分
auto model = models.GetModel(modelId);
auto v = model->weight.tokenizer.Encode(content);
for (int i = 0; i < v.Count(0); i++) {
if(i >= output_buffer_len) {
break;
}
output_buffer[i] = (int)((float*)v.cpuData)[i];
}
return (int)v.Count(0);
}
DLL_EXPORT void add_dict_llm_model(int modelId, char *key, char *value) {
auto model = models.GetModel(modelId);
model->weight.AddDict(key, value);
......@@ -141,6 +169,11 @@ extern "C" {
return;
}
DLL_EXPORT void release_memory(int modelId) {
auto model = models.GetModel(modelId);
model->weight.ReleaseWeight();
return;
}
DLL_EXPORT void init_params_llm_model(int modelId) {
auto model = models.GetModel(modelId);
model->InitParams();
......@@ -207,7 +240,8 @@ extern "C" {
DLL_EXPORT int launch_response_str_llm_model(int modelId, char *content,
int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits) {
float temperature, float repeat_penalty, bool output_logits,
int stop_token_len, int * stop_token_ids) {
auto model = models.GetModel(modelId);
std::vector <int> tokens;
auto v = model->weight.tokenizer.Encode(content);
......@@ -215,6 +249,10 @@ extern "C" {
tokens.push_back((int)((float*)v.cpuData)[i]);
}
auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
for(int i = 0; i < stop_token_len; i++ )
{
config.stop_token_ids.insert(stop_token_ids[i]);
}
return model->LaunchResponseTokens(tokens, config);
}
......@@ -227,12 +265,17 @@ extern "C" {
DLL_EXPORT int launch_response_llm_model(int modelId, int len, int *values,
int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits) {
float temperature, float repeat_penalty, bool output_logits,
int stop_token_len, int * stop_token_ids) {
std::vector <int> input;
for (int i = 0; i < len; i++) {
input.push_back(values[i]);
}
auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
for(int i = 0; i < stop_token_len; i++ )
{
config.stop_token_ids.insert(stop_token_ids[i]);
}
auto model = models.GetModel(modelId);
return model->LaunchResponseTokens(input, config);
}
......@@ -251,4 +294,82 @@ extern "C" {
}
return ret;
}
DLL_EXPORT char* get_llm_model_type(int modelId) {
auto model = models.GetModel(modelId);
return string_to_chars(model->model_type);
}
char** convertToCharArray(const std::vector<std::string>& strings) {
// 分配 char** 数组的内存
char** charArray = new char*[strings.size()];
// 遍历 std::vector<std::string>
for (size_t i = 0; i < strings.size(); i++) {
// 获取当前字符串
const std::string& str = strings[i];
// 分配内存并复制字符串内容
charArray[i] = new char[str.length() + 1];
std::strcpy(charArray[i], str.c_str());
}
return charArray;
}
DLL_EXPORT void freeCharArray(char** charArray, size_t size) {
// 释放每个字符串的内存
for (size_t i = 0; i < size; i++) {
delete[] charArray[i];
}
// 释放 char** 数组的内存
delete[] charArray;
}
DLL_EXPORT char **response_batch_str_llm_model(int modelId, char **content, int content_size,
int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits) {
std::vector<std::string> inputs;
std::vector <std::string> outputs;
inputs.resize(content_size);
outputs.resize(content_size);
for(int i = 0; i < content_size; ++i)
{
inputs[i] = content[i];
}
auto model = models.GetModel(modelId);
auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
model->ResponseBatch(inputs, outputs, NULL, config);
return convertToCharArray(outputs);
}
DLL_EXPORT char **response_batch_tokens_llm_model(int modelId, int batch, int* tokens_lens, int *tokens,
int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits) {
std::vector<std::vector<float>> inputTokens;
inputTokens.resize(batch);
int index = 0;
for (int i = 0; i < batch; i++) {
for (int j = 0; j < tokens_lens[i]; j++) {
inputTokens[i].push_back(tokens[index++]);
}
}
std::vector <std::string> outputs;
auto model = models.GetModel(modelId);
auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
model->ResponseBatch(inputTokens, outputs, NULL, config);
return convertToCharArray(outputs);
}
DLL_EXPORT void freeChars(char* charArray) {
if (charArray != nullptr) {
// 释放字符串的内存
delete[] charArray;
}
}
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment