Commit aefd9f11 authored by zhouxiang's avatar zhouxiang
Browse files

dcu平台fastllm推理框架

parents
Pipeline #543 failed with stages
in 0 seconds
import sys
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM
from fastllm_pytools import torch2flm
if __name__ == "__main__":
exportPath = sys.argv[1] if (sys.argv[1] is not None) else "nsql-llama-2-7b-fp16.flm";
tokenizer = AutoTokenizer.from_pretrained("models/llama7b/nsql-llama-2-7b")
model = AutoModelForCausalLM.from_pretrained("models/llama7b/nsql-llama-2-7b")
model.config.model_type = "nsql-llama-2-7b"
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "nsql-llama-2-7b-" + dtype + ".flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
import sys
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
from fastllm_pytools import torch2flm
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True, fp32=True).eval()
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "qwen-7b-" + dtype + ".flm"
torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
\ No newline at end of file
from setuptools import setup, find_packages
setup (
name = "fastllm_pytools",
version = "0.0.1",
author = "huangyuyang",
author_email = "ztxz16@foxmail.com",
description = "Fastllm pytools",
url = "https://github.com/ztxz16/fastllm",
packages = ['fastllm_pytools'],
package_data = {
'': ['*.dll', '*.so']
}
)
import streamlit as st
from streamlit_chat import message
from fastllm_pytools import llm
import sys
st.set_page_config(
page_title="fastllm web demo",
page_icon=":robot:"
)
@st.cache_resource
def get_model():
model = llm.model(sys.argv[1])
return model
if "messages" not in st.session_state:
st.session_state.messages = []
for i, (prompt, response) in enumerate(st.session_state.messages):
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
st.markdown(response)
if prompt := st.chat_input("请开始对话"):
model = get_model()
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = ""
for chunk in model.stream_response(prompt, st.session_state.messages, one_by_one = True):
full_response += chunk
message_placeholder.markdown(full_response + "▌")
message_placeholder.markdown(full_response)
st.session_state.messages.append((prompt, full_response))
//
// Created by huangyuyang on 6/27/23.
//
#include "model.h"
#include <cstring>
#ifdef WIN32
#define DLL_EXPORT _declspec(dllexport)
#else
#define DLL_EXPORT
#endif
extern "C" {
DLL_EXPORT void print_cpu_ins() {
fastllm::PrintInstructionInfo();
}
DLL_EXPORT void set_cpu_threads(int threads) {
fastllm::SetThreads(threads);
}
DLL_EXPORT int get_cpu_threads() {
return fastllm::GetThreads();
}
DLL_EXPORT void set_cpu_low_mem(bool low) {
fastllm::SetLowMemMode(low);
}
DLL_EXPORT bool get_cpu_low_mem(bool low) {
return fastllm::GetLowMemMode();
}
DLL_EXPORT void set_kvcache_in_cpu(bool in) {
fastllm::SetKVCacheInCPU(in);
}
DLL_EXPORT bool get_kvcache_in_cpu() {
return fastllm::GetKVCacheInCPU();
}
DLL_EXPORT void set_device_map(int device_cnt, int *lens, char *devices, int *values) {
std::map <std::string, int> deviceMap;
int cur = 0;
for (int i = 0; i < device_cnt; i++) {
std::string key = "";
for (int j = 0; j < lens[i]; j++) {
key += devices[cur++];
}
deviceMap[key] = values[i];
}
fastllm::SetDeviceMap(deviceMap);
}
DLL_EXPORT struct ModelManager {
std::mutex locker;
std::map <int, std::unique_ptr<fastllm::basellm> > models;
fastllm::basellm *GetModel(int handle) {
locker.lock();
auto ret = models[handle].get();
locker.unlock();
return ret;
}
};
static ModelManager models;
DLL_EXPORT char *string_to_chars(const std::string &s) {
char *svalue = new char[s.size() + 1];
memcpy(svalue, s.data(), s.size());
svalue[s.size()] = 0;
return svalue;
}
DLL_EXPORT fastllm::GenerationConfig make_config(int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits) {
fastllm::GenerationConfig config;
config.output_token_limit = max_length;
config.temperature = temperature;
config.repeat_penalty = repeat_penalty;
if (do_sample) {
config.top_p = top_p;
config.top_k = top_k;
}
config.output_logits = output_logits;
return config;
}
DLL_EXPORT int create_llm_model(char *path) {
models.locker.lock();
int id = models.models.size();
models.models[id] = fastllm::CreateLLMModelFromFile(path);
models.locker.unlock();
return id;
}
DLL_EXPORT int create_empty_llm_model(char *type) {
models.locker.lock();
int id = models.models.size();
models.models[id] = fastllm::CreateEmptyLLMModel(type);
models.locker.unlock();
return id;
}
DLL_EXPORT int get_tokenizer_vocab_size(int modelId) {
auto model = models.GetModel(modelId);
int ret = model->weight.tokenizer.tokenToStringDict.size();
return ret;
}
DLL_EXPORT void add_tokenizer_word_llm_model(int modelId, char *key, int tokenId, float score) {
auto model = models.GetModel(modelId);
model->weight.AddTokenizerWord(key, tokenId, score);
return;
}
DLL_EXPORT void add_dict_llm_model(int modelId, char *key, char *value) {
auto model = models.GetModel(modelId);
model->weight.AddDict(key, value);
return;
}
DLL_EXPORT void add_adapter_dict_llm_model(int modelId, char *adapterName, char *key, char *value) {
auto model = models.GetModel(modelId);
model->weight.AddAdapterDict(adapterName, key, value);
return;
}
DLL_EXPORT void set_adapter(int modelId, char *name) {
auto model = models.GetModel(modelId);
model->SetAdapter(name);
return;
}
DLL_EXPORT void disable_adapter(int modelId, char *name) {
auto model = models.GetModel(modelId);
model->DisableAdapter();
return;
}
DLL_EXPORT void init_params_llm_model(int modelId) {
auto model = models.GetModel(modelId);
model->InitParams();
return;
}
DLL_EXPORT void warmup_llm_model(int modelId) {
auto model = models.GetModel(modelId);
model->WarmUp();
return;
}
DLL_EXPORT void save_llm_model(int modelId, char *path) {
auto model = models.GetModel(modelId);
model->SaveModel(path);
return;
}
DLL_EXPORT void add_weight_llm_model(int modelId, char *key, int dimsLen, void *dimsData,
int dataType, int weightType, int oriDataType, void *oriData) {
auto model = models.GetModel(modelId);
std::vector <int> dims = std::vector <int> (dimsLen);
for (int i = 0; i < dims.size(); i++) {
dims[i] = ((int*)dimsData)[i];
}
model->weight.AddWeight(key, dims,
(fastllm::DataType)dataType,
(fastllm::WeightType)weightType,
(fastllm::DataType)oriDataType,
(uint8_t*)oriData);
return;
}
DLL_EXPORT void add_qlinear_weight_llm_model(int modelId, char *key, int dimsLen, void *dimsData,
int bit, void *scales, void *oriData) {
auto model = models.GetModel(modelId);
std::vector <int> dims = std::vector <int> (dimsLen);
for (int i = 0; i < dims.size(); i++) {
dims[i] = ((int*)dimsData)[i];
}
model->weight.AddQLinearWeight(key, dims, bit, (float*)scales, (uint8_t*)oriData);
return;
}
DLL_EXPORT char *make_input_llm_model(int modelId, char *history, int round, char *input) {
auto model = models.GetModel(modelId);
char *ret = string_to_chars(model->MakeInput(history, round, input));
return ret;
}
DLL_EXPORT char *make_history_llm_model(int modelId, char *history, int round, char *input, char *output) {
auto model = models.GetModel(modelId);
return string_to_chars(model->MakeHistory(history, round, input, output));
}
DLL_EXPORT char *response_str_llm_model(int modelId, char *content,
int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits) {
auto model = models.GetModel(modelId);
auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
std::string s = model->Response(content, nullptr, config);
return string_to_chars(s);
}
DLL_EXPORT int launch_response_str_llm_model(int modelId, char *content,
int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits) {
auto model = models.GetModel(modelId);
std::vector <int> tokens;
auto v = model->weight.tokenizer.Encode(content);
for (int i = 0; i < v.Count(0); i++) {
tokens.push_back((int)((float*)v.cpuData)[i]);
}
auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
return model->LaunchResponseTokens(tokens, config);
}
DLL_EXPORT char *fetch_response_str_llm_model(int modelId, int handleId) {
auto model = models.GetModel(modelId);
int ret = model->FetchResponseTokens(handleId);
std::string s = (ret == -1 ? "<flmeos>" : model->weight.tokenizer.DecodeTokens(std::vector <int> {ret}));
return string_to_chars(s);
}
DLL_EXPORT int launch_response_llm_model(int modelId, int len, int *values,
int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits) {
std::vector <int> input;
for (int i = 0; i < len; i++) {
input.push_back(values[i]);
}
auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
auto model = models.GetModel(modelId);
return model->LaunchResponseTokens(input, config);
}
DLL_EXPORT int fetch_response_llm_model(int modelId, int handleId) {
auto model = models.GetModel(modelId);
return model->FetchResponseTokens(handleId);
}
DLL_EXPORT int fetch_response_logits_llm_model(int modelId, int handleId, float *logits) {
auto model = models.GetModel(modelId);
std::vector <float> retLogits;
int ret = model->FetchResponseLogits(handleId, retLogits);
if (ret != -1) {
memcpy(logits, retLogits.data(), retLogits.size() * sizeof(float));
}
return ret;
}
};
//
// Created by huangyuyang on 5/13/23.
//
#include <iostream>
#include "model.h"
struct QuantConfig {
std::string path; // 模型文件路径
std::string output; // 输出文件路径
int bits; // 量化位数
};
void Usage() {
std::cout << "Usage:" << std::endl;
std::cout << "[-h|--help]: 显示帮助" << std::endl;
std::cout << "<-p|--path> <args>: 模型文件的路径" << std::endl;
std::cout << "<-b|--bits> <args>: 量化位数, 4 = int4, 8 = int8, 16 = fp16" << std::endl;
std::cout << "<-o|--output> <args>: 输出文件路径" << std::endl;
}
void ParseArgs(int argc, char **argv, QuantConfig &config) {
std::vector <std::string> sargv;
for (int i = 0; i < argc; i++) {
sargv.push_back(std::string(argv[i]));
}
for (int i = 1; i < argc; i++) {
if (sargv[i] == "-h" || sargv[i] == "--help") {
Usage();
exit(0);
} else if (sargv[i] == "-p" || sargv[i] == "--path") {
config.path = sargv[++i];
} else if (sargv[i] == "-b" || sargv[i] == "--bits") {
config.bits = atoi(sargv[++i].c_str());
} else if (sargv[i] == "-o" || sargv[i] == "--output") {
config.output = sargv[++i];
} else if (sargv[i] == "-m" || sargv[i] == "--model") {
i++;
} else {
Usage();
exit(-1);
}
}
}
int main(int argc, char **argv) {
QuantConfig config;
ParseArgs(argc, argv, config);
auto model = fastllm::CreateLLMModelFromFile(config.path);
model->SaveLowBitModel(config.output, config.bits);
return 0;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment