dcu平台fastllm推理框架

aefd9f11 · zhouxiang · aefd9f11 · aefd9f11 · aefd9f11 · aefd9f11
Commit aefd9f11 authored Sep 06, 2023 by zhouxiang
6 changed files
--- a/tools/scripts/nsqlllama2flm.py
+++ b/tools/scripts/nsqlllama2flm.py
+import sys
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import LlamaTokenizer, LlamaForCausalLM
+from fastllm_pytools import torch2flm
+
+if __name__ == "__main__":
+    exportPath = sys.argv[1] if (sys.argv[1] is not None) else "nsql-llama-2-7b-fp16.flm";
+    tokenizer = AutoTokenizer.from_pretrained("models/llama7b/nsql-llama-2-7b")
+    model = AutoModelForCausalLM.from_pretrained("models/llama7b/nsql-llama-2-7b")
+    model.config.model_type = "nsql-llama-2-7b"
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "nsql-llama-2-7b-" + dtype + ".flm"
+    torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
--- a/tools/scripts/qwen2flm.py
+++ b/tools/scripts/qwen2flm.py
+import sys
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+from fastllm_pytools import torch2flm
+
+if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True, fp32=True).eval()
+    model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
+
+    dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
+    exportPath = sys.argv[1] if len(sys.argv) >= 2 else "qwen-7b-" + dtype + ".flm"
+    torch2flm.tofile(exportPath, model, tokenizer, dtype = dtype)
\ No newline at end of file
--- a/tools/scripts/setup.py
+++ b/tools/scripts/setup.py
+from setuptools import setup, find_packages
+
+setup (
+    name = "fastllm_pytools",
+    version = "0.0.1",
+    author = "huangyuyang",
+    author_email = "ztxz16@foxmail.com",
+    description = "Fastllm pytools",
+    url = "https://github.com/ztxz16/fastllm",
+    packages = ['fastllm_pytools'],
+
+    package_data = {
+        '': ['*.dll', '*.so']
+    }
+)
--- a/tools/scripts/web_demo.py
+++ b/tools/scripts/web_demo.py
+import streamlit as st
+from streamlit_chat import message
+from fastllm_pytools import llm
+import sys
+
+st.set_page_config(
+    page_title="fastllm web demo",
+    page_icon=":robot:"
+)
+
+@st.cache_resource
+def get_model():
+    model = llm.model(sys.argv[1])
+    return model
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+for i, (prompt, response) in enumerate(st.session_state.messages):
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    with st.chat_message("assistant"):
+        st.markdown(response)
+
+if prompt := st.chat_input("请开始对话"):
+    model = get_model()
+    with st.chat_message("user"):
+        st.markdown(prompt)
+
+    with st.chat_message("assistant"):
+        message_placeholder = st.empty()
+        full_response = ""
+        for chunk in model.stream_response(prompt, st.session_state.messages, one_by_one = True):
+            full_response += chunk
+            message_placeholder.markdown(full_response + "▌")
+        message_placeholder.markdown(full_response)
+    st.session_state.messages.append((prompt, full_response))
--- a/tools/src/pytools.cpp
+++ b/tools/src/pytools.cpp
+//
+// Created by huangyuyang on 6/27/23.
+//
+
+#include "model.h"
+
+#include <cstring>
+
+#ifdef WIN32
+#define DLL_EXPORT _declspec(dllexport)
+#else
+#define DLL_EXPORT
+#endif
+
+extern "C" {
+    DLL_EXPORT void print_cpu_ins() {
+        fastllm::PrintInstructionInfo();
+    }
+
+    DLL_EXPORT void set_cpu_threads(int threads) {
+        fastllm::SetThreads(threads);
+    }
+
+    DLL_EXPORT int get_cpu_threads() {
+        return fastllm::GetThreads();
+    }
+
+    DLL_EXPORT void set_cpu_low_mem(bool low) {
+        fastllm::SetLowMemMode(low);
+    }
+
+    DLL_EXPORT bool get_cpu_low_mem(bool low) {
+        return fastllm::GetLowMemMode();
+    }
+
+    DLL_EXPORT void set_kvcache_in_cpu(bool in) {
+        fastllm::SetKVCacheInCPU(in);
+    }
+
+    DLL_EXPORT bool get_kvcache_in_cpu() {
+        return fastllm::GetKVCacheInCPU();
+    }
+
+    DLL_EXPORT void set_device_map(int device_cnt, int *lens, char *devices, int *values) {
+        std::map <std::string, int> deviceMap;
+        int cur = 0;
+        for (int i = 0; i < device_cnt; i++) {
+            std::string key = "";
+            for (int j = 0; j < lens[i]; j++) {
+                key += devices[cur++];
+            }
+            deviceMap[key] = values[i];
+        }
+        fastllm::SetDeviceMap(deviceMap);
+    }
+
+    DLL_EXPORT struct ModelManager {
+        std::mutex locker;
+        std::map <int, std::unique_ptr<fastllm::basellm> > models;
+
+        fastllm::basellm *GetModel(int handle) {
+            locker.lock();
+            auto ret = models[handle].get();
+            locker.unlock();
+            return ret;
+        }
+    };
+
+    static ModelManager models;
+
+    DLL_EXPORT char *string_to_chars(const std::string &s) {
+        char *svalue = new char[s.size() + 1];
+        memcpy(svalue, s.data(), s.size());
+        svalue[s.size()] = 0;
+        return svalue;
+    }
+
+    DLL_EXPORT fastllm::GenerationConfig make_config(int max_length, bool do_sample, float top_p, int top_k,
+                                          float temperature, float repeat_penalty, bool output_logits) {
+        fastllm::GenerationConfig config;
+        config.output_token_limit = max_length;
+        config.temperature = temperature;
+        config.repeat_penalty = repeat_penalty;
+        if (do_sample) {
+            config.top_p = top_p;
+            config.top_k = top_k;
+        }
+        config.output_logits = output_logits;
+        return config;
+    }
+
+    DLL_EXPORT int create_llm_model(char *path) {
+        models.locker.lock();
+        int id = models.models.size();
+        models.models[id] = fastllm::CreateLLMModelFromFile(path);
+        models.locker.unlock();
+        return id;
+    }
+
+    DLL_EXPORT int create_empty_llm_model(char *type) {
+        models.locker.lock();
+        int id = models.models.size();
+        models.models[id] = fastllm::CreateEmptyLLMModel(type);
+        models.locker.unlock();
+        return id;
+    }
+
+    DLL_EXPORT int get_tokenizer_vocab_size(int modelId) {
+        auto model = models.GetModel(modelId);
+        int ret = model->weight.tokenizer.tokenToStringDict.size();
+        return ret;
+    }
+
+    DLL_EXPORT void add_tokenizer_word_llm_model(int modelId, char *key, int tokenId, float score) {
+        auto model = models.GetModel(modelId);
+        model->weight.AddTokenizerWord(key, tokenId, score);
+        return;
+    }
+
+    DLL_EXPORT void add_dict_llm_model(int modelId, char *key, char *value) {
+        auto model = models.GetModel(modelId);
+        model->weight.AddDict(key, value);
+        return;
+    }
+
+    DLL_EXPORT void add_adapter_dict_llm_model(int modelId, char *adapterName, char *key, char *value) {
+        auto model = models.GetModel(modelId);
+        model->weight.AddAdapterDict(adapterName, key, value);
+        return;
+    }
+
+    DLL_EXPORT void set_adapter(int modelId, char *name) {
+        auto model = models.GetModel(modelId);
+        model->SetAdapter(name);
+        return;
+    }
+
+    DLL_EXPORT void disable_adapter(int modelId, char *name) {
+        auto model = models.GetModel(modelId);
+        model->DisableAdapter();
+        return;
+    }
+
+    DLL_EXPORT void init_params_llm_model(int modelId) {
+        auto model = models.GetModel(modelId);
+        model->InitParams();
+        return;
+    }
+
+    DLL_EXPORT void warmup_llm_model(int modelId) {
+        auto model = models.GetModel(modelId);
+        model->WarmUp();
+        return;
+    }
+
+    DLL_EXPORT void save_llm_model(int modelId, char *path) {
+        auto model = models.GetModel(modelId);
+        model->SaveModel(path);
+        return;
+    }
+
+    DLL_EXPORT void add_weight_llm_model(int modelId, char *key, int dimsLen, void *dimsData,
+                              int dataType, int weightType, int oriDataType, void *oriData) {
+        auto model = models.GetModel(modelId);
+        std::vector <int> dims = std::vector <int> (dimsLen);
+        for (int i = 0; i < dims.size(); i++) {
+            dims[i] = ((int*)dimsData)[i];
+        }
+        model->weight.AddWeight(key, dims,
+                                (fastllm::DataType)dataType,
+                                (fastllm::WeightType)weightType,
+                                (fastllm::DataType)oriDataType,
+                                (uint8_t*)oriData);
+        return;
+    }
+
+    DLL_EXPORT void add_qlinear_weight_llm_model(int modelId, char *key, int dimsLen, void *dimsData,
+                                                 int bit, void *scales, void *oriData) {
+        auto model = models.GetModel(modelId);
+        std::vector <int> dims = std::vector <int> (dimsLen);
+        for (int i = 0; i < dims.size(); i++) {
+            dims[i] = ((int*)dimsData)[i];
+        }
+        model->weight.AddQLinearWeight(key, dims, bit, (float*)scales, (uint8_t*)oriData);
+        return;
+    }
+
+    DLL_EXPORT char *make_input_llm_model(int modelId, char *history, int round, char *input) {
+        auto model = models.GetModel(modelId);
+        char *ret = string_to_chars(model->MakeInput(history, round, input));
+        return ret;
+    }
+
+    DLL_EXPORT char *make_history_llm_model(int modelId, char *history, int round, char *input, char *output) {
+        auto model = models.GetModel(modelId);
+        return string_to_chars(model->MakeHistory(history, round, input, output));
+    }
+
+    DLL_EXPORT char *response_str_llm_model(int modelId, char *content,
+                                 int max_length, bool do_sample, float top_p, int top_k,
+                                 float temperature, float repeat_penalty, bool output_logits) {
+        auto model = models.GetModel(modelId);
+        auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
+        std::string s = model->Response(content, nullptr, config);
+        return string_to_chars(s);
+    }
+
+    DLL_EXPORT int launch_response_str_llm_model(int modelId, char *content,
+                                      int max_length, bool do_sample, float top_p, int top_k,
+                                      float temperature, float repeat_penalty, bool output_logits) {
+        auto model = models.GetModel(modelId);
+        std::vector <int> tokens;
+        auto v = model->weight.tokenizer.Encode(content);
+        for (int i = 0; i < v.Count(0); i++) {
+            tokens.push_back((int)((float*)v.cpuData)[i]);
+        }
+        auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
+        return model->LaunchResponseTokens(tokens, config);
+    }
+
+    DLL_EXPORT char *fetch_response_str_llm_model(int modelId, int handleId) {
+        auto model = models.GetModel(modelId);
+        int ret = model->FetchResponseTokens(handleId);
+        std::string s = (ret == -1 ? "<flmeos>" : model->weight.tokenizer.DecodeTokens(std::vector <int> {ret}));
+        return string_to_chars(s);
+    }
+
+    DLL_EXPORT int launch_response_llm_model(int modelId, int len, int *values,
+                                  int max_length, bool do_sample, float top_p, int top_k,
+                                  float temperature, float repeat_penalty, bool output_logits) {
+        std::vector <int> input;
+        for (int i = 0; i < len; i++) {
+            input.push_back(values[i]);
+        }
+        auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits);
+        auto model = models.GetModel(modelId);
+        return model->LaunchResponseTokens(input, config);
+    }
+
+    DLL_EXPORT int fetch_response_llm_model(int modelId, int handleId) {
+        auto model = models.GetModel(modelId);
+        return model->FetchResponseTokens(handleId);
+    }
+
+    DLL_EXPORT int fetch_response_logits_llm_model(int modelId, int handleId, float *logits) {
+        auto model = models.GetModel(modelId);
+        std::vector <float> retLogits;
+        int ret = model->FetchResponseLogits(handleId, retLogits);
+        if (ret != -1) {
+            memcpy(logits, retLogits.data(), retLogits.size() * sizeof(float));
+        }
+        return ret;
+    }
+};
--- a/tools/src/quant.cpp
+++ b/tools/src/quant.cpp
+//
+// Created by huangyuyang on 5/13/23.
+//
+
+#include <iostream>
+#include "model.h"
+
+struct QuantConfig {
+    std::string path; // 模型文件路径
+    std::string output; // 输出文件路径
+    int bits; // 量化位数
+};
+
+void Usage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "[-h|--help]:                      显示帮助" << std::endl;
+    std::cout << "<-p|--path> <args>:               模型文件的路径" << std::endl;
+    std::cout << "<-b|--bits> <args>:               量化位数, 4 = int4, 8 = int8, 16 = fp16" << std::endl;
+    std::cout << "<-o|--output> <args>:             输出文件路径" << std::endl;
+}
+
+void ParseArgs(int argc, char **argv, QuantConfig &config) {
+	std::vector <std::string> sargv;
+	for (int i = 0; i < argc; i++) {
+		sargv.push_back(std::string(argv[i]));
+	}
+	for (int i = 1; i < argc; i++) {
+		if (sargv[i] == "-h" || sargv[i] == "--help") {
+			Usage();
+			exit(0);
+		} else if (sargv[i] == "-p" || sargv[i] == "--path") {
+			config.path = sargv[++i];
+		} else if (sargv[i] == "-b" || sargv[i] == "--bits") {
+			config.bits = atoi(sargv[++i].c_str());
+		} else if (sargv[i] == "-o" || sargv[i] == "--output") {
+			config.output = sargv[++i];
+		} else if (sargv[i] == "-m" || sargv[i] == "--model") {
+            i++;
+        } else {
+			Usage();
+			exit(-1);
+		}
+	}
+}
+
+int main(int argc, char **argv) {
+    QuantConfig config;
+    ParseArgs(argc, argv, config);
+    auto model = fastllm::CreateLLMModelFromFile(config.path);
+    model->SaveLowBitModel(config.output, config.bits);
+    return 0;
+}
\ No newline at end of file