Commit 56215723 authored by zhouxiang's avatar zhouxiang
Browse files

1、同步到最新版本;2、增加batch推理接口;3、解决内存泄漏问题;4、修复llama系列流式输出不流畅的问题

parent 44be91d3
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
<html lang="zh-CN"> <html lang="zh-CN">
<meta name="viewport" content="width=device-width,initial-scale=1" /> <meta name="viewport" content="width=device-width,initial-scale=1" />
<meta name="description" content="Fastllm Web Interface" /> <meta name="description" content="Chat Web Interface" />
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<title>Fastllm Web Interface</title> <title>Chat Web Interface</title>
<style> <style>
* { * {
box-sizing: border-box; box-sizing: border-box;
...@@ -693,7 +693,7 @@ ...@@ -693,7 +693,7 @@
<div class="button minimize"></div> <div class="button minimize"></div>
<div class="button maximize"></div> <div class="button maximize"></div>
</div> </div>
<div class="title">Fastllm</div> <div class="title">Chat</div>
</div> </div>
<div class="messages"> <div class="messages">
<div id="chatlog"></div> <div id="chatlog"></div>
......
...@@ -18,6 +18,12 @@ struct WebConfig { ...@@ -18,6 +18,12 @@ struct WebConfig {
int threads = 4; // 使用的线程数 int threads = 4; // 使用的线程数
bool lowMemMode = false; // 是否使用低内存模式 bool lowMemMode = false; // 是否使用低内存模式
int port = 8081; // 端口号 int port = 8081; // 端口号
bool history = true; // 是否采用多轮对话的方式
int max_length = 4096; //输出最大长度
float repeat_penalty = 1.0f; // 重复惩罚系数,1.0代表不惩罚
int top_k = 1; // top_k采样
float top_p = 1.0; // top_p采样
float temperature = 1.0; // 温度参数,一般在0.1 ~ 1.0之间,设大这个参数可以带来结果的多样性
}; };
void Usage() { void Usage() {
...@@ -28,6 +34,7 @@ void Usage() { ...@@ -28,6 +34,7 @@ void Usage() {
std::cout << "<-t|--threads> <args>: 使用的线程数量" << std::endl; std::cout << "<-t|--threads> <args>: 使用的线程数量" << std::endl;
std::cout << "<-l|--low>: 使用低内存模式" << std::endl; std::cout << "<-l|--low>: 使用低内存模式" << std::endl;
std::cout << "<--port> <args>: 网页端口号" << std::endl; std::cout << "<--port> <args>: 网页端口号" << std::endl;
std::cout << "<--nohistory> <args>: 不采用多轮对话模式" << std::endl;
} }
void ParseArgs(int argc, char **argv, WebConfig &config) { void ParseArgs(int argc, char **argv, WebConfig &config) {
...@@ -49,6 +56,18 @@ void ParseArgs(int argc, char **argv, WebConfig &config) { ...@@ -49,6 +56,18 @@ void ParseArgs(int argc, char **argv, WebConfig &config) {
config.webPath = sargv[++i]; config.webPath = sargv[++i];
} else if (sargv[i] == "--port") { } else if (sargv[i] == "--port") {
config.port = atoi(sargv[++i].c_str()); config.port = atoi(sargv[++i].c_str());
} else if (sargv[i] == "--max_length") {
config.max_length = atoi(sargv[++i].c_str());
} else if (sargv[i] == "--repeat_penalty") {
config.repeat_penalty = atof(sargv[++i].c_str());
} else if (sargv[i] == "--top_k") {
config.top_k = atoi(sargv[++i].c_str());
} else if (sargv[i] == "--top_p") {
config.top_p = atof(sargv[++i].c_str());
} else if (sargv[i] == "--temperature") {
config.temperature = atof(sargv[++i].c_str());
} else if (sargv[i] == "--nohistory") {
config.history = false;
} else { } else {
Usage(); Usage();
exit(-1); exit(-1);
...@@ -83,7 +102,12 @@ int main(int argc, char** argv) { ...@@ -83,7 +102,12 @@ int main(int argc, char** argv) {
session->output = "<eop>\n"; session->output = "<eop>\n";
session->status = 2; session->status = 2;
} else { } else {
auto prompt = model->MakeInput(session->history, session->round, input); std::string prompt;
if(config.history)
prompt = model->MakeInput(session->history, session->round, input);
else
prompt = model->MakeInput("", 0, input);
auto inputs = model->weight.tokenizer.Encode(prompt); auto inputs = model->weight.tokenizer.Encode(prompt);
std::vector<int> tokens; std::vector<int> tokens;
...@@ -91,7 +115,13 @@ int main(int argc, char** argv) { ...@@ -91,7 +115,13 @@ int main(int argc, char** argv) {
tokens.push_back(((float *) inputs.cpuData)[i]); tokens.push_back(((float *) inputs.cpuData)[i]);
} }
int handleId = model->LaunchResponseTokens(tokens); fastllm::GenerationConfig gconfig;
gconfig.output_token_limit = config.max_length;
gconfig.temperature = config.temperature;
gconfig.repeat_penalty = config.repeat_penalty;
gconfig.top_p = config.top_p;
gconfig.top_k = config.top_k;
int handleId = model->LaunchResponseTokens(tokens, gconfig);
std::vector<float> results; std::vector<float> results;
while (true) { while (true) {
int result = model->FetchResponseTokens(handleId); int result = model->FetchResponseTokens(handleId);
......
...@@ -43,6 +43,9 @@ namespace fastllm { ...@@ -43,6 +43,9 @@ namespace fastllm {
void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams); void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams); bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams); void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
// public:
// CudaLinearOp();
// void *streams_handle = NULL;
}; };
class CudaSplitOp : BaseOperator { class CudaSplitOp : BaseOperator {
......
...@@ -15,6 +15,7 @@ void FastllmCudaDirectFree(void *ret); ...@@ -15,6 +15,7 @@ void FastllmCudaDirectFree(void *ret);
void FastllmCudaCopyFromHostToDevice(void *dst, void *src, size_t size); void FastllmCudaCopyFromHostToDevice(void *dst, void *src, size_t size);
void FastllmCudaCopyFromDeviceToHost(void *dst, void *src, size_t size); void FastllmCudaCopyFromDeviceToHost(void *dst, void *src, size_t size);
void FastllmCudaCopyFromDeviceToDevice(void *dst, void *src, size_t size); void FastllmCudaCopyFromDeviceToDevice(void *dst, void *src, size_t size);
void FastllmCudaMemcpyBetweenDevices(int dstId, void *dst, int srcId, void *src, size_t size);
void FastllmCudaMemcpy2DDeviceToDevice(void * dst, size_t dpitch, const void * src, void FastllmCudaMemcpy2DDeviceToDevice(void * dst, size_t dpitch, const void * src,
size_t spitch, size_t width, size_t height); size_t spitch, size_t width, size_t height);
...@@ -40,7 +41,9 @@ bool FastllmCudaMatMulFloatInt8(const fastllm::Data &input, fastllm::Data &weigh ...@@ -40,7 +41,9 @@ bool FastllmCudaMatMulFloatInt8(const fastllm::Data &input, fastllm::Data &weigh
bool FastllmCudaMatMulFloatInt4(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k); bool FastllmCudaMatMulFloatInt4(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloatInt4NoZero(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k); bool FastllmCudaMatMulFloatInt4NoZero(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloat32(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k); bool FastllmCudaMatMulFloat32(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmCudaMatMulFloat16(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k); bool FastllmCudaMatMulFloat16(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
// bool FastllmCudaMatMulFloat16(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k, void* streams_handle);
bool FastllmCudaBatchMatMul(const fastllm::Data &input0, const fastllm::Data &input1, fastllm::Data &output, bool FastllmCudaBatchMatMul(const fastllm::Data &input0, const fastllm::Data &input1, fastllm::Data &output,
int input0Spatial, int input1Spatial, int outputSpatial, int input0Spatial, int input1Spatial, int outputSpatial,
int input0Stride, int input1Stride, int input0Stride, int input1Stride,
...@@ -70,6 +73,7 @@ bool FastllmCudaBatchMatMulBatch(void **i0s, void **i1s, void **os, ...@@ -70,6 +73,7 @@ bool FastllmCudaBatchMatMulBatch(void **i0s, void **i1s, void **os,
int *ns, int *ms, int *ks, int *ns, int *ms, int *ks,
int *i0Strides, int *i1Strides, float alpha, int batch); int *i0Strides, int *i1Strides, float alpha, int batch);
void FastllmCudaSetDevice(int gpu_id); void FastllmCudaSetDevice(int gpu_id);
void* FastllmCreateStreams(int numStreams);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
...@@ -19,6 +19,10 @@ ...@@ -19,6 +19,10 @@
#include <memory> #include <memory>
#include "devices/cpu/cputhreadpool.h" #include "devices/cpu/cputhreadpool.h"
#ifdef USE_SENTENCEPIECE
#include <sentencepiece_processor.h>
#endif
namespace fastllm { namespace fastllm {
void SetDeviceMap(const std::map <std::string, int> &deviceMap); void SetDeviceMap(const std::map <std::string, int> &deviceMap);
std::map <std::string, int> GetDeviceMap(); std::map <std::string, int> GetDeviceMap();
...@@ -40,6 +44,7 @@ namespace fastllm { ...@@ -40,6 +44,7 @@ namespace fastllm {
float temperature = 1.0; // 温度参数,一般在0.1 ~ 1.0之间,设大这个参数可以带来结果的多样性 float temperature = 1.0; // 温度参数,一般在0.1 ~ 1.0之间,设大这个参数可以带来结果的多样性
bool output_logits = false; // 是否返回logits bool output_logits = false; // 是否返回logits
bool enable_hash_id = false; // 给会话添加hash id bool enable_hash_id = false; // 给会话添加hash id
std::multiset <int> stop_token_ids;
bool IsSimpleGreedy() const { bool IsSimpleGreedy() const {
...@@ -149,7 +154,7 @@ namespace fastllm { ...@@ -149,7 +154,7 @@ namespace fastllm {
uint8_t quantization(const float &realNumber) const { uint8_t quantization(const float &realNumber) const {
if (type == 0) { if (type == 0) {
return (uint8_t) (std::min((double) ((1 << bit) - 1), return (uint8_t) (std::min((double) ((1 << bit) - 1),
std::max(realNumber / scale + zeroPoint + 0.5, 0.0))); (double) std::max(realNumber / scale + zeroPoint + 0.5, 0.0)));
} else { } else {
return (uint8_t) (std::max(0.f, std::min(15.f, (realNumber - min) / scale + 0.5f))); return (uint8_t) (std::max(0.f, std::min(15.f, (realNumber - min) / scale + 0.5f)));
} }
...@@ -245,7 +250,7 @@ namespace fastllm { ...@@ -245,7 +250,7 @@ namespace fastllm {
std::string fileName; std::string fileName;
long long filePos; long long filePos;
std::shared_ptr<FileMmap> m_file; std::shared_ptr<FileMmap> mapFile;
bool directMemory = false; // 直接分配/释放Memory,不经过缓存 bool directMemory = false; // 直接分配/释放Memory,不经过缓存
...@@ -287,6 +292,8 @@ namespace fastllm { ...@@ -287,6 +292,8 @@ namespace fastllm {
void PrintShape() const; // 输出形状 void PrintShape() const; // 输出形状
std::vector<int> Shape() const;
void Print() const; // 输出 void Print() const; // 输出
void CalcWeightSum(); // 计算WeightSum void CalcWeightSum(); // 计算WeightSum
...@@ -297,8 +304,8 @@ namespace fastllm { ...@@ -297,8 +304,8 @@ namespace fastllm {
void ToDevice(void *device); void ToDevice(void *device);
void set_file(std::shared_ptr<FileMmap> file) { void SetMapFile(std::shared_ptr<FileMmap> file) {
m_file = file; mapFile = file;
} }
}; };
...@@ -306,7 +313,8 @@ namespace fastllm { ...@@ -306,7 +313,8 @@ namespace fastllm {
enum TokenizerType { enum TokenizerType {
BPE = 0, BPE = 0,
NORMAL = 1, NORMAL = 1,
QWEN = 2 QWEN = 2,
GLM = 3
}; };
struct TrieNode { struct TrieNode {
...@@ -357,6 +365,9 @@ namespace fastllm { ...@@ -357,6 +365,9 @@ namespace fastllm {
std::unordered_map <int, std::string> tokenToStringDict; std::unordered_map <int, std::string> tokenToStringDict;
std::unordered_map <int, float> tokenToScoreDict; std::unordered_map <int, float> tokenToScoreDict;
std::unordered_map <std::string, int> stringToTokenDict; std::unordered_map <std::string, int> stringToTokenDict;
#ifdef USE_SENTENCEPIECE
std::unique_ptr<sentencepiece::SentencePieceProcessor> spProcessor;
#endif
Tokenizer (); Tokenizer ();
...@@ -405,6 +416,8 @@ namespace fastllm { ...@@ -405,6 +416,8 @@ namespace fastllm {
void AddWeight(const std::string &key, const std::vector <int> &dims, void AddWeight(const std::string &key, const std::vector <int> &dims,
DataType dataType, WeightType weightType, DataType oriDataType, uint8_t *oriData); // 插入一个权重 DataType dataType, WeightType weightType, DataType oriDataType, uint8_t *oriData); // 插入一个权重
void ReleaseWeight(); // 释放所有权重占用的空间
void AddQLinearWeight(const std::string &key, const std::vector <int> &dims, void AddQLinearWeight(const std::string &key, const std::vector <int> &dims,
int bit, float *scales, uint8_t *oriData); // 插入一个Qlinear层的权重,量化规则为float value = scales * oriData int bit, float *scales, uint8_t *oriData); // 插入一个Qlinear层的权重,量化规则为float value = scales * oriData
......
...@@ -46,7 +46,9 @@ namespace fastllm { ...@@ -46,7 +46,9 @@ namespace fastllm {
public: public:
basellm() {}; basellm() {};
~basellm() {}; ~basellm() {
this->weight.ReleaseWeight();
};
virtual void LoadFromFile(const std::string &fileName); // 从文件读取 virtual void LoadFromFile(const std::string &fileName); // 从文件读取
...@@ -102,6 +104,11 @@ namespace fastllm { ...@@ -102,6 +104,11 @@ namespace fastllm {
RuntimeResultBatch retCb = nullptr, RuntimeResultBatch retCb = nullptr,
const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复 const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复
virtual void ResponseBatch(std::vector<std::vector<float>> &inputTokens,
std::vector<std::string> &outputs,
RuntimeResultBatch retCb = nullptr,
const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复
virtual int LaunchResponseTokens(const std::vector <int> &inputTokens, virtual int LaunchResponseTokens(const std::vector <int> &inputTokens,
const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId
...@@ -148,6 +155,7 @@ namespace fastllm { ...@@ -148,6 +155,7 @@ namespace fastllm {
std::thread *mainLoop = nullptr; std::thread *mainLoop = nullptr;
std::mutex mainLoopLocker, dictLocker; std::mutex mainLoopLocker, dictLocker;
std::mutex resultTokenQueueLocker;
std::map <std::string, int> deviceMap; std::map <std::string, int> deviceMap;
......
...@@ -15,6 +15,8 @@ namespace fastllm { ...@@ -15,6 +15,8 @@ namespace fastllm {
public: public:
ChatGLMModel (); // 构造函数 ChatGLMModel (); // 构造函数
virtual void InitParams(); // 初始化参数信息
// 推理 // 推理
virtual int Forward( virtual int Forward(
const Data &inputIds, const Data &inputIds,
...@@ -68,6 +70,16 @@ namespace fastllm { ...@@ -68,6 +70,16 @@ namespace fastllm {
private: private:
virtual void CausalMask(Data &data, int start) {}; // 因果mask? virtual void CausalMask(Data &data, int start) {}; // 因果mask?
int mask_token_id;
int gmask_token_id;
int smask_token_id;
// int sop_token_id; //=bos_token_id
int eop_token_id;
int system_token_id;
int user_token_id;
int assistant_token_id;
int observation_token_id;
float rope = 1.0f; float rope = 1.0f;
}; };
} }
......
//
// Created by huangyuyang on 5/11/23.
//
#ifndef FASTLLM_GLM_H
#define FASTLLM_GLM_H
#include "basellm.h"
#include "cmath"
#include <iostream>
namespace fastllm {
class GLMModel: public basellm {
public:
GLMModel (); // 构造函数
// 推理
virtual int Forward(
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <float> *logits = nullptr);
std::vector <int> ForwardBatch(
int batch,
const Data &inputIds,
const Data &attentionMask,
const Data &positionIds,
std::vector <std::pair <Data, Data> > &pastKeyValues,
const GenerationConfig &generationConfig = GenerationConfig(),
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *retLogits = nullptr);
// 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
virtual void InitParams();
virtual void WarmUp(); // 预热
virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
private:
float scale_attn_1;
static constexpr int eot_token_id = 50000;//<|endoftext|>
static constexpr int cls_token_id = 50002;//[CLS]
static constexpr int mask_token_id = 50003;//[MASK]
static constexpr int smask_token_id = 50008;//[sMASK]
static constexpr int gmask_token_id = 50009;//[gMASK]
};
}
#endif //FASTLLM_GLM_H
...@@ -12,6 +12,22 @@ pyfastllm是基于fastllm的python api接口实现,通过pyfastllm可以更加 ...@@ -12,6 +12,22 @@ pyfastllm是基于fastllm的python api接口实现,通过pyfastllm可以更加
## 版本更新 ## 版本更新
### v0.2.0 2023-10-23
- 代码结构调整优化
- 增加了模型转换和量化接口
### v0.1.5 2023-10-13
- 修复wheel编译安装部分
- 文件合并,修复导入
### v0.1.4 2023-09-12
- 修复了一些后端接口变动的bug
- 增加了新的ops, 支持低级op操作
### v0.1.3 2023-07-08 ### v0.1.3 2023-07-08
- 增加使用和API接口文档 - 增加使用和API接口文档
...@@ -31,26 +47,21 @@ pyfastllm是基于fastllm的python api接口实现,通过pyfastllm可以更加 ...@@ -31,26 +47,21 @@ pyfastllm是基于fastllm的python api接口实现,通过pyfastllm可以更加
首先下载pybind11 c++依赖: 首先下载pybind11 c++依赖:
```sh ```shell
git submodule init git submodule init
git submodule update # 下载pybind11依赖 git submodule update # 下载pybind11依赖
``` ```
Cpp手动编译: Cpp手动编译:
```sh
```shell
mkdir build-py mkdir build-py
cd build-py cd build-py
cmake .. -DUSE_CUDA=ON -DPY_API=ON cmake .. -DUSE_CUDA=ON -DPY_API=ON
make -j4 make -j
python cli.py -p chatglm-6b-int8.bin -t 8 # 与cpp编译的运行结果保持一致 cp fastllm*.so pyfastllm/examples/ # 或放置在$PYTHONPATH环境变量包含的的目录中
``` cd ../pyfastllm/examples/
python3 cli_simple.py -p chatglm-6b-int8.flm # 与cpp编译的运行结果保持一致
Python脚本编译:
```sh
cd pyfastllm
python build_libs --cuda
python cli.py -p chatglm-6b-int8.bin -t 8
``` ```
### wheel包方式 ### wheel包方式
...@@ -59,27 +70,39 @@ python cli.py -p chatglm-6b-int8.bin -t 8 ...@@ -59,27 +70,39 @@ python cli.py -p chatglm-6b-int8.bin -t 8
首先下载pybind11: 首先下载pybind11:
```bash ```shell
pip install pybind11 pip install pybind11
``` ```
```sh - GPU
cd pyfastllm ```shell
python setup.py build cd pyfastllm/
python setup.py install python3 setup.py build
python cli.py -p chatglm-6b-int8.bin -t 8 python3 setup.py install
cd examples/
python3 cli_simple.py -p chatglm-6b-int8.flm
```
- CPU
```shell
cd pyfastllm/
export USE_CUDA=OFF
python3 setup.py build
python3 setup.py install
cd examples/
python3 cli_simple.py -p chatglm-6b-int8.flm -t 8
``` ```
## 使用 ## 使用
### python 调用 ### python 调用
demo文件夹中存放了几种常见的代码示例: examples文件夹中存放了几种常见的代码示例:
demo/cli.py: 以回调函数方式输出回答示例 - `examples/cli_simple.py`: 调用api接口示例(推荐)
demo/cli_thread.py: 多线程调用api接口示例(推荐) - `examples/cli_low_api.py`: 底层API调用示例
demo/cli_low_api.py: 底层API调用示例 - `examples/convert_model.py`: 模型转换示例
demo/convert_model.py: 模型转换示例 - `examples/web_api.py`, `examples/web_api_client.py`: fastapi webapi调用
demo/web_api.py, demo/web_api_client.py: fastapi webapi调用 - `examples/test_ops.py`: 部分op的使用样例及测试
### 命令行工具 ### 命令行工具
...@@ -94,12 +117,12 @@ $ fastllm-convert -m chatglm6B -p hf_model_path -o output_flm_path ...@@ -94,12 +117,12 @@ $ fastllm-convert -m chatglm6B -p hf_model_path -o output_flm_path
```sh ```sh
mkdir build-py mkdir build-py
cd build-py && cmake .. -DPY_API=ON -DUSE_CUDA=ON && make -j && cd - cd build-py && cmake .. -DPY_API=ON -DUSE_CUDA=ON && make -j && cd -
cd pyfastllm/demo cd pyfastllm/examples
python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32 python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32
``` ```
可以使用locust进行压测。A100 40G,chatglm fp16 压测部分结果如下: 可以使用locust进行压测。A100 40G,chatglm fp16 压测部分结果如下:
| 并发数 | 平均调用时间(s) | TP95(s) | TP99(s) | | 并发数 | 平均调用时间(s) | TP95(s) | TP99(s) |
|----------:|------|------|------| |----------:|-------|------|------|
| 1 | 3.07 | 4.2 | 4.8 | | 1 | 3.07 | 4.2 | 4.8 |
| 10 | 6.11 | 11.0 | 12.0 | | 10 | 6.11 | 11.0 | 12.0 |
| 16 | 6.82 | 15.0 | 16.0 | | 16 | 6.82 | 15.0 | 16.0 |
...@@ -175,20 +198,23 @@ python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32 ...@@ -175,20 +198,23 @@ python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32
支持的模型列表: 支持的模型列表:
| 模型名称 | 对应类 | 备注 | 模型名称 | 对应类 | 备注 |
| -- | -- | -- | ---- | ---- | ---- |
| ChatGLM-6B | fastllm.ChatGLMModel | | ChatGLM-6B | fastllm.ChatGLMModel | |
| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本 | ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本 |
| Moss | fastllm.MossModel | | Moss | fastllm.MossModel | |
| Alpaca | fastllm.llamaModel | | Alpaca | fastllm.LlamaModel | |
| QWen | fastllm.QWenModel | |
## 开发计划(TODO) ## 开发计划(TODO)
- [x] 修改response_batch的output_str函数,以返回值的形式返回答案 - [x] 修改response_batch的output_str函数,以返回值的形式返回答案
- [x] 编解码部分优化,合并不同的返回类型 - [x] 编解码部分优化,合并不同的返回类型
- [ ] 对接numpy等矩阵库
- [ ] Tensor的深复制和浅复制,以及基础运算符重载 - [ ] Tensor的深复制和浅复制,以及基础运算符重载
- [ ] fix low_api下pastKV复制的bug - [ ] fix low_api下pastKV复制的bug
- [ ] 模型运行参数对象类,封装模型运行时参数,包含模型路径、运行线程数、是否为低内存模型、惩罚因子、温度等 - [x] 模型运行参数对象类,封装模型运行时参数,包含模型路径、运行线程数、是否为低内存模型、惩罚因子、温度等
- [ ] 暴露更多的底层api接口,按照module的方式定义模型的点,拼接model实现自定义model - [ ] 增加更多的op
- [ ] 增加module
...@@ -3,6 +3,7 @@ import shutil ...@@ -3,6 +3,7 @@ import shutil
import platform import platform
import sys import sys
import argparse import argparse
import glob
parser = argparse.ArgumentParser(description='build fastllm libs') parser = argparse.ArgumentParser(description='build fastllm libs')
parser.add_argument('--cuda', dest='cuda', action='store_true', default=False, parser.add_argument('--cuda', dest='cuda', action='store_true', default=False,
...@@ -24,19 +25,23 @@ def build_libs(): ...@@ -24,19 +25,23 @@ def build_libs():
os.chdir(cmake_build_dir) os.chdir(cmake_build_dir)
# build it # build it
cpu_num = min(os.cpu_count(), 4)
args = parser.parse_args() args = parser.parse_args()
if IS_WINDOWS: if IS_WINDOWS:
os.system('cmake -G "Ninja" -DPY_API=ON .. && ninja pyfastllm') os.system('cmake -G Ninja -DPY_API=ON .. && ninja pyfastllm')
elif IS_LINUX: elif IS_LINUX:
extra_opts = ' -DPY_API=ON ' extra_opts = ' -DPY_API=ON '
extra_opts += ' -DUSE_CUDA=ON ' if args.cuda else ' ' extra_opts += ' -DUSE_CUDA=ON ' if args.cuda else ' '
build_cmd = 'cmake ' + extra_opts + ' .. && make pyfastllm -j4' build_cmd = f"cmake {extra_opts} .. && make pyfastllm -j{cpu_num}"
print(build_cmd) print(build_cmd)
os.system('cmake ' + extra_opts + ' .. && make pyfastllm -j4') os.system(f"cmake {extra_opts} .. && make pyfastllm -j{cpu_num}")
else: else:
extra_opts = '-DPY_API=ON' extra_opts = '-DPY_API=ON'
os.system('cmake ' + extra_opts + '.. && make pyfastllm -j4') os.system(f"cmake {extra_opts} .. && make pyfastllm -j{cpu_num}")
so_files = glob.glob("*.so", root_dir=cmake_build_dir)
for file in so_files:
shutil.copy(os.path.join(cmake_build_dir, file), os.path.join(root_dir, "pyfastllm/fastllm"))
if __name__ == '__main__': if __name__ == '__main__':
build_libs() build_libs()
\ No newline at end of file
...@@ -3,7 +3,7 @@ import sys ...@@ -3,7 +3,7 @@ import sys
import platform import platform
import logging import logging
import argparse import argparse
sys.path.append('./build-py') sys.path.append('../../build-py')
import pyfastllm # 或fastllm import pyfastllm # 或fastllm
logging.info(f"python gcc version:{platform.python_compiler()}") logging.info(f"python gcc version:{platform.python_compiler()}")
......
import fastllm
import numpy as np
def np_rms_norm(inputs, weights, eps):
channel = inputs.shape[-1]
sqrt_mean = np.sqrt(np.sum(inputs**2)/channel + eps)
return inputs / sqrt_mean *weights
def np_layer_norm(inputs, gamma, beta, axis=-1):
assert axis < len(inputs.shapes), "axis should less than inputs dims"
channel = inputs.shape[axis]
mean = np.mean(inputs, axis=axis)
var = np.var(inputs, axis=axis)
output = (inputs - mean) / var * gamma + beta
return output
def np_linear(inputs, weights, bias):
output = np.matmul(inputs, weights.T) + bias
return output
def np_softmax(inputs, axis=None):
maxv = inputs.max(axis, keepdims=True)
exp_v = np.exp(inputs - maxv)
exp_sum = np.sum(exp_v, axis=axis)
return exp_v / exp_sum
def np_silu(inputs, ):
return inputs / (1 + np.exp(-inputs))
def np_attention(q, k, v, mask=None, group=None, scale=None):
qk = np_softmax(q @ k.T * scale, axis=-1)
attn = qk @ v
return attn
def test_linear():
inputs = np.array([[1, 2]])
weight = np.array([[3, 4, 5, 5, 6, 7]]).reshape([3, 2])
bias = np.array([0, 1, 1])
np_output = np_linear(inputs, weight, bias)
print(np_output)
input = fastllm.Tensor(fastllm.float32, [1, 2], [1, 2])
weights = fastllm.Tensor(fastllm.float32, [3, 2], [3, 4, 5, 5, 6, 7])
bias = fastllm.Tensor(fastllm.float32, [3], [0, 1, 1])
out = fastllm.ops.linear(input, weights, bias)
print(out)
def test_rms_norm():
inputs = np.array([1, 5]).reshape([1, 2])
weights = np.array([1, 3]).reshape([1, 2])
eps = 1e-6
np_out = np_rms_norm(inputs, weights, eps)
print(np_out)
input = fastllm.Tensor(fastllm.float32, [1, 2], [1, 5])
weights = fastllm.Tensor(fastllm.float32, [1, 2], [1, 3])
out = fastllm.Tensor()
out = fastllm.ops.rms_norm(input, weights, eps=1e-6)
print(out)
def test_silu():
inputs = np.array([1, 5]).reshape([1, 2])
output = np_softmax(inputs)
# output = np_silu(inputs)
print(output)
inputs = fastllm.Tensor(fastllm.float32, [1, 2], [1, 5])
out = fastllm.ops.activation(input=inputs, activate_type="softmax")
# out = fastllm.ops.activation(input=inputs, activate_type="silu")
print(out)
def test_attention():
q = np.array([1, 2, 3, 4, 5, 6]).reshape([2, 3])
k = np.array([5, 6, 7, 8, 9, 10]).reshape([2, 3])
v = np.array([1, 1, 1, 2, 1, 3]).reshape([2, 3])
scale = 1 / np.sqrt(q.shape[-1])
output = np_attention(q, k, v, scale=scale)
print(output)
q = fastllm.Tensor(fastllm.float32, [1, 2, 3], [1, 2, 3, 4, 5, 6])
k = fastllm.Tensor(fastllm.float32, [1, 2, 3], [5, 6, 7, 8, 9, 10])
v = fastllm.Tensor(fastllm.float32, [1, 2, 3], [1, 1, 1, 2, 1, 3])
mask = fastllm.Tensor()
output = fastllm.ops.attention(q, k, v, mask, group=1, scale=scale, attentionType=0)
print(output)
test_attention()
test_silu()
test_linear()
test_rms_norm()
...@@ -7,7 +7,8 @@ from copy import deepcopy ...@@ -7,7 +7,8 @@ from copy import deepcopy
import traceback import traceback
from typing import List from typing import List
sys.path.append('../../build-py') sys.path.append('../../build-py')
import pyfastllm # 或fastllm import pyfastllm
import uuid
from fastapi import FastAPI, Request from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
import threading, queue, uvicorn, json, time import threading, queue, uvicorn, json, time
...@@ -106,8 +107,8 @@ def dynamic_batch_stream_func(): ...@@ -106,8 +107,8 @@ def dynamic_batch_stream_func():
def chat_stream(prompt: str, config: pyfastllm.GenerationConfig, uid:int=0, time_out=200): def chat_stream(prompt: str, config: pyfastllm.GenerationConfig, uid:int=0, time_out=200):
global g_model, g_msg_dict global g_msg_dict
time_stamp = round(time.time() * 1000) time_stamp = str(uuid.uuid1())
hash_id = str(pyfastllm.std_hash(f"{prompt}time_stamp:{time_stamp}")) hash_id = str(pyfastllm.std_hash(f"{prompt}time_stamp:{time_stamp}"))
thread = threading.Thread(target = batch_response_stream, args = (f"{prompt}time_stamp:{time_stamp}", config)) thread = threading.Thread(target = batch_response_stream, args = (f"{prompt}time_stamp:{time_stamp}", config))
thread.start() thread.start()
......
# -*- coding: utf-8 -*-
import sys
import platform
import logging
import argparse
import fastllm
logging.info(f"python gcc version:{platform.python_compiler()}")
def args_parser():
parser = argparse.ArgumentParser(description='fastllm')
parser.add_argument('-m', '--model', type=int, required=False, default=0, help='模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)')
parser.add_argument('-p', '--path', type=str, required=True, default='', help='模型文件的路径')
parser.add_argument('-t', '--threads', type=int, default=4, help='使用的线程数量')
parser.add_argument('-l', '--low', action='store_true', help='使用低内存模式')
args = parser.parse_args()
return args
# 请谨慎使用该函数,目前仍存在bug,仅作为low level api调用示例,请勿在生产环境使用
def response(model, prompt_input:str, stream_output:bool=False):
gmask_token_id = 130001
bos_token_id = 130004
eos_token_id = model.eos_token_id
input_ids = model.weight.tokenizer.encode(prompt_input)
if model.model_type == "chatglm":
gmask_token_id = model.gmask_token_id
bos_token_id = model.bos_token_id
gmask_bos = fastllm.Tensor(fastllm.float32, [1, 2], [gmask_token_id, bos_token_id])
input_ids = fastllm.cat([gmask_bos, input_ids], 0)
seq_len = input_ids.count(0)
vmask = [0] * (seq_len * seq_len)
vpids = [0] * (seq_len * 2)
for i in range(seq_len-1):
vmask[i*seq_len + seq_len -1] = 1
vpids[i] = i
vpids[seq_len - 1] = seq_len - 2
vpids[seq_len * 2 - 1] = 1
attention_mask = fastllm.Tensor(fastllm.float32, [seq_len, seq_len], vmask)
position_ids = fastllm.Tensor(fastllm.float32, [2, seq_len], vpids)
pastKeyValues = []
for _ in range(model.block_cnt):
pastKeyValues.append([fastllm.Tensor(fastllm.float32), fastllm.Tensor(fastllm.float32)])
ret_str = ""
ret_len = 1
mask_ids = -1
output_tokens = []
penalty_factor = fastllm.Tensor()
while len(output_tokens) < 2048: # config.max_seq_len
ret, pastKeyValues = model.forward(input_ids, attention_mask, position_ids, penalty_factor, pastKeyValues)
if ret == eos_token_id:
break
output_tokens.append(ret)
cur_str = model.weight.tokenizer.decode(fastllm.Tensor(fastllm.float32, [len(output_tokens)], output_tokens))
ret_str += cur_str
print(cur_str, end="")
sys.stdout.flush()
if stream_output:
yield cur_str
ret_len += 1
output_tokens = []
if mask_ids == -1:
mask_ids = seq_len - 2
input_ids = fastllm.Tensor(fastllm.float32, [1, 1], [ret])
attention_mask = fastllm.Tensor()
position_ids = fastllm.Tensor(fastllm.float32, [2, 1], [mask_ids, ret_len])
print()
return ret_str
def run_with_low_level(args):
model_path = args.path
llm_type = fastllm.get_llm_type(model_path)
print(f"llm model: {llm_type}")
model = fastllm.create_llm(model_path)
prompt = ""
while prompt != "stop":
prompt = input("User: ")
outputs = response(model, prompt_input=model.make_input("", 0, prompt))
for output in outputs:
print(output)
sys.stdout.flush()
if __name__ == "__main__":
args = args_parser()
run_with_low_level(args)
# -*- coding: utf-8 -*-
import sys, os
import platform
import logging
import argparse
import fastllm
logging.info(f"python gcc version:{platform.python_compiler()}")
def args_parser():
parser = argparse.ArgumentParser(description='fastllm')
parser.add_argument('-m', '--model', type=int, required=False, default=0, help='模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)')
parser.add_argument('-p', '--path', type=str, required=True, default='', help='模型文件的路径')
parser.add_argument('-t', '--threads', type=int, default=4, help='使用的线程数量')
parser.add_argument('-l', '--low', action='store_true', help='使用低内存模式')
args = parser.parse_args()
return args
def response(model, prompt_input:str, stream_output:bool=False):
input_ids = model.weight.tokenizer.encode(prompt_input)
input_ids = input_ids.to_list()
input_ids = [int(v) for v in input_ids]
if model.model_type == "chatglm":
input_ids = [model.gmask_token_id, model.bos_token_id] + input_ids
# print(input_ids)
handle = model.launch_response(input_ids, fastllm.GenerationConfig())
continue_token = True
ret_byte = b""
ret_str = ""
while continue_token:
resp_token = model.fetch_response(handle)
continue_token = (resp_token != -1)
content = model.weight.tokenizer.decode_byte([resp_token])
ret_byte += content
ret_str = ret_byte.decode(errors='ignore')
if stream_output:
yield ret_str
return ret_str
def run_with_response(args):
model_path = args.path
OLD_API = False
if OLD_API:
model = fastllm.ChatGLMModel()
model.load_weights(model_path)
model.warmup()
else:
fastllm.set_threads(args.threads)
fastllm.set_low_memory(args.low)
if not os.path.exists(model_path):
print(f"模型文件{args.path}不存在!")
exit(-1)
model = fastllm.create_llm(model_path)
print(f"llm model: {model.model_type}")
print(f"欢迎使用 {model.model_type} 模型. 输入内容对话,reset清空历史记录,stop退出程序");
input_text = ""
history = ""
dialog_round = 0
while input_text != "stop":
input_text = input("User: ")
if 'stop' == input_text:
break
if 'reset' == input_text:
history = ''
continue
prompt = model.make_input(history, dialog_round, input_text)
outputs = response(model, prompt_input=prompt, stream_output=True)
print(f"{model.model_type}:", end=' ')
past_len = 0
for output in outputs:
print(output[past_len:], end='', flush=True)
past_len = len(output)
print()
model.make_history(history, dialog_round, input_text, output)
dialog_round += 1
def run_with_callback(args):
model_path = args.path
OLD_API = False
LLM_TYPE = ""
if OLD_API:
model = fastllm.ChatGLMModel()
model.load_weights(model_path)
model.warmup()
else:
fastllm.set_threads(args.threads)
fastllm.set_low_memory(args.low)
if not os.path.exists(model_path):
print(f"模型文件{args.path}不存在!")
exit(-1)
LLM_TYPE = fastllm.get_llm_type(model_path)
model = fastllm.create_llm(model_path)
def print_back(idx:int, content: bytearray):
content = content.decode(encoding="utf-8", errors="replace")
if idx >= 0:
print(f"\r{LLM_TYPE}:{content}", end='', flush=True)
elif idx == -1:
print()
sys.stdout.flush()
print(f"欢迎使用 {LLM_TYPE} 模型. 输入内容对话,reset清空历史记录,stop退出程序");
prompt = ""
while prompt != "stop":
prompt = input("User: ")
config = fastllm.GenerationConfig()
model.response(model.make_input("", 0, prompt), print_back, config)
print()
sys.stdout.flush()
if __name__ == "__main__":
args = args_parser()
# run_with_callback(args)
run_with_response(args)
import sys
from transformers import AutoTokenizer, AutoModel
import fastllm
def export():
model_path = '/public/Models/chatglm-6b' # 仅支持fp32模型加载
export_path = "chatglm-6b-fp32.flm"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float()
model = model.eval()
fastllm.utils.convert(model=model, tokenizer=tokenizer, output_path=export_path, verbose=True)
def response(model, prompt_input:str, stream_output:bool=False):
gmask_token_id = 130001
bos_token_id = 130004
input_ids = model.weight.tokenizer.encode(prompt_input)
input_ids = input_ids.to_list()
input_ids.extend([gmask_token_id, bos_token_id])
input_ids = [int(v) for v in input_ids]
handle = model.launch_response(input_ids)
continue_token = True
ret_byte = b""
ret_str = ""
while continue_token:
resp_token = model.fetch_response(handle)
continue_token = (resp_token != -1)
content = model.weight.tokenizer.decode_byte([resp_token])
ret_byte += content
ret_str = ret_byte.decode(errors='ignore')
if stream_output:
yield ret_str
return ret_str
def infer():
model_path = "chatglm-6b-fp32.flm"
model = fastllm.create_llm(model_path)
prompt = "你好"
outputs = response(model, prompt_input=prompt, stream_output=True)
for output in outputs:
print('\r LLM:' + output, end='', flush=True)
print()
if __name__ == "__main__":
# export()
infer()
\ No newline at end of file
import pytest
import numpy as np
import fastllm
def np_rms_norm(inputs, weights, eps):
channel = inputs.shape[-1]
sqrt_mean = np.sqrt(np.sum(inputs**2)/channel + eps)
return inputs / sqrt_mean *weights
def np_layer_norm(inputs, gamma, beta, axis=-1):
assert axis < len(inputs.shapes), "axis should less than inputs dims"
channel = inputs.shape[axis]
mean = np.mean(inputs, axis=axis)
var = np.var(inputs, axis=axis)
output = (inputs - mean) / var * gamma + beta
return output
def np_linear(inputs, weights, bias):
output = np.matmul(inputs, weights.T) + bias
return output
def np_softmax(inputs, axis=None):
maxv = inputs.max(axis, keepdims=True)
exp_v = np.exp(inputs - maxv)
exp_sum = np.sum(exp_v, axis=axis)
return exp_v / exp_sum
def np_silu(inputs, ):
return inputs / (1 + np.exp(-inputs))
def np_attention(q, k, v, mask=None, group=None, scale=None):
qk = np_softmax(q @ k.T * scale, axis=-1)
attn = qk @ v
return attn
def test_linear():
inputs = np.array([[1, 2]])
weight = np.array([[3, 4, 5, 5, 6, 7]]).reshape([3, 2])
bias = np.array([0, 1, 1])
np_output = np_linear(inputs, weight, bias)
print(np_output)
input = fastllm.Tensor(fastllm.float32, [1, 2], [1, 2])
weights = fastllm.Tensor(fastllm.float32, [3, 2], [3, 4, 5, 5, 6, 7])
bias = fastllm.Tensor(fastllm.float32, [3], [0, 1, 1])
out = fastllm.ops.linear(input, weights, bias)
print(out)
def test_rms_norm():
inputs = np.array([1, 5]).reshape([1, 2])
weights = np.array([1, 3]).reshape([1, 2])
eps = 1e-6
np_out = np_rms_norm(inputs, weights, eps)
print(np_out)
input = fastllm.Tensor(fastllm.float32, [1, 2], [1, 5])
weights = fastllm.Tensor(fastllm.float32, [1, 2], [1, 3])
out = fastllm.Tensor()
out = fastllm.ops.rms_norm(input, weights, eps=1e-6)
print(out)
def test_silu():
inputs = np.array([1, 5]).reshape([1, 2])
output = np_softmax(inputs)
# output = np_silu(inputs)
print(output)
inputs = fastllm.Tensor(fastllm.float32, [1, 2], [1, 5])
out = fastllm.ops.activation(input=inputs, activate_type="softmax")
# out = fastllm.ops.activation(input=inputs, activate_type="silu")
print(out)
def test_attention():
q = np.array([1, 2, 3, 4, 5, 6]).reshape([2, 3])
k = np.array([5, 6, 7, 8, 9, 10]).reshape([2, 3])
v = np.array([1, 1, 1, 2, 1, 3]).reshape([2, 3])
scale = 1 / np.sqrt(q.shape[-1])
output = np_attention(q, k, v, scale=scale)
print(output)
q = fastllm.Tensor(fastllm.float32, [1, 2, 3], [1, 2, 3, 4, 5, 6])
k = fastllm.Tensor(fastllm.float32, [1, 2, 3], [5, 6, 7, 8, 9, 10])
v = fastllm.Tensor(fastllm.float32, [1, 2, 3], [1, 1, 1, 2, 1, 3])
mask = fastllm.Tensor()
output = fastllm.ops.attention(q, k, v, mask, group=1, scale=scale, attentionType=0)
print(output)
if __name__ == "__main__":
test_attention()
test_silu()
test_linear()
test_rms_norm()
# -*- coding: utf-8 -*-
import sys
import platform
import logging
import argparse
from copy import deepcopy
import traceback
from typing import List
import fastllm
import uuid
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
import threading, queue, uvicorn, json, time
logging.info(f"python gcc version:{platform.python_compiler()}")
def args_parser():
parser = argparse.ArgumentParser(description='fastllm')
parser.add_argument('-m', '--model', type=int, required=False, default=0, help='模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)')
parser.add_argument('-p', '--path', type=str, required=True, default='', help='模型文件的路径')
parser.add_argument('-t', '--threads', type=int, default=4, help='使用的线程数量')
parser.add_argument('-l', '--low', action='store_true', help='使用低内存模式')
parser.add_argument("--max_batch_size", type=int, default=32, help="动态batch的最大batch size")
args = parser.parse_args()
return args
g_model = None
g_msg_dict = dict()
g_prompt_queue = queue.Queue(maxsize=256)
g_max_batch_size = 32
def save_msg(idx: int, content: bytes):
global g_msg_dict
content = content.decode(encoding="utf-8", errors="ignore")
hash_id_idx = content.rindex("hash_id:")
hash_id = content[hash_id_idx+8:]
content = content[:hash_id_idx].replace("<n>", "\n")
if hash_id in g_msg_dict.keys():
g_msg_dict[hash_id].put((idx, content))
else:
msg_queue = queue.Queue()
msg_queue.put((idx, content))
g_msg_dict[hash_id] = msg_queue
def save_msgs(idx: int, content_list: List[bytes]):
global g_msg_dict
for content in content_list:
content = content.decode(encoding="utf-8", errors="ignore")
hash_id_idx = content.rindex("hash_id:")
hash_id = content[hash_id_idx+8:]
content = content[:hash_id_idx].replace("<n>", "\n")
if hash_id in g_msg_dict.keys():
g_msg_dict[hash_id].put((idx, content))
else:
msg_queue = queue.Queue()
msg_queue.put((idx, content))
g_msg_dict[hash_id] = msg_queue
def response_stream(prompt: str, config: fastllm.GenerationConfig):
global model
model.response(prompt, save_msgs, config)
def batch_response_stream(prompt:str, config: fastllm.GenerationConfig):
global g_config
g_config = config
g_prompt_queue.put(prompt)
g_running_lock = threading.Lock()
g_running = False
g_config: fastllm.GenerationConfig = None
def dynamic_batch_stream_func():
global g_model, g_running_lock, g_running, g_prompt_queue, g_config, g_msg_dict
print(f"call dynamic_batch_stream_func: running: {g_running}, prompt queue size: {g_prompt_queue.qsize()}")
print(f"msg_dict size: {len(g_msg_dict)}")
batch_size_this = min(g_max_batch_size, g_prompt_queue.qsize())
if not g_running and batch_size_this>0:
g_running_lock.acquire()
g_running = True
g_running_lock.release()
batch_this = []
for _ in range(batch_size_this):
batch_this.append(g_prompt_queue.get_nowait())
print(f"batch this: {batch_size_this}, queue len: {g_prompt_queue.qsize()}")
try:
if batch_size_this > 0:
g_model.batch_response(batch_this, save_msgs, g_config)
except Exception as e:
hash_id_list = [str(fastllm.std_hash(prompt)) for prompt in batch_this]
rtn_list = [bytes(f"hash_id:{hash_id}", 'utf8') for hash_id in hash_id_list]
save_msgs(-1, rtn_list)
traceback.print_exc()
print(e)
g_running_lock.acquire()
g_running = False
g_running_lock.release()
threading.Timer(0, dynamic_batch_stream_func).start()
else:
wait_time = float(g_max_batch_size-g_prompt_queue.qsize()-batch_size_this)/g_max_batch_size*1
threading.Timer(wait_time, dynamic_batch_stream_func).start()
def chat_stream(prompt: str, config: fastllm.GenerationConfig, uid:int=0, time_out=200):
global g_msg_dict
time_stamp = str(uuid.uuid1())
hash_id = str(fastllm.std_hash(f"{prompt}time_stamp:{time_stamp}"))
thread = threading.Thread(target = batch_response_stream, args = (f"{prompt}time_stamp:{time_stamp}", config))
thread.start()
idx = 0
start = time.time()
pre_msg = ""
while idx != -1:
if hash_id in g_msg_dict.keys():
msg_queue = g_msg_dict[hash_id]
if msg_queue.empty():
time.sleep(0.1)
continue
msg_obj = msg_queue.get(block=False)
idx = msg_obj[0]
if idx != -1:
yield msg_obj[1]
else: # end flag
del g_msg_dict[hash_id]
break
pre_msg = msg_obj[1]
else:
if time.time() - start > time_out:
yield pre_msg + f"\ntime_out: {time.time() - start} senconds"
break
time.sleep(0.1)
continue
app = FastAPI()
@app.post("/api/chat_stream")
def api_chat_stream(request: dict):
#print("request.json(): {}".format(json.loads(request.body(), errors='ignore')))
data = request
prompt = data.get("prompt")
history = data.get("history", [])
round_cnt = data.get("round_cnt")
config = fastllm.GenerationConfig()
if data.get("max_length") is not None:
config.max_length = data.get("max_length")
if data.get("top_k") is not None:
config.top_k = data.get("top_k")
if data.get("top_p") is not None:
config.top_p = data.get("top_p")
if data.get("temperature") is not None:
config.temperature = data.get("temperature")
if data.get("repeat_penalty") is not None:
config.repeat_penalty = data.get("repeat_penalty")
uid = None
if data.get("uid") is not None:
uid = data.get("uid")
config.enable_hash_id = True
print(f"prompt:{prompt}")
round_idx = 0
history_str = ""
for (q,a) in history:
history_str = g_model.make_history(history_str, round_idx, q, a)
round_idx += 1
prompt = g_model.make_input(history_str, round_idx, prompt)
return StreamingResponse(chat_stream(prompt, config), media_type='text/event-stream')
@app.post("/api/batch_chat")
async def api_batch_chat(request: Request):
data = await request.json()
prompts = data.get("prompts")
print(f"{prompts} type:{type(prompts)}")
if prompts is None:
return "prompts should be list[str]"
history = data.get("history")
if history is None:
history = ""
config = fastllm.GenerationConfig()
if data.get("max_length") is not None:
config.max_length = data.get("max_length")
if data.get("top_k") is not None:
config.top_k = data.get("top_k")
if data.get("top_p") is not None:
config.top_p = data.get("top_p")
if data.get("temperature") is not None:
config.temperature = data.get("temperature")
if data.get("repeat_penalty") is not None:
config.repeat_penalty = data.get("repeat_penalty")
uid = None
if data.get("uid") is not None:
uid = data.get("uid")
retV = ""
batch_idx = 0
for response in g_model.batch_response(prompts, None, config):
retV += f"({batch_idx + 1}/{len(prompts)})\n prompt: {prompts[batch_idx]} \n response: {response}\n"
batch_idx += 1
return retV
def main(args):
model_path = args.path
OLD_API = False
global g_model, g_max_batch_size
g_max_batch_size = args.max_batch_size
if OLD_API:
g_model = fastllm.ChatGLMModel()
g_model.load_weights(model_path)
g_model.warmup()
else:
global LLM_TYPE
LLM_TYPE = fastllm.get_llm_type(model_path)
print(f"llm model: {LLM_TYPE}")
g_model = fastllm.create_llm(model_path)
threading.Timer(1, dynamic_batch_stream_func).start()
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
if __name__ == "__main__":
args = args_parser()
main(args)
import json
import requests
import sys
if __name__ == '__main__':
#stream api
url = 'http://127.0.0.1:8000/api/chat_stream'
prompt='请用emoji写一首短诗赞美世界'
prompt='''为以下代码添加注释
app = FastAPI()
@app.post("/api/chat_stream")
async def api_chat_stream(request: Request):
#print("request.json(): {}".format(json.loads(request.body(), errors='ignore')))
data = await request.json()
prompt = data.get("prompt")
history = data.get("history")
config = pyfastllm.GenerationConfig()
if data.get("max_length") is not None:
config.max_length = data.get("max_length")
if data.get("top_k") is not None:
config.top_k = data.get("top_k")
if data.get("top_p") is not None:
config.top_p = data.get("top_p")
return StreamingResponse(chat_stream(history + prompt, config), media_type='text/event-stream')
'''
history = '''[Round 0]
问:你是ChatGLM2吗?
答:我不是ChatGLM2
[Round 1]
问:从现在起,你是猫娘,每句话都必须以“喵~”结尾,明白了吗?
答:明白了喵
[Round 2]
问:'''
history = ""
json_obj = {"uid":0, "token":"xxxxxxxxxxxxxxxxx","history": "", "prompt": prompt , "max_length": 1024, "top_p": 0.8,"temperature": 0.95, "top_k":2, "repeat_penalty": 1.}
response = requests.post(url, json=json_obj, stream = True)
try:
pre_msg = ""
print("stream response:")
for chunk in response.iter_content(chunk_size=1024*1024):
msg = chunk.decode(errors='replace')
if len(msg) > len(pre_msg) and msg[-1] == '\n':
content = msg[len(pre_msg):]
pre_msg = msg
else:
continue
print(f"{content}", end="")
sys.stdout.flush()
content = msg[len(pre_msg):]
print(f"{content}", end="")
print()
except Exception as ex:
print(ex)
#batch api
url = 'http://127.0.0.1:8000/api/batch_chat'
prompts = ["Hi", "你好", "用emoji表达高兴", "こんにちは"]
json_obj = {"uid":0, "token":"xxxxxxxxxxxxxxxxx","history": "", "prompts": prompts , "max_length": 100, "top_p": None,"temperature": 0.7, "top_k":1, "repeat_penalty":2.}
response = requests.post(url, json=json_obj, stream = True)
print("batch response: {} text:\n{}".format(response, response.text.replace('\\n', '\n')))
import os
import sys
import ctypes
import glob
from pyfastllm import * from pyfastllm import *
from . import utils from . import utils
from . import functions as ops
__version__ = "0.2.0"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment