Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
fastllm
Commits
56215723
Commit
56215723
authored
Jan 31, 2024
by
zhouxiang
Browse files
1、同步到最新版本;2、增加batch推理接口;3、解决内存泄漏问题;4、修复llama系列流式输出不流畅的问题
parent
44be91d3
Changes
83
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
991 additions
and
64 deletions
+991
-64
example/webui/web/index.html
example/webui/web/index.html
+3
-3
example/webui/webui.cpp
example/webui/webui.cpp
+32
-2
include/devices/cuda/cudadevice.h
include/devices/cuda/cudadevice.h
+3
-0
include/devices/cuda/fastllm-cuda.cuh
include/devices/cuda/fastllm-cuda.cuh
+4
-0
include/fastllm.h
include/fastllm.h
+18
-5
include/models/basellm.h
include/models/basellm.h
+9
-1
include/models/chatglm.h
include/models/chatglm.h
+15
-3
include/models/glm.h
include/models/glm.h
+62
-0
pyfastllm/README.md
pyfastllm/README.md
+64
-38
pyfastllm/build_libs.py
pyfastllm/build_libs.py
+12
-7
pyfastllm/demo/cli.py
pyfastllm/demo/cli.py
+1
-1
pyfastllm/demo/test_ops.py
pyfastllm/demo/test_ops.py
+93
-0
pyfastllm/demo/web_api.py
pyfastllm/demo/web_api.py
+4
-3
pyfastllm/examples/cli_low_level.py
pyfastllm/examples/cli_low_level.py
+97
-0
pyfastllm/examples/cli_simple.py
pyfastllm/examples/cli_simple.py
+127
-0
pyfastllm/examples/convert_model.py
pyfastllm/examples/convert_model.py
+57
-0
pyfastllm/examples/test_ops.py
pyfastllm/examples/test_ops.py
+95
-0
pyfastllm/examples/web_api.py
pyfastllm/examples/web_api.py
+223
-0
pyfastllm/examples/web_api_client.py
pyfastllm/examples/web_api_client.py
+62
-0
pyfastllm/fastllm/__init__.py
pyfastllm/fastllm/__init__.py
+10
-1
No files found.
example/webui/web/index.html
View file @
56215723
...
...
@@ -2,11 +2,11 @@
<html
lang=
"zh-CN"
>
<meta
name=
"viewport"
content=
"width=device-width,initial-scale=1"
/>
<meta
name=
"description"
content=
"
Fastllm
Web Interface"
/>
<meta
name=
"description"
content=
"
Chat
Web Interface"
/>
<head>
<meta
charset=
"utf-8"
>
<title>
Fastllm
Web Interface
</title>
<title>
Chat
Web Interface
</title>
<style>
*
{
box-sizing
:
border-box
;
...
...
@@ -693,7 +693,7 @@
<div
class=
"button minimize"
></div>
<div
class=
"button maximize"
></div>
</div>
<div
class=
"title"
>
Fastllm
</div>
<div
class=
"title"
>
Chat
</div>
</div>
<div
class=
"messages"
>
<div
id=
"chatlog"
></div>
...
...
example/webui/webui.cpp
View file @
56215723
...
...
@@ -18,6 +18,12 @@ struct WebConfig {
int
threads
=
4
;
// 使用的线程数
bool
lowMemMode
=
false
;
// 是否使用低内存模式
int
port
=
8081
;
// 端口号
bool
history
=
true
;
// 是否采用多轮对话的方式
int
max_length
=
4096
;
//输出最大长度
float
repeat_penalty
=
1.0
f
;
// 重复惩罚系数,1.0代表不惩罚
int
top_k
=
1
;
// top_k采样
float
top_p
=
1.0
;
// top_p采样
float
temperature
=
1.0
;
// 温度参数,一般在0.1 ~ 1.0之间,设大这个参数可以带来结果的多样性
};
void
Usage
()
{
...
...
@@ -28,6 +34,7 @@ void Usage() {
std
::
cout
<<
"<-t|--threads> <args>: 使用的线程数量"
<<
std
::
endl
;
std
::
cout
<<
"<-l|--low>: 使用低内存模式"
<<
std
::
endl
;
std
::
cout
<<
"<--port> <args>: 网页端口号"
<<
std
::
endl
;
std
::
cout
<<
"<--nohistory> <args>: 不采用多轮对话模式"
<<
std
::
endl
;
}
void
ParseArgs
(
int
argc
,
char
**
argv
,
WebConfig
&
config
)
{
...
...
@@ -49,6 +56,18 @@ void ParseArgs(int argc, char **argv, WebConfig &config) {
config
.
webPath
=
sargv
[
++
i
];
}
else
if
(
sargv
[
i
]
==
"--port"
)
{
config
.
port
=
atoi
(
sargv
[
++
i
].
c_str
());
}
else
if
(
sargv
[
i
]
==
"--max_length"
)
{
config
.
max_length
=
atoi
(
sargv
[
++
i
].
c_str
());
}
else
if
(
sargv
[
i
]
==
"--repeat_penalty"
)
{
config
.
repeat_penalty
=
atof
(
sargv
[
++
i
].
c_str
());
}
else
if
(
sargv
[
i
]
==
"--top_k"
)
{
config
.
top_k
=
atoi
(
sargv
[
++
i
].
c_str
());
}
else
if
(
sargv
[
i
]
==
"--top_p"
)
{
config
.
top_p
=
atof
(
sargv
[
++
i
].
c_str
());
}
else
if
(
sargv
[
i
]
==
"--temperature"
)
{
config
.
temperature
=
atof
(
sargv
[
++
i
].
c_str
());
}
else
if
(
sargv
[
i
]
==
"--nohistory"
)
{
config
.
history
=
false
;
}
else
{
Usage
();
exit
(
-
1
);
...
...
@@ -83,7 +102,12 @@ int main(int argc, char** argv) {
session
->
output
=
"<eop>
\n
"
;
session
->
status
=
2
;
}
else
{
auto
prompt
=
model
->
MakeInput
(
session
->
history
,
session
->
round
,
input
);
std
::
string
prompt
;
if
(
config
.
history
)
prompt
=
model
->
MakeInput
(
session
->
history
,
session
->
round
,
input
);
else
prompt
=
model
->
MakeInput
(
""
,
0
,
input
);
auto
inputs
=
model
->
weight
.
tokenizer
.
Encode
(
prompt
);
std
::
vector
<
int
>
tokens
;
...
...
@@ -91,7 +115,13 @@ int main(int argc, char** argv) {
tokens
.
push_back
(((
float
*
)
inputs
.
cpuData
)[
i
]);
}
int
handleId
=
model
->
LaunchResponseTokens
(
tokens
);
fastllm
::
GenerationConfig
gconfig
;
gconfig
.
output_token_limit
=
config
.
max_length
;
gconfig
.
temperature
=
config
.
temperature
;
gconfig
.
repeat_penalty
=
config
.
repeat_penalty
;
gconfig
.
top_p
=
config
.
top_p
;
gconfig
.
top_k
=
config
.
top_k
;
int
handleId
=
model
->
LaunchResponseTokens
(
tokens
,
gconfig
);
std
::
vector
<
float
>
results
;
while
(
true
)
{
int
result
=
model
->
FetchResponseTokens
(
handleId
);
...
...
include/devices/cuda/cudadevice.h
View file @
56215723
...
...
@@ -43,6 +43,9 @@ namespace fastllm {
void
Reshape
(
const
std
::
string
&
opType
,
const
DataDict
&
datas
,
const
FloatDict
&
floatParams
,
const
IntDict
&
intParams
);
bool
CanRun
(
const
std
::
string
&
opType
,
const
DataDict
&
datas
,
const
FloatDict
&
floatParams
,
const
IntDict
&
intParams
);
void
Run
(
const
std
::
string
&
opType
,
const
DataDict
&
datas
,
const
FloatDict
&
floatParams
,
const
IntDict
&
intParams
);
// public:
// CudaLinearOp();
// void *streams_handle = NULL;
};
class
CudaSplitOp
:
BaseOperator
{
...
...
include/devices/cuda/fastllm-cuda.cuh
View file @
56215723
...
...
@@ -15,6 +15,7 @@ void FastllmCudaDirectFree(void *ret);
void
FastllmCudaCopyFromHostToDevice
(
void
*
dst
,
void
*
src
,
size_t
size
);
void
FastllmCudaCopyFromDeviceToHost
(
void
*
dst
,
void
*
src
,
size_t
size
);
void
FastllmCudaCopyFromDeviceToDevice
(
void
*
dst
,
void
*
src
,
size_t
size
);
void
FastllmCudaMemcpyBetweenDevices
(
int
dstId
,
void
*
dst
,
int
srcId
,
void
*
src
,
size_t
size
);
void
FastllmCudaMemcpy2DDeviceToDevice
(
void
*
dst
,
size_t
dpitch
,
const
void
*
src
,
size_t
spitch
,
size_t
width
,
size_t
height
);
...
...
@@ -40,7 +41,9 @@ bool FastllmCudaMatMulFloatInt8(const fastllm::Data &input, fastllm::Data &weigh
bool
FastllmCudaMatMulFloatInt4
(
const
fastllm
::
Data
&
input
,
fastllm
::
Data
&
weight
,
const
fastllm
::
Data
&
bias
,
fastllm
::
Data
&
output
,
int
n
,
int
m
,
int
k
);
bool
FastllmCudaMatMulFloatInt4NoZero
(
const
fastllm
::
Data
&
input
,
fastllm
::
Data
&
weight
,
const
fastllm
::
Data
&
bias
,
fastllm
::
Data
&
output
,
int
n
,
int
m
,
int
k
);
bool
FastllmCudaMatMulFloat32
(
const
fastllm
::
Data
&
input
,
fastllm
::
Data
&
weight
,
const
fastllm
::
Data
&
bias
,
fastllm
::
Data
&
output
,
int
n
,
int
m
,
int
k
);
bool
FastllmCudaMatMulFloat16
(
const
fastllm
::
Data
&
input
,
fastllm
::
Data
&
weight
,
const
fastllm
::
Data
&
bias
,
fastllm
::
Data
&
output
,
int
n
,
int
m
,
int
k
);
// bool FastllmCudaMatMulFloat16(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k, void* streams_handle);
bool
FastllmCudaBatchMatMul
(
const
fastllm
::
Data
&
input0
,
const
fastllm
::
Data
&
input1
,
fastllm
::
Data
&
output
,
int
input0Spatial
,
int
input1Spatial
,
int
outputSpatial
,
int
input0Stride
,
int
input1Stride
,
...
...
@@ -70,6 +73,7 @@ bool FastllmCudaBatchMatMulBatch(void **i0s, void **i1s, void **os,
int
*
ns
,
int
*
ms
,
int
*
ks
,
int
*
i0Strides
,
int
*
i1Strides
,
float
alpha
,
int
batch
);
void
FastllmCudaSetDevice
(
int
gpu_id
);
void
*
FastllmCreateStreams
(
int
numStreams
);
#ifdef __cplusplus
}
#endif
include/fastllm.h
View file @
56215723
...
...
@@ -19,6 +19,10 @@
#include <memory>
#include "devices/cpu/cputhreadpool.h"
#ifdef USE_SENTENCEPIECE
#include <sentencepiece_processor.h>
#endif
namespace
fastllm
{
void
SetDeviceMap
(
const
std
::
map
<
std
::
string
,
int
>
&
deviceMap
);
std
::
map
<
std
::
string
,
int
>
GetDeviceMap
();
...
...
@@ -40,6 +44,7 @@ namespace fastllm {
float
temperature
=
1.0
;
// 温度参数,一般在0.1 ~ 1.0之间,设大这个参数可以带来结果的多样性
bool
output_logits
=
false
;
// 是否返回logits
bool
enable_hash_id
=
false
;
// 给会话添加hash id
std
::
multiset
<
int
>
stop_token_ids
;
bool
IsSimpleGreedy
()
const
{
...
...
@@ -149,7 +154,7 @@ namespace fastllm {
uint8_t
quantization
(
const
float
&
realNumber
)
const
{
if
(
type
==
0
)
{
return
(
uint8_t
)
(
std
::
min
((
double
)
((
1
<<
bit
)
-
1
),
std
::
max
(
realNumber
/
scale
+
zeroPoint
+
0.5
,
0.0
)));
(
double
)
std
::
max
(
realNumber
/
scale
+
zeroPoint
+
0.5
,
0.0
)));
}
else
{
return
(
uint8_t
)
(
std
::
max
(
0.
f
,
std
::
min
(
15.
f
,
(
realNumber
-
min
)
/
scale
+
0.5
f
)));
}
...
...
@@ -245,7 +250,7 @@ namespace fastllm {
std
::
string
fileName
;
long
long
filePos
;
std
::
shared_ptr
<
FileMmap
>
m
_f
ile
;
std
::
shared_ptr
<
FileMmap
>
m
apF
ile
;
bool
directMemory
=
false
;
// 直接分配/释放Memory,不经过缓存
...
...
@@ -287,6 +292,8 @@ namespace fastllm {
void
PrintShape
()
const
;
// 输出形状
std
::
vector
<
int
>
Shape
()
const
;
void
Print
()
const
;
// 输出
void
CalcWeightSum
();
// 计算WeightSum
...
...
@@ -297,8 +304,8 @@ namespace fastllm {
void
ToDevice
(
void
*
device
);
void
s
et
_f
ile
(
std
::
shared_ptr
<
FileMmap
>
file
)
{
m_f
ile
=
file
;
void
S
et
MapF
ile
(
std
::
shared_ptr
<
FileMmap
>
file
)
{
mapF
ile
=
file
;
}
};
...
...
@@ -306,7 +313,8 @@ namespace fastllm {
enum
TokenizerType
{
BPE
=
0
,
NORMAL
=
1
,
QWEN
=
2
QWEN
=
2
,
GLM
=
3
};
struct
TrieNode
{
...
...
@@ -357,6 +365,9 @@ namespace fastllm {
std
::
unordered_map
<
int
,
std
::
string
>
tokenToStringDict
;
std
::
unordered_map
<
int
,
float
>
tokenToScoreDict
;
std
::
unordered_map
<
std
::
string
,
int
>
stringToTokenDict
;
#ifdef USE_SENTENCEPIECE
std
::
unique_ptr
<
sentencepiece
::
SentencePieceProcessor
>
spProcessor
;
#endif
Tokenizer
();
...
...
@@ -405,6 +416,8 @@ namespace fastllm {
void
AddWeight
(
const
std
::
string
&
key
,
const
std
::
vector
<
int
>
&
dims
,
DataType
dataType
,
WeightType
weightType
,
DataType
oriDataType
,
uint8_t
*
oriData
);
// 插入一个权重
void
ReleaseWeight
();
// 释放所有权重占用的空间
void
AddQLinearWeight
(
const
std
::
string
&
key
,
const
std
::
vector
<
int
>
&
dims
,
int
bit
,
float
*
scales
,
uint8_t
*
oriData
);
// 插入一个Qlinear层的权重,量化规则为float value = scales * oriData
...
...
include/models/basellm.h
View file @
56215723
...
...
@@ -46,7 +46,9 @@ namespace fastllm {
public:
basellm
()
{};
~
basellm
()
{};
~
basellm
()
{
this
->
weight
.
ReleaseWeight
();
};
virtual
void
LoadFromFile
(
const
std
::
string
&
fileName
);
// 从文件读取
...
...
@@ -102,6 +104,11 @@ namespace fastllm {
RuntimeResultBatch
retCb
=
nullptr
,
const
GenerationConfig
&
generationConfig
=
GenerationConfig
());
// 批量根据给出的内容回复
virtual
void
ResponseBatch
(
std
::
vector
<
std
::
vector
<
float
>>
&
inputTokens
,
std
::
vector
<
std
::
string
>
&
outputs
,
RuntimeResultBatch
retCb
=
nullptr
,
const
GenerationConfig
&
generationConfig
=
GenerationConfig
());
// 批量根据给出的内容回复
virtual
int
LaunchResponseTokens
(
const
std
::
vector
<
int
>
&
inputTokens
,
const
GenerationConfig
&
generationConfig
=
GenerationConfig
());
// 启动一个response任务,返回分配的handleId
...
...
@@ -148,6 +155,7 @@ namespace fastllm {
std
::
thread
*
mainLoop
=
nullptr
;
std
::
mutex
mainLoopLocker
,
dictLocker
;
std
::
mutex
resultTokenQueueLocker
;
std
::
map
<
std
::
string
,
int
>
deviceMap
;
...
...
include/models/chatglm.h
View file @
56215723
...
...
@@ -15,8 +15,10 @@ namespace fastllm {
public:
ChatGLMModel
();
// 构造函数
virtual
void
InitParams
();
// 初始化参数信息
// 推理
virtual
int
Forward
(
virtual
int
Forward
(
const
Data
&
inputIds
,
const
Data
&
attentionMask
,
const
Data
&
positionIds
,
...
...
@@ -56,7 +58,7 @@ namespace fastllm {
const
std
::
vector
<
std
::
map
<
std
::
string
,
int
>
>
&
params
,
Data
&
inputIds
,
Data
&
attentionMask
,
Data
&
positionIds
);
virtual
void
WarmUp
();
// 预热
virtual
void
WarmUp
();
// 预热
virtual
std
::
string
MakeInput
(
const
std
::
string
&
history
,
int
round
,
const
std
::
string
&
input
);
// 根据历史信息和当前输入生成prompt
...
...
@@ -66,7 +68,17 @@ namespace fastllm {
void
UpdateSinCos
(
float
rope
);
private:
virtual
void
CausalMask
(
Data
&
data
,
int
start
)
{};
// 因果mask?
virtual
void
CausalMask
(
Data
&
data
,
int
start
)
{};
// 因果mask?
int
mask_token_id
;
int
gmask_token_id
;
int
smask_token_id
;
// int sop_token_id; //=bos_token_id
int
eop_token_id
;
int
system_token_id
;
int
user_token_id
;
int
assistant_token_id
;
int
observation_token_id
;
float
rope
=
1.0
f
;
};
...
...
include/models/glm.h
0 → 100644
View file @
56215723
//
// Created by huangyuyang on 5/11/23.
//
#ifndef FASTLLM_GLM_H
#define FASTLLM_GLM_H
#include "basellm.h"
#include "cmath"
#include <iostream>
namespace
fastllm
{
class
GLMModel
:
public
basellm
{
public:
GLMModel
();
// 构造函数
// 推理
virtual
int
Forward
(
const
Data
&
inputIds
,
const
Data
&
attentionMask
,
const
Data
&
positionIds
,
std
::
vector
<
std
::
pair
<
Data
,
Data
>
>
&
pastKeyValues
,
const
GenerationConfig
&
generationConfig
=
GenerationConfig
(),
const
LastTokensManager
&
lastTokens
=
LastTokensManager
(),
std
::
vector
<
float
>
*
logits
=
nullptr
);
std
::
vector
<
int
>
ForwardBatch
(
int
batch
,
const
Data
&
inputIds
,
const
Data
&
attentionMask
,
const
Data
&
positionIds
,
std
::
vector
<
std
::
pair
<
Data
,
Data
>
>
&
pastKeyValues
,
const
GenerationConfig
&
generationConfig
=
GenerationConfig
(),
const
LastTokensManager
&
lastTokens
=
LastTokensManager
(),
std
::
vector
<
std
::
vector
<
float
>*>
*
retLogits
=
nullptr
);
// 根据输入的tokens生成LLM推理的输入
virtual
void
FillLLMInputs
(
std
::
vector
<
std
::
vector
<
float
>
>
&
inputTokens
,
const
std
::
map
<
std
::
string
,
int
>
&
params
,
Data
&
inputIds
,
Data
&
attentionMask
,
Data
&
positionIds
);
virtual
void
InitParams
();
virtual
void
WarmUp
();
// 预热
virtual
std
::
string
MakeInput
(
const
std
::
string
&
history
,
int
round
,
const
std
::
string
&
input
);
// 根据历史信息和当前输入生成prompt
virtual
std
::
string
MakeHistory
(
const
std
::
string
&
history
,
int
round
,
const
std
::
string
&
input
,
const
std
::
string
&
output
);
// 根据当前回复更新history
private:
float
scale_attn_1
;
static
constexpr
int
eot_token_id
=
50000
;
//<|endoftext|>
static
constexpr
int
cls_token_id
=
50002
;
//[CLS]
static
constexpr
int
mask_token_id
=
50003
;
//[MASK]
static
constexpr
int
smask_token_id
=
50008
;
//[sMASK]
static
constexpr
int
gmask_token_id
=
50009
;
//[gMASK]
};
}
#endif //FASTLLM_GLM_H
pyfastllm/README.md
View file @
56215723
...
...
@@ -12,6 +12,22 @@ pyfastllm是基于fastllm的python api接口实现,通过pyfastllm可以更加
## 版本更新
### v0.2.0 2023-10-23
-
代码结构调整优化
-
增加了模型转换和量化接口
### v0.1.5 2023-10-13
-
修复wheel编译安装部分
-
文件合并,修复导入
### v0.1.4 2023-09-12
-
修复了一些后端接口变动的bug
-
增加了新的ops, 支持低级op操作
### v0.1.3 2023-07-08
-
增加使用和API接口文档
...
...
@@ -31,26 +47,21 @@ pyfastllm是基于fastllm的python api接口实现,通过pyfastllm可以更加
首先下载pybind11 c++依赖:
```
sh
```
sh
ell
git submodule init
git submodule update
# 下载pybind11依赖
```
Cpp手动编译:
```
sh
```
shell
mkdir
build-py
cd
build-py
cmake ..
-DUSE_CUDA
=
ON
-DPY_API
=
ON
make
-j4
python cli.py
-p
chatglm-6b-int8.bin
-t
8
# 与cpp编译的运行结果保持一致
```
Python脚本编译:
```
sh
cd
pyfastllm
python build_libs
--cuda
python cli.py
-p
chatglm-6b-int8.bin
-t
8
make
-j
cp
fastllm
*
.so pyfastllm/examples/
# 或放置在$PYTHONPATH环境变量包含的的目录中
cd
../pyfastllm/examples/
python3 cli_simple.py
-p
chatglm-6b-int8.flm
# 与cpp编译的运行结果保持一致
```
### wheel包方式
...
...
@@ -59,27 +70,39 @@ python cli.py -p chatglm-6b-int8.bin -t 8
首先下载pybind11:
```
ba
sh
```
sh
ell
pip
install
pybind11
```
```
sh
cd
pyfastllm
python setup.py build
python setup.py
install
python cli.py
-p
chatglm-6b-int8.bin
-t
8
-
GPU
```
shell
cd
pyfastllm/
python3 setup.py build
python3 setup.py
install
cd
examples/
python3 cli_simple.py
-p
chatglm-6b-int8.flm
```
-
CPU
```
shell
cd
pyfastllm/
export
USE_CUDA
=
OFF
python3 setup.py build
python3 setup.py
install
cd
examples/
python3 cli_simple.py
-p
chatglm-6b-int8.flm
-t
8
```
## 使用
### python 调用
在
demo
文件夹中存放了几种常见的代码示例:
在
examples
文件夹中存放了几种常见的代码示例:
demo/cli
.py:
以回调函数方式输出回答示例
demo/cli_thread
.py:
多线程调用api接口示例(推荐)
demo/cli_low_api
.py:
底层API调用
示例
demo/convert_model
.py:
模型转换示例
demo/web_api.py, demo/web_api_client
.py:
fastapi webapi调用
-
`examples/cli_simple
.py
`
:
调用api接口示例(推荐)
-
`examples/cli_low_api
.py
`
:
底层API调用示例
-
`examples/convert_model
.py
`
:
模型转换
示例
-
`examples/web_api.py`
,
`examples/web_api_client
.py
`
:
fastapi webapi调用
-
`examples/test_ops
.py
`
:
部分op的使用样例及测试
### 命令行工具
...
...
@@ -94,22 +117,22 @@ $ fastllm-convert -m chatglm6B -p hf_model_path -o output_flm_path
```
sh
mkdir
build-py
cd
build-py
&&
cmake ..
-DPY_API
=
ON
-DUSE_CUDA
=
ON
&&
make
-j
&&
cd
-
cd
pyfastllm/
demo
cd
pyfastllm/
examples
python web_api.py
-m
0
-p
path_for_chatglm
--max_batch_size
32
```
可以使用locust进行压测。A100 40G,chatglm fp16 压测部分结果如下:
| 并发数 | 平均调用时间(s) | TP95(s) | TP99(s) |
|----------:|------|------|------|
| 1 | 3.07 | 4.2
| 4.8 |
| 10 | 6.11 | 11.0 | 12.0 |
| 16 | 6.82 | 15.0 | 16.0 |
|----------:|------
-
|------|------|
| 1 |
3.07 |
4.2 |
4.8 |
| 10 |
6.11 | 11.0 | 12.0 |
| 16 |
6.82 | 15.0 | 16.0 |
| 32 | 10.74 | 16.0 | 20.0 |
## API编程接口
### fastllm数据结构
> fattllm.Tensor数据类型
-
fastllm.float32
-
fastllm.float32
-
fastllm.bfloat16
-
fastllm.int16
-
fastllm.int8
...
...
@@ -175,20 +198,23 @@ python web_api.py -m 0 -p path_for_chatglm --max_batch_size 32
支持的模型列表:
| 模型名称 | 对应类 | 备注
| -- | -- | --
| ChatGLM-6B | fastllm.ChatGLMModel |
| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本
| Moss | fastllm.MossModel |
| Alpaca | fastllm.llamaModel |
| 模型名称 | 对应类 | 备注 |
| ---- | ---- | ---- |
| ChatGLM-6B | fastllm.ChatGLMModel | |
| ChatGLM2-6B | fastllm.ChatGLMModel | 在权重中标注版本 |
| Moss | fastllm.MossModel | |
| Alpaca | fastllm.LlamaModel | |
| QWen | fastllm.QWenModel | |
## 开发计划(TODO)
-
[x] 修改response_batch的output_str函数,以返回值的形式返回答案
-
[x] 编解码部分优化,合并不同的返回类型
-
[ ] 对接numpy等矩阵库
-
[ ] Tensor的深复制和浅复制,以及基础运算符重载
-
[ ] fix low_api下pastKV复制的bug
-
[ ] 模型运行参数对象类,封装模型运行时参数,包含模型路径、运行线程数、是否为低内存模型、惩罚因子、温度等
-
[ ] 暴露更多的底层api接口,按照module的方式定义模型的点,拼接model实现自定义model
-
[x] 模型运行参数对象类,封装模型运行时参数,包含模型路径、运行线程数、是否为低内存模型、惩罚因子、温度等
-
[ ] 增加更多的op
-
[ ] 增加module
pyfastllm/build_libs.py
View file @
56215723
...
...
@@ -3,6 +3,7 @@ import shutil
import
platform
import
sys
import
argparse
import
glob
parser
=
argparse
.
ArgumentParser
(
description
=
'build fastllm libs'
)
parser
.
add_argument
(
'--cuda'
,
dest
=
'cuda'
,
action
=
'store_true'
,
default
=
False
,
...
...
@@ -23,20 +24,24 @@ def build_libs():
os
.
makedirs
(
cmake_build_dir
)
os
.
chdir
(
cmake_build_dir
)
# build it
# build it
cpu_num
=
min
(
os
.
cpu_count
(),
4
)
args
=
parser
.
parse_args
()
if
IS_WINDOWS
:
os
.
system
(
'cmake -G
"
Ninja
"
-DPY_API=ON .. && ninja pyfastllm'
)
os
.
system
(
'cmake -G Ninja -DPY_API=ON .. && ninja pyfastllm'
)
elif
IS_LINUX
:
extra_opts
=
' -DPY_API=ON '
extra_opts
+=
' -DUSE_CUDA=ON '
if
args
.
cuda
else
' '
build_cmd
=
'
cmake
'
+
extra_opts
+
'
.. && make pyfastllm -j
4'
build_cmd
=
f
"
cmake
{
extra_opts
}
.. && make pyfastllm -j
{
cpu_num
}
"
print
(
build_cmd
)
os
.
system
(
'
cmake
'
+
extra_opts
+
'
.. && make pyfastllm -j
4'
)
os
.
system
(
f
"
cmake
{
extra_opts
}
.. && make pyfastllm -j
{
cpu_num
}
"
)
else
:
extra_opts
=
'-DPY_API=ON'
os
.
system
(
'cmake '
+
extra_opts
+
'.. && make pyfastllm -j4'
)
os
.
system
(
f
"cmake
{
extra_opts
}
.. && make pyfastllm -j
{
cpu_num
}
"
)
so_files
=
glob
.
glob
(
"*.so"
,
root_dir
=
cmake_build_dir
)
for
file
in
so_files
:
shutil
.
copy
(
os
.
path
.
join
(
cmake_build_dir
,
file
),
os
.
path
.
join
(
root_dir
,
"pyfastllm/fastllm"
))
if
__name__
==
'__main__'
:
build_libs
()
build_libs
()
\ No newline at end of file
pyfastllm/demo/cli.py
View file @
56215723
...
...
@@ -3,7 +3,7 @@ import sys
import
platform
import
logging
import
argparse
sys
.
path
.
append
(
'./build-py'
)
sys
.
path
.
append
(
'.
./..
/build-py'
)
import
pyfastllm
# 或fastllm
logging
.
info
(
f
"python gcc version:
{
platform
.
python_compiler
()
}
"
)
...
...
pyfastllm/demo/test_ops.py
0 → 100644
View file @
56215723
import
fastllm
import
numpy
as
np
def
np_rms_norm
(
inputs
,
weights
,
eps
):
channel
=
inputs
.
shape
[
-
1
]
sqrt_mean
=
np
.
sqrt
(
np
.
sum
(
inputs
**
2
)
/
channel
+
eps
)
return
inputs
/
sqrt_mean
*
weights
def
np_layer_norm
(
inputs
,
gamma
,
beta
,
axis
=-
1
):
assert
axis
<
len
(
inputs
.
shapes
),
"axis should less than inputs dims"
channel
=
inputs
.
shape
[
axis
]
mean
=
np
.
mean
(
inputs
,
axis
=
axis
)
var
=
np
.
var
(
inputs
,
axis
=
axis
)
output
=
(
inputs
-
mean
)
/
var
*
gamma
+
beta
return
output
def
np_linear
(
inputs
,
weights
,
bias
):
output
=
np
.
matmul
(
inputs
,
weights
.
T
)
+
bias
return
output
def
np_softmax
(
inputs
,
axis
=
None
):
maxv
=
inputs
.
max
(
axis
,
keepdims
=
True
)
exp_v
=
np
.
exp
(
inputs
-
maxv
)
exp_sum
=
np
.
sum
(
exp_v
,
axis
=
axis
)
return
exp_v
/
exp_sum
def
np_silu
(
inputs
,
):
return
inputs
/
(
1
+
np
.
exp
(
-
inputs
))
def
np_attention
(
q
,
k
,
v
,
mask
=
None
,
group
=
None
,
scale
=
None
):
qk
=
np_softmax
(
q
@
k
.
T
*
scale
,
axis
=-
1
)
attn
=
qk
@
v
return
attn
def
test_linear
():
inputs
=
np
.
array
([[
1
,
2
]])
weight
=
np
.
array
([[
3
,
4
,
5
,
5
,
6
,
7
]]).
reshape
([
3
,
2
])
bias
=
np
.
array
([
0
,
1
,
1
])
np_output
=
np_linear
(
inputs
,
weight
,
bias
)
print
(
np_output
)
input
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
1
,
2
])
weights
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
3
,
2
],
[
3
,
4
,
5
,
5
,
6
,
7
])
bias
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
3
],
[
0
,
1
,
1
])
out
=
fastllm
.
ops
.
linear
(
input
,
weights
,
bias
)
print
(
out
)
def
test_rms_norm
():
inputs
=
np
.
array
([
1
,
5
]).
reshape
([
1
,
2
])
weights
=
np
.
array
([
1
,
3
]).
reshape
([
1
,
2
])
eps
=
1e-6
np_out
=
np_rms_norm
(
inputs
,
weights
,
eps
)
print
(
np_out
)
input
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
1
,
5
])
weights
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
1
,
3
])
out
=
fastllm
.
Tensor
()
out
=
fastllm
.
ops
.
rms_norm
(
input
,
weights
,
eps
=
1e-6
)
print
(
out
)
def
test_silu
():
inputs
=
np
.
array
([
1
,
5
]).
reshape
([
1
,
2
])
output
=
np_softmax
(
inputs
)
# output = np_silu(inputs)
print
(
output
)
inputs
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
1
,
5
])
out
=
fastllm
.
ops
.
activation
(
input
=
inputs
,
activate_type
=
"softmax"
)
# out = fastllm.ops.activation(input=inputs, activate_type="silu")
print
(
out
)
def
test_attention
():
q
=
np
.
array
([
1
,
2
,
3
,
4
,
5
,
6
]).
reshape
([
2
,
3
])
k
=
np
.
array
([
5
,
6
,
7
,
8
,
9
,
10
]).
reshape
([
2
,
3
])
v
=
np
.
array
([
1
,
1
,
1
,
2
,
1
,
3
]).
reshape
([
2
,
3
])
scale
=
1
/
np
.
sqrt
(
q
.
shape
[
-
1
])
output
=
np_attention
(
q
,
k
,
v
,
scale
=
scale
)
print
(
output
)
q
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
,
3
],
[
1
,
2
,
3
,
4
,
5
,
6
])
k
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
,
3
],
[
5
,
6
,
7
,
8
,
9
,
10
])
v
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
,
3
],
[
1
,
1
,
1
,
2
,
1
,
3
])
mask
=
fastllm
.
Tensor
()
output
=
fastllm
.
ops
.
attention
(
q
,
k
,
v
,
mask
,
group
=
1
,
scale
=
scale
,
attentionType
=
0
)
print
(
output
)
test_attention
()
test_silu
()
test_linear
()
test_rms_norm
()
pyfastllm/demo/web_api.py
View file @
56215723
...
...
@@ -7,7 +7,8 @@ from copy import deepcopy
import
traceback
from
typing
import
List
sys
.
path
.
append
(
'../../build-py'
)
import
pyfastllm
# 或fastllm
import
pyfastllm
import
uuid
from
fastapi
import
FastAPI
,
Request
from
fastapi.responses
import
StreamingResponse
import
threading
,
queue
,
uvicorn
,
json
,
time
...
...
@@ -106,8 +107,8 @@ def dynamic_batch_stream_func():
def
chat_stream
(
prompt
:
str
,
config
:
pyfastllm
.
GenerationConfig
,
uid
:
int
=
0
,
time_out
=
200
):
global
g_model
,
g_msg_dict
time_stamp
=
round
(
time
.
time
()
*
1000
)
global
g_msg_dict
time_stamp
=
str
(
uuid
.
uuid1
()
)
hash_id
=
str
(
pyfastllm
.
std_hash
(
f
"
{
prompt
}
time_stamp:
{
time_stamp
}
"
))
thread
=
threading
.
Thread
(
target
=
batch_response_stream
,
args
=
(
f
"
{
prompt
}
time_stamp:
{
time_stamp
}
"
,
config
))
thread
.
start
()
...
...
pyfastllm/examples/cli_low_level.py
0 → 100644
View file @
56215723
# -*- coding: utf-8 -*-
import
sys
import
platform
import
logging
import
argparse
import
fastllm
logging
.
info
(
f
"python gcc version:
{
platform
.
python_compiler
()
}
"
)
def
args_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
'fastllm'
)
parser
.
add_argument
(
'-m'
,
'--model'
,
type
=
int
,
required
=
False
,
default
=
0
,
help
=
'模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)'
)
parser
.
add_argument
(
'-p'
,
'--path'
,
type
=
str
,
required
=
True
,
default
=
''
,
help
=
'模型文件的路径'
)
parser
.
add_argument
(
'-t'
,
'--threads'
,
type
=
int
,
default
=
4
,
help
=
'使用的线程数量'
)
parser
.
add_argument
(
'-l'
,
'--low'
,
action
=
'store_true'
,
help
=
'使用低内存模式'
)
args
=
parser
.
parse_args
()
return
args
# 请谨慎使用该函数,目前仍存在bug,仅作为low level api调用示例,请勿在生产环境使用
def
response
(
model
,
prompt_input
:
str
,
stream_output
:
bool
=
False
):
gmask_token_id
=
130001
bos_token_id
=
130004
eos_token_id
=
model
.
eos_token_id
input_ids
=
model
.
weight
.
tokenizer
.
encode
(
prompt_input
)
if
model
.
model_type
==
"chatglm"
:
gmask_token_id
=
model
.
gmask_token_id
bos_token_id
=
model
.
bos_token_id
gmask_bos
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
gmask_token_id
,
bos_token_id
])
input_ids
=
fastllm
.
cat
([
gmask_bos
,
input_ids
],
0
)
seq_len
=
input_ids
.
count
(
0
)
vmask
=
[
0
]
*
(
seq_len
*
seq_len
)
vpids
=
[
0
]
*
(
seq_len
*
2
)
for
i
in
range
(
seq_len
-
1
):
vmask
[
i
*
seq_len
+
seq_len
-
1
]
=
1
vpids
[
i
]
=
i
vpids
[
seq_len
-
1
]
=
seq_len
-
2
vpids
[
seq_len
*
2
-
1
]
=
1
attention_mask
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
seq_len
,
seq_len
],
vmask
)
position_ids
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
2
,
seq_len
],
vpids
)
pastKeyValues
=
[]
for
_
in
range
(
model
.
block_cnt
):
pastKeyValues
.
append
([
fastllm
.
Tensor
(
fastllm
.
float32
),
fastllm
.
Tensor
(
fastllm
.
float32
)])
ret_str
=
""
ret_len
=
1
mask_ids
=
-
1
output_tokens
=
[]
penalty_factor
=
fastllm
.
Tensor
()
while
len
(
output_tokens
)
<
2048
:
# config.max_seq_len
ret
,
pastKeyValues
=
model
.
forward
(
input_ids
,
attention_mask
,
position_ids
,
penalty_factor
,
pastKeyValues
)
if
ret
==
eos_token_id
:
break
output_tokens
.
append
(
ret
)
cur_str
=
model
.
weight
.
tokenizer
.
decode
(
fastllm
.
Tensor
(
fastllm
.
float32
,
[
len
(
output_tokens
)],
output_tokens
))
ret_str
+=
cur_str
print
(
cur_str
,
end
=
""
)
sys
.
stdout
.
flush
()
if
stream_output
:
yield
cur_str
ret_len
+=
1
output_tokens
=
[]
if
mask_ids
==
-
1
:
mask_ids
=
seq_len
-
2
input_ids
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
1
],
[
ret
])
attention_mask
=
fastllm
.
Tensor
()
position_ids
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
2
,
1
],
[
mask_ids
,
ret_len
])
print
()
return
ret_str
def
run_with_low_level
(
args
):
model_path
=
args
.
path
llm_type
=
fastllm
.
get_llm_type
(
model_path
)
print
(
f
"llm model:
{
llm_type
}
"
)
model
=
fastllm
.
create_llm
(
model_path
)
prompt
=
""
while
prompt
!=
"stop"
:
prompt
=
input
(
"User: "
)
outputs
=
response
(
model
,
prompt_input
=
model
.
make_input
(
""
,
0
,
prompt
))
for
output
in
outputs
:
print
(
output
)
sys
.
stdout
.
flush
()
if
__name__
==
"__main__"
:
args
=
args_parser
()
run_with_low_level
(
args
)
pyfastllm/examples/cli_simple.py
0 → 100644
View file @
56215723
# -*- coding: utf-8 -*-
import
sys
,
os
import
platform
import
logging
import
argparse
import
fastllm
logging
.
info
(
f
"python gcc version:
{
platform
.
python_compiler
()
}
"
)
def
args_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
'fastllm'
)
parser
.
add_argument
(
'-m'
,
'--model'
,
type
=
int
,
required
=
False
,
default
=
0
,
help
=
'模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)'
)
parser
.
add_argument
(
'-p'
,
'--path'
,
type
=
str
,
required
=
True
,
default
=
''
,
help
=
'模型文件的路径'
)
parser
.
add_argument
(
'-t'
,
'--threads'
,
type
=
int
,
default
=
4
,
help
=
'使用的线程数量'
)
parser
.
add_argument
(
'-l'
,
'--low'
,
action
=
'store_true'
,
help
=
'使用低内存模式'
)
args
=
parser
.
parse_args
()
return
args
def
response
(
model
,
prompt_input
:
str
,
stream_output
:
bool
=
False
):
input_ids
=
model
.
weight
.
tokenizer
.
encode
(
prompt_input
)
input_ids
=
input_ids
.
to_list
()
input_ids
=
[
int
(
v
)
for
v
in
input_ids
]
if
model
.
model_type
==
"chatglm"
:
input_ids
=
[
model
.
gmask_token_id
,
model
.
bos_token_id
]
+
input_ids
# print(input_ids)
handle
=
model
.
launch_response
(
input_ids
,
fastllm
.
GenerationConfig
())
continue_token
=
True
ret_byte
=
b
""
ret_str
=
""
while
continue_token
:
resp_token
=
model
.
fetch_response
(
handle
)
continue_token
=
(
resp_token
!=
-
1
)
content
=
model
.
weight
.
tokenizer
.
decode_byte
([
resp_token
])
ret_byte
+=
content
ret_str
=
ret_byte
.
decode
(
errors
=
'ignore'
)
if
stream_output
:
yield
ret_str
return
ret_str
def
run_with_response
(
args
):
model_path
=
args
.
path
OLD_API
=
False
if
OLD_API
:
model
=
fastllm
.
ChatGLMModel
()
model
.
load_weights
(
model_path
)
model
.
warmup
()
else
:
fastllm
.
set_threads
(
args
.
threads
)
fastllm
.
set_low_memory
(
args
.
low
)
if
not
os
.
path
.
exists
(
model_path
):
print
(
f
"模型文件
{
args
.
path
}
不存在!"
)
exit
(
-
1
)
model
=
fastllm
.
create_llm
(
model_path
)
print
(
f
"llm model:
{
model
.
model_type
}
"
)
print
(
f
"欢迎使用
{
model
.
model_type
}
模型. 输入内容对话,reset清空历史记录,stop退出程序"
);
input_text
=
""
history
=
""
dialog_round
=
0
while
input_text
!=
"stop"
:
input_text
=
input
(
"User: "
)
if
'stop'
==
input_text
:
break
if
'reset'
==
input_text
:
history
=
''
continue
prompt
=
model
.
make_input
(
history
,
dialog_round
,
input_text
)
outputs
=
response
(
model
,
prompt_input
=
prompt
,
stream_output
=
True
)
print
(
f
"
{
model
.
model_type
}
:"
,
end
=
' '
)
past_len
=
0
for
output
in
outputs
:
print
(
output
[
past_len
:],
end
=
''
,
flush
=
True
)
past_len
=
len
(
output
)
print
()
model
.
make_history
(
history
,
dialog_round
,
input_text
,
output
)
dialog_round
+=
1
def
run_with_callback
(
args
):
model_path
=
args
.
path
OLD_API
=
False
LLM_TYPE
=
""
if
OLD_API
:
model
=
fastllm
.
ChatGLMModel
()
model
.
load_weights
(
model_path
)
model
.
warmup
()
else
:
fastllm
.
set_threads
(
args
.
threads
)
fastllm
.
set_low_memory
(
args
.
low
)
if
not
os
.
path
.
exists
(
model_path
):
print
(
f
"模型文件
{
args
.
path
}
不存在!"
)
exit
(
-
1
)
LLM_TYPE
=
fastllm
.
get_llm_type
(
model_path
)
model
=
fastllm
.
create_llm
(
model_path
)
def
print_back
(
idx
:
int
,
content
:
bytearray
):
content
=
content
.
decode
(
encoding
=
"utf-8"
,
errors
=
"replace"
)
if
idx
>=
0
:
print
(
f
"
\r
{
LLM_TYPE
}
:
{
content
}
"
,
end
=
''
,
flush
=
True
)
elif
idx
==
-
1
:
print
()
sys
.
stdout
.
flush
()
print
(
f
"欢迎使用
{
LLM_TYPE
}
模型. 输入内容对话,reset清空历史记录,stop退出程序"
);
prompt
=
""
while
prompt
!=
"stop"
:
prompt
=
input
(
"User: "
)
config
=
fastllm
.
GenerationConfig
()
model
.
response
(
model
.
make_input
(
""
,
0
,
prompt
),
print_back
,
config
)
print
()
sys
.
stdout
.
flush
()
if
__name__
==
"__main__"
:
args
=
args_parser
()
# run_with_callback(args)
run_with_response
(
args
)
pyfastllm/examples/convert_model.py
0 → 100644
View file @
56215723
import
sys
from
transformers
import
AutoTokenizer
,
AutoModel
import
fastllm
def
export
():
model_path
=
'/public/Models/chatglm-6b'
# 仅支持fp32模型加载
export_path
=
"chatglm-6b-fp32.flm"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
model
=
AutoModel
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
).
float
()
model
=
model
.
eval
()
fastllm
.
utils
.
convert
(
model
=
model
,
tokenizer
=
tokenizer
,
output_path
=
export_path
,
verbose
=
True
)
def
response
(
model
,
prompt_input
:
str
,
stream_output
:
bool
=
False
):
gmask_token_id
=
130001
bos_token_id
=
130004
input_ids
=
model
.
weight
.
tokenizer
.
encode
(
prompt_input
)
input_ids
=
input_ids
.
to_list
()
input_ids
.
extend
([
gmask_token_id
,
bos_token_id
])
input_ids
=
[
int
(
v
)
for
v
in
input_ids
]
handle
=
model
.
launch_response
(
input_ids
)
continue_token
=
True
ret_byte
=
b
""
ret_str
=
""
while
continue_token
:
resp_token
=
model
.
fetch_response
(
handle
)
continue_token
=
(
resp_token
!=
-
1
)
content
=
model
.
weight
.
tokenizer
.
decode_byte
([
resp_token
])
ret_byte
+=
content
ret_str
=
ret_byte
.
decode
(
errors
=
'ignore'
)
if
stream_output
:
yield
ret_str
return
ret_str
def
infer
():
model_path
=
"chatglm-6b-fp32.flm"
model
=
fastllm
.
create_llm
(
model_path
)
prompt
=
"你好"
outputs
=
response
(
model
,
prompt_input
=
prompt
,
stream_output
=
True
)
for
output
in
outputs
:
print
(
'
\r
LLM:'
+
output
,
end
=
''
,
flush
=
True
)
print
()
if
__name__
==
"__main__"
:
# export()
infer
()
\ No newline at end of file
pyfastllm/examples/test_ops.py
0 → 100644
View file @
56215723
import
pytest
import
numpy
as
np
import
fastllm
def
np_rms_norm
(
inputs
,
weights
,
eps
):
channel
=
inputs
.
shape
[
-
1
]
sqrt_mean
=
np
.
sqrt
(
np
.
sum
(
inputs
**
2
)
/
channel
+
eps
)
return
inputs
/
sqrt_mean
*
weights
def
np_layer_norm
(
inputs
,
gamma
,
beta
,
axis
=-
1
):
assert
axis
<
len
(
inputs
.
shapes
),
"axis should less than inputs dims"
channel
=
inputs
.
shape
[
axis
]
mean
=
np
.
mean
(
inputs
,
axis
=
axis
)
var
=
np
.
var
(
inputs
,
axis
=
axis
)
output
=
(
inputs
-
mean
)
/
var
*
gamma
+
beta
return
output
def
np_linear
(
inputs
,
weights
,
bias
):
output
=
np
.
matmul
(
inputs
,
weights
.
T
)
+
bias
return
output
def
np_softmax
(
inputs
,
axis
=
None
):
maxv
=
inputs
.
max
(
axis
,
keepdims
=
True
)
exp_v
=
np
.
exp
(
inputs
-
maxv
)
exp_sum
=
np
.
sum
(
exp_v
,
axis
=
axis
)
return
exp_v
/
exp_sum
def
np_silu
(
inputs
,
):
return
inputs
/
(
1
+
np
.
exp
(
-
inputs
))
def
np_attention
(
q
,
k
,
v
,
mask
=
None
,
group
=
None
,
scale
=
None
):
qk
=
np_softmax
(
q
@
k
.
T
*
scale
,
axis
=-
1
)
attn
=
qk
@
v
return
attn
def
test_linear
():
inputs
=
np
.
array
([[
1
,
2
]])
weight
=
np
.
array
([[
3
,
4
,
5
,
5
,
6
,
7
]]).
reshape
([
3
,
2
])
bias
=
np
.
array
([
0
,
1
,
1
])
np_output
=
np_linear
(
inputs
,
weight
,
bias
)
print
(
np_output
)
input
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
1
,
2
])
weights
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
3
,
2
],
[
3
,
4
,
5
,
5
,
6
,
7
])
bias
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
3
],
[
0
,
1
,
1
])
out
=
fastllm
.
ops
.
linear
(
input
,
weights
,
bias
)
print
(
out
)
def
test_rms_norm
():
inputs
=
np
.
array
([
1
,
5
]).
reshape
([
1
,
2
])
weights
=
np
.
array
([
1
,
3
]).
reshape
([
1
,
2
])
eps
=
1e-6
np_out
=
np_rms_norm
(
inputs
,
weights
,
eps
)
print
(
np_out
)
input
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
1
,
5
])
weights
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
1
,
3
])
out
=
fastllm
.
Tensor
()
out
=
fastllm
.
ops
.
rms_norm
(
input
,
weights
,
eps
=
1e-6
)
print
(
out
)
def
test_silu
():
inputs
=
np
.
array
([
1
,
5
]).
reshape
([
1
,
2
])
output
=
np_softmax
(
inputs
)
# output = np_silu(inputs)
print
(
output
)
inputs
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
],
[
1
,
5
])
out
=
fastllm
.
ops
.
activation
(
input
=
inputs
,
activate_type
=
"softmax"
)
# out = fastllm.ops.activation(input=inputs, activate_type="silu")
print
(
out
)
def
test_attention
():
q
=
np
.
array
([
1
,
2
,
3
,
4
,
5
,
6
]).
reshape
([
2
,
3
])
k
=
np
.
array
([
5
,
6
,
7
,
8
,
9
,
10
]).
reshape
([
2
,
3
])
v
=
np
.
array
([
1
,
1
,
1
,
2
,
1
,
3
]).
reshape
([
2
,
3
])
scale
=
1
/
np
.
sqrt
(
q
.
shape
[
-
1
])
output
=
np_attention
(
q
,
k
,
v
,
scale
=
scale
)
print
(
output
)
q
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
,
3
],
[
1
,
2
,
3
,
4
,
5
,
6
])
k
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
,
3
],
[
5
,
6
,
7
,
8
,
9
,
10
])
v
=
fastllm
.
Tensor
(
fastllm
.
float32
,
[
1
,
2
,
3
],
[
1
,
1
,
1
,
2
,
1
,
3
])
mask
=
fastllm
.
Tensor
()
output
=
fastllm
.
ops
.
attention
(
q
,
k
,
v
,
mask
,
group
=
1
,
scale
=
scale
,
attentionType
=
0
)
print
(
output
)
if
__name__
==
"__main__"
:
test_attention
()
test_silu
()
test_linear
()
test_rms_norm
()
pyfastllm/examples/web_api.py
0 → 100644
View file @
56215723
# -*- coding: utf-8 -*-
import
sys
import
platform
import
logging
import
argparse
from
copy
import
deepcopy
import
traceback
from
typing
import
List
import
fastllm
import
uuid
from
fastapi
import
FastAPI
,
Request
from
fastapi.responses
import
StreamingResponse
import
threading
,
queue
,
uvicorn
,
json
,
time
logging
.
info
(
f
"python gcc version:
{
platform
.
python_compiler
()
}
"
)
def
args_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
'fastllm'
)
parser
.
add_argument
(
'-m'
,
'--model'
,
type
=
int
,
required
=
False
,
default
=
0
,
help
=
'模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)'
)
parser
.
add_argument
(
'-p'
,
'--path'
,
type
=
str
,
required
=
True
,
default
=
''
,
help
=
'模型文件的路径'
)
parser
.
add_argument
(
'-t'
,
'--threads'
,
type
=
int
,
default
=
4
,
help
=
'使用的线程数量'
)
parser
.
add_argument
(
'-l'
,
'--low'
,
action
=
'store_true'
,
help
=
'使用低内存模式'
)
parser
.
add_argument
(
"--max_batch_size"
,
type
=
int
,
default
=
32
,
help
=
"动态batch的最大batch size"
)
args
=
parser
.
parse_args
()
return
args
g_model
=
None
g_msg_dict
=
dict
()
g_prompt_queue
=
queue
.
Queue
(
maxsize
=
256
)
g_max_batch_size
=
32
def
save_msg
(
idx
:
int
,
content
:
bytes
):
global
g_msg_dict
content
=
content
.
decode
(
encoding
=
"utf-8"
,
errors
=
"ignore"
)
hash_id_idx
=
content
.
rindex
(
"hash_id:"
)
hash_id
=
content
[
hash_id_idx
+
8
:]
content
=
content
[:
hash_id_idx
].
replace
(
"<n>"
,
"
\n
"
)
if
hash_id
in
g_msg_dict
.
keys
():
g_msg_dict
[
hash_id
].
put
((
idx
,
content
))
else
:
msg_queue
=
queue
.
Queue
()
msg_queue
.
put
((
idx
,
content
))
g_msg_dict
[
hash_id
]
=
msg_queue
def
save_msgs
(
idx
:
int
,
content_list
:
List
[
bytes
]):
global
g_msg_dict
for
content
in
content_list
:
content
=
content
.
decode
(
encoding
=
"utf-8"
,
errors
=
"ignore"
)
hash_id_idx
=
content
.
rindex
(
"hash_id:"
)
hash_id
=
content
[
hash_id_idx
+
8
:]
content
=
content
[:
hash_id_idx
].
replace
(
"<n>"
,
"
\n
"
)
if
hash_id
in
g_msg_dict
.
keys
():
g_msg_dict
[
hash_id
].
put
((
idx
,
content
))
else
:
msg_queue
=
queue
.
Queue
()
msg_queue
.
put
((
idx
,
content
))
g_msg_dict
[
hash_id
]
=
msg_queue
def
response_stream
(
prompt
:
str
,
config
:
fastllm
.
GenerationConfig
):
global
model
model
.
response
(
prompt
,
save_msgs
,
config
)
def
batch_response_stream
(
prompt
:
str
,
config
:
fastllm
.
GenerationConfig
):
global
g_config
g_config
=
config
g_prompt_queue
.
put
(
prompt
)
g_running_lock
=
threading
.
Lock
()
g_running
=
False
g_config
:
fastllm
.
GenerationConfig
=
None
def
dynamic_batch_stream_func
():
global
g_model
,
g_running_lock
,
g_running
,
g_prompt_queue
,
g_config
,
g_msg_dict
print
(
f
"call dynamic_batch_stream_func: running:
{
g_running
}
, prompt queue size:
{
g_prompt_queue
.
qsize
()
}
"
)
print
(
f
"msg_dict size:
{
len
(
g_msg_dict
)
}
"
)
batch_size_this
=
min
(
g_max_batch_size
,
g_prompt_queue
.
qsize
())
if
not
g_running
and
batch_size_this
>
0
:
g_running_lock
.
acquire
()
g_running
=
True
g_running_lock
.
release
()
batch_this
=
[]
for
_
in
range
(
batch_size_this
):
batch_this
.
append
(
g_prompt_queue
.
get_nowait
())
print
(
f
"batch this:
{
batch_size_this
}
, queue len:
{
g_prompt_queue
.
qsize
()
}
"
)
try
:
if
batch_size_this
>
0
:
g_model
.
batch_response
(
batch_this
,
save_msgs
,
g_config
)
except
Exception
as
e
:
hash_id_list
=
[
str
(
fastllm
.
std_hash
(
prompt
))
for
prompt
in
batch_this
]
rtn_list
=
[
bytes
(
f
"hash_id:
{
hash_id
}
"
,
'utf8'
)
for
hash_id
in
hash_id_list
]
save_msgs
(
-
1
,
rtn_list
)
traceback
.
print_exc
()
print
(
e
)
g_running_lock
.
acquire
()
g_running
=
False
g_running_lock
.
release
()
threading
.
Timer
(
0
,
dynamic_batch_stream_func
).
start
()
else
:
wait_time
=
float
(
g_max_batch_size
-
g_prompt_queue
.
qsize
()
-
batch_size_this
)
/
g_max_batch_size
*
1
threading
.
Timer
(
wait_time
,
dynamic_batch_stream_func
).
start
()
def
chat_stream
(
prompt
:
str
,
config
:
fastllm
.
GenerationConfig
,
uid
:
int
=
0
,
time_out
=
200
):
global
g_msg_dict
time_stamp
=
str
(
uuid
.
uuid1
())
hash_id
=
str
(
fastllm
.
std_hash
(
f
"
{
prompt
}
time_stamp:
{
time_stamp
}
"
))
thread
=
threading
.
Thread
(
target
=
batch_response_stream
,
args
=
(
f
"
{
prompt
}
time_stamp:
{
time_stamp
}
"
,
config
))
thread
.
start
()
idx
=
0
start
=
time
.
time
()
pre_msg
=
""
while
idx
!=
-
1
:
if
hash_id
in
g_msg_dict
.
keys
():
msg_queue
=
g_msg_dict
[
hash_id
]
if
msg_queue
.
empty
():
time
.
sleep
(
0.1
)
continue
msg_obj
=
msg_queue
.
get
(
block
=
False
)
idx
=
msg_obj
[
0
]
if
idx
!=
-
1
:
yield
msg_obj
[
1
]
else
:
# end flag
del
g_msg_dict
[
hash_id
]
break
pre_msg
=
msg_obj
[
1
]
else
:
if
time
.
time
()
-
start
>
time_out
:
yield
pre_msg
+
f
"
\n
time_out:
{
time
.
time
()
-
start
}
senconds"
break
time
.
sleep
(
0.1
)
continue
app
=
FastAPI
()
@
app
.
post
(
"/api/chat_stream"
)
def
api_chat_stream
(
request
:
dict
):
#print("request.json(): {}".format(json.loads(request.body(), errors='ignore')))
data
=
request
prompt
=
data
.
get
(
"prompt"
)
history
=
data
.
get
(
"history"
,
[])
round_cnt
=
data
.
get
(
"round_cnt"
)
config
=
fastllm
.
GenerationConfig
()
if
data
.
get
(
"max_length"
)
is
not
None
:
config
.
max_length
=
data
.
get
(
"max_length"
)
if
data
.
get
(
"top_k"
)
is
not
None
:
config
.
top_k
=
data
.
get
(
"top_k"
)
if
data
.
get
(
"top_p"
)
is
not
None
:
config
.
top_p
=
data
.
get
(
"top_p"
)
if
data
.
get
(
"temperature"
)
is
not
None
:
config
.
temperature
=
data
.
get
(
"temperature"
)
if
data
.
get
(
"repeat_penalty"
)
is
not
None
:
config
.
repeat_penalty
=
data
.
get
(
"repeat_penalty"
)
uid
=
None
if
data
.
get
(
"uid"
)
is
not
None
:
uid
=
data
.
get
(
"uid"
)
config
.
enable_hash_id
=
True
print
(
f
"prompt:
{
prompt
}
"
)
round_idx
=
0
history_str
=
""
for
(
q
,
a
)
in
history
:
history_str
=
g_model
.
make_history
(
history_str
,
round_idx
,
q
,
a
)
round_idx
+=
1
prompt
=
g_model
.
make_input
(
history_str
,
round_idx
,
prompt
)
return
StreamingResponse
(
chat_stream
(
prompt
,
config
),
media_type
=
'text/event-stream'
)
@
app
.
post
(
"/api/batch_chat"
)
async
def
api_batch_chat
(
request
:
Request
):
data
=
await
request
.
json
()
prompts
=
data
.
get
(
"prompts"
)
print
(
f
"
{
prompts
}
type:
{
type
(
prompts
)
}
"
)
if
prompts
is
None
:
return
"prompts should be list[str]"
history
=
data
.
get
(
"history"
)
if
history
is
None
:
history
=
""
config
=
fastllm
.
GenerationConfig
()
if
data
.
get
(
"max_length"
)
is
not
None
:
config
.
max_length
=
data
.
get
(
"max_length"
)
if
data
.
get
(
"top_k"
)
is
not
None
:
config
.
top_k
=
data
.
get
(
"top_k"
)
if
data
.
get
(
"top_p"
)
is
not
None
:
config
.
top_p
=
data
.
get
(
"top_p"
)
if
data
.
get
(
"temperature"
)
is
not
None
:
config
.
temperature
=
data
.
get
(
"temperature"
)
if
data
.
get
(
"repeat_penalty"
)
is
not
None
:
config
.
repeat_penalty
=
data
.
get
(
"repeat_penalty"
)
uid
=
None
if
data
.
get
(
"uid"
)
is
not
None
:
uid
=
data
.
get
(
"uid"
)
retV
=
""
batch_idx
=
0
for
response
in
g_model
.
batch_response
(
prompts
,
None
,
config
):
retV
+=
f
"(
{
batch_idx
+
1
}
/
{
len
(
prompts
)
}
)
\n
prompt:
{
prompts
[
batch_idx
]
}
\n
response:
{
response
}
\n
"
batch_idx
+=
1
return
retV
def
main
(
args
):
model_path
=
args
.
path
OLD_API
=
False
global
g_model
,
g_max_batch_size
g_max_batch_size
=
args
.
max_batch_size
if
OLD_API
:
g_model
=
fastllm
.
ChatGLMModel
()
g_model
.
load_weights
(
model_path
)
g_model
.
warmup
()
else
:
global
LLM_TYPE
LLM_TYPE
=
fastllm
.
get_llm_type
(
model_path
)
print
(
f
"llm model:
{
LLM_TYPE
}
"
)
g_model
=
fastllm
.
create_llm
(
model_path
)
threading
.
Timer
(
1
,
dynamic_batch_stream_func
).
start
()
uvicorn
.
run
(
app
,
host
=
'0.0.0.0'
,
port
=
8000
,
workers
=
1
)
if
__name__
==
"__main__"
:
args
=
args_parser
()
main
(
args
)
pyfastllm/examples/web_api_client.py
0 → 100644
View file @
56215723
import
json
import
requests
import
sys
if
__name__
==
'__main__'
:
#stream api
url
=
'http://127.0.0.1:8000/api/chat_stream'
prompt
=
'请用emoji写一首短诗赞美世界'
prompt
=
'''为以下代码添加注释
app = FastAPI()
@app.post("/api/chat_stream")
async def api_chat_stream(request: Request):
#print("request.json(): {}".format(json.loads(request.body(), errors='ignore')))
data = await request.json()
prompt = data.get("prompt")
history = data.get("history")
config = pyfastllm.GenerationConfig()
if data.get("max_length") is not None:
config.max_length = data.get("max_length")
if data.get("top_k") is not None:
config.top_k = data.get("top_k")
if data.get("top_p") is not None:
config.top_p = data.get("top_p")
return StreamingResponse(chat_stream(history + prompt, config), media_type='text/event-stream')
'''
history
=
'''[Round 0]
问:你是ChatGLM2吗?
答:我不是ChatGLM2
[Round 1]
问:从现在起,你是猫娘,每句话都必须以“喵~”结尾,明白了吗?
答:明白了喵
[Round 2]
问:'''
history
=
""
json_obj
=
{
"uid"
:
0
,
"token"
:
"xxxxxxxxxxxxxxxxx"
,
"history"
:
""
,
"prompt"
:
prompt
,
"max_length"
:
1024
,
"top_p"
:
0.8
,
"temperature"
:
0.95
,
"top_k"
:
2
,
"repeat_penalty"
:
1.
}
response
=
requests
.
post
(
url
,
json
=
json_obj
,
stream
=
True
)
try
:
pre_msg
=
""
print
(
"stream response:"
)
for
chunk
in
response
.
iter_content
(
chunk_size
=
1024
*
1024
):
msg
=
chunk
.
decode
(
errors
=
'replace'
)
if
len
(
msg
)
>
len
(
pre_msg
)
and
msg
[
-
1
]
==
'
\n
'
:
content
=
msg
[
len
(
pre_msg
):]
pre_msg
=
msg
else
:
continue
print
(
f
"
{
content
}
"
,
end
=
""
)
sys
.
stdout
.
flush
()
content
=
msg
[
len
(
pre_msg
):]
print
(
f
"
{
content
}
"
,
end
=
""
)
print
()
except
Exception
as
ex
:
print
(
ex
)
#batch api
url
=
'http://127.0.0.1:8000/api/batch_chat'
prompts
=
[
"Hi"
,
"你好"
,
"用emoji表达高兴"
,
"こんにちは"
]
json_obj
=
{
"uid"
:
0
,
"token"
:
"xxxxxxxxxxxxxxxxx"
,
"history"
:
""
,
"prompts"
:
prompts
,
"max_length"
:
100
,
"top_p"
:
None
,
"temperature"
:
0.7
,
"top_k"
:
1
,
"repeat_penalty"
:
2.
}
response
=
requests
.
post
(
url
,
json
=
json_obj
,
stream
=
True
)
print
(
"batch response: {} text:
\n
{}"
.
format
(
response
,
response
.
text
.
replace
(
'
\\
n'
,
'
\n
'
)))
pyfastllm/fastllm/__init__.py
View file @
56215723
import
os
import
sys
import
ctypes
import
glob
from
pyfastllm
import
*
from
.
import
utils
\ No newline at end of file
from
.
import
utils
from
.
import
functions
as
ops
__version__
=
"0.2.0"
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment