Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Baichuan-13B_fastllm
Commits
704137ae
Commit
704137ae
authored
Jul 11, 2024
by
chenzk
Browse files
Merge
http://developer.hpccube.com/codes/chenzk/baichuan-13b_fastllm
parents
ecaf86cf
268abe98
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
673 additions
and
67 deletions
+673
-67
README.md
README.md
+13
-4
api_server_demo/fastllm-openai.py
api_server_demo/fastllm-openai.py
+225
-0
api_server_demo/openai-client.py
api_server_demo/openai-client.py
+76
-0
api_server_demo/requirements.txt
api_server_demo/requirements.txt
+5
-0
benchmark/benchmark
benchmark/benchmark
+0
-0
package/fastllm_pytools/hf_model.py
package/fastllm_pytools/hf_model.py
+9
-0
package/fastllm_pytools/libfastllm_tools.so
package/fastllm_pytools/libfastllm_tools.so
+0
-0
package/fastllm_pytools/llm.py
package/fastllm_pytools/llm.py
+336
-63
package/fastllm_pytools/torch2flm.py
package/fastllm_pytools/torch2flm.py
+9
-0
No files found.
README.md
View file @
704137ae
# Baichuan-13B
_fastllm
# Baichuan-13B
## 论文
...
...
@@ -33,7 +33,7 @@ Baichuan整体模型基于标准的Transformer结构,采用了和LLaMA一样
在光源可拉取推理的docker镜像,拉取方式如下:
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:
1
.1
3.1
-centos7.6-dtk
-
23.
04
-py38
-latest
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:
2
.1
.0
-centos7.6-dtk23.
10.1
-py38
```
### 容器启动
...
...
@@ -43,7 +43,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk
```
# <container_name> 自定义容器名
# <project_path> 当前工程所在路径
docker run -it --name=<container_name> -v <project_path>:/work -w /work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add
39
image.sourcefind.cn:5000/dcu/admin/base/pytorch:
1
.1
3.1
-centos7.6-dtk
-
23.
04
-py38
-latest
/bin/bash
docker run -it --name=<container_name> -v <project_path>:/work -w /work
--privileged -v /opt/hyhal:/opt/hyhal
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE
--ipc=host --network host
--shm-size=16G --group-add
video
image.sourcefind.cn:5000/dcu/admin/base/pytorch:
2
.1
.0
-centos7.6-dtk23.
10.1
-py38 /bin/bash
```
### 加载环境
...
...
@@ -51,7 +51,7 @@ docker run -it --name=<container_name> -v <project_path>:/work -w /work --device
进入容器后执行如下命令,加载运行环境变量
```
source /opt/dtk
-23.04
/cuda/env.sh
source /opt/dtk/cuda/env.sh
```
### 安装方法
...
...
@@ -99,6 +99,15 @@ python cli_demo.py -p baichuan-13b-fp16.bin
# 简易webui,需要先安装streamlit-chat,并且需要在容器启动时映射streamlit的端口到外部网络
streamlit run web_demo.py baichuan-13b-fp16.bin
# 按照openai接口实现的api_server的实例:
# 需要先进入api_server_demo,安装所需依赖:
cd api_server_demo
pip install -r requirements.txt
# 运行api_server服务,使用-p指定转换后的模型文件,客户端代码可以参考openai-client.py实现:
python fastllm-openai.py -p ../baichuan-13b-fp16.bin
# 如果需要测试服务的并发性能,可以使用openai-client.py,修改其中的prompt和concurrencys变量值后执行:
python openai-client.py
```
### 推理性能测试
...
...
api_server_demo/fastllm-openai.py
0 → 100644
View file @
704137ae
# coding=utf-8
# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
# Usage: python openai_api.py
# Visit http://localhost:8100/docs for documents.
import
time
import
json
import
torch
import
uvicorn
import
argparse
from
pydantic
import
BaseModel
,
Field
from
fastapi
import
FastAPI
,
HTTPException
from
fastapi.middleware.cors
import
CORSMiddleware
from
contextlib
import
asynccontextmanager
from
typing
import
Any
,
Dict
,
List
,
Literal
,
Optional
,
Union
#from transformers import AutoTokenizer, AutoModel
from
sse_starlette.sse
import
ServerSentEvent
,
EventSourceResponse
from
fastllm_pytools
import
llm
@
asynccontextmanager
async
def
lifespan
(
app
:
FastAPI
):
# collects GPU memory
yield
global
device_map
if
torch
.
cuda
.
is_available
():
for
device
in
device_map
:
with
torch
.
cuda
.
device
(
device
):
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
ipc_collect
()
app
=
FastAPI
(
lifespan
=
lifespan
)
app
.
add_middleware
(
CORSMiddleware
,
allow_origins
=
[
"*"
],
allow_credentials
=
True
,
allow_methods
=
[
"*"
],
allow_headers
=
[
"*"
],
)
class
ModelCard
(
BaseModel
):
id
:
str
object
:
str
=
"model"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
owned_by
:
str
=
"owner"
root
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
permission
:
Optional
[
list
]
=
None
class
ModelList
(
BaseModel
):
object
:
str
=
"list"
data
:
List
[
ModelCard
]
=
[]
class
ChatMessage
(
BaseModel
):
role
:
Literal
[
"user"
,
"assistant"
,
"system"
]
content
:
str
class
Usage
(
BaseModel
):
prompt_tokens
:
int
=
None
total_tokens
:
int
=
None
completion_tokens
:
int
=
None
class
DeltaMessage
(
BaseModel
):
role
:
Optional
[
Literal
[
"user"
,
"assistant"
,
"system"
]]
=
None
content
:
Optional
[
str
]
=
None
class
ChatCompletionRequest
(
BaseModel
):
model
:
str
messages
:
List
[
ChatMessage
]
temperature
:
Optional
[
float
]
=
None
top_p
:
Optional
[
float
]
=
None
max_length
:
Optional
[
int
]
=
None
stream
:
Optional
[
bool
]
=
False
class
ChatCompletionResponseChoice
(
BaseModel
):
index
:
int
message
:
ChatMessage
finish_reason
:
Literal
[
"stop"
,
"length"
]
class
ChatCompletionResponseStreamChoice
(
BaseModel
):
index
:
int
delta
:
DeltaMessage
finish_reason
:
Optional
[
Literal
[
"stop"
,
"length"
]]
class
ChatCompletionResponse
(
BaseModel
):
id
:
str
object
:
Literal
[
"chat.completion"
,
"chat.completion.chunk"
]
created
:
Optional
[
int
]
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
List
[
Union
[
ChatCompletionResponseChoice
,
ChatCompletionResponseStreamChoice
]]
usage
:
Usage
=
None
@
app
.
get
(
"/v1/models"
,
response_model
=
ModelList
)
def
list_models
():
global
model_list
for
model
in
model_list
:
ModelCard
(
id
=
model
)
ModelList
.
data
.
append
(
ModelCard
)
return
ModelList
()
@
app
.
post
(
"/v1/chat/completions"
,
response_model
=
ChatCompletionResponse
)
def
create_chat_completion
(
request
:
ChatCompletionRequest
):
if
request
.
model
not
in
model_list
:
raise
HTTPException
(
status_code
=
400
,
detail
=
"Invalid Model Name"
)
global
model
id
=
"chatcmpl-A"
if
request
.
messages
[
-
1
].
role
!=
"user"
:
raise
HTTPException
(
status_code
=
400
,
detail
=
"Invalid request"
)
query
=
request
.
messages
[
-
1
].
content
if
request
.
max_length
is
not
None
:
max_length
=
request
.
max_length
else
:
max_length
=
1024
if
request
.
temperature
is
not
None
:
temperature
=
request
.
temperature
else
:
temperature
=
0.1
if
request
.
top_p
is
not
None
:
top_p
=
request
.
top_p
else
:
top_p
=
0.8
prev_messages
=
request
.
messages
[:
-
1
]
# print(prev_messages)
if
len
(
prev_messages
)
>
0
and
prev_messages
[
0
].
role
==
"system"
:
query
=
prev_messages
.
pop
(
0
).
content
+
query
history
=
[]
if
len
(
prev_messages
)
%
2
==
0
:
for
i
in
range
(
0
,
len
(
prev_messages
),
2
):
if
prev_messages
[
i
].
role
==
"user"
and
prev_messages
[
i
+
1
].
role
==
"assistant"
:
history
.
append
([
prev_messages
[
i
].
content
,
prev_messages
[
i
+
1
].
content
])
if
request
.
stream
:
generate
=
predict
(
id
=
id
,
query
=
query
,
history
=
history
,
max_length
=
max_length
,
top_p
=
top_p
,
temperature
=
temperature
,
model_id
=
request
.
model
)
return
EventSourceResponse
(
generate
,
media_type
=
"text/event-stream"
)
response
=
model
.
response
(
query
=
query
,
history
=
history
,
max_length
=
max_length
,
top_p
=
top_p
,
temperature
=
temperature
)
choice_data
=
ChatCompletionResponseChoice
(
index
=
0
,
message
=
ChatMessage
(
role
=
"assistant"
,
content
=
response
),
finish_reason
=
"stop"
)
prompt_tokens
=
len
(
model
.
tokenizer_encode_string
(
query
))
completion_tokens
=
len
(
model
.
tokenizer_encode_string
(
response
))
usage
=
Usage
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
return
ChatCompletionResponse
(
id
=
id
,
model
=
request
.
model
,
choices
=
[
choice_data
],
object
=
"chat.completion"
,
usage
=
usage
)
def
predict
(
id
:
str
,
query
:
str
,
history
:
List
[
List
[
str
]],
model_id
:
str
,
max_length
:
int
,
top_p
:
float
,
temperature
:
float
):
global
model
creat_time
=
int
(
time
.
time
())
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
0
,
delta
=
DeltaMessage
(
role
=
"assistant"
),
finish_reason
=
None
)
chunk
=
ChatCompletionResponse
(
id
=
id
,
created
=
creat_time
,
model
=
model_id
,
choices
=
[
choice_data
],
object
=
"chat.completion.chunk"
)
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) //pydantic从1.8.0开始不支持dumps_kwags参数,参考https://github.com/THUDM/ChatGLM2-6B/issues/308
yield
json
.
dumps
(
chunk
.
model_dump
(
exclude_unset
=
True
),
ensure_ascii
=
False
)
for
new_response
in
model
.
stream_response
(
query
=
query
,
history
=
history
,
max_length
=
max_length
,
top_p
=
top_p
,
temperature
=
temperature
):
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
0
,
delta
=
DeltaMessage
(
content
=
new_response
),
finish_reason
=
None
)
chunk
=
ChatCompletionResponse
(
id
=
id
,
created
=
creat_time
,
model
=
model_id
,
choices
=
[
choice_data
],
object
=
"chat.completion.chunk"
)
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield
json
.
dumps
(
chunk
.
model_dump
(
exclude_unset
=
True
),
ensure_ascii
=
False
)
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
0
,
delta
=
DeltaMessage
(),
finish_reason
=
"stop"
)
chunk
=
ChatCompletionResponse
(
id
=
id
,
created
=
creat_time
,
model
=
model_id
,
choices
=
[
choice_data
],
object
=
"chat.completion.chunk"
)
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield
json
.
dumps
(
chunk
.
model_dump
(
exclude_unset
=
True
),
ensure_ascii
=
False
)
yield
'[DONE]'
def
args_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
'baichuan2_chat_demo'
)
parser
.
add_argument
(
'-p'
,
'--path'
,
type
=
str
,
default
=
"/model"
,
help
=
'模型文件的路径'
)
parser
.
add_argument
(
'-g'
,
'--gpus'
,
type
=
str
,
default
=
"0"
,
help
=
'指定运行的gpu卡,例如“0,1”'
)
args
=
parser
.
parse_args
()
return
args
if
__name__
==
"__main__"
:
args
=
args_parser
()
global
model_list
model_list
=
[
"baichuan2-fastllm"
]
global
device_map
device_map
=
[
"cuda:"
+
num
for
num
in
args
.
gpus
.
split
(
','
)]
llm
.
set_device_map
(
device_map
)
model
=
llm
.
model
(
args
.
path
)
uvicorn
.
run
(
app
,
host
=
'127.0.0.1'
,
port
=
8100
)
api_server_demo/openai-client.py
0 → 100644
View file @
704137ae
import
openai
import
time
import
threading
import
queue
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
def
jls_extract_def
(
model
,
messages
,
temperature
,
max_length
,
stream
,
index
):
openai
.
api_base
=
"http://127.0.0.1:8100/v1"
openai
.
api_key
=
"none"
output_tokens
=
0
ret
=
""
t0
=
time
.
time
()
result
=
openai
.
ChatCompletion
.
create
(
model
=
model
,
messages
=
messages
,
temperature
=
temperature
,
max_length
=
max_length
,
stream
=
stream
)
for
chunk
in
result
:
# print(chunk)
output_tokens
+=
1
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"content"
):
if
(
index
==
0
):
print
(
chunk
.
choices
[
0
].
delta
.
content
,
end
=
""
,
flush
=
True
)
ret
+=
chunk
.
choices
[
0
].
delta
.
content
t1
=
time
.
time
()
# print("\ntoken/s: {:.2f}, output_tokens: {}".format(output_tokens/(t1-t0),output_tokens))
result
=
output_tokens
,
ret
,
output_tokens
/
(
t1
-
t0
)
return
result
if
__name__
==
"__main__"
:
prompt
=
"满江红全文"
concurrencys
=
[
1
]
temperature
=
0.1
max_length
=
4096
stream
=
True
prompts
=
[
prompt
]
model
=
"baichuan2-fastllm"
messages
=
[{
"role"
:
"user"
,
"content"
:
"你好"
}]
pool
=
ThreadPoolExecutor
(
max_workers
=
32
)
for
i
in
range
(
len
(
concurrencys
)):
cur_prompts
=
prompts
*
concurrencys
[
i
]
token_count
=
0
threads
=
[]
t0
=
time
.
time
()
for
index
,
prompt
in
enumerate
(
cur_prompts
):
messages
[
0
][
"content"
]
=
prompt
t
=
pool
.
submit
(
jls_extract_def
,
model
,
messages
,
temperature
,
max_length
,
stream
,
index
)
t
.
index
=
index
threads
.
append
(
t
)
for
future
in
as_completed
(
threads
):
result
=
future
.
result
()
print
(
future
.
index
)
print
(
result
)
print
(
"
\n
"
)
token_count
+=
result
[
0
]
t1
=
time
.
time
()
print
(
"
\n
---------------------------------------------
\n
"
)
print
(
"
\n
concurrency: {}"
.
format
(
concurrencys
[
i
]))
print
(
"
\n
total use: {:.2f}"
.
format
(
t1
-
t0
))
print
(
"
\n
token/s: {:.2f}, token_count: {}"
.
format
(
token_count
/
(
t1
-
t0
),
token_count
))
print
(
"
\n
---------------------------------------------
\n
"
)
api_server_demo/requirements.txt
0 → 100644
View file @
704137ae
uvicorn==0.23.2
pydantic==2.5.1
fastapi==0.103.1
sse_starlette
openai==0.28
benchmark/benchmark
View file @
704137ae
No preview for this file type
package/fastllm_pytools/hf_model.py
View file @
704137ae
...
...
@@ -26,6 +26,9 @@ def create(model,
exit
(
0
);
# 0.1 model info
# if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
# model.config.model_type = "chatglm3"
# print("model.config.model_type: chatglm3!")
modelInfo
=
model
.
config
.
__dict__
if
model
.
generation_config
is
not
None
:
modelInfo
.
update
(
model
.
generation_config
.
__dict__
)
...
...
@@ -48,6 +51,12 @@ def create(model,
if
modelInfo
[
"chat_format"
]
==
"chatml"
:
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
if
(
modelInfo
[
"model_type"
]
==
"chatglm"
and
hasattr
(
tokenizer
,
"build_chat_input"
)):
# chatglm3
modelInfo
[
"pre_prompt"
]
=
""
;
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
tokenizer
.
get_command
(
"<|user|>"
))
+
">
\n
"
);
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
tokenizer
.
get_command
(
"<|assistant|>"
))
+
">"
);
modelInfo
[
"history_sep"
]
=
""
;
weight_type_dict
=
{};
...
...
package/fastllm_pytools/libfastllm_tools.so
View file @
704137ae
No preview for this file type
package/fastllm_pytools/llm.py
View file @
704137ae
This diff is collapsed.
Click to expand it.
package/fastllm_pytools/torch2flm.py
View file @
704137ae
...
...
@@ -80,6 +80,8 @@ def tofile(exportPath,
fo
.
write
(
struct
.
pack
(
'i'
,
2
))
# 0.1 model info
#if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
# model.config.model_type = "chatglm3"
modelInfo
=
model
.
config
.
__dict__
if
model
.
generation_config
is
not
None
:
modelInfo
.
update
(
model
.
generation_config
.
__dict__
)
...
...
@@ -112,6 +114,13 @@ def tofile(exportPath,
if
modelInfo
[
"chat_format"
]
==
"chatml"
:
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
if
(
modelInfo
[
"model_type"
]
==
"chatglm"
and
hasattr
(
tokenizer
,
"build_chat_input"
)):
print
(
"chatglm3"
)
# chatglm3
modelInfo
[
"pre_prompt"
]
=
""
;
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
tokenizer
.
get_command
(
"<|user|>"
))
+
">
\n
"
);
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
tokenizer
.
get_command
(
"<|assistant|>"
))
+
">"
);
modelInfo
[
"history_sep"
]
=
""
;
modelInfo
[
"tokenizer_use_score"
]
=
"1"
# 分词带分数
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment