Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Baichuan-13B_fastllm
Commits
52251123
Commit
52251123
authored
Jan 16, 2024
by
zhouxiang
Browse files
dtk更新到最新版本,支持k100卡,添加一个apiserver的demo供参考
parent
37f12b90
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
384 additions
and
34 deletions
+384
-34
README.md
README.md
+4
-4
api_server_demo/fastllm-openai.py
api_server_demo/fastllm-openai.py
+225
-0
api_server_demo/openai-client.py
api_server_demo/openai-client.py
+76
-0
api_server_demo/requirements.txt
api_server_demo/requirements.txt
+5
-0
benchmark/benchmark
benchmark/benchmark
+0
-0
package/fastllm_pytools/hf_model.py
package/fastllm_pytools/hf_model.py
+9
-2
package/fastllm_pytools/libfastllm_tools.so
package/fastllm_pytools/libfastllm_tools.so
+0
-0
package/fastllm_pytools/llm.py
package/fastllm_pytools/llm.py
+56
-26
package/fastllm_pytools/torch2flm.py
package/fastllm_pytools/torch2flm.py
+9
-2
No files found.
README.md
View file @
52251123
# Baichuan-13B
_fastllm
# Baichuan-13B
## 论文
...
...
@@ -33,7 +33,7 @@ Baichuan整体模型基于标准的Transformer结构,采用了和LLaMA一样
在光源可拉取推理的docker镜像,拉取方式如下:
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:
1
.1
3.1
-centos7.6-dtk
-
23.
04
-py38
-latest
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:
2
.1
.0
-centos7.6-dtk23.
10.1
-py38
```
### 容器启动
...
...
@@ -43,7 +43,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk
```
# <container_name> 自定义容器名
# <project_path> 当前工程所在路径
docker run -it --name=<container_name> -v <project_path>:/work -w /work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 image.sourcefind.cn:5000/dcu/admin/base/pytorch:
1
.1
3.1
-centos7.6-dtk
-
23.
04
-py38
-latest
/bin/bash
docker run -it --name=<container_name> -v <project_path>:/work -w /work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 image.sourcefind.cn:5000/dcu/admin/base/pytorch:
2
.1
.0
-centos7.6-dtk23.
10.1
-py38 /bin/bash
```
### 加载环境
...
...
@@ -51,7 +51,7 @@ docker run -it --name=<container_name> -v <project_path>:/work -w /work --device
进入容器后执行如下命令,加载运行环境变量
```
source /opt/dtk-23.0
4
/cuda/env.sh
source /opt/dtk-23.
1
0/cuda/env.sh
```
### 安装方法
...
...
api_server_demo/fastllm-openai.py
0 → 100644
View file @
52251123
# coding=utf-8
# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
# Usage: python openai_api.py
# Visit http://localhost:8100/docs for documents.
import
time
import
json
import
torch
import
uvicorn
import
argparse
from
pydantic
import
BaseModel
,
Field
from
fastapi
import
FastAPI
,
HTTPException
from
fastapi.middleware.cors
import
CORSMiddleware
from
contextlib
import
asynccontextmanager
from
typing
import
Any
,
Dict
,
List
,
Literal
,
Optional
,
Union
#from transformers import AutoTokenizer, AutoModel
from
sse_starlette.sse
import
ServerSentEvent
,
EventSourceResponse
from
fastllm_pytools
import
llm
@
asynccontextmanager
async
def
lifespan
(
app
:
FastAPI
):
# collects GPU memory
yield
global
device_map
if
torch
.
cuda
.
is_available
():
for
device
in
device_map
:
with
torch
.
cuda
.
device
(
device
):
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
ipc_collect
()
app
=
FastAPI
(
lifespan
=
lifespan
)
app
.
add_middleware
(
CORSMiddleware
,
allow_origins
=
[
"*"
],
allow_credentials
=
True
,
allow_methods
=
[
"*"
],
allow_headers
=
[
"*"
],
)
class
ModelCard
(
BaseModel
):
id
:
str
object
:
str
=
"model"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
owned_by
:
str
=
"owner"
root
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
permission
:
Optional
[
list
]
=
None
class
ModelList
(
BaseModel
):
object
:
str
=
"list"
data
:
List
[
ModelCard
]
=
[]
class
ChatMessage
(
BaseModel
):
role
:
Literal
[
"user"
,
"assistant"
,
"system"
]
content
:
str
class
Usage
(
BaseModel
):
prompt_tokens
:
int
=
None
total_tokens
:
int
=
None
completion_tokens
:
int
=
None
class
DeltaMessage
(
BaseModel
):
role
:
Optional
[
Literal
[
"user"
,
"assistant"
,
"system"
]]
=
None
content
:
Optional
[
str
]
=
None
class
ChatCompletionRequest
(
BaseModel
):
model
:
str
messages
:
List
[
ChatMessage
]
temperature
:
Optional
[
float
]
=
None
top_p
:
Optional
[
float
]
=
None
max_length
:
Optional
[
int
]
=
None
stream
:
Optional
[
bool
]
=
False
class
ChatCompletionResponseChoice
(
BaseModel
):
index
:
int
message
:
ChatMessage
finish_reason
:
Literal
[
"stop"
,
"length"
]
class
ChatCompletionResponseStreamChoice
(
BaseModel
):
index
:
int
delta
:
DeltaMessage
finish_reason
:
Optional
[
Literal
[
"stop"
,
"length"
]]
class
ChatCompletionResponse
(
BaseModel
):
id
:
str
object
:
Literal
[
"chat.completion"
,
"chat.completion.chunk"
]
created
:
Optional
[
int
]
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
List
[
Union
[
ChatCompletionResponseChoice
,
ChatCompletionResponseStreamChoice
]]
usage
:
Usage
=
None
@
app
.
get
(
"/v1/models"
,
response_model
=
ModelList
)
def
list_models
():
global
model_list
for
model
in
model_list
:
ModelCard
(
id
=
model
)
ModelList
.
data
.
append
(
ModelCard
)
return
ModelList
()
@
app
.
post
(
"/v1/chat/completions"
,
response_model
=
ChatCompletionResponse
)
def
create_chat_completion
(
request
:
ChatCompletionRequest
):
if
request
.
model
not
in
model_list
:
raise
HTTPException
(
status_code
=
400
,
detail
=
"Invalid Model Name"
)
global
model
id
=
"chatcmpl-A"
if
request
.
messages
[
-
1
].
role
!=
"user"
:
raise
HTTPException
(
status_code
=
400
,
detail
=
"Invalid request"
)
query
=
request
.
messages
[
-
1
].
content
if
request
.
max_length
is
not
None
:
max_length
=
request
.
max_length
else
:
max_length
=
1024
if
request
.
temperature
is
not
None
:
temperature
=
request
.
temperature
else
:
temperature
=
0.1
if
request
.
top_p
is
not
None
:
top_p
=
request
.
top_p
else
:
top_p
=
0.8
prev_messages
=
request
.
messages
[:
-
1
]
# print(prev_messages)
if
len
(
prev_messages
)
>
0
and
prev_messages
[
0
].
role
==
"system"
:
query
=
prev_messages
.
pop
(
0
).
content
+
query
history
=
[]
if
len
(
prev_messages
)
%
2
==
0
:
for
i
in
range
(
0
,
len
(
prev_messages
),
2
):
if
prev_messages
[
i
].
role
==
"user"
and
prev_messages
[
i
+
1
].
role
==
"assistant"
:
history
.
append
([
prev_messages
[
i
].
content
,
prev_messages
[
i
+
1
].
content
])
if
request
.
stream
:
generate
=
predict
(
id
=
id
,
query
=
query
,
history
=
history
,
max_length
=
max_length
,
top_p
=
top_p
,
temperature
=
temperature
,
model_id
=
request
.
model
)
return
EventSourceResponse
(
generate
,
media_type
=
"text/event-stream"
)
response
=
model
.
response
(
query
=
query
,
history
=
history
,
max_length
=
max_length
,
top_p
=
top_p
,
temperature
=
temperature
)
choice_data
=
ChatCompletionResponseChoice
(
index
=
0
,
message
=
ChatMessage
(
role
=
"assistant"
,
content
=
response
),
finish_reason
=
"stop"
)
prompt_tokens
=
len
(
model
.
tokenizer_encode_string
(
query
))
completion_tokens
=
len
(
model
.
tokenizer_encode_string
(
response
))
usage
=
Usage
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
return
ChatCompletionResponse
(
id
=
id
,
model
=
request
.
model
,
choices
=
[
choice_data
],
object
=
"chat.completion"
,
usage
=
usage
)
def
predict
(
id
:
str
,
query
:
str
,
history
:
List
[
List
[
str
]],
model_id
:
str
,
max_length
:
int
,
top_p
:
float
,
temperature
:
float
):
global
model
creat_time
=
int
(
time
.
time
())
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
0
,
delta
=
DeltaMessage
(
role
=
"assistant"
),
finish_reason
=
None
)
chunk
=
ChatCompletionResponse
(
id
=
id
,
created
=
creat_time
,
model
=
model_id
,
choices
=
[
choice_data
],
object
=
"chat.completion.chunk"
)
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) //pydantic从1.8.0开始不支持dumps_kwags参数,参考https://github.com/THUDM/ChatGLM2-6B/issues/308
yield
json
.
dumps
(
chunk
.
model_dump
(
exclude_unset
=
True
),
ensure_ascii
=
False
)
for
new_response
in
model
.
stream_response
(
query
=
query
,
history
=
history
,
max_length
=
max_length
,
top_p
=
top_p
,
temperature
=
temperature
):
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
0
,
delta
=
DeltaMessage
(
content
=
new_response
),
finish_reason
=
None
)
chunk
=
ChatCompletionResponse
(
id
=
id
,
created
=
creat_time
,
model
=
model_id
,
choices
=
[
choice_data
],
object
=
"chat.completion.chunk"
)
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield
json
.
dumps
(
chunk
.
model_dump
(
exclude_unset
=
True
),
ensure_ascii
=
False
)
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
0
,
delta
=
DeltaMessage
(),
finish_reason
=
"stop"
)
chunk
=
ChatCompletionResponse
(
id
=
id
,
created
=
creat_time
,
model
=
model_id
,
choices
=
[
choice_data
],
object
=
"chat.completion.chunk"
)
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield
json
.
dumps
(
chunk
.
model_dump
(
exclude_unset
=
True
),
ensure_ascii
=
False
)
yield
'[DONE]'
def
args_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
'baichuan2_chat_demo'
)
parser
.
add_argument
(
'-p'
,
'--path'
,
type
=
str
,
default
=
"/model"
,
help
=
'模型文件的路径'
)
parser
.
add_argument
(
'-g'
,
'--gpus'
,
type
=
str
,
default
=
"0"
,
help
=
'指定运行的gpu卡,例如“0,1”'
)
args
=
parser
.
parse_args
()
return
args
if
__name__
==
"__main__"
:
args
=
args_parser
()
global
model_list
model_list
=
[
"baichuan2-fastllm"
]
global
device_map
device_map
=
[
"cuda:"
+
num
for
num
in
args
.
gpus
.
split
(
','
)]
llm
.
set_device_map
(
device_map
)
model
=
llm
.
model
(
args
.
path
)
uvicorn
.
run
(
app
,
host
=
'127.0.0.1'
,
port
=
8100
)
api_server_demo/openai-client.py
0 → 100644
View file @
52251123
import
openai
import
time
import
threading
import
queue
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
def
jls_extract_def
(
model
,
messages
,
temperature
,
max_length
,
stream
,
index
):
openai
.
api_base
=
"http://127.0.0.1:8100/v1"
openai
.
api_key
=
"none"
output_tokens
=
0
ret
=
""
t0
=
time
.
time
()
result
=
openai
.
ChatCompletion
.
create
(
model
=
model
,
messages
=
messages
,
temperature
=
temperature
,
max_length
=
max_length
,
stream
=
stream
)
for
chunk
in
result
:
# print(chunk)
output_tokens
+=
1
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"content"
):
if
(
index
==
0
):
print
(
chunk
.
choices
[
0
].
delta
.
content
,
end
=
""
,
flush
=
True
)
ret
+=
chunk
.
choices
[
0
].
delta
.
content
t1
=
time
.
time
()
# print("\ntoken/s: {:.2f}, output_tokens: {}".format(output_tokens/(t1-t0),output_tokens))
result
=
output_tokens
,
ret
,
output_tokens
/
(
t1
-
t0
)
return
result
if
__name__
==
"__main__"
:
prompt
=
"满江红全文"
concurrencys
=
[
1
]
temperature
=
0.1
max_length
=
4096
stream
=
True
prompts
=
[
prompt
]
model
=
"baichuan2-fastllm""
messages=[{"
role
": "
user
", "
content
": "
你好
"}]
pool = ThreadPoolExecutor(max_workers=32)
for i in range(len(concurrencys)):
cur_prompts = prompts * concurrencys[i]
token_count = 0
threads = []
t0 = time.time()
for index, prompt in enumerate(cur_prompts):
messages[0]["
content
"] = prompt
t = pool.submit(jls_extract_def, model, messages, temperature, max_length, stream, index)
t.index = index
threads.append(t)
for future in as_completed(threads):
result = future.result()
print(future.index)
print(result)
print("
\
n
")
token_count += result[0]
t1 = time.time()
print("
\
n
---------------------------------------------
\
n
")
print("
\
nconcurrency
:
{}
".format(concurrencys[i]))
print("
\
ntotal
use
:
{:.
2
f
}
".format(t1-t0))
print("
\
ntoken
/
s
:
{:.
2
f
},
token_count
:
{}
".format(token_count/(t1-t0),token_count))
print("
\
n
---------------------------------------------
\
n
")
api_server_demo/requirements.txt
0 → 100644
View file @
52251123
uvicorn==0.23.2
pydantic==2.5.1
fastapi==0.103.1
sse_starlette
openai==0.28
benchmark/benchmark
View file @
52251123
No preview for this file type
package/fastllm_pytools/hf_model.py
View file @
52251123
...
...
@@ -26,8 +26,9 @@ def create(model,
exit
(
0
);
# 0.1 model info
if
model
.
config
.
model_type
==
"chatglm"
and
model
.
config
.
transformers_version
==
"4.30.2"
:
model
.
config
.
model_type
=
"chatglm3"
# if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
# model.config.model_type = "chatglm3"
# print("model.config.model_type: chatglm3!")
modelInfo
=
model
.
config
.
__dict__
if
model
.
generation_config
is
not
None
:
modelInfo
.
update
(
model
.
generation_config
.
__dict__
)
...
...
@@ -50,6 +51,12 @@ def create(model,
if
modelInfo
[
"chat_format"
]
==
"chatml"
:
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
if
(
modelInfo
[
"model_type"
]
==
"chatglm"
and
hasattr
(
tokenizer
,
"build_chat_input"
)):
# chatglm3
modelInfo
[
"pre_prompt"
]
=
""
;
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
tokenizer
.
get_command
(
"<|user|>"
))
+
">
\n
"
);
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
tokenizer
.
get_command
(
"<|assistant|>"
))
+
">"
);
modelInfo
[
"history_sep"
]
=
""
;
weight_type_dict
=
{};
...
...
package/fastllm_pytools/libfastllm_tools.so
View file @
52251123
No preview for this file type
package/fastllm_pytools/llm.py
View file @
52251123
...
...
@@ -4,10 +4,13 @@ import os;
import
threading
from
typing
import
Optional
,
Tuple
,
Union
,
List
,
Callable
,
Dict
,
Any
;
from
copy
import
deepcopy
import
json
import
platform
if
platform
.
system
()
==
'Windows'
:
fastllm_lib
=
ctypes
.
cdll
.
LoadLibrary
(
os
.
path
.
join
(
os
.
path
.
split
(
os
.
path
.
realpath
(
__file__
))[
0
],
"fastllm_tools.dll"
))
fastllm_lib
=
ctypes
.
CDLL
(
os
.
path
.
join
(
os
.
path
.
split
(
os
.
path
.
realpath
(
__file__
))[
0
],
"fastllm_tools.dll"
),
winmode
=
0
)
elif
platform
.
system
()
==
'Darwin'
:
fastllm_lib
=
ctypes
.
cdll
.
LoadLibrary
(
os
.
path
.
join
(
os
.
path
.
split
(
os
.
path
.
realpath
(
__file__
))[
0
],
"libfastllm_tools.dylib"
))
else
:
fastllm_lib
=
ctypes
.
cdll
.
LoadLibrary
(
os
.
path
.
join
(
os
.
path
.
split
(
os
.
path
.
realpath
(
__file__
))[
0
],
"libfastllm_tools.so"
))
...
...
@@ -22,7 +25,8 @@ fastllm_lib.token_encode_string.restype = ctypes.c_int
fastllm_lib
.
launch_response_llm_model
.
argtypes
=
[
ctypes
.
c_int
,
ctypes
.
c_int
,
ctypes
.
c_void_p
,
ctypes
.
c_int
,
ctypes
.
c_bool
,
ctypes
.
c_float
,
ctypes
.
c_int
,
ctypes
.
c_float
,
ctypes
.
c_float
,
ctypes
.
c_bool
]
ctypes
.
c_float
,
ctypes
.
c_float
,
ctypes
.
c_bool
,
ctypes
.
c_int
,
ctypes
.
POINTER
(
ctypes
.
c_int
)]
fastllm_lib
.
launch_response_llm_model
.
restype
=
ctypes
.
c_int
fastllm_lib
.
fetch_response_llm_model
.
argtypes
=
[
ctypes
.
c_int
,
ctypes
.
c_int
]
...
...
@@ -39,7 +43,8 @@ fastllm_lib.response_str_llm_model.restype = ctypes.POINTER(ctypes.c_char)
fastllm_lib
.
launch_response_str_llm_model
.
argtype
=
[
ctypes
.
c_int
,
ctypes
.
c_char_p
,
ctypes
.
c_int
,
ctypes
.
c_bool
,
ctypes
.
c_float
,
ctypes
.
c_int
,
ctypes
.
c_float
,
ctypes
.
c_float
,
ctypes
.
c_bool
]
ctypes
.
c_float
,
ctypes
.
c_float
,
ctypes
.
c_bool
,
ctypes
.
c_int
,
ctypes
.
POINTER
(
ctypes
.
c_int
)]
fastllm_lib
.
launch_response_str_llm_model
.
restype
=
ctypes
.
c_int
fastllm_lib
.
fetch_response_str_llm_model
.
argtypes
=
[
ctypes
.
c_int
,
ctypes
.
c_int
]
...
...
@@ -59,7 +64,6 @@ fastllm_lib.add_tokenizer_word_llm_model.argtype = [ctypes.c_int, ctypes.c_char_
fastllm_lib
.
set_device_map
.
argtype
=
[
ctypes
.
c_int
,
ctypes
.
c_void_p
,
ctypes
.
c_char_p
,
ctypes
.
c_void_p
]
fastllm_lib
.
get_llm_model_type
.
argtype
=
[
ctypes
.
c_int
]
# fastllm_lib.get_llm_model_type.restype = ctypes.c_char_p
fastllm_lib
.
get_llm_model_type
.
restype
=
ctypes
.
POINTER
(
ctypes
.
c_char
)
fastllm_lib
.
response_batch_str_llm_model
.
argtypes
=
[
ctypes
.
c_int
,
ctypes
.
POINTER
(
ctypes
.
c_char_p
),
ctypes
.
c_int
,
...
...
@@ -233,19 +237,29 @@ class model:
break
return
buffer_bytes
[:
result_len
]
def
stop_token_ctypes
(
self
,
stop_token_ids
):
if
stop_token_ids
is
None
:
return
0
,
None
else
:
return
ctypes
.
c_int
(
len
(
stop_token_ids
)),
(
ctypes
.
c_int
*
len
(
stop_token_ids
))(
*
stop_token_ids
)
def
response_logits
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
tokenizer
=
None
)
->
str
:
tokenizer
=
None
,
stop_token_ids
:
List
[
int
]
=
None
,
)
->
str
:
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
);
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
)
if
(
tokenizer
==
None
):
handle
=
fastllm_lib
.
launch_response_str_llm_model
(
self
.
model
,
prompt
.
encode
(),
ctypes
.
c_int
(
1
),
ctypes
.
c_bool
(
False
),
ctypes
.
c_float
(
1
),
ctypes
.
c_int
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_bool
(
True
));
ctypes
.
c_int
(
1
),
ctypes
.
c_bool
(
False
),
ctypes
.
c_float
(
1
),
ctypes
.
c_int
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_float
(
1
),
ctypes
.
c_bool
(
True
),
stop_token_len
,
stop_token_list
);
else
:
input
=
tokenizer
.
encode
(
prompt
);
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
1
,
False
,
1
,
1
,
1
,
1
,
True
);
1
,
False
,
1
,
1
,
1
,
1
,
True
,
stop_token_len
,
stop_token_list
);
vocab_size
=
fastllm_lib
.
get_tokenizer_vocab_size
(
self
.
model
);
logits
=
list
(
range
(
vocab_size
))
array
=
(
ctypes
.
c_float
*
(
vocab_size
*
4
))(
*
logits
);
...
...
@@ -258,7 +272,8 @@ class model:
def
response
(
self
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
)
->
str
:
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
stop_token_ids
:
List
[
int
]
=
None
)
->
str
:
ret
=
""
;
for
i
in
self
.
stream_response
(
query
=
query
,
history
=
history
,
...
...
@@ -267,7 +282,8 @@ class model:
top_p
=
top_p
,
top_k
=
top_k
,
temperature
=
temperature
,
repeat_penalty
=
repeat_penalty
,
one_by_one
=
True
):
one_by_one
=
True
,
stop_token_ids
=
stop_token_ids
):
ret
+=
i
;
return
ret
;
...
...
@@ -275,11 +291,13 @@ class model:
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
one_by_one
=
True
):
one_by_one
=
True
,
stop_token_ids
:
List
[
int
]
=
None
):
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
);
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
);
handle
=
fastllm_lib
.
launch_response_str_llm_model
(
self
.
model
,
prompt
.
encode
(),
ctypes
.
c_int
(
max_length
),
ctypes
.
c_bool
(
do_sample
),
ctypes
.
c_float
(
top_p
),
ctypes
.
c_int
(
top_k
),
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
));
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
),
stop_token_len
,
stop_token_list
);
res
=
""
;
ret
=
b
''
;
fail_cnt
=
0
;
...
...
@@ -310,12 +328,15 @@ class model:
def
stream_response_raw
(
self
,
input_tokens
:
List
[
int
],
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
one_by_one
=
True
one_by_one
=
True
,
stop_token_ids
:
List
[
int
]
=
None
):
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
)
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input_tokens
),
(
ctypes
.
c_int
*
len
(
input_tokens
))(
*
input_tokens
),
ctypes
.
c_int
(
max_length
),
ctypes
.
c_bool
(
do_sample
),
ctypes
.
c_float
(
top_p
),
ctypes
.
c_int
(
top_k
),
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
))
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
),
stop_token_len
,
stop_token_list
)
# 可能遇到长尾char需要多个token才能够生成,所以只返回bytes,string.decode策略交给外部
# 方便统计输出token数量,和控制不完整utf8时候解码的逻辑
...
...
@@ -335,15 +356,16 @@ class model:
yield
total_bytes
def
chat
(
self
,
tokenizer
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
**
kwargs
):
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
stop_token_ids
:
List
[
int
]
=
None
,
**
kwargs
):
if
self
.
model_type
!=
"chatglm3"
:
if
(
not
(
history
)):
history
=
[];
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
);
input
=
tokenizer
.
encode
(
prompt
);
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
)
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
);
False
,
stop_token_len
,
stop_token_list
);
result
=
[];
while
True
:
...
...
@@ -359,11 +381,11 @@ class model:
history
=
[]
role
=
"user"
input
=
self
.
build_chatglm3_input
(
tokenizer
,
query
,
history
=
history
,
role
=
role
)
history
.
append
({
"role"
:
role
,
"content"
:
query
})
history
.
append
({
"role"
:
role
,
"content"
:
query
})
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
)
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
);
False
,
stop_token_len
,
stop_token_list
);
tokens
=
[];
while
True
:
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
);
...
...
@@ -377,15 +399,16 @@ class model:
def
stream_chat
(
self
,
tokenizer
,
query
:
str
,
history
:
List
[
Tuple
[
str
,
str
]]
=
None
,
past_key_values
=
None
,
max_length
:
int
=
8192
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
return_past_key_values
=
False
,
**
kwargs
)
->
str
:
return_past_key_values
=
False
,
stop_token_ids
:
List
[
int
]
=
None
,
**
kwargs
)
->
str
:
if
self
.
model_type
!=
"chatglm3"
:
if
(
not
(
history
)):
history
=
[];
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
history
);
input
=
tokenizer
.
encode
(
prompt
);
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
)
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
);
False
,
stop_token_len
,
stop_token_list
);
tokens
=
[];
while
True
:
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
);
...
...
@@ -404,10 +427,11 @@ class model:
role
=
"user"
input
=
self
.
build_chatglm3_input
(
tokenizer
,
query
,
history
=
history
,
role
=
role
)
history
.
append
({
"role"
:
role
,
"content"
:
query
})
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
)
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
);
False
,
stop_token_len
,
stop_token_list
);
tokens
=
[];
while
True
:
cur
=
fastllm_lib
.
fetch_response_llm_model
(
self
.
model
,
handle
);
...
...
@@ -517,16 +541,18 @@ class model:
def
response_batch
(
self
,
querys
:
List
[
str
],
historys
:
List
[
List
[
Tuple
[
str
,
str
]]]
=
None
,
max_length
:
int
=
1024
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
**
kwargs
)
->
List
[
str
]:
stop_token_ids
:
List
[
int
]
=
None
,
**
kwargs
)
->
List
[
str
]:
query_size
=
len
(
querys
)
if
(
not
(
historys
)):
historys
=
[[]
for
_
in
range
(
query_size
)]
handles
=
[]
for
i
,
query
in
enumerate
(
querys
):
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
historys
[
i
])
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
);
handle
=
fastllm_lib
.
launch_response_str_llm_model
(
self
.
model
,
prompt
.
encode
(),
ctypes
.
c_int
(
max_length
),
ctypes
.
c_bool
(
do_sample
),
ctypes
.
c_float
(
top_p
),
ctypes
.
c_int
(
top_k
),
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
))
ctypes
.
c_float
(
temperature
),
ctypes
.
c_float
(
repeat_penalty
),
ctypes
.
c_bool
(
False
),
stop_token_len
,
stop_token_list
)
handles
.
append
(
handle
)
responses
=
[]
...
...
@@ -560,7 +586,7 @@ class model:
def
chat_batch
(
self
,
tokenizer
,
querys
:
List
[
str
],
historys
:
List
[
List
[
Tuple
[
str
,
str
]]]
=
None
,
max_length
:
int
=
1024
,
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
**
kwargs
):
do_sample
=
True
,
top_p
=
0.8
,
top_k
=
1
,
temperature
=
1.0
,
repeat_penalty
=
1.01
,
stop_token_ids
:
List
[
int
]
=
None
,
**
kwargs
):
query_size
=
len
(
querys
)
if
(
not
(
historys
)):
historys
=
[[]
for
_
in
range
(
query_size
)]
...
...
@@ -569,9 +595,10 @@ class model:
for
i
,
query
in
enumerate
(
querys
):
prompt
=
query
if
self
.
direct_query
else
self
.
get_prompt
(
query
,
historys
[
i
])
input
=
tokenizer
.
encode
(
prompt
);
stop_token_len
,
stop_token_list
=
self
.
stop_token_ctypes
(
stop_token_ids
);
handle
=
fastllm_lib
.
launch_response_llm_model
(
self
.
model
,
len
(
input
),
(
ctypes
.
c_int
*
len
(
input
))(
*
input
),
max_length
,
do_sample
,
top_p
,
top_k
,
temperature
,
repeat_penalty
,
False
);
False
,
stop_token_len
,
stop_token_list
);
handles
.
append
(
handle
)
responses
=
[]
...
...
@@ -588,3 +615,6 @@ class model:
return
responses
,
historys
def
release_memory
(
self
):
fastllm_lib
.
release_memory
(
self
.
model
)
package/fastllm_pytools/torch2flm.py
View file @
52251123
...
...
@@ -80,8 +80,8 @@ def tofile(exportPath,
fo
.
write
(
struct
.
pack
(
'i'
,
2
))
# 0.1 model info
if
model
.
config
.
model_type
==
"chatglm"
and
model
.
config
.
transformers_version
==
"4.30.2"
:
model
.
config
.
model_type
=
"chatglm3"
#
if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
#
model.config.model_type = "chatglm3"
modelInfo
=
model
.
config
.
__dict__
if
model
.
generation_config
is
not
None
:
modelInfo
.
update
(
model
.
generation_config
.
__dict__
)
...
...
@@ -114,6 +114,13 @@ def tofile(exportPath,
if
modelInfo
[
"chat_format"
]
==
"chatml"
:
modelInfo
[
"im_end_id"
]
=
tokenizer
.
im_end_id
modelInfo
[
"im_start_id"
]
=
tokenizer
.
im_start_id
if
(
modelInfo
[
"model_type"
]
==
"chatglm"
and
hasattr
(
tokenizer
,
"build_chat_input"
)):
print
(
"chatglm3"
)
# chatglm3
modelInfo
[
"pre_prompt"
]
=
""
;
modelInfo
[
"user_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
tokenizer
.
get_command
(
"<|user|>"
))
+
">
\n
"
);
modelInfo
[
"bot_role"
]
=
(
"<FLM_FIX_TOKEN_"
+
str
(
tokenizer
.
get_command
(
"<|assistant|>"
))
+
">"
);
modelInfo
[
"history_sep"
]
=
""
;
modelInfo
[
"tokenizer_use_score"
]
=
"1"
# 分词带分数
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment