Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenych
chat_demo
Commits
3edf4e00
"src/vscode:/vscode.git/clone" did not exist on "6c0e3972dcd80f08d4d30bab1bbd065ba0b658d1"
Commit
3edf4e00
authored
Aug 05, 2024
by
chenych
Browse files
Add vllm stram chat code and update client.py
parent
020c2e2f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
245 additions
and
102 deletions
+245
-102
llm_service/client.py
llm_service/client.py
+5
-2
llm_service/inferencer.py
llm_service/inferencer.py
+152
-100
llm_service/vllm_test.py
llm_service/vllm_test.py
+88
-0
No files found.
llm_service/client.py
View file @
3edf4e00
...
...
@@ -4,6 +4,7 @@ import requests
parse
=
argparse
.
ArgumentParser
()
parse
.
add_argument
(
'--query'
,
default
=
'请写一首诗'
)
parse
.
add_argument
(
'--use_hf'
,
action
=
'store_true'
)
args
=
parse
.
parse_args
()
print
(
args
.
query
)
...
...
@@ -14,8 +15,10 @@ data = {
}
json_str
=
json
.
dumps
(
data
)
response
=
requests
.
post
(
"http://localhost:8888/inference"
,
headers
=
headers
,
data
=
json_str
.
encode
(
"utf-8"
),
verify
=
False
)
if
args
.
use_hf
:
response
=
requests
.
post
(
"http://localhost:8888/hf_inference"
,
headers
=
headers
,
data
=
json_str
.
encode
(
"utf-8"
),
verify
=
False
)
else
:
response
=
requests
.
post
(
"http://localhost:8888/vllm_inference"
,
headers
=
headers
,
data
=
json_str
.
encode
(
"utf-8"
),
verify
=
False
)
str_response
=
response
.
content
.
decode
(
"utf-8"
)
print
(
json
.
loads
(
str_response
))
llm_service/inferencer.py
View file @
3edf4e00
...
...
@@ -8,8 +8,8 @@ import asyncio
from
loguru
import
logger
from
aiohttp
import
web
# from multiprocessing import Value
from
transformers
import
AutoModelForCausalLM
,
Autotokenzier
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
COMMON
=
{
...
...
@@ -65,27 +65,39 @@ def build_history_messages(prompt, history, system: str = None):
return
history_messages
def
substitution
(
output_text
):
# 翻译特殊字符
import
re
if
isinstance
(
output_text
,
list
):
output_text
=
output_text
[
0
]
matchObj
=
re
.
split
(
'.*(<.*>).*'
,
output_text
,
re
.
M
|
re
.
I
)
if
len
(
matchObj
)
>
1
:
obj
=
matchObj
[
1
]
replace_str
=
COMMON
.
get
(
obj
)
if
replace_str
:
output_text
=
output_text
.
replace
(
obj
,
replace_str
)
logger
.
info
(
f
"
{
obj
}
be replaced
{
replace_str
}
, after
{
output_text
}
"
)
return
output_text
class
LLMInference
:
def
__init__
(
self
,
model
,
tokenizer
,
sampling_params
,
tokenzier
,
device
:
str
=
'cuda'
,
use_vllm
:
bool
=
False
,
)
->
None
:
self
.
device
=
device
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
sampling_params
=
sampling_params
self
.
use_vllm
=
use_vllm
self
.
tokenzier
=
tokenzier
def
generate_response
(
self
,
prompt
,
history
=
[]):
print
(
"generate"
)
output_text
=
''
error
=
''
time_token
i
zer
=
time
.
time
()
time_tokenz
i
er
=
time
.
time
()
try
:
output_text
=
self
.
chat
(
prompt
,
history
)
...
...
@@ -96,47 +108,16 @@ class LLMInference:
time_finish
=
time
.
time
()
logger
.
debug
(
'output_text:{}
\n
timecost {} '
.
format
(
output_text
,
time_finish
-
time_token
i
zer
))
time_finish
-
time_tokenz
i
er
))
return
output_text
,
error
def
substitution
(
self
,
output_text
):
# 翻译特殊字符
import
re
matchObj
=
re
.
split
(
'.*(<.*>).*'
,
output_text
,
re
.
M
|
re
.
I
)
if
len
(
matchObj
)
>
1
:
obj
=
matchObj
[
1
]
replace_str
=
COMMON
.
get
(
obj
)
if
replace_str
:
output_text
=
output_text
.
replace
(
obj
,
replace_str
)
logger
.
info
(
f
"
{
obj
}
be replaced
{
replace_str
}
, after
{
output_text
}
"
)
return
output_text
def
chat
(
self
,
prompt
:
str
,
history
=
[]):
def
chat
(
self
,
messages
,
history
=
[]):
'''单轮问答'''
logger
.
info
(
"****************** in chat ******************"
)
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
}]
try
:
if
self
.
use_vllm
:
## vllm
logger
.
info
(
"****************** use vllm ******************"
)
prompt_token_ids
=
[
self
.
tokenizer
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
)]
logger
.
info
(
f
"before generate
{
messages
}
"
)
outputs
=
self
.
model
.
generate
(
prompt_token_ids
=
prompt_token_ids
,
sampling_params
=
self
.
sampling_params
)
output_text
=
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
generated_text_
=
self
.
substitution
(
generated_text
)
output_text
.
append
(
generated_text_
)
logger
.
info
(
f
"using vllm, output_text
{
output_text
}
"
)
return
''
.
join
(
output_text
)
else
:
# transformers
input_ids
=
self
.
token
i
zer
.
apply_chat_template
(
input_ids
=
self
.
tokenz
i
er
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
,
return_tensors
=
"pt"
).
to
(
'cuda'
)
outputs
=
self
.
model
.
generate
(
input_ids
,
...
...
@@ -144,9 +125,9 @@ class LLMInference:
)
response
=
outputs
[
0
][
input_ids
.
shape
[
-
1
]:]
generated_text
=
self
.
token
i
zer
.
decode
(
response
,
skip_special_tokens
=
True
)
generated_text
=
self
.
tokenz
i
er
.
decode
(
response
,
skip_special_tokens
=
True
)
output_text
=
self
.
substitution
(
generated_text
)
output_text
=
substitution
(
generated_text
)
logger
.
info
(
f
"using transformers, output_text
{
output_text
}
"
)
return
output_text
...
...
@@ -154,86 +135,143 @@ class LLMInference:
logger
.
error
(
f
"chat inference failed,
{
e
}
"
)
def
chat_stream
(
self
,
prompt
:
str
,
history
=
[]):
def
chat_stream
(
self
,
messages
,
history
=
[]):
'''流式服务'''
# HuggingFace
logger
.
info
(
"****************** in chat stream *****************"
)
current_length
=
0
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
}]
logger
.
info
(
f
"stream_chat messages
{
messages
}
"
)
for
response
,
_
,
_
in
self
.
model
.
stream_chat
(
self
.
token
i
zer
,
messages
,
history
=
history
,
for
response
,
_
,
_
in
self
.
model
.
stream_chat
(
self
.
tokenz
i
er
,
messages
,
history
=
history
,
max_length
=
1024
,
past_key_values
=
None
,
return_past_key_values
=
True
):
output_text
=
response
[
current_length
:]
output_text
=
self
.
substitution
(
output_text
)
logger
.
info
(
f
"using transformers chat_stream, Prompt:
{
prompt
!
r
}
, Generated text:
{
output_text
!
r
}
"
)
output_text
=
substitution
(
output_text
)
logger
.
info
(
f
"using transformers chat_stream, Prompt:
{
messages
!
r
}
, Generated text:
{
output_text
!
r
}
"
)
yield
output_text
current_length
=
len
(
response
)
def
init_model
(
model_path
,
use_vllm
=
False
,
t
p
_size
=
1
):
def
init_model
(
model_path
,
use_vllm
=
False
,
t
ensor_parallel
_size
=
1
):
## init models
# huggingface
logger
.
info
(
"Starting initial model of Llama"
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
if
use_vllm
:
# vllm
from
vllm
import
LLM
,
SamplingParams
logger
.
info
(
"Starting initial model of LLM"
)
tokenzier
=
Autotokenzier
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
if
use_vllm
:
from
vllm
import
AsyncEngineArgs
,
AsyncLLMEngine
,
SamplingParams
sampling_params
=
SamplingParams
(
temperature
=
1
,
top_p
=
0.95
,
max_tokens
=
1024
,
stop_token_ids
=
[
tokenizer
.
eos_token_id
])
early_stopping
=
False
,
stop_token_ids
=
[
tokenzier
.
eos_token_id
]
)
# vLLM基础配置
args
=
AsyncEngineArgs
(
model_path
)
args
.
worker_use_ray
=
False
args
.
engine_use_ray
=
False
args
.
tokenzier
=
model_path
args
.
tensor_parallel_size
=
tensor_parallel_size
args
.
trust_remote_code
=
True
args
.
enforce_eager
=
True
args
.
max_model_len
=
1024
args
.
dtype
=
'float16'
# 加载模型
engine
=
AsyncLLMEngine
.
from_engine_args
(
args
)
return
engine
,
tokenzier
,
sampling_params
else
:
# huggingface
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
).
half
().
cuda
().
eval
()
return
model
,
tokenzier
,
None
model
=
LLM
(
model
=
model_path
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
dtype
=
"float16"
,
tensor_parallel_size
=
tp_size
)
return
model
,
tokenizer
,
sampling_params
def
hf_inference
(
bind_port
,
model
,
tokenzier
,
stream_chat
):
'''启动 hf Web 服务器,接收 HTTP 请求,并通过调用本地的 LLM 推理服务生成响应. '''
llm_infer
=
LLMInference
(
model
,
tokenzier
)
async
def
inference
(
request
):
start
=
time
.
time
()
input_json
=
await
request
.
json
()
prompt
=
input_json
[
'query'
]
history
=
input_json
[
'history'
]
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
}]
logger
.
info
(
"****************** use transformers ******************"
)
if
stream_chat
:
text
=
await
asyncio
.
to_thread
(
llm_infer
.
chat_stream
,
messages
=
messages
,
history
=
history
)
else
:
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
).
half
().
cuda
().
eval
()
return
model
,
tokenizer
,
None
text
=
await
asyncio
.
to_thread
(
llm_infer
.
chat
,
messages
=
messages
,
history
=
history
)
logger
.
debug
(
'问题:{} 回答:{}
\n
timecost {} '
.
format
(
prompt
,
text
,
time
.
time
()
-
start
))
return
web
.
json_response
({
'text'
:
text
})
def
llm_inference
(
args
):
'''启动 Web 服务器,接收 HTTP 请求,并通过调用本地的 LLM 推理服务生成响应. '''
config
=
configparser
.
ConfigParser
()
config
.
read
(
args
.
config_path
)
app
=
web
.
Application
()
app
.
add_routes
([
web
.
post
(
'/hf_inference'
,
inference
)])
web
.
run_app
(
app
,
host
=
'0.0.0.0'
,
port
=
bind_port
)
bind_port
=
int
(
config
[
'default'
][
'bind_port'
])
model_path
=
config
[
'llm'
][
'local_llm_path'
]
tensor_parallel_size
=
config
.
getint
(
'llm'
,
'tensor_parallel_size'
)
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
stream_chat
=
config
.
getboolean
(
'llm'
,
'stream_chat'
)
logger
.
info
(
f
"Get params: model_path
{
model_path
}
, use_vllm
{
use_vllm
}
, tensor_parallel_size
{
tensor_parallel_size
}
, stream_chat
{
stream_chat
}
"
)
model
,
tokenzier
,
sampling_params
=
init_model
(
model_path
,
use_vllm
,
tensor_parallel_size
)
def
vllm_inference
(
bind_port
,
model
,
tokenzier
,
sampling_params
,
stream_chat
):
'''启动 Web 服务器,接收 HTTP 请求,并通过调用本地的 LLM 推理服务生成响应. '''
import
uuid
from
typing
import
AsyncGenerator
from
fastapi.responses
import
StreamingResponse
async
def
inference
(
request
):
start
=
time
.
time
()
input_json
=
await
request
.
json
()
llm_infer
=
LLMInference
(
model
,
tokenzier
,
sampling_params
,
use_vllm
=
use_vllm
)
prompt
=
input_json
[
'query'
]
history
=
input_json
[
'history'
]
logger
.
info
(
f
"prompt
{
prompt
}
"
)
# history = input_json['history']
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
}]
logger
.
info
(
"****************** use vllm ******************"
)
## generate template
input_text
=
tokenzier
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
logger
.
info
(
f
"The input_text is
{
input_text
}
"
)
assert
model
is
not
None
request_id
=
str
(
uuid
.
uuid4
().
hex
)
results_generator
=
model
.
generate
(
input_text
,
sampling_params
=
sampling_params
,
request_id
=
request_id
)
# Streaming case
async
def
stream_results
()
->
AsyncGenerator
[
bytes
,
None
]:
async
for
request_output
in
results_generator
:
text_outputs
=
[
output
.
text
for
output
in
request_output
.
outputs
]
ret
=
{
"text"
:
text_outputs
}
print
(
ret
)
# yield (json.dumps(ret) + "\0").encode("utf-8")
yield
web
.
json_response
({
'text'
:
text
})
if
stream_chat
:
text
=
await
asyncio
.
to_thread
(
llm_infer
.
chat_stream
,
prompt
=
prompt
,
history
=
history
)
else
:
text
=
await
asyncio
.
to_thread
(
llm_infer
.
chat
,
prompt
=
prompt
,
history
=
history
)
logger
.
info
(
"****************** in chat stream *****************"
)
return
StreamingResponse
(
stream_results
())
# Non-streaming case
logger
.
info
(
"****************** in chat ******************"
)
final_output
=
None
async
for
request_output
in
results_generator
:
# if await request.is_disconnected():
# # Abort the request if the client disconnects.
# await engine.abort(request_id)
# return Response(status_code=499)
final_output
=
request_output
assert
final_output
is
not
None
text
=
[
output
.
text
for
output
in
final_output
.
outputs
]
end
=
time
.
time
()
output_text
=
substitution
(
text
)
logger
.
debug
(
'问题:{} 回答:{}
\n
timecost {} '
.
format
(
prompt
,
text
,
end
-
start
))
return
web
.
json_response
({
'text'
:
text
})
return
web
.
json_response
({
'text'
:
output_
text
})
app
=
web
.
Application
()
app
.
add_routes
([
web
.
post
(
'/inference'
,
inference
)])
app
.
add_routes
([
web
.
post
(
'/
vllm_
inference'
,
inference
)])
web
.
run_app
(
app
,
host
=
'0.0.0.0'
,
port
=
bind_port
)
...
...
@@ -292,7 +330,21 @@ def parse_args():
def
main
():
args
=
parse_args
()
set_envs
(
args
.
DCU_ID
)
llm_inference
(
args
)
# configs
config
=
configparser
.
ConfigParser
()
config
.
read
(
args
.
config_path
)
bind_port
=
int
(
config
[
'default'
][
'bind_port'
])
model_path
=
config
[
'llm'
][
'local_llm_path'
]
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
tensor_parallel_size
=
config
.
getint
(
'llm'
,
'tensor_parallel_size'
)
stream_chat
=
config
.
getboolean
(
'llm'
,
'stream_chat'
)
logger
.
info
(
f
"Get params: model_path
{
model_path
}
, use_vllm
{
use_vllm
}
, tensor_parallel_size
{
tensor_parallel_size
}
, stream_chat
{
stream_chat
}
"
)
model
,
tokenzier
,
sampling_params
=
init_model
(
model_path
,
use_vllm
,
tensor_parallel_size
)
if
use_vllm
:
vllm_inference
(
bind_port
,
model
,
tokenzier
,
sampling_params
,
stream_chat
)
else
:
hf_inference
(
bind_port
,
model
,
tokenzier
,
sampling_params
,
stream_chat
)
# infer_test(args)
...
...
llm_service/vllm_test.py
0 → 100644
View file @
3edf4e00
def
init_model
(
model_path
,
use_vllm
=
False
,
tensor_parallel_size
=
1
):
## init models
# huggingface
logger
.
info
(
"Starting initial model of Llama - vllm"
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
# vllm
sampling_params
=
SamplingParams
(
temperature
=
1
,
top_p
=
0.95
,
max_tokens
=
1024
,
early_stopping
=
False
,
stop_token_ids
=
[
tokenizer
.
eos_token_id
]
)
# vLLM基础配置
args
=
AsyncEngineArgs
(
model_path
)
args
.
worker_use_ray
=
False
args
.
engine_use_ray
=
False
args
.
tokenizer
=
model_path
args
.
tensor_parallel_size
=
tensor_parallel_size
args
.
trust_remote_code
=
True
args
.
enforce_eager
=
True
args
.
max_model_len
=
1024
args
.
dtype
=
'float16'
# 加载模型
engine
=
AsyncLLMEngine
.
from_engine_args
(
args
)
return
engine
,
tokenizer
,
sampling_params
def
llm_inference
(
args
):
'''启动 Web 服务器,接收 HTTP 请求,并通过调用本地的 LLM 推理服务生成响应. '''
config
=
configparser
.
ConfigParser
()
config
.
read
(
args
.
config_path
)
bind_port
=
int
(
config
[
'default'
][
'bind_port'
])
model_path
=
config
[
'llm'
][
'local_llm_path'
]
tensor_parallel_size
=
config
.
getint
(
'llm'
,
'tensor_parallel_size'
)
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
stream_chat
=
config
.
getboolean
(
'llm'
,
'stream_chat'
)
logger
.
info
(
f
"Get params: model_path
{
model_path
}
, use_vllm
{
use_vllm
}
, tensor_parallel_size
{
tensor_parallel_size
}
, stream_chat
{
stream_chat
}
"
)
model
,
tokenizer
,
sampling_params
=
init_model
(
model_path
,
tensor_parallel_size
)
async
def
inference
(
request
):
start
=
time
.
time
()
input_json
=
await
request
.
json
()
prompt
=
input_json
[
'query'
]
history
=
input_json
[
'history'
]
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
}]
logger
.
info
(
"****************** use vllm ******************"
)
logger
.
info
(
f
"before generate
{
messages
}
"
)
## 1
text
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
print
(
text
)
assert
model
is
not
None
request_id
=
str
(
uuid
.
uuid4
().
hex
)
results_generator
=
model
.
generate
(
inputs
=
text
,
sampling_params
=
sampling_params
,
request_id
=
request_id
)
# Streaming case
async
def
stream_results
()
->
AsyncGenerator
[
bytes
,
None
]:
async
for
request_output
in
results_generator
:
prompt
=
request_output
.
prompt
text_outputs
=
[
output
.
text
for
output
in
request_output
.
outputs
]
ret
=
{
"text"
:
text_outputs
}
yield
(
json
.
dumps
(
ret
)
+
"
\0
"
).
encode
(
"utf-8"
)
if
stream_chat
:
return
StreamingResponse
(
stream_results
())
# Non-streaming case
final_output
=
None
async
for
request_output
in
results_generator
:
# if await request.is_disconnected():
# # Abort the request if the client disconnects.
# await engine.abort(request_id)
# return Response(status_code=499)
final_output
=
request_output
assert
final_output
is
not
None
text
=
[
output
.
text
for
output
in
final_output
.
outputs
]
end
=
time
.
time
()
logger
.
debug
(
'问题:{} 回答:{}
\n
timecost {} '
.
format
(
prompt
,
text
,
end
-
start
))
return
web
.
json_response
({
'text'
:
text
})
app
=
web
.
Application
()
app
.
add_routes
([
web
.
post
(
'/inference'
,
inference
)])
web
.
run_app
(
app
,
host
=
'0.0.0.0'
,
port
=
bind_port
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment