Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
08043847
Unverified
Commit
08043847
authored
Sep 08, 2023
by
Antoni Baum
Committed by
GitHub
Sep 08, 2023
Browse files
Start background task in `AsyncLLMEngine.generate` (#988)
Co-authored-by:
Zhuohan Li
<
zhuohan123@gmail.com
>
parent
4b5bcf89
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
16 additions
and
24 deletions
+16
-24
tests/async_engine/api_server_async_engine.py
tests/async_engine/api_server_async_engine.py
+1
-2
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+13
-9
vllm/entrypoints/api_server.py
vllm/entrypoints/api_server.py
+1
-5
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-8
No files found.
tests/async_engine/api_server_async_engine.py
View file @
08043847
...
...
@@ -40,8 +40,7 @@ if __name__ == "__main__":
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngineWithStats
.
from_engine_args
(
engine_args
,
start_engine_loop
=
False
)
engine
=
AsyncLLMEngineWithStats
.
from_engine_args
(
engine_args
)
vllm
.
entrypoints
.
api_server
.
engine
=
engine
uvicorn
.
run
(
app
,
...
...
vllm/engine/async_llm_engine.py
View file @
08043847
...
...
@@ -230,6 +230,8 @@ class AsyncLLMEngine:
async frontend will be executed in a separate process as the
model workers.
log_requests: Whether to log the requests.
start_engine_loop: If True, the background task to run the engine
will be automatically started in the generate call.
*args, *kwargs: Arguments for LLMEngine.
"""
...
...
@@ -240,7 +242,7 @@ class AsyncLLMEngine:
engine_use_ray
:
bool
,
*
args
,
log_requests
:
bool
=
True
,
start_engine_loop
:
bool
=
Fals
e
,
start_engine_loop
:
bool
=
Tru
e
,
**
kwargs
)
->
None
:
self
.
worker_use_ray
=
worker_use_ray
self
.
engine_use_ray
=
engine_use_ray
...
...
@@ -249,8 +251,7 @@ class AsyncLLMEngine:
self
.
request_tracker
:
RequestTracker
=
RequestTracker
()
self
.
background_loop
=
None
if
start_engine_loop
:
self
.
start_background_loop
()
self
.
start_engine_loop
=
start_engine_loop
@
property
def
is_running
(
self
)
->
bool
:
...
...
@@ -330,11 +331,14 @@ class AsyncLLMEngine:
f
"prompt token ids:
{
prompt_token_ids
}
."
)
if
not
self
.
is_running
:
raise
AsyncEngineDeadError
(
"Background loop is not running. If it was running, "
"inspect the output to find the stacktrace of the "
"error that caused the background loop to stop "
"(AsyncEngineDeadError)."
)
if
self
.
start_engine_loop
:
self
.
start_background_loop
()
else
:
raise
AsyncEngineDeadError
(
"Background loop is not running. If it was running, "
"inspect the output to find the stacktrace of the "
"error that caused the background loop to stop "
"(AsyncEngineDeadError)."
)
stream
=
self
.
request_tracker
.
add_request
(
request_id
,
...
...
@@ -426,7 +430,7 @@ class AsyncLLMEngine:
@
classmethod
def
from_engine_args
(
cls
,
engine_args
:
AsyncEngineArgs
,
start_engine_loop
:
bool
=
Fals
e
)
->
"AsyncLLMEngine"
:
start_engine_loop
:
bool
=
Tru
e
)
->
"AsyncLLMEngine"
:
"""Creates an async LLM engine from the engine arguments."""
# Create the engine configs.
engine_configs
=
engine_args
.
create_engine_configs
()
...
...
vllm/entrypoints/api_server.py
View file @
08043847
...
...
@@ -32,9 +32,6 @@ async def generate(request: Request) -> Response:
sampling_params
=
SamplingParams
(
**
request_dict
)
request_id
=
random_uuid
()
if
not
engine
.
is_running
:
engine
.
start_background_loop
()
results_generator
=
engine
.
generate
(
prompt
,
sampling_params
,
request_id
)
# Streaming case
...
...
@@ -80,8 +77,7 @@ if __name__ == "__main__":
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
,
start_engine_loop
=
False
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
uvicorn
.
run
(
app
,
host
=
args
.
host
,
...
...
vllm/entrypoints/openai/api_server.py
View file @
08043847
...
...
@@ -192,9 +192,6 @@ async def create_chat_completion(request: ChatCompletionRequest,
"""
logger
.
info
(
f
"Received chat completion request:
{
request
}
"
)
if
not
engine
.
is_running
:
engine
.
start_background_loop
()
error_check_ret
=
await
check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
...
...
@@ -367,9 +364,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
"""
logger
.
info
(
f
"Received completion request:
{
request
}
"
)
if
not
engine
.
is_running
:
engine
.
start_background_loop
()
error_check_ret
=
await
check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
...
...
@@ -627,8 +621,7 @@ if __name__ == "__main__":
served_model
=
args
.
model
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
,
start_engine_loop
=
False
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine_model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
max_model_len
=
engine_model_config
.
get_max_model_len
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment