Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
08043847
Unverified
Commit
08043847
authored
Sep 08, 2023
by
Antoni Baum
Committed by
GitHub
Sep 08, 2023
Browse files
Start background task in `AsyncLLMEngine.generate` (#988)
Co-authored-by:
Zhuohan Li
<
zhuohan123@gmail.com
>
parent
4b5bcf89
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
16 additions
and
24 deletions
+16
-24
tests/async_engine/api_server_async_engine.py
tests/async_engine/api_server_async_engine.py
+1
-2
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+13
-9
vllm/entrypoints/api_server.py
vllm/entrypoints/api_server.py
+1
-5
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-8
No files found.
tests/async_engine/api_server_async_engine.py
View file @
08043847
...
@@ -40,8 +40,7 @@ if __name__ == "__main__":
...
@@ -40,8 +40,7 @@ if __name__ == "__main__":
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngineWithStats
.
from_engine_args
(
engine_args
,
engine
=
AsyncLLMEngineWithStats
.
from_engine_args
(
engine_args
)
start_engine_loop
=
False
)
vllm
.
entrypoints
.
api_server
.
engine
=
engine
vllm
.
entrypoints
.
api_server
.
engine
=
engine
uvicorn
.
run
(
uvicorn
.
run
(
app
,
app
,
...
...
vllm/engine/async_llm_engine.py
View file @
08043847
...
@@ -230,6 +230,8 @@ class AsyncLLMEngine:
...
@@ -230,6 +230,8 @@ class AsyncLLMEngine:
async frontend will be executed in a separate process as the
async frontend will be executed in a separate process as the
model workers.
model workers.
log_requests: Whether to log the requests.
log_requests: Whether to log the requests.
start_engine_loop: If True, the background task to run the engine
will be automatically started in the generate call.
*args, *kwargs: Arguments for LLMEngine.
*args, *kwargs: Arguments for LLMEngine.
"""
"""
...
@@ -240,7 +242,7 @@ class AsyncLLMEngine:
...
@@ -240,7 +242,7 @@ class AsyncLLMEngine:
engine_use_ray
:
bool
,
engine_use_ray
:
bool
,
*
args
,
*
args
,
log_requests
:
bool
=
True
,
log_requests
:
bool
=
True
,
start_engine_loop
:
bool
=
Fals
e
,
start_engine_loop
:
bool
=
Tru
e
,
**
kwargs
)
->
None
:
**
kwargs
)
->
None
:
self
.
worker_use_ray
=
worker_use_ray
self
.
worker_use_ray
=
worker_use_ray
self
.
engine_use_ray
=
engine_use_ray
self
.
engine_use_ray
=
engine_use_ray
...
@@ -249,8 +251,7 @@ class AsyncLLMEngine:
...
@@ -249,8 +251,7 @@ class AsyncLLMEngine:
self
.
request_tracker
:
RequestTracker
=
RequestTracker
()
self
.
request_tracker
:
RequestTracker
=
RequestTracker
()
self
.
background_loop
=
None
self
.
background_loop
=
None
if
start_engine_loop
:
self
.
start_engine_loop
=
start_engine_loop
self
.
start_background_loop
()
@
property
@
property
def
is_running
(
self
)
->
bool
:
def
is_running
(
self
)
->
bool
:
...
@@ -330,6 +331,9 @@ class AsyncLLMEngine:
...
@@ -330,6 +331,9 @@ class AsyncLLMEngine:
f
"prompt token ids:
{
prompt_token_ids
}
."
)
f
"prompt token ids:
{
prompt_token_ids
}
."
)
if
not
self
.
is_running
:
if
not
self
.
is_running
:
if
self
.
start_engine_loop
:
self
.
start_background_loop
()
else
:
raise
AsyncEngineDeadError
(
raise
AsyncEngineDeadError
(
"Background loop is not running. If it was running, "
"Background loop is not running. If it was running, "
"inspect the output to find the stacktrace of the "
"inspect the output to find the stacktrace of the "
...
@@ -426,7 +430,7 @@ class AsyncLLMEngine:
...
@@ -426,7 +430,7 @@ class AsyncLLMEngine:
@
classmethod
@
classmethod
def
from_engine_args
(
cls
,
def
from_engine_args
(
cls
,
engine_args
:
AsyncEngineArgs
,
engine_args
:
AsyncEngineArgs
,
start_engine_loop
:
bool
=
Fals
e
)
->
"AsyncLLMEngine"
:
start_engine_loop
:
bool
=
Tru
e
)
->
"AsyncLLMEngine"
:
"""Creates an async LLM engine from the engine arguments."""
"""Creates an async LLM engine from the engine arguments."""
# Create the engine configs.
# Create the engine configs.
engine_configs
=
engine_args
.
create_engine_configs
()
engine_configs
=
engine_args
.
create_engine_configs
()
...
...
vllm/entrypoints/api_server.py
View file @
08043847
...
@@ -32,9 +32,6 @@ async def generate(request: Request) -> Response:
...
@@ -32,9 +32,6 @@ async def generate(request: Request) -> Response:
sampling_params
=
SamplingParams
(
**
request_dict
)
sampling_params
=
SamplingParams
(
**
request_dict
)
request_id
=
random_uuid
()
request_id
=
random_uuid
()
if
not
engine
.
is_running
:
engine
.
start_background_loop
()
results_generator
=
engine
.
generate
(
prompt
,
sampling_params
,
request_id
)
results_generator
=
engine
.
generate
(
prompt
,
sampling_params
,
request_id
)
# Streaming case
# Streaming case
...
@@ -80,8 +77,7 @@ if __name__ == "__main__":
...
@@ -80,8 +77,7 @@ if __name__ == "__main__":
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
,
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
start_engine_loop
=
False
)
uvicorn
.
run
(
app
,
uvicorn
.
run
(
app
,
host
=
args
.
host
,
host
=
args
.
host
,
...
...
vllm/entrypoints/openai/api_server.py
View file @
08043847
...
@@ -192,9 +192,6 @@ async def create_chat_completion(request: ChatCompletionRequest,
...
@@ -192,9 +192,6 @@ async def create_chat_completion(request: ChatCompletionRequest,
"""
"""
logger
.
info
(
f
"Received chat completion request:
{
request
}
"
)
logger
.
info
(
f
"Received chat completion request:
{
request
}
"
)
if
not
engine
.
is_running
:
engine
.
start_background_loop
()
error_check_ret
=
await
check_model
(
request
)
error_check_ret
=
await
check_model
(
request
)
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
...
@@ -367,9 +364,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
...
@@ -367,9 +364,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
"""
"""
logger
.
info
(
f
"Received completion request:
{
request
}
"
)
logger
.
info
(
f
"Received completion request:
{
request
}
"
)
if
not
engine
.
is_running
:
engine
.
start_background_loop
()
error_check_ret
=
await
check_model
(
request
)
error_check_ret
=
await
check_model
(
request
)
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
...
@@ -627,8 +621,7 @@ if __name__ == "__main__":
...
@@ -627,8 +621,7 @@ if __name__ == "__main__":
served_model
=
args
.
model
served_model
=
args
.
model
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
,
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
start_engine_loop
=
False
)
engine_model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
engine_model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
max_model_len
=
engine_model_config
.
get_max_model_len
()
max_model_len
=
engine_model_config
.
get_max_model_len
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment