Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
23486039
Unverified
Commit
23486039
authored
Mar 10, 2026
by
Mark McLoughlin
Committed by
GitHub
Mar 10, 2026
Browse files
[Frontend][Core] Revert "Add shutdown timeout" (#34730 and #36270) (#36628)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
c8851008
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
95 additions
and
761 deletions
+95
-761
tests/entrypoints/openai/test_shutdown.py
tests/entrypoints/openai/test_shutdown.py
+0
-459
tests/entrypoints/test_api_server_process_manager.py
tests/entrypoints/test_api_server_process_manager.py
+7
-15
vllm/config/vllm.py
vllm/config/vllm.py
+0
-6
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+0
-11
vllm/engine/protocol.py
vllm/engine/protocol.py
+0
-5
vllm/entrypoints/cli/serve.py
vllm/entrypoints/cli/serve.py
+6
-42
vllm/entrypoints/launcher.py
vllm/entrypoints/launcher.py
+5
-23
vllm/v1/engine/__init__.py
vllm/v1/engine/__init__.py
+0
-2
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+3
-2
vllm/v1/engine/coordinator.py
vllm/v1/engine/coordinator.py
+2
-4
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+43
-127
vllm/v1/engine/core_client.py
vllm/v1/engine/core_client.py
+12
-12
vllm/v1/engine/utils.py
vllm/v1/engine/utils.py
+5
-34
vllm/v1/utils.py
vllm/v1/utils.py
+12
-19
No files found.
tests/entrypoints/openai/test_shutdown.py
View file @
23486039
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Integration tests for shutdown behavior, timeout, and signal handling."""
import
asyncio
import
signal
import
signal
import
subprocess
import
subprocess
import
sys
import
sys
import
time
import
time
from
dataclasses
import
dataclass
,
field
import
httpx
import
openai
import
openai
import
psutil
import
pytest
import
pytest
from
tests.utils
import
RemoteOpenAIServer
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.network_utils
import
get_open_port
from
vllm.utils.network_utils
import
get_open_port
...
@@ -24,101 +18,6 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
...
@@ -24,101 +18,6 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
_IS_ROCM
=
current_platform
.
is_rocm
()
_IS_ROCM
=
current_platform
.
is_rocm
()
_SERVER_STARTUP_TIMEOUT
=
120
_SERVER_STARTUP_TIMEOUT
=
120
_PROCESS_EXIT_TIMEOUT
=
15
_PROCESS_EXIT_TIMEOUT
=
15
_SHUTDOWN_DETECTION_TIMEOUT
=
10
_CHILD_CLEANUP_TIMEOUT
=
10
def
_get_child_pids
(
parent_pid
:
int
)
->
list
[
int
]:
try
:
parent
=
psutil
.
Process
(
parent_pid
)
return
[
c
.
pid
for
c
in
parent
.
children
(
recursive
=
True
)]
except
psutil
.
NoSuchProcess
:
return
[]
async
def
_assert_children_cleaned_up
(
child_pids
:
list
[
int
],
timeout
:
float
=
_CHILD_CLEANUP_TIMEOUT
,
):
"""Wait for child processes to exit and fail if any remain."""
if
not
child_pids
:
return
deadline
=
time
.
time
()
+
timeout
while
time
.
time
()
<
deadline
:
still_alive
=
[]
for
pid
in
child_pids
:
try
:
p
=
psutil
.
Process
(
pid
)
if
p
.
is_running
()
and
p
.
status
()
!=
psutil
.
STATUS_ZOMBIE
:
still_alive
.
append
(
pid
)
except
psutil
.
NoSuchProcess
:
pass
if
not
still_alive
:
return
await
asyncio
.
sleep
(
0.5
)
pytest
.
fail
(
f
"Child processes
{
still_alive
}
still alive after
{
timeout
}
s. "
f
"Process cleanup may not be working correctly."
)
@
dataclass
class
ShutdownState
:
got_503
:
bool
=
False
got_500
:
bool
=
False
requests_after_sigterm
:
int
=
0
aborted_requests
:
int
=
0
connection_errors
:
int
=
0
stop_requesting
:
bool
=
False
errors
:
list
[
str
]
=
field
(
default_factory
=
list
)
async
def
_concurrent_request_loop
(
client
:
openai
.
AsyncOpenAI
,
state
:
ShutdownState
,
sigterm_sent
:
asyncio
.
Event
|
None
=
None
,
concurrency
:
int
=
10
,
):
"""Run multiple concurrent requests to keep the server busy."""
async
def
single_request
():
while
not
state
.
stop_requesting
:
try
:
response
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Write a story: "
,
max_tokens
=
200
,
)
if
sigterm_sent
is
not
None
and
sigterm_sent
.
is_set
():
state
.
requests_after_sigterm
+=
1
# Check if any choice has finish_reason='abort'
if
any
(
choice
.
finish_reason
==
"abort"
for
choice
in
response
.
choices
):
state
.
aborted_requests
+=
1
except
openai
.
APIStatusError
as
e
:
if
e
.
status_code
==
503
:
state
.
got_503
=
True
elif
e
.
status_code
==
500
:
state
.
got_500
=
True
else
:
state
.
errors
.
append
(
f
"API error:
{
e
}
"
)
except
(
openai
.
APIConnectionError
,
httpx
.
RemoteProtocolError
):
state
.
connection_errors
+=
1
if
sigterm_sent
is
not
None
and
sigterm_sent
.
is_set
():
break
except
Exception
as
e
:
state
.
errors
.
append
(
f
"Unexpected error:
{
e
}
"
)
break
await
asyncio
.
sleep
(
0.01
)
tasks
=
[
asyncio
.
create_task
(
single_request
())
for
_
in
range
(
concurrency
)]
try
:
await
asyncio
.
gather
(
*
tasks
,
return_exceptions
=
True
)
finally
:
for
t
in
tasks
:
if
not
t
.
done
():
t
.
cancel
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -204,361 +103,3 @@ async def test_shutdown_on_engine_failure():
...
@@ -204,361 +103,3 @@ async def test_shutdown_on_engine_failure():
return_code
=
proc
.
wait
(
timeout
=
_PROCESS_EXIT_TIMEOUT
)
return_code
=
proc
.
wait
(
timeout
=
_PROCESS_EXIT_TIMEOUT
)
assert
return_code
is
not
None
assert
return_code
is
not
None
@
pytest
.
mark
.
asyncio
async
def
test_wait_timeout_completes_requests
():
"""Verify wait timeout: new requests rejected, in-flight requests complete."""
server_args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"256"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.05"
,
"--max-num-seqs"
,
"4"
,
"--shutdown-timeout"
,
"30"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
proc
=
remote_server
.
proc
child_pids
=
_get_child_pids
(
proc
.
pid
)
state
=
ShutdownState
()
sigterm_sent
=
asyncio
.
Event
()
request_task
=
asyncio
.
create_task
(
_concurrent_request_loop
(
client
,
state
,
sigterm_sent
,
concurrency
=
10
)
)
await
asyncio
.
sleep
(
0.5
)
proc
.
send_signal
(
signal
.
SIGTERM
)
sigterm_sent
.
set
()
try
:
await
asyncio
.
wait_for
(
request_task
,
timeout
=
_SHUTDOWN_DETECTION_TIMEOUT
)
except
asyncio
.
TimeoutError
:
pass
finally
:
state
.
stop_requesting
=
True
if
not
request_task
.
done
():
request_task
.
cancel
()
await
asyncio
.
gather
(
request_task
,
return_exceptions
=
True
)
# wait timeout should complete in-flight requests
assert
state
.
requests_after_sigterm
>
0
,
(
f
"Wait timeout should complete in-flight requests. "
f
"503:
{
state
.
got_503
}
, 500:
{
state
.
got_500
}
, "
f
"conn_errors:
{
state
.
connection_errors
}
, errors:
{
state
.
errors
}
"
)
# server must stop accepting new requests (503, 500, or connection close)
assert
state
.
got_503
or
state
.
got_500
or
state
.
connection_errors
>
0
,
(
f
"Server should stop accepting requests. "
f
"completed:
{
state
.
requests_after_sigterm
}
, errors:
{
state
.
errors
}
"
)
await
_assert_children_cleaned_up
(
child_pids
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"wait_for_engine_idle"
,
[
0.0
,
2.0
])
async
def
test_abort_timeout_exits_quickly
(
wait_for_engine_idle
:
float
):
server_args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"256"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.05"
,
"--max-num-seqs"
,
"4"
,
"--shutdown-timeout"
,
"0"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
server_args
)
as
remote_server
:
proc
=
remote_server
.
proc
child_pids
=
_get_child_pids
(
proc
.
pid
)
if
wait_for_engine_idle
>
0
:
client
=
remote_server
.
get_async_client
()
# Send requests to ensure engine is fully initialized
for
_
in
range
(
2
):
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Test request: "
,
max_tokens
=
10
,
)
# Wait for engine to become idle
await
asyncio
.
sleep
(
wait_for_engine_idle
)
start_time
=
time
.
time
()
proc
.
send_signal
(
signal
.
SIGTERM
)
# abort timeout (0) should exit promptly
for
_
in
range
(
20
):
if
proc
.
poll
()
is
not
None
:
break
time
.
sleep
(
0.1
)
if
proc
.
poll
()
is
None
:
proc
.
kill
()
proc
.
wait
(
timeout
=
5
)
pytest
.
fail
(
"Process did not exit after SIGTERM with abort timeout"
)
exit_time
=
time
.
time
()
-
start_time
assert
exit_time
<
2
,
f
"Default shutdown took too long:
{
exit_time
:.
1
f
}
s"
assert
proc
.
returncode
in
(
0
,
-
15
,
None
),
f
"Unexpected:
{
proc
.
returncode
}
"
await
_assert_children_cleaned_up
(
child_pids
)
@
pytest
.
mark
.
asyncio
async
def
test_wait_timeout_with_short_duration
():
"""Verify server exits cleanly with a short wait timeout."""
wait_timeout
=
3
server_args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"256"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.05"
,
"--max-num-seqs"
,
"4"
,
"--shutdown-timeout"
,
str
(
wait_timeout
),
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
proc
=
remote_server
.
proc
child_pids
=
_get_child_pids
(
proc
.
pid
)
state
=
ShutdownState
()
request_task
=
asyncio
.
create_task
(
_concurrent_request_loop
(
client
,
state
,
concurrency
=
3
)
)
await
asyncio
.
sleep
(
0.5
)
start_time
=
time
.
time
()
proc
.
send_signal
(
signal
.
SIGTERM
)
# server should exit within wait_timeout + buffer
max_wait
=
wait_timeout
+
15
for
_
in
range
(
int
(
max_wait
*
10
)):
if
proc
.
poll
()
is
not
None
:
break
time
.
sleep
(
0.1
)
exit_time
=
time
.
time
()
-
start_time
state
.
stop_requesting
=
True
if
not
request_task
.
done
():
request_task
.
cancel
()
await
asyncio
.
gather
(
request_task
,
return_exceptions
=
True
)
if
proc
.
poll
()
is
None
:
proc
.
kill
()
proc
.
wait
(
timeout
=
5
)
pytest
.
fail
(
f
"Process did not exit within
{
max_wait
}
s after SIGTERM"
)
assert
exit_time
<
wait_timeout
+
10
,
(
f
"Took too long to exit (
{
exit_time
:.
1
f
}
s), expected <
{
wait_timeout
+
10
}
s"
)
assert
proc
.
returncode
in
(
0
,
-
15
,
None
),
f
"Unexpected:
{
proc
.
returncode
}
"
await
_assert_children_cleaned_up
(
child_pids
)
@
pytest
.
mark
.
asyncio
async
def
test_abort_timeout_fails_inflight_requests
():
"""Verify abort timeout (0) immediately aborts in-flight requests."""
server_args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"256"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.05"
,
"--max-num-seqs"
,
"4"
,
"--shutdown-timeout"
,
"0"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
proc
=
remote_server
.
proc
child_pids
=
_get_child_pids
(
proc
.
pid
)
state
=
ShutdownState
()
sigterm_sent
=
asyncio
.
Event
()
request_task
=
asyncio
.
create_task
(
_concurrent_request_loop
(
client
,
state
,
sigterm_sent
,
concurrency
=
10
)
)
await
asyncio
.
sleep
(
0.5
)
proc
.
send_signal
(
signal
.
SIGTERM
)
sigterm_sent
.
set
()
try
:
await
asyncio
.
wait_for
(
request_task
,
timeout
=
5
)
except
asyncio
.
TimeoutError
:
pass
finally
:
state
.
stop_requesting
=
True
if
not
request_task
.
done
():
request_task
.
cancel
()
await
asyncio
.
gather
(
request_task
,
return_exceptions
=
True
)
# With abort timeout (0), requests should be aborted (finish_reason='abort')
# or rejected (connection errors or API errors)
assert
(
state
.
aborted_requests
>
0
or
state
.
connection_errors
>
0
or
state
.
got_500
or
state
.
got_503
),
(
f
"Abort timeout should cause request aborts or failures. "
f
"aborted:
{
state
.
aborted_requests
}
, "
f
"503:
{
state
.
got_503
}
, 500:
{
state
.
got_500
}
, "
f
"conn_errors:
{
state
.
connection_errors
}
, "
f
"completed:
{
state
.
requests_after_sigterm
}
"
)
# Verify fast shutdown
start_time
=
time
.
time
()
for
_
in
range
(
100
):
if
proc
.
poll
()
is
not
None
:
break
time
.
sleep
(
0.1
)
exit_time
=
time
.
time
()
-
start_time
assert
exit_time
<
10
,
f
"Abort timeout shutdown took too long:
{
exit_time
:.
1
f
}
s"
await
_assert_children_cleaned_up
(
child_pids
)
@
pytest
.
mark
.
asyncio
async
def
test_request_rejection_during_shutdown
():
"""Verify new requests are rejected with error during shutdown."""
server_args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"256"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.05"
,
"--max-num-seqs"
,
"4"
,
"--shutdown-timeout"
,
"30"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
proc
=
remote_server
.
proc
child_pids
=
_get_child_pids
(
proc
.
pid
)
proc
.
send_signal
(
signal
.
SIGTERM
)
await
asyncio
.
sleep
(
1.0
)
# Try to send new requests - they should be rejected
rejected_count
=
0
for
_
in
range
(
10
):
try
:
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Hello"
,
max_tokens
=
10
)
except
(
openai
.
APIStatusError
,
openai
.
APIConnectionError
,
httpx
.
RemoteProtocolError
,
):
rejected_count
+=
1
await
asyncio
.
sleep
(
0.1
)
assert
rejected_count
>
0
,
(
f
"Expected requests to be rejected during shutdown, "
f
"but
{
rejected_count
}
were rejected out of 10"
)
await
_assert_children_cleaned_up
(
child_pids
)
@
pytest
.
mark
.
asyncio
async
def
test_multi_api_server_shutdown
():
"""Verify shutdown works with multiple API servers."""
server_args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"256"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.05"
,
"--max-num-seqs"
,
"4"
,
"--shutdown-timeout"
,
"30"
,
"--api-server-count"
,
"2"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
server_args
,
auto_port
=
True
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
proc
=
remote_server
.
proc
child_pids
=
_get_child_pids
(
proc
.
pid
)
assert
len
(
child_pids
)
>=
2
,
(
f
"Expected at least 2 child processes, got
{
len
(
child_pids
)
}
"
)
state
=
ShutdownState
()
sigterm_sent
=
asyncio
.
Event
()
# Start concurrent requests across both API servers
request_task
=
asyncio
.
create_task
(
_concurrent_request_loop
(
client
,
state
,
sigterm_sent
,
concurrency
=
8
)
)
await
asyncio
.
sleep
(
0.5
)
# Send SIGTERM to parent - should propagate to all children
proc
.
send_signal
(
signal
.
SIGTERM
)
sigterm_sent
.
set
()
try
:
await
asyncio
.
wait_for
(
request_task
,
timeout
=
_SHUTDOWN_DETECTION_TIMEOUT
)
except
asyncio
.
TimeoutError
:
pass
finally
:
state
.
stop_requesting
=
True
if
not
request_task
.
done
():
request_task
.
cancel
()
await
asyncio
.
gather
(
request_task
,
return_exceptions
=
True
)
for
_
in
range
(
300
):
# up to 30 seconds
if
proc
.
poll
()
is
not
None
:
break
time
.
sleep
(
0.1
)
if
proc
.
poll
()
is
None
:
proc
.
kill
()
proc
.
wait
(
timeout
=
5
)
pytest
.
fail
(
"Process did not exit after SIGTERM"
)
await
_assert_children_cleaned_up
(
child_pids
)
tests/entrypoints/test_api_server_process_manager.py
View file @
23486039
...
@@ -79,7 +79,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
...
@@ -79,7 +79,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
finally
:
finally
:
# Always clean up the processes
# Always clean up the processes
print
(
"Cleaning up processes..."
)
print
(
"Cleaning up processes..."
)
manager
.
shutdown
()
manager
.
close
()
# Give processes time to terminate
# Give processes time to terminate
time
.
sleep
(
0.2
)
time
.
sleep
(
0.2
)
...
@@ -111,8 +111,6 @@ def test_wait_for_completion_or_failure(api_server_args):
...
@@ -111,8 +111,6 @@ def test_wait_for_completion_or_failure(api_server_args):
wait_for_completion_or_failure
(
api_server_manager
=
manager
)
wait_for_completion_or_failure
(
api_server_manager
=
manager
)
except
Exception
as
e
:
except
Exception
as
e
:
result
[
"exception"
]
=
e
result
[
"exception"
]
=
e
finally
:
manager
.
shutdown
()
# Start a thread to run wait_for_completion_or_failure
# Start a thread to run wait_for_completion_or_failure
wait_thread
=
threading
.
Thread
(
target
=
run_with_exception_capture
,
daemon
=
True
)
wait_thread
=
threading
.
Thread
(
target
=
run_with_exception_capture
,
daemon
=
True
)
...
@@ -145,7 +143,7 @@ def test_wait_for_completion_or_failure(api_server_args):
...
@@ -145,7 +143,7 @@ def test_wait_for_completion_or_failure(api_server_args):
assert
not
proc
.
is_alive
(),
f
"Process
{
i
}
should not be alive"
assert
not
proc
.
is_alive
(),
f
"Process
{
i
}
should not be alive"
finally
:
finally
:
manager
.
shutdown
()
manager
.
close
()
time
.
sleep
(
0.2
)
time
.
sleep
(
0.2
)
...
@@ -176,14 +174,11 @@ def test_normal_completion(api_server_args):
...
@@ -176,14 +174,11 @@ def test_normal_completion(api_server_args):
# since all processes have already
# since all processes have already
# terminated, it should return immediately
# terminated, it should return immediately
# with no error
# with no error
try
:
wait_for_completion_or_failure
(
api_server_manager
=
manager
)
wait_for_completion_or_failure
(
api_server_manager
=
manager
)
finally
:
manager
.
shutdown
()
finally
:
finally
:
# Clean up just in case
# Clean up just in case
manager
.
shutdown
()
manager
.
close
()
time
.
sleep
(
0.2
)
time
.
sleep
(
0.2
)
...
@@ -206,7 +201,7 @@ def test_external_process_monitoring(api_server_args):
...
@@ -206,7 +201,7 @@ def test_external_process_monitoring(api_server_args):
def
__init__
(
self
,
proc
):
def
__init__
(
self
,
proc
):
self
.
proc
=
proc
self
.
proc
=
proc
def
shutdown
(
self
):
def
close
(
self
):
if
self
.
proc
.
is_alive
():
if
self
.
proc
.
is_alive
():
self
.
proc
.
terminate
()
self
.
proc
.
terminate
()
self
.
proc
.
join
(
timeout
=
0.5
)
self
.
proc
.
join
(
timeout
=
0.5
)
...
@@ -231,9 +226,6 @@ def test_external_process_monitoring(api_server_args):
...
@@ -231,9 +226,6 @@ def test_external_process_monitoring(api_server_args):
)
)
except
Exception
as
e
:
except
Exception
as
e
:
result
[
"exception"
]
=
e
result
[
"exception"
]
=
e
finally
:
manager
.
shutdown
()
mock_coordinator
.
shutdown
()
# Start a thread to run wait_for_completion_or_failure
# Start a thread to run wait_for_completion_or_failure
wait_thread
=
threading
.
Thread
(
target
=
run_with_exception_capture
,
daemon
=
True
)
wait_thread
=
threading
.
Thread
(
target
=
run_with_exception_capture
,
daemon
=
True
)
...
@@ -267,6 +259,6 @@ def test_external_process_monitoring(api_server_args):
...
@@ -267,6 +259,6 @@ def test_external_process_monitoring(api_server_args):
finally
:
finally
:
# Clean up
# Clean up
manager
.
shutdown
()
manager
.
close
()
mock_coordinator
.
shutdown
()
mock_coordinator
.
close
()
time
.
sleep
(
0.2
)
time
.
sleep
(
0.2
)
vllm/config/vllm.py
View file @
23486039
...
@@ -327,12 +327,6 @@ class VllmConfig:
...
@@ -327,12 +327,6 @@ class VllmConfig:
weight_transfer_config
:
WeightTransferConfig
|
None
=
None
weight_transfer_config
:
WeightTransferConfig
|
None
=
None
"""The configurations for weight transfer during RL training."""
"""The configurations for weight transfer during RL training."""
shutdown_timeout
:
int
=
Field
(
default
=
0
,
ge
=
0
)
"""Shutdown grace period for in-flight requests. Shutdown will be delayed for
up to this amount of time to allow already-running requests to complete. Any
remaining requests are aborted once the timeout is reached.
"""
def
compute_hash
(
self
)
->
str
:
def
compute_hash
(
self
)
->
str
:
"""
"""
WARNING: Whenever a new field is added to this config,
WARNING: Whenever a new field is added to this config,
...
...
vllm/engine/arg_utils.py
View file @
23486039
...
@@ -606,8 +606,6 @@ class EngineArgs:
...
@@ -606,8 +606,6 @@ class EngineArgs:
kv_offloading_backend
:
KVOffloadingBackend
=
CacheConfig
.
kv_offloading_backend
kv_offloading_backend
:
KVOffloadingBackend
=
CacheConfig
.
kv_offloading_backend
tokens_only
:
bool
=
False
tokens_only
:
bool
=
False
shutdown_timeout
:
int
=
0
weight_transfer_config
:
WeightTransferConfig
|
None
=
get_field
(
weight_transfer_config
:
WeightTransferConfig
|
None
=
get_field
(
VllmConfig
,
VllmConfig
,
"weight_transfer_config"
,
"weight_transfer_config"
,
...
@@ -1310,14 +1308,6 @@ class EngineArgs:
...
@@ -1310,14 +1308,6 @@ class EngineArgs:
default
=
False
,
default
=
False
,
action
=
argparse
.
BooleanOptionalAction
,
action
=
argparse
.
BooleanOptionalAction
,
)
)
parser
.
add_argument
(
"--shutdown-timeout"
,
type
=
int
,
default
=
0
,
help
=
"Shutdown timeout in seconds. 0 = abort, >0 = wait."
,
)
return
parser
return
parser
@
classmethod
@
classmethod
...
@@ -1926,7 +1916,6 @@ class EngineArgs:
...
@@ -1926,7 +1916,6 @@ class EngineArgs:
optimization_level
=
self
.
optimization_level
,
optimization_level
=
self
.
optimization_level
,
performance_mode
=
self
.
performance_mode
,
performance_mode
=
self
.
performance_mode
,
weight_transfer_config
=
self
.
weight_transfer_config
,
weight_transfer_config
=
self
.
weight_transfer_config
,
shutdown_timeout
=
self
.
shutdown_timeout
,
)
)
return
config
return
config
...
...
vllm/engine/protocol.py
View file @
23486039
...
@@ -200,11 +200,6 @@ class EngineClient(ABC):
...
@@ -200,11 +200,6 @@ class EngineClient(ABC):
"""Return whether the engine is currently paused."""
"""Return whether the engine is currently paused."""
...
...
@
abstractmethod
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
"""Shutdown the engine with optional timeout."""
...
async
def
scale_elastic_ep
(
async
def
scale_elastic_ep
(
self
,
new_data_parallel_size
:
int
,
drain_timeout
:
int
=
300
self
,
new_data_parallel_size
:
int
,
drain_timeout
:
int
=
300
)
->
None
:
)
->
None
:
...
...
vllm/entrypoints/cli/serve.py
View file @
23486039
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
import
argparse
import
argparse
import
signal
import
signal
import
time
import
uvloop
import
uvloop
...
@@ -225,12 +224,8 @@ def run_headless(args: argparse.Namespace):
...
@@ -225,12 +224,8 @@ def run_headless(args: argparse.Namespace):
try
:
try
:
engine_manager
.
join_first
()
engine_manager
.
join_first
()
finally
:
finally
:
timeout
=
None
if
shutdown_requested
:
timeout
=
vllm_config
.
shutdown_timeout
logger
.
info
(
"Waiting up to %d seconds for processes to exit"
,
timeout
)
engine_manager
.
shutdown
(
timeout
=
timeout
)
logger
.
info
(
"Shutting down."
)
logger
.
info
(
"Shutting down."
)
engine_manager
.
close
()
def
run_multi_api_server
(
args
:
argparse
.
Namespace
):
def
run_multi_api_server
(
args
:
argparse
.
Namespace
):
...
@@ -241,19 +236,6 @@ def run_multi_api_server(args: argparse.Namespace):
...
@@ -241,19 +236,6 @@ def run_multi_api_server(args: argparse.Namespace):
if
num_api_servers
>
1
:
if
num_api_servers
>
1
:
setup_multiprocess_prometheus
()
setup_multiprocess_prometheus
()
shutdown_requested
=
False
# Catch SIGTERM and SIGINT to allow graceful shutdown.
def
signal_handler
(
signum
,
frame
):
nonlocal
shutdown_requested
logger
.
debug
(
"Received %d signal."
,
signum
)
if
not
shutdown_requested
:
shutdown_requested
=
True
raise
SystemExit
signal
.
signal
(
signal
.
SIGTERM
,
signal_handler
)
signal
.
signal
(
signal
.
SIGINT
,
signal_handler
)
listen_address
,
sock
=
setup_server
(
args
)
listen_address
,
sock
=
setup_server
(
args
)
engine_args
=
vllm
.
AsyncEngineArgs
.
from_cli_args
(
args
)
engine_args
=
vllm
.
AsyncEngineArgs
.
from_cli_args
(
args
)
...
@@ -315,29 +297,11 @@ def run_multi_api_server(args: argparse.Namespace):
...
@@ -315,29 +297,11 @@ def run_multi_api_server(args: argparse.Namespace):
api_server_manager
=
APIServerProcessManager
(
**
api_server_manager_kwargs
)
api_server_manager
=
APIServerProcessManager
(
**
api_server_manager_kwargs
)
# Wait for API servers
# Wait for API servers
try
:
wait_for_completion_or_failure
(
wait_for_completion_or_failure
(
api_server_manager
=
api_server_manager
,
api_server_manager
=
api_server_manager
,
engine_manager
=
local_engine_manager
,
engine_manager
=
local_engine_manager
,
coordinator
=
coordinator
,
coordinator
=
coordinator
,
)
)
finally
:
timeout
=
shutdown_by
=
None
if
shutdown_requested
:
timeout
=
vllm_config
.
shutdown_timeout
shutdown_by
=
time
.
monotonic
()
+
timeout
logger
.
info
(
"Waiting up to %d seconds for processes to exit"
,
timeout
)
def
to_timeout
(
deadline
:
float
|
None
)
->
float
|
None
:
return
(
deadline
if
deadline
is
None
else
max
(
deadline
-
time
.
monotonic
(),
0.0
)
)
api_server_manager
.
shutdown
(
timeout
=
timeout
)
if
local_engine_manager
:
local_engine_manager
.
shutdown
(
timeout
=
to_timeout
(
shutdown_by
))
if
coordinator
:
coordinator
.
shutdown
(
timeout
=
to_timeout
(
shutdown_by
))
def
run_api_server_worker_proc
(
def
run_api_server_worker_proc
(
...
...
vllm/entrypoints/launcher.py
View file @
23486039
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
import
asyncio
import
asyncio
import
signal
import
signal
import
socket
import
socket
from
functools
import
partial
from
typing
import
Any
from
typing
import
Any
import
uvicorn
import
uvicorn
...
@@ -92,10 +91,12 @@ async def serve_http(
...
@@ -92,10 +91,12 @@ async def serve_http(
)
)
)
)
shutdown_event
=
asyncio
.
Event
()
def
signal_handler
()
->
None
:
def
signal_handler
()
->
None
:
shutdown_event
.
set
()
# prevents the uvicorn signal handler to exit early
server_task
.
cancel
()
watchdog_task
.
cancel
()
if
ssl_cert_refresher
:
ssl_cert_refresher
.
stop
()
async
def
dummy_shutdown
()
->
None
:
async
def
dummy_shutdown
()
->
None
:
pass
pass
...
@@ -103,24 +104,6 @@ async def serve_http(
...
@@ -103,24 +104,6 @@ async def serve_http(
loop
.
add_signal_handler
(
signal
.
SIGINT
,
signal_handler
)
loop
.
add_signal_handler
(
signal
.
SIGINT
,
signal_handler
)
loop
.
add_signal_handler
(
signal
.
SIGTERM
,
signal_handler
)
loop
.
add_signal_handler
(
signal
.
SIGTERM
,
signal_handler
)
async
def
handle_shutdown
()
->
None
:
await
shutdown_event
.
wait
()
engine_client
=
app
.
state
.
engine_client
timeout
=
engine_client
.
vllm_config
.
shutdown_timeout
await
loop
.
run_in_executor
(
None
,
partial
(
engine_client
.
shutdown
,
timeout
=
timeout
)
)
server
.
should_exit
=
True
server_task
.
cancel
()
watchdog_task
.
cancel
()
if
ssl_cert_refresher
:
ssl_cert_refresher
.
stop
()
shutdown_task
=
loop
.
create_task
(
handle_shutdown
())
try
:
try
:
await
server_task
await
server_task
return
dummy_shutdown
()
return
dummy_shutdown
()
...
@@ -137,7 +120,6 @@ async def serve_http(
...
@@ -137,7 +120,6 @@ async def serve_http(
logger
.
info
(
"Shutting down FastAPI HTTP server."
)
logger
.
info
(
"Shutting down FastAPI HTTP server."
)
return
server
.
shutdown
()
return
server
.
shutdown
()
finally
:
finally
:
shutdown_task
.
cancel
()
watchdog_task
.
cancel
()
watchdog_task
.
cancel
()
...
...
vllm/v1/engine/__init__.py
View file @
23486039
...
@@ -226,8 +226,6 @@ class EngineCoreRequestType(enum.Enum):
...
@@ -226,8 +226,6 @@ class EngineCoreRequestType(enum.Enum):
UTILITY
=
b
"
\x03
"
UTILITY
=
b
"
\x03
"
# Sentinel used within EngineCoreProc.
# Sentinel used within EngineCoreProc.
EXECUTOR_FAILED
=
b
"
\x04
"
EXECUTOR_FAILED
=
b
"
\x04
"
# Sentinel to wake up input_queue.get() during shutdown.
WAKEUP
=
b
"
\x05
"
class
ReconfigureDistributedRequest
(
msgspec
.
Struct
):
class
ReconfigureDistributedRequest
(
msgspec
.
Struct
):
...
...
vllm/v1/engine/async_llm.py
View file @
23486039
...
@@ -264,15 +264,16 @@ class AsyncLLM(EngineClient):
...
@@ -264,15 +264,16 @@ class AsyncLLM(EngineClient):
def
__del__
(
self
):
def
__del__
(
self
):
self
.
shutdown
()
self
.
shutdown
()
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
def
shutdown
(
self
)
:
"""Shutdown, cleaning up the background proc and IPC."""
"""Shutdown, cleaning up the background proc and IPC."""
shutdown_prometheus
()
shutdown_prometheus
()
if
renderer
:
=
getattr
(
self
,
"renderer"
,
None
):
if
renderer
:
=
getattr
(
self
,
"renderer"
,
None
):
renderer
.
shutdown
()
renderer
.
shutdown
()
if
engine_core
:
=
getattr
(
self
,
"engine_core"
,
None
):
if
engine_core
:
=
getattr
(
self
,
"engine_core"
,
None
):
engine_core
.
shutdown
(
timeout
=
timeout
)
engine_core
.
shutdown
()
handler
=
getattr
(
self
,
"output_handler"
,
None
)
handler
=
getattr
(
self
,
"output_handler"
,
None
)
if
handler
is
not
None
:
if
handler
is
not
None
:
...
...
vllm/v1/engine/coordinator.py
View file @
23486039
...
@@ -104,10 +104,8 @@ class DPCoordinator:
...
@@ -104,10 +104,8 @@ class DPCoordinator:
"""Returns tuple of ZMQ input address, output address."""
"""Returns tuple of ZMQ input address, output address."""
return
self
.
coord_in_address
,
self
.
coord_out_address
return
self
.
coord_in_address
,
self
.
coord_out_address
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
def
close
(
self
):
"""Shutdown coordinator process with configurable timeout."""
self
.
_finalizer
()
if
self
.
_finalizer
.
detach
()
is
not
None
:
shutdown
([
self
.
proc
],
timeout
=
timeout
)
class
EngineState
:
class
EngineState
:
...
...
vllm/v1/engine/core.py
View file @
23486039
...
@@ -9,7 +9,6 @@ from collections import defaultdict, deque
...
@@ -9,7 +9,6 @@ from collections import defaultdict, deque
from
collections.abc
import
Callable
,
Generator
from
collections.abc
import
Callable
,
Generator
from
concurrent.futures
import
Future
from
concurrent.futures
import
Future
from
contextlib
import
ExitStack
,
contextmanager
from
contextlib
import
ExitStack
,
contextmanager
from
enum
import
IntEnum
from
functools
import
partial
from
functools
import
partial
from
inspect
import
isclass
,
signature
from
inspect
import
isclass
,
signature
from
logging
import
DEBUG
from
logging
import
DEBUG
...
@@ -62,7 +61,6 @@ from vllm.v1.engine import (
...
@@ -62,7 +61,6 @@ from vllm.v1.engine import (
from
vllm.v1.engine.utils
import
(
from
vllm.v1.engine.utils
import
(
EngineHandshakeMetadata
,
EngineHandshakeMetadata
,
EngineZmqAddresses
,
EngineZmqAddresses
,
SignalCallback
,
get_device_indices
,
get_device_indices
,
)
)
from
vllm.v1.executor
import
Executor
from
vllm.v1.executor
import
Executor
...
@@ -773,12 +771,6 @@ class EngineCore:
...
@@ -773,12 +771,6 @@ class EngineCore:
raise
NotImplementedError
raise
NotImplementedError
class
EngineShutdownState
(
IntEnum
):
RUNNING
=
0
REQUESTED
=
1
SHUTTING_DOWN
=
2
class
EngineCoreProc
(
EngineCore
):
class
EngineCoreProc
(
EngineCore
):
"""ZMQ-wrapper for running EngineCore in background process."""
"""ZMQ-wrapper for running EngineCore in background process."""
...
@@ -806,7 +798,6 @@ class EngineCoreProc(EngineCore):
...
@@ -806,7 +798,6 @@ class EngineCoreProc(EngineCore):
self
.
engine_index
=
engine_index
self
.
engine_index
=
engine_index
identity
=
self
.
engine_index
.
to_bytes
(
length
=
2
,
byteorder
=
"little"
)
identity
=
self
.
engine_index
.
to_bytes
(
length
=
2
,
byteorder
=
"little"
)
self
.
engines_running
=
False
self
.
engines_running
=
False
self
.
shutdown_state
=
EngineShutdownState
.
RUNNING
with
self
.
_perform_handshakes
(
with
self
.
_perform_handshakes
(
handshake_address
,
handshake_address
,
...
@@ -1037,11 +1028,25 @@ class EngineCoreProc(EngineCore):
...
@@ -1037,11 +1028,25 @@ class EngineCoreProc(EngineCore):
def
run_engine_core
(
*
args
,
dp_rank
:
int
=
0
,
local_dp_rank
:
int
=
0
,
**
kwargs
):
def
run_engine_core
(
*
args
,
dp_rank
:
int
=
0
,
local_dp_rank
:
int
=
0
,
**
kwargs
):
"""Launch EngineCore busy loop in background process."""
"""Launch EngineCore busy loop in background process."""
# Signal handler used for graceful termination.
# SystemExit exception is only raised once to allow this and worker
# processes to terminate without error
shutdown_requested
=
False
# Ensure we can serialize transformer config after spawning
# Ensure we can serialize transformer config after spawning
maybe_register_config_serialize_by_value
()
maybe_register_config_serialize_by_value
()
def
signal_handler
(
signum
,
frame
):
nonlocal
shutdown_requested
if
not
shutdown_requested
:
shutdown_requested
=
True
raise
SystemExit
()
# Either SIGTERM or SIGINT will terminate the engine_core
signal
.
signal
(
signal
.
SIGTERM
,
signal_handler
)
signal
.
signal
(
signal
.
SIGINT
,
signal_handler
)
engine_core
:
EngineCoreProc
|
None
=
None
engine_core
:
EngineCoreProc
|
None
=
None
signal_callback
:
SignalCallback
|
None
=
None
try
:
try
:
vllm_config
:
VllmConfig
=
kwargs
[
"vllm_config"
]
vllm_config
:
VllmConfig
=
kwargs
[
"vllm_config"
]
parallel_config
:
ParallelConfig
=
vllm_config
.
parallel_config
parallel_config
:
ParallelConfig
=
vllm_config
.
parallel_config
...
@@ -1089,22 +1094,6 @@ class EngineCoreProc(EngineCore):
...
@@ -1089,22 +1094,6 @@ class EngineCoreProc(EngineCore):
engine_core
=
EngineCoreProc
(
*
args
,
engine_index
=
dp_rank
,
**
kwargs
)
engine_core
=
EngineCoreProc
(
*
args
,
engine_index
=
dp_rank
,
**
kwargs
)
assert
engine_core
is
not
None
assert
engine_core
is
not
None
def
wakeup_engine
():
# Wakes up idle engine via input_queue when shutdown is requested
# Not safe in a signal handler - we may interrupt the main thread
# while it is holding the non-reentrant input_queue.mutex
engine_core
.
input_queue
.
put_nowait
((
EngineCoreRequestType
.
WAKEUP
,
None
))
signal_callback
=
SignalCallback
(
wakeup_engine
)
def
signal_handler
(
signum
,
frame
):
engine_core
.
shutdown_state
=
EngineShutdownState
.
REQUESTED
signal_callback
.
trigger
()
signal
.
signal
(
signal
.
SIGTERM
,
signal_handler
)
signal
.
signal
(
signal
.
SIGINT
,
signal_handler
)
engine_core
.
run_busy_loop
()
engine_core
.
run_busy_loop
()
except
SystemExit
:
except
SystemExit
:
...
@@ -1118,10 +1107,6 @@ class EngineCoreProc(EngineCore):
...
@@ -1118,10 +1107,6 @@ class EngineCoreProc(EngineCore):
engine_core
.
_send_engine_dead
()
engine_core
.
_send_engine_dead
()
raise
e
raise
e
finally
:
finally
:
signal
.
signal
(
signal
.
SIGTERM
,
signal
.
SIG_DFL
)
signal
.
signal
(
signal
.
SIGINT
,
signal
.
SIG_DFL
)
if
signal_callback
is
not
None
:
signal_callback
.
stop
()
if
engine_core
is
not
None
:
if
engine_core
is
not
None
:
engine_core
.
shutdown
()
engine_core
.
shutdown
()
...
@@ -1136,25 +1121,21 @@ class EngineCoreProc(EngineCore):
...
@@ -1136,25 +1121,21 @@ class EngineCoreProc(EngineCore):
or
bool
(
self
.
batch_queue
)
or
bool
(
self
.
batch_queue
)
)
)
def
is_running
(
self
)
->
bool
:
"""Returns true if shutdown has not been requested."""
return
self
.
shutdown_state
==
EngineShutdownState
.
RUNNING
def
run_busy_loop
(
self
):
def
run_busy_loop
(
self
):
"""Core busy loop of the EngineCore."""
"""Core busy loop of the EngineCore."""
while
self
.
_handle_shutdown
():
# Loop until process is sent a SIGINT or SIGTERM
while
True
:
# 1) Poll the input queue until there is work to do.
# 1) Poll the input queue until there is work to do.
self
.
_process_input_queue
()
self
.
_process_input_queue
()
# 2) Step the engine core and return the outputs.
# 2) Step the engine core and return the outputs.
self
.
_process_engine_step
()
self
.
_process_engine_step
()
raise
SystemExit
def
_process_input_queue
(
self
):
def
_process_input_queue
(
self
):
"""Exits when an engine step needs to be performed."""
"""Exits when an engine step needs to be performed."""
waited
=
False
waited
=
False
while
not
self
.
has_work
()
and
self
.
is_running
()
:
while
not
self
.
has_work
():
# Notify callbacks waiting for engine to become idle.
# Notify callbacks waiting for engine to become idle.
self
.
_notify_idle_state_callbacks
()
self
.
_notify_idle_state_callbacks
()
if
self
.
input_queue
.
empty
():
if
self
.
input_queue
.
empty
():
...
@@ -1206,60 +1187,18 @@ class EngineCoreProc(EngineCore):
...
@@ -1206,60 +1187,18 @@ class EngineCoreProc(EngineCore):
callback
=
self
.
_idle_state_callbacks
.
pop
()
callback
=
self
.
_idle_state_callbacks
.
pop
()
callback
(
self
)
callback
(
self
)
def
_handle_shutdown
(
self
)
->
bool
:
# Check if shutdown was requested and handle it
if
self
.
shutdown_state
==
EngineShutdownState
.
RUNNING
:
return
True
if
self
.
shutdown_state
==
EngineShutdownState
.
REQUESTED
:
shutdown_timeout
=
self
.
vllm_config
.
shutdown_timeout
logger
.
info
(
"Shutdown initiated (timeout=%d)"
,
shutdown_timeout
)
if
shutdown_timeout
==
0
:
num_requests
=
self
.
scheduler
.
get_num_unfinished_requests
()
if
num_requests
>
0
:
logger
.
info
(
"Aborting %d requests"
,
num_requests
)
aborted_reqs
=
self
.
scheduler
.
finish_requests
(
None
,
RequestStatus
.
FINISHED_ABORTED
)
self
.
_send_abort_outputs
(
aborted_reqs
)
else
:
num_requests
=
self
.
scheduler
.
get_num_unfinished_requests
()
if
num_requests
>
0
:
logger
.
info
(
"Draining %d in-flight requests (timeout=%ds)"
,
num_requests
,
shutdown_timeout
,
)
self
.
shutdown_state
=
EngineShutdownState
.
SHUTTING_DOWN
# Exit when no work remaining
if
not
self
.
has_work
():
logger
.
info
(
"Shutdown complete"
)
return
False
return
True
def
_handle_client_request
(
def
_handle_client_request
(
self
,
request_type
:
EngineCoreRequestType
,
request
:
Any
self
,
request_type
:
EngineCoreRequestType
,
request
:
Any
)
->
None
:
)
->
None
:
"""Dispatch request from client."""
"""Dispatch request from client."""
if
request_type
==
EngineCoreRequestType
.
WAKEUP
:
if
request_type
==
EngineCoreRequestType
.
ADD
:
return
elif
request_type
==
EngineCoreRequestType
.
ADD
:
req
,
request_wave
=
request
req
,
request_wave
=
request
if
self
.
_reject_add_in_shutdown
(
req
):
return
self
.
add_request
(
req
,
request_wave
)
self
.
add_request
(
req
,
request_wave
)
elif
request_type
==
EngineCoreRequestType
.
ABORT
:
elif
request_type
==
EngineCoreRequestType
.
ABORT
:
self
.
abort_requests
(
request
)
self
.
abort_requests
(
request
)
elif
request_type
==
EngineCoreRequestType
.
UTILITY
:
elif
request_type
==
EngineCoreRequestType
.
UTILITY
:
client_idx
,
call_id
,
method_name
,
args
=
request
client_idx
,
call_id
,
method_name
,
args
=
request
if
self
.
_reject_utility_in_shutdown
(
client_idx
,
call_id
,
method_name
):
return
output
=
UtilityOutput
(
call_id
)
output
=
UtilityOutput
(
call_id
)
# Lazily look-up utility method so that failure will be handled/returned.
# Lazily look-up utility method so that failure will be handled/returned.
get_result
=
lambda
:
(
method
:
=
getattr
(
self
,
method_name
))
and
method
(
get_result
=
lambda
:
(
method
:
=
getattr
(
self
,
method_name
))
and
method
(
...
@@ -1276,27 +1215,6 @@ class EngineCoreProc(EngineCore):
...
@@ -1276,27 +1215,6 @@ class EngineCoreProc(EngineCore):
"Unrecognized input request type encountered: %s"
,
request_type
"Unrecognized input request type encountered: %s"
,
request_type
)
)
def
_reject_add_in_shutdown
(
self
,
request
:
Request
)
->
bool
:
if
self
.
shutdown_state
==
EngineShutdownState
.
RUNNING
:
return
False
logger
.
info
(
"Rejecting request %s (server shutting down)"
,
request
.
request_id
)
self
.
_send_abort_outputs_to_client
([
request
.
request_id
],
request
.
client_index
)
return
True
def
_reject_utility_in_shutdown
(
self
,
client_idx
:
int
,
call_id
:
int
,
method_name
:
str
)
->
bool
:
if
self
.
shutdown_state
==
EngineShutdownState
.
RUNNING
:
return
False
logger
.
warning
(
"Rejecting utility call %s (server shutting down)"
,
method_name
)
output
=
UtilityOutput
(
call_id
,
failure_message
=
"Server shutting down"
)
self
.
output_queue
.
put_nowait
(
(
client_idx
,
EngineCoreOutputs
(
utility_output
=
output
))
)
return
True
@
staticmethod
@
staticmethod
def
_invoke_utility_method
(
def
_invoke_utility_method
(
name
:
str
,
get_result
:
Callable
,
output
:
UtilityOutput
,
enqueue_output
:
Callable
name
:
str
,
get_result
:
Callable
,
output
:
UtilityOutput
,
enqueue_output
:
Callable
...
@@ -1510,7 +1428,22 @@ class EngineCoreProc(EngineCore):
...
@@ -1510,7 +1428,22 @@ class EngineCoreProc(EngineCore):
logger
.
exception
(
logger
.
exception
(
"Unexpected error pre-processing request %s"
,
request
.
request_id
"Unexpected error pre-processing request %s"
,
request
.
request_id
)
)
self
.
_send_error_outputs_to_client
([
request
.
request_id
],
request
.
client_index
)
self
.
output_queue
.
put_nowait
(
(
request
.
client_index
,
EngineCoreOutputs
(
engine_index
=
self
.
engine_index
,
finished_requests
=
{
request
.
request_id
},
outputs
=
[
EngineCoreOutput
(
request_id
=
request
.
request_id
,
new_token_ids
=
[],
finish_reason
=
FinishReason
.
ERROR
,
)
],
),
)
)
def
pause_scheduler
(
def
pause_scheduler
(
self
,
mode
:
PauseMode
=
"abort"
,
clear_cache
:
bool
=
True
self
,
mode
:
PauseMode
=
"abort"
,
clear_cache
:
bool
=
True
...
@@ -1553,26 +1486,6 @@ class EngineCoreProc(EngineCore):
...
@@ -1553,26 +1486,6 @@ class EngineCoreProc(EngineCore):
self
.
_idle_state_callbacks
.
append
(
partial
(
engine_idle_callback
,
future
=
future
))
self
.
_idle_state_callbacks
.
append
(
partial
(
engine_idle_callback
,
future
=
future
))
return
future
return
future
def
_send_finish_outputs_to_client
(
self
,
req_ids
:
list
[
str
],
client_index
:
int
,
finish_reason
:
FinishReason
)
->
None
:
outputs
=
[
EngineCoreOutput
(
req_id
,
[],
finish_reason
=
finish_reason
)
for
req_id
in
req_ids
]
eco
=
EngineCoreOutputs
(
finished_requests
=
req_ids
,
outputs
=
outputs
)
self
.
output_queue
.
put_nowait
((
client_index
,
eco
))
def
_send_abort_outputs_to_client
(
self
,
req_ids
:
list
[
str
],
client_index
:
int
)
->
None
:
self
.
_send_finish_outputs_to_client
(
req_ids
,
client_index
,
FinishReason
.
ABORT
)
def
_send_error_outputs_to_client
(
self
,
req_ids
:
list
[
str
],
client_index
:
int
)
->
None
:
self
.
_send_finish_outputs_to_client
(
req_ids
,
client_index
,
FinishReason
.
ERROR
)
def
_send_abort_outputs
(
self
,
aborted_reqs
:
list
[
tuple
[
str
,
int
]])
->
None
:
def
_send_abort_outputs
(
self
,
aborted_reqs
:
list
[
tuple
[
str
,
int
]])
->
None
:
# TODO(nick) this will be moved inside the scheduler
# TODO(nick) this will be moved inside the scheduler
if
aborted_reqs
:
if
aborted_reqs
:
...
@@ -1581,7 +1494,12 @@ class EngineCoreProc(EngineCore):
...
@@ -1581,7 +1494,12 @@ class EngineCoreProc(EngineCore):
for
req_id
,
client_index
in
aborted_reqs
:
for
req_id
,
client_index
in
aborted_reqs
:
by_client
[
client_index
].
add
(
req_id
)
by_client
[
client_index
].
add
(
req_id
)
for
client_index
,
req_ids
in
by_client
.
items
():
for
client_index
,
req_ids
in
by_client
.
items
():
self
.
_send_abort_outputs_to_client
(
list
(
req_ids
),
client_index
)
outputs
=
[
EngineCoreOutput
(
req_id
,
[],
finish_reason
=
FinishReason
.
ABORT
)
for
req_id
in
req_ids
]
eco
=
EngineCoreOutputs
(
finished_requests
=
req_ids
,
outputs
=
outputs
)
self
.
output_queue
.
put_nowait
((
client_index
,
eco
))
class
DPEngineCoreProc
(
EngineCoreProc
):
class
DPEngineCoreProc
(
EngineCoreProc
):
...
@@ -1699,7 +1617,7 @@ class DPEngineCoreProc(EngineCoreProc):
...
@@ -1699,7 +1617,7 @@ class DPEngineCoreProc(EngineCoreProc):
"""Core busy loop of the EngineCore for data parallel case."""
"""Core busy loop of the EngineCore for data parallel case."""
# Loop until process is sent a SIGINT or SIGTERM
# Loop until process is sent a SIGINT or SIGTERM
while
self
.
_handle_shutdown
()
:
while
True
:
# 1) Poll the input queue until there is work to do.
# 1) Poll the input queue until there is work to do.
self
.
_process_input_queue
()
self
.
_process_input_queue
()
...
@@ -1747,8 +1665,6 @@ class DPEngineCoreProc(EngineCoreProc):
...
@@ -1747,8 +1665,6 @@ class DPEngineCoreProc(EngineCoreProc):
self
.
current_wave
+=
1
self
.
current_wave
+=
1
self
.
step_counter
=
0
self
.
step_counter
=
0
raise
SystemExit
def
_has_global_unfinished_reqs
(
self
,
local_unfinished
:
bool
)
->
bool
:
def
_has_global_unfinished_reqs
(
self
,
local_unfinished
:
bool
)
->
bool
:
# Optimization - only perform finish-sync all-reduce every 32 steps.
# Optimization - only perform finish-sync all-reduce every 32 steps.
self
.
step_counter
+=
1
self
.
step_counter
+=
1
...
...
vllm/v1/engine/core_client.py
View file @
23486039
...
@@ -128,7 +128,7 @@ class EngineCoreClient(ABC):
...
@@ -128,7 +128,7 @@ class EngineCoreClient(ABC):
return
AsyncMPClient
(
*
client_args
)
return
AsyncMPClient
(
*
client_args
)
@
abstractmethod
@
abstractmethod
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
...
def
shutdown
(
self
)
:
...
def
get_output
(
self
)
->
EngineCoreOutputs
:
def
get_output
(
self
)
->
EngineCoreOutputs
:
raise
NotImplementedError
raise
NotImplementedError
...
@@ -298,7 +298,7 @@ class InprocClient(EngineCoreClient):
...
@@ -298,7 +298,7 @@ class InprocClient(EngineCoreClient):
if
len
(
request_ids
)
>
0
:
if
len
(
request_ids
)
>
0
:
self
.
engine_core
.
abort_requests
(
request_ids
)
self
.
engine_core
.
abort_requests
(
request_ids
)
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
def
shutdown
(
self
)
->
None
:
self
.
engine_core
.
shutdown
()
self
.
engine_core
.
shutdown
()
def
profile
(
self
,
is_start
:
bool
=
True
,
profile_prefix
:
str
|
None
=
None
)
->
None
:
def
profile
(
self
,
is_start
:
bool
=
True
,
profile_prefix
:
str
|
None
=
None
)
->
None
:
...
@@ -390,9 +390,9 @@ class BackgroundResources:
...
@@ -390,9 +390,9 @@ class BackgroundResources:
self
.
engine_dead
=
True
self
.
engine_dead
=
True
if
self
.
engine_manager
is
not
None
:
if
self
.
engine_manager
is
not
None
:
self
.
engine_manager
.
shutdown
()
self
.
engine_manager
.
close
()
if
self
.
coordinator
is
not
None
:
if
self
.
coordinator
is
not
None
:
self
.
coordinator
.
shutdown
()
self
.
coordinator
.
close
()
if
isinstance
(
self
.
output_socket
,
zmq
.
asyncio
.
Socket
):
if
isinstance
(
self
.
output_socket
,
zmq
.
asyncio
.
Socket
):
# Async case.
# Async case.
...
@@ -568,7 +568,10 @@ class MPClient(EngineCoreClient):
...
@@ -568,7 +568,10 @@ class MPClient(EngineCoreClient):
)
)
with
launch_core_engines
(
with
launch_core_engines
(
vllm_config
,
executor_class
,
log_stats
,
addresses
vllm_config
,
executor_class
,
log_stats
,
addresses
,
)
as
(
engine_manager
,
coordinator
,
addresses
):
)
as
(
engine_manager
,
coordinator
,
addresses
):
self
.
resources
.
coordinator
=
coordinator
self
.
resources
.
coordinator
=
coordinator
self
.
resources
.
engine_manager
=
engine_manager
self
.
resources
.
engine_manager
=
engine_manager
...
@@ -634,12 +637,9 @@ class MPClient(EngineCoreClient):
...
@@ -634,12 +637,9 @@ class MPClient(EngineCoreClient):
if
not
success
:
if
not
success
:
self
.
_finalizer
()
self
.
_finalizer
()
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
def
shutdown
(
self
):
"""Shutdown engine manager under timeout and clean up resources."""
# Terminate background resources.
if
self
.
_finalizer
.
detach
()
is
not
None
:
self
.
_finalizer
()
if
self
.
resources
.
engine_manager
is
not
None
:
self
.
resources
.
engine_manager
.
shutdown
(
timeout
=
timeout
)
self
.
resources
()
def
_format_exception
(
self
,
e
:
Exception
)
->
Exception
:
def
_format_exception
(
self
,
e
:
Exception
)
->
Exception
:
"""If errored, use EngineDeadError so root cause is clear."""
"""If errored, use EngineDeadError so root cause is clear."""
...
@@ -683,7 +683,7 @@ class MPClient(EngineCoreClient):
...
@@ -683,7 +683,7 @@ class MPClient(EngineCoreClient):
sentinels
=
[
proc
.
sentinel
for
proc
in
engine_processes
]
sentinels
=
[
proc
.
sentinel
for
proc
in
engine_processes
]
died
=
multiprocessing
.
connection
.
wait
(
sentinels
)
died
=
multiprocessing
.
connection
.
wait
(
sentinels
)
_self
=
self_ref
()
_self
=
self_ref
()
if
not
_self
or
not
_self
.
_finalizer
.
alive
or
_self
.
resources
.
engine_dead
:
if
not
_self
or
_self
.
resources
.
engine_dead
:
return
return
_self
.
resources
.
engine_dead
=
True
_self
.
resources
.
engine_dead
=
True
proc_name
=
next
(
proc_name
=
next
(
...
...
vllm/v1/engine/utils.py
View file @
23486039
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
import
contextlib
import
contextlib
import
os
import
os
import
threading
import
weakref
import
weakref
from
collections.abc
import
Callable
,
Iterator
from
collections.abc
import
Callable
,
Iterator
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
...
@@ -152,12 +151,11 @@ class CoreEngineProcManager:
...
@@ -152,12 +151,11 @@ class CoreEngineProcManager:
finally
:
finally
:
# Kill other procs if not all are running.
# Kill other procs if not all are running.
if
self
.
finished_procs
():
if
self
.
finished_procs
():
self
.
shutdown
()
self
.
close
()
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
def
close
(
self
):
"""Shutdown engine core processes with configurable timeout."""
"""Shutdown all procs."""
if
self
.
_finalizer
.
detach
()
is
not
None
:
self
.
_finalizer
()
shutdown
(
self
.
processes
,
timeout
=
timeout
)
def
join_first
(
self
):
def
join_first
(
self
):
"""Wait for any process to exit."""
"""Wait for any process to exit."""
...
@@ -175,33 +173,6 @@ class CoreEngineProcManager:
...
@@ -175,33 +173,6 @@ class CoreEngineProcManager:
}
}
class
SignalCallback
:
"""Safely trigger a callback from signal handler context via a dedicated thread."""
def
__init__
(
self
,
callback
:
Callable
[[],
None
]):
self
.
_callback
=
callback
self
.
_event
=
threading
.
Event
()
self
.
_stopped
=
False
self
.
_thread
=
threading
.
Thread
(
target
=
self
.
_run
,
daemon
=
True
,
name
=
"signal-callback"
,
)
self
.
_thread
.
start
()
def
_run
(
self
):
self
.
_event
.
wait
()
if
not
self
.
_stopped
:
self
.
_callback
()
def
trigger
(
self
):
self
.
_event
.
set
()
def
stop
(
self
):
self
.
_stopped
=
True
self
.
_event
.
set
()
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
set_device_control_env_var
(
def
set_device_control_env_var
(
vllm_config
:
VllmConfig
,
local_dp_rank
:
int
vllm_config
:
VllmConfig
,
local_dp_rank
:
int
...
@@ -797,7 +768,7 @@ class CoreEngineActorManager:
...
@@ -797,7 +768,7 @@ class CoreEngineActorManager:
def
get_run_refs
(
self
):
def
get_run_refs
(
self
):
return
self
.
run_refs
return
self
.
run_refs
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
def
close
(
self
)
:
import
ray
import
ray
for
actor
in
self
.
local_engine_actors
+
self
.
remote_engine_actors
:
for
actor
in
self
.
local_engine_actors
+
self
.
remote_engine_actors
:
...
...
vllm/v1/utils.py
View file @
23486039
...
@@ -220,10 +220,8 @@ class APIServerProcessManager:
...
@@ -220,10 +220,8 @@ class APIServerProcessManager:
# The extra processes are managed by their owners
# The extra processes are managed by their owners
self
.
_finalizer
=
weakref
.
finalize
(
self
,
shutdown
,
self
.
processes
)
self
.
_finalizer
=
weakref
.
finalize
(
self
,
shutdown
,
self
.
processes
)
def
shutdown
(
self
,
timeout
:
float
|
None
=
None
)
->
None
:
def
close
(
self
)
->
None
:
"""Shutdown API server processes with configurable timeout"""
self
.
_finalizer
()
if
self
.
_finalizer
.
detach
()
is
not
None
:
shutdown
(
self
.
processes
,
timeout
=
timeout
)
def
wait_for_completion_or_failure
(
def
wait_for_completion_or_failure
(
...
@@ -290,30 +288,25 @@ def wait_for_completion_or_failure(
...
@@ -290,30 +288,25 @@ def wait_for_completion_or_failure(
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
exception
(
"Exception occurred while running API servers: %s"
,
str
(
e
))
logger
.
exception
(
"Exception occurred while running API servers: %s"
,
str
(
e
))
raise
raise
finally
:
logger
.
info
(
"Terminating remaining processes ..."
)
api_server_manager
.
close
()
if
coordinator
:
coordinator
.
close
()
if
engine_manager
:
engine_manager
.
close
()
# Note(rob): shutdown function cannot be a bound method,
# Note(rob): shutdown function cannot be a bound method,
# else the gc cannot collect the object.
# else the gc cannot collect the object.
def
shutdown
(
procs
:
list
[
BaseProcess
],
timeout
:
float
|
None
=
None
)
->
None
:
def
shutdown
(
procs
:
list
[
BaseProcess
]):
"""Shutdown processes with timeout.
Args:
procs: List of processes to shutdown
timeout: Maximum time in seconds to wait for graceful shutdown
"""
if
timeout
is
None
:
timeout
=
0.0
# Allow at least 5 seconds for remaining procs to terminate.
timeout
=
max
(
timeout
,
5.0
)
# Shutdown the process.
# Shutdown the process.
for
proc
in
procs
:
for
proc
in
procs
:
if
proc
.
is_alive
():
if
proc
.
is_alive
():
proc
.
terminate
()
proc
.
terminate
()
# Allow
time
for remaining procs to terminate.
# Allow
5 seconds
for remaining procs to terminate.
deadline
=
time
.
monotonic
()
+
timeout
deadline
=
time
.
monotonic
()
+
5
for
proc
in
procs
:
for
proc
in
procs
:
remaining
=
deadline
-
time
.
monotonic
()
remaining
=
deadline
-
time
.
monotonic
()
if
remaining
<=
0
:
if
remaining
<=
0
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment