Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7f6d47c1
Unverified
Commit
7f6d47c1
authored
Apr 07, 2025
by
Nick Hill
Committed by
GitHub
Apr 07, 2025
Browse files
[V1][BugFix] Exit properly if engine core fails during startup (#16137)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
3147586e
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
67 additions
and
14 deletions
+67
-14
requirements/test.in
requirements/test.in
+1
-0
requirements/test.txt
requirements/test.txt
+3
-0
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_engine_core_client.py
+41
-0
vllm/v1/engine/core_client.py
vllm/v1/engine/core_client.py
+14
-9
vllm/v1/utils.py
vllm/v1/utils.py
+8
-5
No files found.
requirements/test.in
View file @
7f6d47c1
...
@@ -5,6 +5,7 @@ pytest-forked
...
@@ -5,6 +5,7 @@ pytest-forked
pytest-asyncio
pytest-asyncio
pytest-rerunfailures
pytest-rerunfailures
pytest-shard
pytest-shard
pytest-timeout
# testing utils
# testing utils
awscli
awscli
...
...
requirements/test.txt
View file @
7f6d47c1
...
@@ -444,6 +444,7 @@ pytest==8.3.3
...
@@ -444,6 +444,7 @@ pytest==8.3.3
# pytest-mock
# pytest-mock
# pytest-rerunfailures
# pytest-rerunfailures
# pytest-shard
# pytest-shard
# pytest-timeout
pytest-asyncio==0.24.0
pytest-asyncio==0.24.0
# via -r requirements/test.in
# via -r requirements/test.in
pytest-forked==1.6.0
pytest-forked==1.6.0
...
@@ -454,6 +455,8 @@ pytest-rerunfailures==14.0
...
@@ -454,6 +455,8 @@ pytest-rerunfailures==14.0
# via -r requirements/test.in
# via -r requirements/test.in
pytest-shard==0.1.2
pytest-shard==0.1.2
# via -r requirements/test.in
# via -r requirements/test.in
pytest-timeout==2.3.1
# via -r requirements/test.in
python-dateutil==2.9.0.post0
python-dateutil==2.9.0.post0
# via
# via
# botocore
# botocore
...
...
tests/v1/engine/test_engine_core_client.py
View file @
7f6d47c1
...
@@ -3,8 +3,10 @@
...
@@ -3,8 +3,10 @@
import
asyncio
import
asyncio
import
time
import
time
import
uuid
import
uuid
from
threading
import
Thread
from
typing
import
Optional
from
typing
import
Optional
import
psutil
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
...
@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
...
@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
await
core_client
.
call_utility_async
(
"echo"
,
None
,
"help!"
)
await
core_client
.
call_utility_async
(
"echo"
,
None
,
"help!"
)
assert
str
(
e_info
.
value
)
==
"Call to echo method failed: help!"
assert
str
(
e_info
.
value
)
==
"Call to echo method failed: help!"
@
pytest
.
mark
.
timeout
(
10
)
def
test_startup_failure
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
,
pytest
.
raises
(
Exception
)
as
e_info
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
(
usage_context
=
UsageContext
.
UNKNOWN_CONTEXT
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
# Start another thread to wait for engine core process to start
# and kill it - simulate fatal uncaught process exit.
this_proc
=
psutil
.
Process
()
children_before
=
set
(
this_proc
.
children
())
def
kill_first_child
():
while
True
:
time
.
sleep
(
0.5
)
children
=
set
(
this_proc
.
children
())
-
children_before
if
children
:
child
=
children
.
pop
()
print
(
"Killing child core process"
,
child
.
pid
)
child
.
kill
()
break
Thread
(
target
=
kill_first_child
,
daemon
=
True
).
start
()
_core_client
=
EngineCoreClient
.
make_client
(
multiprocess_mode
=
True
,
asyncio_mode
=
True
,
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
,
)
assert
"Engine core initialization failed"
in
str
(
e_info
.
value
)
vllm/v1/engine/core_client.py
View file @
7f6d47c1
...
@@ -411,10 +411,21 @@ class MPClient(EngineCoreClient):
...
@@ -411,10 +411,21 @@ class MPClient(EngineCoreClient):
# Wait for engine core process(es) to send ready messages.
# Wait for engine core process(es) to send ready messages.
identities
=
set
(
eng
.
index
for
eng
in
self
.
resources
.
core_engines
)
identities
=
set
(
eng
.
index
for
eng
in
self
.
resources
.
core_engines
)
poller
=
zmq
.
Poller
()
poller
.
register
(
sync_input_socket
,
zmq
.
POLLIN
)
for
eng
in
self
.
resources
.
core_engines
:
poller
.
register
(
eng
.
proc_handle
,
zmq
.
POLLIN
)
while
identities
:
while
identities
:
while
not
sync_input_socket
.
poll
(
timeout
=
STARTUP_POLL_PERIOD_MS
):
events
=
poller
.
poll
(
STARTUP_POLL_PERIOD_MS
)
logger
.
info
(
"Waiting for %d core engine proc(s) to start: %s"
,
if
not
events
:
len
(
identities
),
identities
)
logger
.
debug
(
"Waiting for %d core engine proc(s) to start: %s"
,
len
(
identities
),
identities
)
continue
if
len
(
events
)
>
1
or
events
[
0
][
0
]
!=
sync_input_socket
:
# One of the core processes exited.
raise
RuntimeError
(
"Engine core initialization failed. "
"See root cause above."
)
eng_id_bytes
,
msg
=
sync_input_socket
.
recv_multipart
()
eng_id_bytes
,
msg
=
sync_input_socket
.
recv_multipart
()
eng_id
=
int
.
from_bytes
(
eng_id_bytes
,
byteorder
=
"little"
)
eng_id
=
int
.
from_bytes
(
eng_id_bytes
,
byteorder
=
"little"
)
if
eng_id
not
in
identities
:
if
eng_id
not
in
identities
:
...
@@ -424,12 +435,6 @@ class MPClient(EngineCoreClient):
...
@@ -424,12 +435,6 @@ class MPClient(EngineCoreClient):
logger
.
info
(
"Core engine process %d ready."
,
eng_id
)
logger
.
info
(
"Core engine process %d ready."
,
eng_id
)
identities
.
discard
(
eng_id
)
identities
.
discard
(
eng_id
)
# Double check that the process are running.
for
engine
in
self
.
resources
.
core_engines
:
proc
=
engine
.
proc_handle
.
proc
if
proc
.
exitcode
is
not
None
:
raise
RuntimeError
(
f
"Engine proc
{
proc
.
name
}
not running"
)
def
_init_core_engines
(
def
_init_core_engines
(
self
,
self
,
vllm_config
:
VllmConfig
,
vllm_config
:
VllmConfig
,
...
...
vllm/v1/utils.py
View file @
7f6d47c1
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
multiprocessing
import
os
import
os
import
weakref
import
weakref
from
collections
import
defaultdict
from
collections
import
defaultdict
from
collections.abc
import
Sequence
from
collections.abc
import
Sequence
from
multiprocessing
import
Process
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
Generic
,
Optional
,
TypeVar
,
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
Generic
,
Optional
,
TypeVar
,
Union
,
overload
)
Union
,
overload
)
...
@@ -112,20 +112,23 @@ class BackgroundProcHandle:
...
@@ -112,20 +112,23 @@ class BackgroundProcHandle:
process_kwargs
[
"output_path"
]
=
output_path
process_kwargs
[
"output_path"
]
=
output_path
# Run busy loop in background process.
# Run busy loop in background process.
self
.
proc
=
context
.
Process
(
target
=
target_fn
,
self
.
proc
:
Process
=
context
.
Process
(
target
=
target_fn
,
kwargs
=
process_kwargs
,
kwargs
=
process_kwargs
,
name
=
process_name
)
name
=
process_name
)
self
.
_finalizer
=
weakref
.
finalize
(
self
,
shutdown
,
self
.
proc
,
self
.
_finalizer
=
weakref
.
finalize
(
self
,
shutdown
,
self
.
proc
,
input_path
,
output_path
)
input_path
,
output_path
)
self
.
proc
.
start
()
self
.
proc
.
start
()
def
fileno
(
self
):
return
self
.
proc
.
sentinel
def
shutdown
(
self
):
def
shutdown
(
self
):
self
.
_finalizer
()
self
.
_finalizer
()
# Note(rob): shutdown function cannot be a bound method,
# Note(rob): shutdown function cannot be a bound method,
# else the gc cannot collect the object.
# else the gc cannot collect the object.
def
shutdown
(
proc
:
multiprocessing
.
Process
,
input_path
:
str
,
output_path
:
str
):
def
shutdown
(
proc
:
Process
,
input_path
:
str
,
output_path
:
str
):
# Shutdown the process.
# Shutdown the process.
if
proc
.
is_alive
():
if
proc
.
is_alive
():
proc
.
terminate
()
proc
.
terminate
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment