Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4db44264
Unverified
Commit
4db44264
authored
Sep 10, 2025
by
Nick Hill
Committed by
GitHub
Sep 10, 2025
Browse files
[CI] Fail subprocess tests with root-cause error (#23795)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
a0933c3b
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
138 additions
and
33 deletions
+138
-33
requirements/test.in
requirements/test.in
+1
-0
requirements/test.txt
requirements/test.txt
+3
-1
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+26
-0
tests/conftest.py
tests/conftest.py
+10
-0
tests/utils.py
tests/utils.py
+92
-28
vllm/executor/ray_distributed_executor.py
vllm/executor/ray_distributed_executor.py
+6
-4
No files found.
requirements/test.in
View file @
4db44264
...
@@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
...
@@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
sentence-transformers # required for embedding tests
sentence-transformers # required for embedding tests
soundfile # required for audio tests
soundfile # required for audio tests
jiwer # required for audio tests
jiwer # required for audio tests
tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test
timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.8.0
torch==2.8.0
torchaudio==2.8.0
torchaudio==2.8.0
...
...
requirements/test.txt
View file @
4db44264
...
@@ -137,7 +137,7 @@ contourpy==1.3.0
...
@@ -137,7 +137,7 @@ contourpy==1.3.0
# via matplotlib
# via matplotlib
cramjam==2.9.0
cramjam==2.9.0
# via fastparquet
# via fastparquet
cupy-cuda12x==13.
3
.0
cupy-cuda12x==13.
6
.0
# via ray
# via ray
cycler==0.12.1
cycler==0.12.1
# via matplotlib
# via matplotlib
...
@@ -1032,6 +1032,8 @@ tabledata==1.3.3
...
@@ -1032,6 +1032,8 @@ tabledata==1.3.3
# via pytablewriter
# via pytablewriter
tabulate==0.9.0
tabulate==0.9.0
# via sacrebleu
# via sacrebleu
tblib==3.1.0
# via -r requirements/test.in
tcolorpy==0.1.6
tcolorpy==0.1.6
# via pytablewriter
# via pytablewriter
tenacity==9.0.0
tenacity==9.0.0
...
...
tests/async_engine/test_api_server.py
View file @
4db44264
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
copyreg
import
os
import
os
import
subprocess
import
subprocess
import
sys
import
sys
...
@@ -10,6 +11,30 @@ from pathlib import Path
...
@@ -10,6 +11,30 @@ from pathlib import Path
import
pytest
import
pytest
import
requests
import
requests
import
urllib3.exceptions
def
_pickle_new_connection_error
(
obj
):
"""Custom pickler for NewConnectionError to fix tblib compatibility."""
# Extract the original message by removing the "conn: " prefix
full_message
=
obj
.
args
[
0
]
if
obj
.
args
else
""
if
': '
in
full_message
:
# Split off the connection part and keep the actual message
_
,
actual_message
=
full_message
.
split
(
': '
,
1
)
else
:
actual_message
=
full_message
return
_unpickle_new_connection_error
,
(
actual_message
,
)
def
_unpickle_new_connection_error
(
message
):
"""Custom unpickler for NewConnectionError."""
# Create with None as conn and the actual message
return
urllib3
.
exceptions
.
NewConnectionError
(
None
,
message
)
# Register the custom pickle/unpickle functions for tblib compatibility
copyreg
.
pickle
(
urllib3
.
exceptions
.
NewConnectionError
,
_pickle_new_connection_error
)
def
_query_server
(
prompt
:
str
,
max_tokens
:
int
=
5
)
->
dict
:
def
_query_server
(
prompt
:
str
,
max_tokens
:
int
=
5
)
->
dict
:
...
@@ -52,6 +77,7 @@ def api_server(distributed_executor_backend: str):
...
@@ -52,6 +77,7 @@ def api_server(distributed_executor_backend: str):
uvicorn_process
.
terminate
()
uvicorn_process
.
terminate
()
@
pytest
.
mark
.
timeout
(
300
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"mp"
,
"ray"
])
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"mp"
,
"ray"
])
def
test_api_server
(
api_server
,
distributed_executor_backend
:
str
):
def
test_api_server
(
api_server
,
distributed_executor_backend
:
str
):
"""
"""
...
...
tests/conftest.py
View file @
4db44264
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
from
tblib
import
pickling_support
# Install support for pickling exceptions so that we can nicely propagate
# failures from tests running in a subprocess.
# This should be run before any custom exception subclasses are defined.
pickling_support
.
install
()
import
http.server
import
http.server
import
json
import
json
import
math
import
math
...
...
tests/utils.py
View file @
4db44264
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
asyncio
import
contextlib
import
copy
import
copy
import
functools
import
functools
import
importlib
import
importlib
...
@@ -13,7 +14,7 @@ import sys
...
@@ -13,7 +14,7 @@ import sys
import
tempfile
import
tempfile
import
time
import
time
import
warnings
import
warnings
from
contextlib
import
contextmanager
,
suppress
from
contextlib
import
ExitStack
,
contextmanager
,
suppress
from
multiprocessing
import
Process
from
multiprocessing
import
Process
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Any
,
Callable
,
Literal
,
Optional
,
Union
from
typing
import
Any
,
Callable
,
Literal
,
Optional
,
Union
...
@@ -800,43 +801,106 @@ _P = ParamSpec("_P")
...
@@ -800,43 +801,106 @@ _P = ParamSpec("_P")
def
fork_new_process_for_each_test
(
def
fork_new_process_for_each_test
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
f
unc
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
"""Decorator to fork a new process for each test function.
"""Decorator to fork a new process for each test function.
See https://github.com/vllm-project/vllm/issues/7053 for more details.
See https://github.com/vllm-project/vllm/issues/7053 for more details.
"""
"""
@
functools
.
wraps
(
f
)
@
functools
.
wraps
(
f
unc
)
def
wrapper
(
*
args
:
_P
.
args
,
**
kwargs
:
_P
.
kwargs
)
->
None
:
def
wrapper
(
*
args
:
_P
.
args
,
**
kwargs
:
_P
.
kwargs
)
->
None
:
# Make the process the leader of its own process group
# Make the process the leader of its own process group
# to avoid sending SIGTERM to the parent process
# to avoid sending SIGTERM to the parent process
os
.
setpgrp
()
os
.
setpgrp
()
from
_pytest.outcomes
import
Skipped
from
_pytest.outcomes
import
Skipped
pid
=
os
.
fork
()
print
(
f
"Fork a new process to run a test
{
pid
}
"
)
# Create a unique temporary file to store exception info from child
if
pid
==
0
:
# process. Use test function name and process ID to avoid collisions.
try
:
with
tempfile
.
NamedTemporaryFile
(
f
(
*
args
,
**
kwargs
)
delete
=
False
,
except
Skipped
as
e
:
mode
=
'w+b'
,
# convert Skipped to exit code 0
prefix
=
f
"vllm_test_
{
func
.
__name__
}
_
{
os
.
getpid
()
}
_"
,
print
(
str
(
e
))
suffix
=
".exc"
)
as
exc_file
,
ExitStack
()
as
delete_after
:
os
.
_exit
(
0
)
exc_file_path
=
exc_file
.
name
except
Exception
:
delete_after
.
callback
(
os
.
remove
,
exc_file_path
)
import
traceback
traceback
.
print_exc
()
pid
=
os
.
fork
()
os
.
_exit
(
1
)
print
(
f
"Fork a new process to run a test
{
pid
}
"
)
if
pid
==
0
:
# Parent process responsible for deleting, don't delete
# in child.
delete_after
.
pop_all
()
try
:
func
(
*
args
,
**
kwargs
)
except
Skipped
as
e
:
# convert Skipped to exit code 0
print
(
str
(
e
))
os
.
_exit
(
0
)
except
Exception
as
e
:
import
traceback
tb_string
=
traceback
.
format_exc
()
# Try to serialize the exception object first
exc_to_serialize
:
dict
[
str
,
Any
]
try
:
# First, try to pickle the actual exception with
# its traceback.
exc_to_serialize
=
{
'pickled_exception'
:
e
}
# Test if it can be pickled
cloudpickle
.
dumps
(
exc_to_serialize
)
except
(
Exception
,
KeyboardInterrupt
):
# Fall back to string-based approach.
exc_to_serialize
=
{
'exception_type'
:
type
(
e
).
__name__
,
'exception_msg'
:
str
(
e
),
'traceback'
:
tb_string
,
}
try
:
with
open
(
exc_file_path
,
'wb'
)
as
f
:
cloudpickle
.
dump
(
exc_to_serialize
,
f
)
except
Exception
:
# Fallback: just print the traceback.
print
(
tb_string
)
os
.
_exit
(
1
)
else
:
os
.
_exit
(
0
)
else
:
else
:
os
.
_exit
(
0
)
pgid
=
os
.
getpgid
(
pid
)
else
:
_pid
,
_exitcode
=
os
.
waitpid
(
pid
,
0
)
pgid
=
os
.
getpgid
(
pid
)
# ignore SIGTERM signal itself
_pid
,
_exitcode
=
os
.
waitpid
(
pid
,
0
)
old_signal_handler
=
signal
.
signal
(
signal
.
SIGTERM
,
# ignore SIGTERM signal itself
signal
.
SIG_IGN
)
old_signal_handler
=
signal
.
signal
(
signal
.
SIGTERM
,
signal
.
SIG_IGN
)
# kill all child processes
# kill all child processes
os
.
killpg
(
pgid
,
signal
.
SIGTERM
)
os
.
killpg
(
pgid
,
signal
.
SIGTERM
)
# restore the signal handler
# restore the signal handler
signal
.
signal
(
signal
.
SIGTERM
,
old_signal_handler
)
signal
.
signal
(
signal
.
SIGTERM
,
old_signal_handler
)
if
_exitcode
!=
0
:
assert
_exitcode
==
0
,
(
f
"function
{
f
}
failed when called with"
# Try to read the exception from the child process
f
" args
{
args
}
and kwargs
{
kwargs
}
"
)
exc_info
=
{}
if
os
.
path
.
exists
(
exc_file_path
):
with
contextlib
.
suppress
(
Exception
),
\
open
(
exc_file_path
,
'rb'
)
as
f
:
exc_info
=
cloudpickle
.
load
(
f
)
if
(
original_exception
:
=
exc_info
.
get
(
'pickled_exception'
))
is
not
None
:
# Re-raise the actual exception object if it was
# successfully pickled.
assert
isinstance
(
original_exception
,
Exception
)
raise
original_exception
if
(
original_tb
:
=
exc_info
.
get
(
"traceback"
))
is
not
None
:
# Use string-based traceback for fallback case
raise
AssertionError
(
f
"Test
{
func
.
__name__
}
failed when called with"
f
" args
{
args
}
and kwargs
{
kwargs
}
"
f
" (exit code:
{
_exitcode
}
):
\n
{
original_tb
}
"
)
from
None
# Fallback to the original generic error
raise
AssertionError
(
f
"function
{
func
.
__name__
}
failed when called with"
f
" args
{
args
}
and kwargs
{
kwargs
}
"
f
" (exit code:
{
_exitcode
}
)"
)
from
None
return
wrapper
return
wrapper
...
...
vllm/executor/ray_distributed_executor.py
View file @
4db44264
...
@@ -117,10 +117,12 @@ class RayDistributedExecutor(DistributedExecutorBase):
...
@@ -117,10 +117,12 @@ class RayDistributedExecutor(DistributedExecutorBase):
self
.
driver_worker
.
execute_method
)
self
.
driver_worker
.
execute_method
)
def
shutdown
(
self
)
->
None
:
def
shutdown
(
self
)
->
None
:
logger
.
info
(
if
logger
:
"Shutting down Ray distributed executor. If you see error log "
# Somehow logger can be None here.
"from logging.cc regarding SIGTERM received, please ignore because "
logger
.
info
(
"this is the expected termination process in Ray."
)
"Shutting down Ray distributed executor. If you see error log "
"from logging.cc regarding SIGTERM received, please ignore "
"because this is the expected termination process in Ray."
)
if
hasattr
(
self
,
"forward_dag"
)
and
self
.
forward_dag
is
not
None
:
if
hasattr
(
self
,
"forward_dag"
)
and
self
.
forward_dag
is
not
None
:
self
.
forward_dag
.
teardown
()
self
.
forward_dag
.
teardown
()
import
ray
import
ray
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment