Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
15422ed3
Unverified
Commit
15422ed3
authored
Jan 14, 2026
by
Ryan Rock
Committed by
GitHub
Jan 15, 2026
Browse files
[CI/Build][Hardware][AMD] Fix v1/shutdown (#31997)
Signed-off-by:
Ryan Rock
<
ryan.rock@amd.com
>
parent
8471b27d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
64 additions
and
3 deletions
+64
-3
tests/v1/shutdown/conftest.py
tests/v1/shutdown/conftest.py
+26
-0
tests/v1/shutdown/test_forward_error.py
tests/v1/shutdown/test_forward_error.py
+18
-2
tests/v1/shutdown/test_startup_error.py
tests/v1/shutdown/test_startup_error.py
+20
-1
No files found.
tests/v1/shutdown/conftest.py
0 → 100644
View file @
15422ed3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
collections.abc
import
Iterable
from
pathlib
import
Path
import
pytest
from
vllm.platforms
import
current_platform
@
pytest
.
fixture
def
rocm_sitecustomize_factory
(
monkeypatch
,
tmp_path
:
Path
):
"""Return a function that installs a given sitecustomize payload."""
if
not
current_platform
.
is_rocm
():
return
lambda
_
:
None
def
install
(
lines
:
Iterable
[
str
])
->
None
:
sc
=
tmp_path
/
"sitecustomize.py"
sc
.
write_text
(
"
\n
"
.
join
(
lines
)
+
"
\n
"
)
monkeypatch
.
setenv
(
"PYTHONPATH"
,
os
.
pathsep
.
join
(
filter
(
None
,
[
str
(
tmp_path
),
os
.
getenv
(
"PYTHONPATH"
)])),
)
return
install
tests/v1/shutdown/test_forward_error.py
View file @
15422ed3
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
"""Test that we handle an Error in model forward and shutdown."""
"""Test that we handle an Error in model forward and shutdown."""
import
asyncio
import
asyncio
import
inspect
import
pytest
import
pytest
...
@@ -38,11 +39,22 @@ def evil_forward(self, *args, **kwargs):
...
@@ -38,11 +39,22 @@ def evil_forward(self, *args, **kwargs):
return
self
.
model
(
*
args
,
**
kwargs
)
return
self
.
model
(
*
args
,
**
kwargs
)
@
pytest
.
fixture
def
rocm_evil_forward
(
rocm_sitecustomize_factory
):
lines
=
[
"from vllm.distributed import get_tensor_model_parallel_rank"
,
"from vllm.model_executor.models.llama import LlamaForCausalLM"
,
inspect
.
getsource
(
evil_forward
),
f
"LlamaForCausalLM.forward =
{
evil_forward
.
__name__
}
"
,
]
rocm_sitecustomize_factory
(
lines
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
,
1
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
,
1
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
async
def
test_async_llm_model_error
(
async
def
test_async_llm_model_error
(
monkeypatch
,
tensor_parallel_size
:
int
,
model
:
str
monkeypatch
,
rocm_evil_forward
,
tensor_parallel_size
:
int
,
model
:
str
)
->
None
:
)
->
None
:
"""Test that AsyncLLM propagates a forward pass error and frees memory.
"""Test that AsyncLLM propagates a forward pass error and frees memory.
...
@@ -104,7 +116,11 @@ async def test_async_llm_model_error(
...
@@ -104,7 +116,11 @@ async def test_async_llm_model_error(
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
,
1
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
,
1
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
test_llm_model_error
(
def
test_llm_model_error
(
monkeypatch
,
tensor_parallel_size
:
int
,
enable_multiprocessing
:
bool
,
model
:
str
monkeypatch
,
rocm_evil_forward
,
tensor_parallel_size
:
int
,
enable_multiprocessing
:
bool
,
model
:
str
,
)
->
None
:
)
->
None
:
"""Test that LLM propagates a forward pass error and frees memory.
"""Test that LLM propagates a forward pass error and frees memory.
TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
...
...
tests/v1/shutdown/test_startup_error.py
View file @
15422ed3
...
@@ -2,6 +2,8 @@
...
@@ -2,6 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test that we handle a startup Error and shutdown."""
"""Test that we handle a startup Error and shutdown."""
import
inspect
import
pytest
import
pytest
from
tests.utils
import
wait_for_gpu_memory_to_clear
from
tests.utils
import
wait_for_gpu_memory_to_clear
...
@@ -28,12 +30,28 @@ def evil_method(self, *args, **kwargs):
...
@@ -28,12 +30,28 @@ def evil_method(self, *args, **kwargs):
return
self
.
model
(
*
args
,
**
kwargs
,
intermediate_tensors
=
None
)
return
self
.
model
(
*
args
,
**
kwargs
,
intermediate_tensors
=
None
)
@
pytest
.
fixture
def
rocm_evil_method
(
rocm_sitecustomize_factory
,
request
):
failing_method
=
request
.
getfixturevalue
(
"failing_method"
)
lines
=
[
"from vllm.distributed import get_tensor_model_parallel_rank"
,
"from vllm.model_executor.models.llama import LlamaForCausalLM"
,
inspect
.
getsource
(
evil_method
),
f
"LlamaForCausalLM.
{
failing_method
}
=
{
evil_method
.
__name__
}
"
,
]
rocm_sitecustomize_factory
(
lines
)
@
pytest
.
mark
.
timeout
(
SHUTDOWN_TEST_TIMEOUT_SEC
)
@
pytest
.
mark
.
timeout
(
SHUTDOWN_TEST_TIMEOUT_SEC
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
,
1
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
,
1
])
@
pytest
.
mark
.
parametrize
(
"failing_method"
,
[
"forward"
,
"load_weights"
])
@
pytest
.
mark
.
parametrize
(
"failing_method"
,
[
"forward"
,
"load_weights"
])
def
test_async_llm_startup_error
(
def
test_async_llm_startup_error
(
monkeypatch
,
model
:
str
,
tensor_parallel_size
:
int
,
failing_method
:
str
monkeypatch
,
rocm_evil_method
,
model
:
str
,
tensor_parallel_size
:
int
,
failing_method
:
str
,
)
->
None
:
)
->
None
:
"""Test that AsyncLLM propagates an __init__ error & frees memory.
"""Test that AsyncLLM propagates an __init__ error & frees memory.
Test profiling (forward()) and load weights failures.
Test profiling (forward()) and load weights failures.
...
@@ -67,6 +85,7 @@ def test_async_llm_startup_error(
...
@@ -67,6 +85,7 @@ def test_async_llm_startup_error(
@
pytest
.
mark
.
parametrize
(
"failing_method"
,
[
"forward"
,
"load_weights"
])
@
pytest
.
mark
.
parametrize
(
"failing_method"
,
[
"forward"
,
"load_weights"
])
def
test_llm_startup_error
(
def
test_llm_startup_error
(
monkeypatch
,
monkeypatch
,
rocm_evil_method
,
model
:
str
,
model
:
str
,
tensor_parallel_size
:
int
,
tensor_parallel_size
:
int
,
enable_multiprocessing
:
bool
,
enable_multiprocessing
:
bool
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment