Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7fb2a5be
Unverified
Commit
7fb2a5be
authored
Sep 17, 2025
by
Woosuk Kwon
Committed by
GitHub
Sep 17, 2025
Browse files
[V0 Deprecation] Skip PP test (#25128)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
6c036615
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
86 deletions
+28
-86
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+28
-86
No files found.
tests/distributed/test_pipeline_parallel.py
View file @
7fb2a5be
...
...
@@ -26,23 +26,10 @@ logger = init_logger("test_pipeline_parallel")
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
For PP, we fall back to V0 by default. This means
that the TP baseline runs with V1 while the PP engine
runs with V0. This gives divergent results with dummy
weights. Once we enable V1 by default for PP, we can
remove this.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
class
ParallelSetup
(
NamedTuple
):
tp_size
:
int
pp_size
:
int
eager_mode
:
bool
chunked_prefill
:
bool
class
PPTestOptions
(
NamedTuple
):
...
...
@@ -53,23 +40,10 @@ class PPTestOptions(NamedTuple):
@
dataclass
class
PPTestSettings
:
parallel_setups
:
list
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
list
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
list
[
str
]
runner
:
RunnerOption
test_options
:
PPTestOptions
def
__post_init__
(
self
):
if
len
(
self
.
distributed_backends
)
!=
len
(
self
.
vllm_major_versions
):
raise
ValueError
(
f
"Length mismatch: distributed_backends "
f
"(
{
len
(
self
.
distributed_backends
)
}
) != "
f
"vllm_major_versions (
{
len
(
self
.
vllm_major_versions
)
}
)"
)
@
staticmethod
def
detailed
(
*
,
...
...
@@ -83,27 +57,21 @@ class PPTestSettings:
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
False
),
eager_mode
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
True
),
eager_mode
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
eager_mode
=
True
),
ParallelSetup
(
tp_size
=
2
*
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
True
),
eager_mode
=
False
),
ParallelSetup
(
tp_size
=
2
*
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
eager_mode
=
True
),
],
distributed_backends
=
[
"mp"
,
"mp"
,
"ray"
,
"ray"
],
vllm_major_versions
=
[
"0"
,
"1"
,
"0"
,
"1"
],
distributed_backends
=
[
"mp"
,
"ray"
],
runner
=
runner
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
...
...
@@ -118,17 +86,14 @@ class PPTestSettings:
multi_node_only
:
bool
=
False
,
load_format
:
Optional
[
str
]
=
None
,
):
vllm_major_versions
=
[
"1"
]
if
runner
==
"pooling"
else
[
"0"
]
return
PPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
eager_mode
=
True
),
],
distributed_backends
=
[
"mp"
],
vllm_major_versions
=
vllm_major_versions
,
runner
=
runner
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
...
...
@@ -138,10 +103,8 @@ class PPTestSettings:
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
backend
,
vllm_major_version
in
zip
(
self
.
distributed_backends
,
self
.
vllm_major_versions
):
yield
(
model_id
,
parallel_setup
,
backend
,
vllm_major_version
,
self
.
runner
,
opts
)
for
backend
in
self
.
distributed_backends
:
yield
(
model_id
,
parallel_setup
,
backend
,
self
.
runner
,
opts
)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
...
...
@@ -269,7 +232,6 @@ def _compare_tp(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
test_options
:
PPTestOptions
,
num_gpus_available
:
int
,
...
...
@@ -281,7 +243,6 @@ def _compare_tp(
tp_size
,
pp_size
,
eager_mode
,
chunked_prefill
,
)
=
parallel_setup
multi_node_only
,
load_format
=
test_options
...
...
@@ -334,8 +295,6 @@ def _compare_tp(
"--max-num-seqs"
,
"8"
,
]
if
chunked_prefill
:
common_args
.
append
(
"--enable-chunked-prefill"
)
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
if
runner
!=
"auto"
:
...
...
@@ -353,14 +312,10 @@ def _compare_tp(
if
max_num_seqs
:
common_args
.
extend
([
"--max-num-seqs"
,
f
"
{
max_num_seqs
}
"
])
specific_case
=
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
testing_ray_compiled_graph
=
False
if
distributed_backend
==
"ray"
and
(
vllm_major_version
==
"1"
or
specific_case
):
if
distributed_backend
==
"ray"
:
# For V1, test Ray Compiled Graph for all the tests
# For V0, test Ray Compiled Graph for a subset of the tests
pp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
"VLLM_USE_V1"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
...
...
@@ -368,17 +323,15 @@ def _compare_tp(
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of a Ray Compiled Graph issue.
common_args
.
append
(
"--disable-frontend-multiprocessing"
)
testing_ray_compiled_graph
=
True
elif
distributed_backend
==
"mp"
:
# Both V0/V1 of multiprocessing executor support PP
pp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
"VLLM_USE_V1"
:
"1"
,
}
else
:
pp_env
=
None
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
"VLLM_USE_V1"
:
"1"
,
}
pp_args
=
[
...
...
@@ -404,25 +357,17 @@ def _compare_tp(
"mp"
,
]
try
:
compare_two_settings
(
model_id
,
pp_args
,
tp_args
,
pp_env
,
tp_env
,
method
=
method
)
except
Exception
:
if
testing_ray_compiled_graph
and
vllm_major_version
==
"0"
:
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger
.
exception
(
"Ray Compiled Graph tests failed"
)
else
:
raise
compare_two_settings
(
model_id
,
pp_args
,
tp_args
,
pp_env
,
tp_env
,
method
=
method
)
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"runner"
,
"test_options"
),
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"
runner
"
,
"test_options"
),
[
params
for
model_id
,
settings
in
TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
...
...
@@ -433,15 +378,14 @@ def test_tp_language_generation(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
test_options
:
PPTestOptions
,
num_gpus_available
,
):
pytest
.
skip
(
"Skipping the test until V1 passes it."
)
_compare_tp
(
model_id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
runner
,
test_options
,
num_gpus_available
,
...
...
@@ -450,8 +394,8 @@ def test_tp_language_generation(
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"runner"
,
"test_options"
),
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"
runner
"
,
"test_options"
),
[
params
for
model_id
,
settings
in
EMBEDDING_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
...
...
@@ -462,15 +406,14 @@ def test_tp_language_embedding(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
test_options
:
PPTestOptions
,
num_gpus_available
,
):
pytest
.
skip
(
"Skipping the test until V1 passes it."
)
_compare_tp
(
model_id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
runner
,
test_options
,
num_gpus_available
,
...
...
@@ -479,8 +422,8 @@ def test_tp_language_embedding(
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"
vllm_major_version
"
,
"runner"
,
"test_options"
),
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"
runner
"
,
"test_options"
),
[
params
for
model_id
,
settings
in
MULTIMODAL_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
TEST_MODELS
...
...
@@ -491,15 +434,14 @@ def test_tp_multimodal_generation(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
test_options
:
PPTestOptions
,
num_gpus_available
,
):
pytest
.
skip
(
"Skipping the test until V1 passes it."
)
_compare_tp
(
model_id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
runner
,
test_options
,
num_gpus_available
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment