Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
488
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1206 additions
and
321 deletions
+1206
-321
tests/detokenizer/test_min_tokens.py
tests/detokenizer/test_min_tokens.py
+0
-1
tests/detokenizer/test_stop_string_while_stop_model_terminates.py
...tokenizer/test_stop_string_while_stop_model_terminates.py
+0
-1
tests/distributed/eplb_utils.py
tests/distributed/eplb_utils.py
+7
-2
tests/distributed/test_ca_buffer_sharing.py
tests/distributed/test_ca_buffer_sharing.py
+3
-3
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+113
-6
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+11
-11
tests/distributed/test_dcp_a2a.py
tests/distributed/test_dcp_a2a.py
+192
-0
tests/distributed/test_elastic_ep.py
tests/distributed/test_elastic_ep.py
+202
-0
tests/distributed/test_eplb_execute.py
tests/distributed/test_eplb_execute.py
+252
-232
tests/distributed/test_eplb_fused_moe_layer.py
tests/distributed/test_eplb_fused_moe_layer.py
+1
-1
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
+1
-1
tests/distributed/test_mq_connect_ip.py
tests/distributed/test_mq_connect_ip.py
+79
-0
tests/distributed/test_multiproc_executor.py
tests/distributed/test_multiproc_executor.py
+4
-2
tests/distributed/test_nccl_symm_mem_allreduce.py
tests/distributed/test_nccl_symm_mem_allreduce.py
+5
-3
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+3
-0
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+30
-28
tests/distributed/test_quick_all_reduce.py
tests/distributed/test_quick_all_reduce.py
+14
-17
tests/distributed/test_shm_broadcast.py
tests/distributed/test_shm_broadcast.py
+285
-8
tests/distributed/test_symm_mem_allreduce.py
tests/distributed/test_symm_mem_allreduce.py
+3
-3
tests/distributed/test_torchrun_example.py
tests/distributed/test_torchrun_example.py
+1
-2
No files found.
Too many changes to show.
To preserve performance only
488 of 488+
files are displayed.
Plain diff
Email patch
tests/detokenizer/test_min_tokens.py
View file @
3fb4b5fa
...
...
@@ -39,7 +39,6 @@ def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
mm_features
=
None
,
sampling_params
=
params
,
pooling_params
=
None
,
eos_token_id
=
None
,
arrival_time
=
0.0
,
lora_request
=
None
,
cache_salt
=
None
,
...
...
tests/detokenizer/test_stop_string_while_stop_model_terminates.py
View file @
3fb4b5fa
...
...
@@ -35,7 +35,6 @@ def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
mm_features
=
None
,
sampling_params
=
params
,
pooling_params
=
None
,
eos_token_id
=
None
,
arrival_time
=
0.0
,
lora_request
=
None
,
cache_salt
=
None
,
...
...
tests/distributed/eplb_utils.py
View file @
3fb4b5fa
...
...
@@ -7,6 +7,7 @@ import random
import
torch
import
torch.multiprocessing
as
mp
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.distributed.parallel_state
import
(
init_distributed_environment
,
)
...
...
@@ -41,8 +42,12 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
update_environment_variables
(
env
)
local_rank
=
os
.
environ
[
"LOCAL_RANK"
]
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_distributed_environment
()
torch
.
accelerator
.
set_device_index
(
device
)
# Create a minimal vllm config for init_distributed_environment
vllm_config
=
VllmConfig
()
with
set_current_vllm_config
(
vllm_config
):
init_distributed_environment
()
# Ensure each worker process has the same random seed
random
.
seed
(
42
)
...
...
tests/distributed/test_ca_buffer_sharing.py
View file @
3fb4b5fa
...
...
@@ -32,7 +32,7 @@ pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
print
(
f
"Rank
{
rank
}
has pointers
{
pointers
}
"
)
dist
.
barrier
()
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
if
rank
==
0
:
# the first rank tries to write to all buffers
...
...
@@ -41,7 +41,7 @@ if rank == 0:
lib
.
cudaMemset
(
pointer
,
byte_value
,
buffer_size_in_bytes
)
dist
.
barrier
()
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
host_data
=
(
ctypes
.
c_char
*
buffer_size_in_bytes
)()
...
...
@@ -59,6 +59,6 @@ for p in pointers:
print
(
f
"Rank
{
rank
}
verified all buffers"
)
dist
.
barrier
()
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
CustomAllreduce
.
free_shared_buffer
(
pointers
)
tests/distributed/test_comm_ops.py
View file @
3fb4b5fa
...
...
@@ -19,6 +19,8 @@ from vllm.distributed import (
tensor_model_parallel_all_reduce
,
tensor_model_parallel_reduce_scatter
,
)
from
vllm.distributed.parallel_state
import
GroupCoordinator
,
TensorMetadata
from
vllm.v1.worker.gpu_worker
import
AsyncIntermediateTensors
from
..utils
import
(
init_test_distributed_environment
,
...
...
@@ -41,7 +43,7 @@ def all_reduce_test_worker(
monkeypatch
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
num_elements
=
8
all_tensors
=
[
...
...
@@ -67,7 +69,7 @@ def reduce_scatter_test_worker(
# they will be able to set the device to the correct GPU
monkeypatch
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
num_elements
=
8
...
...
@@ -98,7 +100,7 @@ def all_gather_test_worker(
# they will be able to set the device to the correct GPU
monkeypatch
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
num_dimensions
=
3
tensor_size
=
list
(
range
(
2
,
num_dimensions
+
2
))
...
...
@@ -132,7 +134,7 @@ def broadcast_tensor_dict_test_worker(
# they will be able to set the device to the correct GPU
monkeypatch
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
test_dict
=
{
# device tensor
...
...
@@ -169,7 +171,7 @@ def send_recv_tensor_dict_test_worker(
):
monkeypatch
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
test_dict
=
{
...
...
@@ -200,6 +202,111 @@ def send_recv_tensor_dict_test_worker(
torch
.
testing
.
assert_close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
class
_DummyWork
:
def
__init__
(
self
)
->
None
:
self
.
wait_calls
=
0
def
wait
(
self
)
->
None
:
self
.
wait_calls
+=
1
class
_DummyAllGatherGroup
:
def
__init__
(
self
,
world_size
:
int
,
rank_in_group
:
int
)
->
None
:
self
.
world_size
=
world_size
self
.
rank_in_group
=
rank_in_group
def
all_gather
(
self
,
t
:
torch
.
Tensor
,
dim
:
int
=
0
)
->
torch
.
Tensor
:
# duplicate local slice across ranks.
assert
dim
==
0
return
torch
.
cat
([
t
for
_
in
range
(
self
.
world_size
)],
dim
=
0
)
def
_make_group_for_unit_test
(
rank_in_group
:
int
=
0
,
world_size
:
int
=
2
)
->
GroupCoordinator
:
# avoid running GroupCoordinator.__init__ (it wires up real process groups).
g
=
GroupCoordinator
.
__new__
(
GroupCoordinator
)
g
.
world_size
=
world_size
g
.
rank_in_group
=
rank_in_group
g
.
ranks
=
list
(
range
(
world_size
))
g
.
use_cpu_custom_send_recv
=
False
g
.
device_group
=
None
g
.
cpu_group
=
None
return
g
def
test_irecv_tensor_dict_send_allgather_postprocess_binds_keys
(
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
def
fake_irecv
(
t
:
torch
.
Tensor
,
*
args
:
Any
,
**
kwargs
:
Any
)
->
_DummyWork
:
t
.
fill_
(
1
)
return
_DummyWork
()
monkeypatch
.
setattr
(
torch
.
distributed
,
"is_initialized"
,
lambda
:
True
)
monkeypatch
.
setattr
(
torch
.
distributed
,
"irecv"
,
fake_irecv
)
g
=
_make_group_for_unit_test
(
rank_in_group
=
0
,
world_size
=
2
)
# 2 tensors so we can catch late-binding bugs in postprocess closures.
metadata_list
=
[
(
"a"
,
TensorMetadata
(
"cpu"
,
torch
.
int32
,
torch
.
Size
([
4
]))),
(
"b"
,
TensorMetadata
(
"cpu"
,
torch
.
int32
,
torch
.
Size
([
4
]))),
]
g
.
recv_object
=
lambda
src
=
None
:
metadata_list
# type: ignore[method-assign]
ag
=
_DummyAllGatherGroup
(
world_size
=
2
,
rank_in_group
=
0
)
td
,
handles
,
postprocess
=
g
.
irecv_tensor_dict
(
all_gather_group
=
ag
)
assert
td
is
not
None
assert
len
(
handles
)
==
2
assert
len
(
postprocess
)
==
2
# before postprocess, dict holds the TP slice (shape 2).
assert
td
[
"a"
].
shape
==
torch
.
Size
([
2
])
assert
td
[
"b"
].
shape
==
torch
.
Size
([
2
])
# simulate worker-side "defer wait": wait + postprocess later.
for
handle
in
handles
:
handle
.
wait
()
for
fn
in
postprocess
:
fn
()
# after postprocess, dict values are reconstructed to full shape (shape 4),
# and each key should be updated independently
assert
td
[
"a"
].
shape
==
torch
.
Size
([
4
])
assert
td
[
"b"
].
shape
==
torch
.
Size
([
4
])
torch
.
testing
.
assert_close
(
td
[
"a"
],
torch
.
ones
(
4
,
dtype
=
torch
.
int32
))
torch
.
testing
.
assert_close
(
td
[
"b"
],
torch
.
ones
(
4
,
dtype
=
torch
.
int32
))
def
test_async_intermediate_tensors_lazy_wait
()
->
None
:
work
=
_DummyWork
()
post_calls
=
{
"n"
:
0
}
def
post
()
->
None
:
post_calls
[
"n"
]
+=
1
it
=
AsyncIntermediateTensors
(
{
"x"
:
torch
.
tensor
([
1
])},
comm_handles
=
[
work
],
comm_postprocess
=
[
post
],
)
# accessing non-tensor attributes should not trigger wait.
assert
it
.
kv_connector_output
is
None
assert
work
.
wait_calls
==
0
assert
post_calls
[
"n"
]
==
0
# first access of `.tensors` triggers wait + postprocess.
_
=
it
.
tensors
assert
work
.
wait_calls
==
1
assert
post_calls
[
"n"
]
==
1
# subsequent access should not re-wait.
_
=
it
.
tensors
assert
work
.
wait_calls
==
1
assert
post_calls
[
"n"
]
==
1
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_test_worker
(
monkeypatch
:
pytest
.
MonkeyPatch
,
...
...
@@ -210,7 +317,7 @@ def send_recv_test_worker(
):
monkeypatch
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
size
=
64
...
...
tests/distributed/test_custom_all_reduce.py
View file @
3fb4b5fa
...
...
@@ -33,8 +33,9 @@ def graph_allreduce(
):
with
monkeypatch
.
context
()
as
m
:
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
m
.
delenv
(
"HIP_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
ensure_model_parallel_initialized
(
tp_size
,
pp_size
)
group
=
get_tp_group
().
device_group
...
...
@@ -47,7 +48,7 @@ def graph_allreduce(
data
=
torch
.
zeros
(
1
)
data
=
data
.
to
(
device
=
device
)
torch
.
distributed
.
all_reduce
(
data
,
group
=
group
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
del
data
# we use the first group to communicate once
...
...
@@ -61,13 +62,11 @@ def graph_allreduce(
for
dtype
in
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]:
with
graph_capture
(
device
=
device
)
as
graph_capture_context
:
# use integers so result matches NCCL exactly
inp1
=
torch
.
randint
(
1
,
16
,
(
sz
,),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
()
)
inp2
=
torch
.
randint
(
1
,
16
,
(
sz
,),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
()
)
torch
.
cuda
.
synchronize
()
device_idx
=
torch
.
accelerator
.
current_device_index
()
inp1
=
torch
.
randint
(
1
,
16
,
(
sz
,),
dtype
=
dtype
,
device
=
device_idx
)
inp2
=
torch
.
randint
(
1
,
16
,
(
sz
,),
dtype
=
dtype
,
device
=
device_idx
)
torch
.
accelerator
.
synchronize
()
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
,
stream
=
graph_capture_context
.
stream
):
for
i
in
range
(
num_communication
):
...
...
@@ -92,8 +91,9 @@ def eager_allreduce(
):
with
monkeypatch
.
context
()
as
m
:
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
m
.
delenv
(
"HIP_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
# we use the first group to communicate once
...
...
@@ -127,6 +127,6 @@ def test_custom_allreduce(
test_target
,
):
world_size
=
tp_size
*
pipeline_parallel_size
if
world_size
>
torch
.
cuda
.
device_count
():
if
world_size
>
torch
.
accelerator
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
multi_process_parallel
(
monkeypatch
,
tp_size
,
pipeline_parallel_size
,
test_target
)
tests/distributed/test_dcp_a2a.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Unit tests for DCP A2A communication backend (no GPU required).
Tests cover:
1. DCP A2A config validation (--dcp-comm-backend)
2. KVP group function exists
3. LSE-weighted combination correctness
"""
import
math
import
pytest
import
torch
from
vllm.config.parallel
import
ParallelConfig
class
TestDCPCommBackendConfig
:
"""Test --dcp-comm-backend config validation."""
def
test_default_is_ag_rs
(
self
):
"""Default comm backend is ag_rs."""
config
=
ParallelConfig
()
assert
config
.
dcp_comm_backend
==
"ag_rs"
def
test_a2a_requires_dcp_greater_than_1
(
self
):
"""A2A backend requires decode_context_parallel_size > 1."""
with
pytest
.
raises
(
ValueError
,
match
=
"requires decode_context_parallel_size > 1"
):
ParallelConfig
(
dcp_comm_backend
=
"a2a"
,
decode_context_parallel_size
=
1
,
)
def
test_a2a_with_dcp_valid
(
self
):
"""A2A backend is valid when DCP > 1."""
config
=
ParallelConfig
(
dcp_comm_backend
=
"a2a"
,
tensor_parallel_size
=
8
,
decode_context_parallel_size
=
4
,
)
assert
config
.
dcp_comm_backend
==
"a2a"
def
test_invalid_backend_rejected
(
self
):
"""Invalid backend values are rejected."""
with
pytest
.
raises
(
ValueError
,
match
=
"must be one of"
):
ParallelConfig
(
dcp_comm_backend
=
"invalid"
,
)
def
test_ag_rs_with_dcp_1_valid
(
self
):
"""ag_rs backend is valid with DCP=1 (no DCP)."""
config
=
ParallelConfig
(
dcp_comm_backend
=
"ag_rs"
,
decode_context_parallel_size
=
1
,
)
assert
config
.
dcp_comm_backend
==
"ag_rs"
class
TestLSEWeightedCombine
:
"""Test LSE-weighted combination logic (CPU only, no GPU).
The _lse_weighted_combine function is the reference implementation
that verifies the Triton kernel's correctness. It computes:
result[b,h,d] = sum_n(w_n * output_n[b,h,d])
where w_n = softmax(lse_n) = exp(lse_n) / sum_k(exp(lse_k))
"""
def
test_importable
(
self
):
"""Verify _lse_weighted_combine is importable."""
from
vllm.v1.attention.ops.dcp_alltoall
import
_lse_weighted_combine
assert
callable
(
_lse_weighted_combine
)
def
test_single_rank
(
self
):
"""Single rank: output unchanged."""
from
vllm.v1.attention.ops.dcp_alltoall
import
_lse_weighted_combine
# N=1, B=2, H=4, D=8
outputs
=
torch
.
randn
(
1
,
2
,
4
,
8
)
lses
=
torch
.
randn
(
1
,
2
,
4
)
result
=
_lse_weighted_combine
(
outputs
,
lses
)
assert
result
.
shape
==
(
2
,
4
,
8
)
torch
.
testing
.
assert_close
(
result
,
outputs
.
squeeze
(
0
),
rtol
=
1e-5
,
atol
=
1e-5
)
def
test_equal_lse
(
self
):
"""Equal LSE values: outputs averaged equally."""
from
vllm.v1.attention.ops.dcp_alltoall
import
_lse_weighted_combine
_N
,
B
,
H
,
D
=
2
,
1
,
1
,
4
outputs
=
torch
.
tensor
(
[
[[[
1.0
,
2.0
,
3.0
,
4.0
]]],
# Rank 0
[[[
5.0
,
6.0
,
7.0
,
8.0
]]],
# Rank 1
]
)
lses
=
torch
.
tensor
(
[
[[
0.0
]],
# Rank 0
[[
0.0
]],
# Rank 1
]
)
result
=
_lse_weighted_combine
(
outputs
,
lses
)
expected
=
(
outputs
[
0
]
+
outputs
[
1
])
/
2
assert
result
.
shape
==
(
B
,
H
,
D
)
torch
.
testing
.
assert_close
(
result
,
expected
,
rtol
=
1e-5
,
atol
=
1e-5
)
def
test_dominant_rank
(
self
):
"""Different LSE values: larger LSE gets more weight."""
from
vllm.v1.attention.ops.dcp_alltoall
import
_lse_weighted_combine
B
,
H
,
D
=
1
,
1
,
2
outputs
=
torch
.
tensor
(
[
[[[
0.0
,
0.0
]]],
# Rank 0
[[[
1.0
,
1.0
]]],
# Rank 1
]
)
lses
=
torch
.
tensor
(
[
[[
-
100.0
]],
# Rank 0: negligible contribution
[[
0.0
]],
# Rank 1: dominant
]
)
result
=
_lse_weighted_combine
(
outputs
,
lses
)
assert
result
.
shape
==
(
B
,
H
,
D
)
torch
.
testing
.
assert_close
(
result
,
outputs
[
1
].
squeeze
(
0
),
atol
=
1e-5
,
rtol
=
1e-5
)
def
test_mathematically_correct
(
self
):
"""Verify mathematical correctness of LSE combination."""
from
vllm.v1.attention.ops.dcp_alltoall
import
_lse_weighted_combine
outputs
=
torch
.
tensor
(
[
[[[
2.0
,
4.0
]]],
[[[
6.0
,
8.0
]]],
]
)
lses
=
torch
.
tensor
(
[
[[
1.0
]],
# exp(1) ≈ 2.718
[[
2.0
]],
# exp(2) ≈ 7.389
]
)
result
=
_lse_weighted_combine
(
outputs
,
lses
)
w0
=
math
.
exp
(
1
)
/
(
math
.
exp
(
1
)
+
math
.
exp
(
2
))
w1
=
math
.
exp
(
2
)
/
(
math
.
exp
(
1
)
+
math
.
exp
(
2
))
expected
=
torch
.
tensor
([[[
w0
*
2.0
+
w1
*
6.0
,
w0
*
4.0
+
w1
*
8.0
]]])
torch
.
testing
.
assert_close
(
result
,
expected
,
rtol
=
1e-4
,
atol
=
1e-4
)
def
test_return_lse
(
self
):
"""return_lse=True returns global LSE (logsumexp of inputs)."""
from
vllm.v1.attention.ops.dcp_alltoall
import
_lse_weighted_combine
B
,
H
,
D
=
1
,
1
,
2
outputs
=
torch
.
tensor
(
[
[[[
1.0
,
2.0
]]],
[[[
3.0
,
4.0
]]],
]
)
lses
=
torch
.
tensor
(
[
[[
1.0
]],
[[
2.0
]],
]
)
result
,
global_lse
=
_lse_weighted_combine
(
outputs
,
lses
,
return_lse
=
True
)
expected_global_lse
=
math
.
log
(
math
.
exp
(
1
)
+
math
.
exp
(
2
))
assert
result
.
shape
==
(
B
,
H
,
D
)
assert
global_lse
.
shape
==
(
B
,
H
)
assert
abs
(
global_lse
.
item
()
-
expected_global_lse
)
<
1e-5
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
,
"-v"
])
tests/distributed/test_elastic_ep.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
subprocess
import
time
import
pytest
import
requests
from
..evals.gsm8k.gsm8k_eval
import
evaluate_gsm8k
from
..utils
import
RemoteOpenAIServer
,
multi_gpu_test
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup_ray_between_tests
():
"""Force-stop any lingering Ray processes between tests."""
subprocess
.
run
([
"ray"
,
"stop"
,
"--force"
],
timeout
=
30
,
capture_output
=
True
)
time
.
sleep
(
5
)
yield
MODEL_NAME
=
"deepseek-ai/DeepSeek-V2-Lite-Chat"
NUM_GSM8K_QUESTIONS
=
256
EXPECTED_ACCURACY
=
0.58
ACCURACY_TOL
=
0.08
MAX_NUM_SEQS
=
32
def
_send_scale_command
(
server
:
RemoteOpenAIServer
,
new_dp_size
:
int
)
->
bool
:
url
=
server
.
url_for
(
"scale_elastic_ep"
)
payload
=
{
"new_data_parallel_size"
:
new_dp_size
}
headers
=
{
"Content-Type"
:
"application/json"
}
try
:
response
=
requests
.
post
(
url
,
json
=
payload
,
headers
=
headers
,
timeout
=
300
)
return
response
.
status_code
==
200
except
requests
.
exceptions
.
RequestException
:
return
False
def
_run_gsm8k_eval
(
server
:
RemoteOpenAIServer
,
stage
:
str
)
->
float
:
assert
server
.
port
is
not
None
result
=
evaluate_gsm8k
(
num_questions
=
NUM_GSM8K_QUESTIONS
,
host
=
f
"http://
{
server
.
host
}
"
,
port
=
server
.
port
,
)
accuracy
=
result
[
"accuracy"
]
print
(
f
"[
{
stage
}
] GSM8K accuracy:
{
accuracy
:.
3
f
}
"
f
"(
{
result
[
'num_questions'
]
}
questions)"
)
assert
accuracy
>=
EXPECTED_ACCURACY
,
(
f
"[
{
stage
}
] GSM8K accuracy
{
accuracy
:.
3
f
}
is below "
f
"expected threshold
{
EXPECTED_ACCURACY
}
"
)
return
accuracy
@
multi_gpu_test
(
num_gpus
=
4
)
def
test_elastic_ep_scaling
():
vllm_serve_args
=
[
"--trust-remote-code"
,
"--tensor-parallel-size"
,
"1"
,
"--gpu-memory-utilization"
,
"0.8"
,
"--max-model-len"
,
"4096"
,
"--max-num-seqs"
,
str
(
MAX_NUM_SEQS
),
"--enable-expert-parallel"
,
"--all2all-backend"
,
"allgather_reducescatter"
,
"--enable-elastic-ep"
,
"--enable-eplb"
,
"--eplb-config.num_redundant_experts"
,
"0"
,
"--data-parallel-backend"
,
"ray"
,
"--data-parallel-size"
,
"2"
,
"--api-server-count"
,
"1"
,
]
leader_address
=
os
.
environ
.
get
(
"LEADER_ADDRESS"
)
if
leader_address
:
vllm_serve_args
.
extend
([
"--data-parallel-address"
,
leader_address
])
with
RemoteOpenAIServer
(
MODEL_NAME
,
vllm_serve_args
,
env_dict
=
{},
max_wait_seconds
=
1200
)
as
server
:
initial_accuracy
=
_run_gsm8k_eval
(
server
,
"Initial (2 GPUs)"
)
assert
_send_scale_command
(
server
,
4
)
time
.
sleep
(
10
)
scale_up_accuracy
=
_run_gsm8k_eval
(
server
,
"After scale up (4 GPUs)"
)
assert
scale_up_accuracy
>=
initial_accuracy
-
ACCURACY_TOL
,
(
f
"Scale up accuracy
{
scale_up_accuracy
:.
3
f
}
dropped more than "
f
"
{
ACCURACY_TOL
}
below initial accuracy
{
initial_accuracy
:.
3
f
}
"
)
assert
_send_scale_command
(
server
,
2
)
time
.
sleep
(
5
)
scale_down_accuracy
=
_run_gsm8k_eval
(
server
,
"After scale down (2 GPUs)"
)
assert
scale_down_accuracy
>=
initial_accuracy
-
ACCURACY_TOL
,
(
f
"Scale down accuracy
{
scale_down_accuracy
:.
3
f
}
dropped more than "
f
"
{
ACCURACY_TOL
}
below initial accuracy
{
initial_accuracy
:.
3
f
}
"
)
print
(
"
\n
Accuracy Summary:"
)
print
(
f
" Initial:
{
initial_accuracy
:.
3
f
}
"
)
print
(
f
" Scale up:
{
scale_up_accuracy
:.
3
f
}
"
f
"(diff:
{
scale_up_accuracy
-
initial_accuracy
:
+
.
3
f
}
)"
)
print
(
f
" Scale down:
{
scale_down_accuracy
:.
3
f
}
"
f
"(diff:
{
scale_down_accuracy
-
initial_accuracy
:
+
.
3
f
}
)"
)
print
(
f
" Tolerance:
{
ACCURACY_TOL
:.
3
f
}
"
)
@
multi_gpu_test
(
num_gpus
=
4
)
def
test_elastic_ep_scaling_uneven
():
"""Test scale up with uneven worker distribution.
This tests the case where num_new_workers % old_dp_size != 0,
specifically 2 -> 3 where remainder = 1 % 2 = 1.
This exercises the remainder handling in sender-receiver pairing.
"""
vllm_serve_args
=
[
"--trust-remote-code"
,
"--tensor-parallel-size"
,
"1"
,
"--gpu-memory-utilization"
,
"0.8"
,
"--max-model-len"
,
"4096"
,
"--max-num-seqs"
,
str
(
MAX_NUM_SEQS
),
"--enable-expert-parallel"
,
"--all2all-backend"
,
"allgather_reducescatter"
,
"--enable-elastic-ep"
,
"--enable-eplb"
,
"--eplb-config.num_redundant_experts"
,
"0"
,
"--data-parallel-backend"
,
"ray"
,
"--data-parallel-size"
,
"2"
,
"--api-server-count"
,
"1"
,
]
leader_address
=
os
.
environ
.
get
(
"LEADER_ADDRESS"
)
if
leader_address
:
vllm_serve_args
.
extend
([
"--data-parallel-address"
,
leader_address
])
with
RemoteOpenAIServer
(
MODEL_NAME
,
vllm_serve_args
,
env_dict
=
{},
max_wait_seconds
=
1200
)
as
server
:
initial_accuracy
=
_run_gsm8k_eval
(
server
,
"Initial (2 GPUs)"
)
# Scale 2 -> 3: This has remainder = 1 % 2 = 1
# Tests uneven sender-receiver pairing
assert
_send_scale_command
(
server
,
3
)
time
.
sleep
(
10
)
scale_up_accuracy
=
_run_gsm8k_eval
(
server
,
"After scale up (3 GPUs)"
)
assert
scale_up_accuracy
>=
initial_accuracy
-
ACCURACY_TOL
,
(
f
"Scale up accuracy
{
scale_up_accuracy
:.
3
f
}
dropped more than "
f
"
{
ACCURACY_TOL
}
below initial accuracy
{
initial_accuracy
:.
3
f
}
"
)
# Scale back down to 2
assert
_send_scale_command
(
server
,
2
)
time
.
sleep
(
5
)
scale_down_accuracy
=
_run_gsm8k_eval
(
server
,
"After scale down (2 GPUs)"
)
assert
scale_down_accuracy
>=
initial_accuracy
-
ACCURACY_TOL
,
(
f
"Scale down accuracy
{
scale_down_accuracy
:.
3
f
}
dropped more than "
f
"
{
ACCURACY_TOL
}
below initial accuracy
{
initial_accuracy
:.
3
f
}
"
)
print
(
"
\n
Accuracy Summary (Uneven Scaling):"
)
print
(
f
" Initial:
{
initial_accuracy
:.
3
f
}
"
)
print
(
f
" Scale up:
{
scale_up_accuracy
:.
3
f
}
"
f
"(diff:
{
scale_up_accuracy
-
initial_accuracy
:
+
.
3
f
}
)"
)
print
(
f
" Scale down:
{
scale_down_accuracy
:.
3
f
}
"
f
"(diff:
{
scale_down_accuracy
-
initial_accuracy
:
+
.
3
f
}
)"
)
print
(
f
" Tolerance:
{
ACCURACY_TOL
:.
3
f
}
"
)
tests/distributed/test_eplb_execute.py
View file @
3fb4b5fa
...
...
@@ -8,6 +8,7 @@ import pytest
import
torch
import
torch.distributed
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.distributed.eplb.rebalance_execute
import
(
move_from_buffer
,
rearrange_expert_weights_inplace
,
...
...
@@ -244,91 +245,95 @@ def _test_async_transfer_layer_without_mtp_worker(
num_logical_experts
:
int
,
)
->
None
:
set_env_vars_and_device
(
env
)
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
tp_group
=
get_tp_group
()
ep_group
=
tp_group
.
device_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
vllm_config
=
VllmConfig
()
vllm_config
.
parallel_config
.
tensor_parallel_size
=
world_size
total_physical_experts
=
world_size
*
num_local_experts
hidden_sizes
=
[
16
,
32
]
with
set_current_vllm_config
(
vllm_config
):
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
,
)
old_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
redundancy_config
,
)
tp_group
=
get_tp_group
()
ep_group
=
tp_group
.
device_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
new_redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
,
)
new_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
new_redundancy_config
,
)
total_physical_experts
=
world_size
*
num_local_experts
hidden_sizes
=
[
16
,
32
]
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
old_indices
,
)
old_indices_cpu
=
old_indices
.
cpu
()
new_indices_cpu
=
new_indices
.
cpu
()
expert_buffer
=
[
torch
.
empty_like
(
w
)
for
w
in
expert_weights
[
0
]]
cuda_stream
=
torch
.
cuda
.
Stream
(
device
=
device
)
for
layer_idx
in
range
(
num_layers
):
is_unchanged
,
is_received_locally
,
recv_metadata
=
asyncio
.
run
(
transfer_layer
(
old_global_expert_indices
=
old_indices_cpu
,
new_global_expert_indices
=
new_indices_cpu
,
expert_weights
=
expert_weights
,
expert_weights_buffer
=
expert_buffer
,
ep_group
=
ep_group
,
layer
=
layer_idx
,
cuda_stream
=
cuda_stream
,
)
redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
,
)
cuda_stream
.
synchronize
()
move_from_buffer
(
expert_weights
=
expert_weights
[
layer_idx
],
expert_weights_buffers
=
expert_buffer
,
is_unchanged
=
is_unchanged
,
is_received_locally
=
is_received_locally
,
recv_metadata
=
recv_metadata
,
new_indices
=
new_indices_cpu
[
layer_idx
].
numpy
(),
ep_rank
=
ep_rank
,
old_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
redundancy_config
,
)
verify_expert_weights_after_shuffle
(
expert_weights
,
new_indices
,
hidden_sizes
,
ep_rank
,
num_local_experts
,
)
verify_redundant_experts_have_same_weights
(
expert_weights
,
new_indices
,
hidden_sizes
,
world_size
,
num_local_experts
,
)
new_redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
,
)
new_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
new_redundancy_config
,
)
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
old_indices
,
)
old_indices_cpu
=
old_indices
.
cpu
()
new_indices_cpu
=
new_indices
.
cpu
()
expert_buffer
=
[
torch
.
empty_like
(
w
)
for
w
in
expert_weights
[
0
]]
cuda_stream
=
torch
.
cuda
.
Stream
(
device
=
device
)
for
layer_idx
in
range
(
num_layers
):
is_unchanged
,
is_received_locally
,
recv_metadata
=
asyncio
.
run
(
transfer_layer
(
old_layer_indices
=
old_indices_cpu
[
layer_idx
],
new_layer_indices
=
new_indices_cpu
[
layer_idx
],
expert_weights
=
expert_weights
[
layer_idx
],
expert_weights_buffer
=
expert_buffer
,
ep_group
=
ep_group
,
cuda_stream
=
cuda_stream
,
)
)
cuda_stream
.
synchronize
()
move_from_buffer
(
expert_weights
=
expert_weights
[
layer_idx
],
expert_weights_buffers
=
expert_buffer
,
is_unchanged
=
is_unchanged
,
is_received_locally
=
is_received_locally
,
recv_metadata
=
recv_metadata
,
new_indices
=
new_indices_cpu
[
layer_idx
].
numpy
(),
ep_rank
=
ep_rank
,
)
verify_expert_weights_after_shuffle
(
expert_weights
,
new_indices
,
hidden_sizes
,
ep_rank
,
num_local_experts
,
)
verify_redundant_experts_have_same_weights
(
expert_weights
,
new_indices
,
hidden_sizes
,
world_size
,
num_local_experts
,
)
def
_test_rearrange_expert_weights_with_redundancy
(
...
...
@@ -337,71 +342,76 @@ def _test_rearrange_expert_weights_with_redundancy(
# Initialize model parallel (using tensor parallel as an entrypoint
# to expert parallel)
set_env_vars_and_device
(
env
)
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
vllm_config
=
VllmConfig
()
vllm_config
.
parallel_config
.
tensor_parallel_size
=
world_size
# Test parameters
total_physical_experts
=
world_size
*
num_local_experts
hidden_sizes
=
[
32
,
64
]
# Two different weight matrices
with
set_current_vllm_config
(
vllm_config
):
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
# Create old expert indices (with redundancy)
redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
old_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
redundancy_config
,
)
# Test parameters
total_physical_experts
=
world_size
*
num_local_experts
hidden_sizes
=
[
32
,
64
]
# Two different weight matrices
# Create new expert indices (with redundancy)
new_redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
new_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
new_redundancy_config
,
)
# Create old expert indices (with redundancy)
redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
# Create expert weights
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
old_indices
)
old_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
redundancy_config
,
)
# Execute weight rearrangement
rearrange_expert_weights_inplace
(
old_indices
,
new_indices
,
expert_weights
,
ep_group
,
is_profile
=
False
,
)
# Create new expert indices (with redundancy)
new_redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
new_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
new_redundancy_config
,
)
# Verify the rearrangement result
verify_expert_weights_after_shuffle
(
expert_weights
,
new_indices
,
hidden_sizes
,
ep_rank
,
num_local_experts
,
)
# Create expert weights
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
old_indices
)
verify_redundant_experts_have_same_weights
(
expert_weights
,
new_indices
,
hidden_sizes
,
world_size
,
num_local_experts
,
)
# Execute weight rearrangement
rearrange_expert_weights_inplace
(
old_indices
,
new_indices
,
expert_weights
,
ep_group
,
is_profile
=
False
,
)
# Verify the rearrangement result
verify_expert_weights_after_shuffle
(
expert_weights
,
new_indices
,
hidden_sizes
,
ep_rank
,
num_local_experts
,
)
verify_redundant_experts_have_same_weights
(
expert_weights
,
new_indices
,
hidden_sizes
,
world_size
,
num_local_experts
,
)
@
pytest
.
mark
.
parametrize
(
...
...
@@ -432,7 +442,7 @@ def test_rearrange_expert_weights_with_redundancy(
):
"""Test the functionality of rearranging expert weights with redundancy."""
if
torch
.
cuda
.
device_count
()
<
world_size
:
if
torch
.
accelerator
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
distributed_run
(
_test_rearrange_expert_weights_with_redundancy
,
...
...
@@ -445,58 +455,63 @@ def test_rearrange_expert_weights_with_redundancy(
def
_test_rearrange_expert_weights_no_change
(
env
,
world_size
)
->
None
:
set_env_vars_and_device
(
env
)
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
vllm_config
=
VllmConfig
()
vllm_config
.
parallel_config
.
tensor_parallel_size
=
world_size
num_layers
=
2
num_local_experts
=
2
total_physical_experts
=
world_size
*
num_local_experts
num_logical_experts
=
total_physical_experts
//
2
# Some redundancy
hidden_sizes
=
[
32
,
64
]
with
set_current_vllm_config
(
vllm_config
):
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
# Create redundancy configuration
redundancy_config
=
[
2
]
*
num_logical_experts
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
# Same indices - no change
indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
redundancy_config
)
num_layers
=
2
num_local_experts
=
2
total_physical_experts
=
world_size
*
num_local_experts
num_logical_experts
=
total_physical_experts
//
2
# Some redundancy
hidden_sizes
=
[
32
,
64
]
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
indices
)
# Create redundancy configuration
redundancy_config
=
[
2
]
*
num_logical_experts
# Save original weights
original_weights
=
[]
for
layer_weights
in
expert_weights
:
layer_copy
=
[]
for
weight
in
layer_weights
:
layer_copy
.
append
(
weight
.
clone
())
original_weights
.
append
(
layer_copy
)
# Execute rearrangement (should be no change)
rearrange_expert_weights_inplace
(
indices
,
indices
,
# Same indices
expert_weights
,
ep_group
,
is_profile
=
False
,
)
# Same indices - no change
indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
redundancy_config
)
# Verify that the weights have not changed
for
layer
in
range
(
num_layers
):
for
weight_idx
in
range
(
len
(
hidden_sizes
)):
torch
.
testing
.
assert_close
(
expert_weights
[
layer
][
weight_idx
],
original_weights
[
layer
][
weight_idx
],
msg
=
f
"""Layer
{
layer
}
, weight
{
weight_idx
}
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
indices
)
# Save original weights
original_weights
=
[]
for
layer_weights
in
expert_weights
:
layer_copy
=
[]
for
weight
in
layer_weights
:
layer_copy
.
append
(
weight
.
clone
())
original_weights
.
append
(
layer_copy
)
# Execute rearrangement (should be no change)
rearrange_expert_weights_inplace
(
indices
,
indices
,
# Same indices
expert_weights
,
ep_group
,
is_profile
=
False
,
)
# Verify that the weights have not changed
for
layer
in
range
(
num_layers
):
for
weight_idx
in
range
(
len
(
hidden_sizes
)):
torch
.
testing
.
assert_close
(
expert_weights
[
layer
][
weight_idx
],
original_weights
[
layer
][
weight_idx
],
msg
=
f
"""Layer
{
layer
}
, weight
{
weight_idx
}
should remain unchanged"""
,
)
)
@
pytest
.
mark
.
parametrize
(
...
...
@@ -513,7 +528,7 @@ def test_async_transfer_layer_without_mtp(
):
"""Exercise async EPLB transfer path without MTP/spec decode."""
if
torch
.
cuda
.
device_count
()
<
world_size
:
if
torch
.
accelerator
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
distributed_run
(
...
...
@@ -532,77 +547,82 @@ def test_rearrange_expert_weights_no_change(world_size):
unchanged.
"""
if
torch
.
cuda
.
device_count
()
<
world_size
:
if
torch
.
accelerator
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
distributed_run
(
_test_rearrange_expert_weights_no_change
,
world_size
)
def
_test_rearrange_expert_weights_profile_mode
(
env
,
world_size
)
->
None
:
set_env_vars_and_device
(
env
)
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
vllm_config
=
VllmConfig
()
vllm_config
.
parallel_config
.
tensor_parallel_size
=
world_size
num_layers
=
1
num_local_experts
=
2
total_physical_experts
=
world_size
*
num_local_experts
num_logical_experts
=
total_physical_experts
//
2
hidden_sizes
=
[
32
]
with
set_current_vllm_config
(
vllm_config
):
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
# Create different index distributions
old_redundancy
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
new_redundancy
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
old_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
old_redundancy
)
new_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
new_redundancy
)
num_layers
=
1
num_local_experts
=
2
total_physical_experts
=
world_size
*
num_local_experts
num_logical_experts
=
total_physical_experts
//
2
hidden_sizes
=
[
32
]
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
old_indices
)
# Create different index distributions
old_redundancy
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
new_redundancy
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
# Save original weights
original_weights
=
[]
for
layer_weights
in
expert_weights
:
layer_copy
=
[]
for
weight
in
layer_weights
:
layer_copy
.
append
(
weight
.
clone
())
original_weights
.
append
(
layer_copy
)
# Execute profile mode rearrangement
rearrange_expert_weights_inplace
(
old_indices
,
new_indices
,
expert_weights
,
ep_group
,
is_profile
=
True
,
# Profile mode
)
old_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
old_redundancy
)
new_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
new_redundancy
)
# In profile mode, the weights should remain unchanged
for
layer
in
range
(
num_layers
):
for
weight_idx
in
range
(
len
(
hidden_sizes
)):
torch
.
testing
.
assert_close
(
expert_weights
[
layer
][
weight_idx
],
original_weights
[
layer
][
weight_idx
],
msg
=
"In profile mode, the weights should remain unchanged"
,
)
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
old_indices
)
# Save original weights
original_weights
=
[]
for
layer_weights
in
expert_weights
:
layer_copy
=
[]
for
weight
in
layer_weights
:
layer_copy
.
append
(
weight
.
clone
())
original_weights
.
append
(
layer_copy
)
# Execute profile mode rearrangement
rearrange_expert_weights_inplace
(
old_indices
,
new_indices
,
expert_weights
,
ep_group
,
is_profile
=
True
,
# Profile mode
)
# In profile mode, the weights should remain unchanged
for
layer
in
range
(
num_layers
):
for
weight_idx
in
range
(
len
(
hidden_sizes
)):
torch
.
testing
.
assert_close
(
expert_weights
[
layer
][
weight_idx
],
original_weights
[
layer
][
weight_idx
],
msg
=
"In profile mode, the weights should remain unchanged"
,
)
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
2
,
4
])
def
test_rearrange_expert_weights_profile_mode
(
world_size
):
"""Test profile mode (should not copy actual weights)"""
if
torch
.
cuda
.
device_count
()
<
world_size
:
if
torch
.
accelerator
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
distributed_run
(
_test_rearrange_expert_weights_profile_mode
,
world_size
)
tests/distributed/test_eplb_fused_moe_layer.py
View file @
3fb4b5fa
...
...
@@ -257,7 +257,7 @@ def test_eplb_fml(
intermediate_size
:
int
,
column_major_scales
:
bool
,
):
if
torch
.
cuda
.
device_count
()
<
world_size
:
if
torch
.
accelerator
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
num_local_experts
=
num_experts
//
world_size
...
...
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
View file @
3fb4b5fa
...
...
@@ -253,7 +253,7 @@ def test_eplb_fml(
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
backend
)
if
torch
.
cuda
.
device_count
()
<
world_size
:
if
torch
.
accelerator
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
num_local_experts
=
num_experts
//
world_size
...
...
tests/distributed/test_mq_connect_ip.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test that MessageQueue uses the local node's IP for binding,
not a remote master_addr. This validates the fix for cross-node
data-parallel where each DP group leader must bind to its own IP.
The bug: multiproc_executor used `parallel_config.master_addr` as
`connect_ip` for every DP group's MessageQueue. For DP groups whose
leader is NOT on the master node, binding to master_addr fails with
"Cannot assign requested address".
The fix: use `get_ip()` (local node IP) instead of `master_addr`.
"""
import
pytest
import
zmq
from
vllm.distributed.device_communicators.shm_broadcast
import
MessageQueue
from
vllm.utils.network_utils
import
get_ip
def
test_mq_bind_with_local_ip
():
"""MessageQueue with remote readers should successfully bind
when connect_ip is the local node's IP."""
# n_reader=2, n_local_reader=1 means 1 remote reader,
# which triggers the remote ZMQ socket bind.
mq
=
MessageQueue
(
n_reader
=
2
,
n_local_reader
=
1
,
connect_ip
=
get_ip
(),
)
handle
=
mq
.
export_handle
()
assert
handle
.
remote_subscribe_addr
is
not
None
# The bound address should contain our local IP
local_ip
=
get_ip
()
assert
(
local_ip
in
handle
.
remote_subscribe_addr
or
f
"[
{
local_ip
}
]"
in
handle
.
remote_subscribe_addr
)
del
mq
def
test_mq_bind_with_non_local_ip_fails
():
"""MessageQueue should fail to bind when connect_ip is a
non-local IP address (simulating the bug where master_addr
from a different node was used)."""
# Use a non-local IP that we definitely can't bind to.
# 198.51.100.1 is from TEST-NET-2 (RFC 5737), never locally assigned.
non_local_ip
=
"198.51.100.1"
with
pytest
.
raises
(
zmq
.
error
.
ZMQError
,
match
=
"Cannot assign requested address"
):
MessageQueue
(
n_reader
=
2
,
n_local_reader
=
1
,
connect_ip
=
non_local_ip
,
)
def
test_mq_bind_defaults_to_local_ip
():
"""When connect_ip is None, MessageQueue should auto-detect
the local IP and bind successfully."""
mq
=
MessageQueue
(
n_reader
=
2
,
n_local_reader
=
1
,
connect_ip
=
None
,
# should fallback to get_ip()
)
handle
=
mq
.
export_handle
()
assert
handle
.
remote_subscribe_addr
is
not
None
del
mq
if
__name__
==
"__main__"
:
test_mq_bind_with_local_ip
()
print
(
"PASSED: test_mq_bind_with_local_ip"
)
test_mq_bind_with_non_local_ip_fails
()
print
(
"PASSED: test_mq_bind_with_non_local_ip_fails"
)
test_mq_bind_defaults_to_local_ip
()
print
(
"PASSED: test_mq_bind_defaults_to_local_ip"
)
print
(
"
\n
All tests passed!"
)
tests/distributed/test_multiproc_executor.py
View file @
3fb4b5fa
...
...
@@ -9,11 +9,11 @@ focusing on executor initialization, RPC calls, and distributed execution.
import
multiprocessing
import
os
import
socket
from
tests.utils
import
multi_gpu_test
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.utils
import
get_open_port
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.executor.multiproc_executor
import
MultiprocExecutor
...
...
@@ -333,7 +333,9 @@ def test_multiproc_executor_multi_node():
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
Total world_size = 4, nnodes = 2
"""
port
=
get_open_port
()
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
s
.
bind
((
""
,
0
))
port
=
s
.
getsockname
()[
1
]
# symm_mem does not work for simulating multi instance in single node
os
.
environ
[
"VLLM_ALLREDUCE_USE_SYMM_MEM"
]
=
"0"
...
...
tests/distributed/test_nccl_symm_mem_allreduce.py
View file @
3fb4b5fa
...
...
@@ -10,6 +10,7 @@ import torch.distributed as dist
import
torch.multiprocessing
as
mp
import
vllm.envs
as
envs
from
tests.utils
import
ensure_current_vllm_config
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed.device_communicators.cuda_communicator
import
CudaCommunicator
from
vllm.distributed.device_communicators.pynccl
import
register_nccl_symmetric_ops
...
...
@@ -37,7 +38,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
update_environment_variables
(
...
...
@@ -51,7 +52,8 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
)
init_distributed_environment
()
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
with
ensure_current_vllm_config
():
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
cuda_communicator
=
typing
.
cast
(
CudaCommunicator
,
get_tp_group
().
device_communicator
...
...
@@ -82,7 +84,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
2
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
def
test_nccl_symm_mem_allreduce
(
monkeypatch
:
pytest
.
MonkeyPatch
,
world_size
):
if
world_size
>
torch
.
cuda
.
device_count
():
if
world_size
>
torch
.
accelerator
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
# Enable SymmMemCommunicator
...
...
tests/distributed/test_pipeline_parallel.py
View file @
3fb4b5fa
...
...
@@ -247,6 +247,7 @@ def _compare_tp(
hf_config
=
get_config
(
model_id
,
trust_remote_code
)
require_embed_inputs
=
model_info
.
require_embed_inputs
max_num_seqs
=
model_info
.
max_num_seqs
enable_prefix_caching
=
model_info
.
enable_prefix_caching
dtype
=
"float16"
if
hf_config
.
model_type
in
_FLOAT16_NOT_SUPPORTED_MODELS
:
...
...
@@ -300,6 +301,8 @@ def _compare_tp(
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
if
not
enable_prefix_caching
:
common_args
.
append
(
"--no-enable-prefix-caching"
)
if
require_embed_inputs
:
common_args
.
extend
(
[
...
...
tests/distributed/test_pynccl.py
View file @
3fb4b5fa
...
...
@@ -9,6 +9,7 @@ import pytest
import
torch
import
torch.distributed
from
tests.utils
import
ensure_current_vllm_config
from
vllm.distributed.communication_op
import
tensor_model_parallel_all_reduce
# noqa
from
vllm.distributed.device_communicators.pynccl
import
PyNcclCommunicator
from
vllm.distributed.device_communicators.pynccl_wrapper
import
NCCLLibrary
...
...
@@ -53,7 +54,7 @@ def worker_fn_wrapper(fn):
update_environment_variables
(
env
)
local_rank
=
os
.
environ
[
"LOCAL_RANK"
]
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_distributed_environment
()
fn
()
...
...
@@ -67,12 +68,12 @@ def worker_fn():
)
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
).
cuda
(
pynccl_comm
.
rank
)
tensor
=
pynccl_comm
.
all_reduce
(
tensor
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
assert
torch
.
all
(
tensor
==
pynccl_comm
.
world_size
).
cpu
().
item
()
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl
():
distributed_run
(
worker_fn
,
2
)
...
...
@@ -92,16 +93,16 @@ def multiple_allreduce_worker_fn():
if
torch
.
distributed
.
get_rank
()
in
[
0
,
1
]:
tensor
=
pynccl_comm
.
all_reduce
(
tensor
)
tensor
=
pynccl_comm
.
all_reduce
(
tensor
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
assert
torch
.
all
(
tensor
==
4
).
cpu
().
item
()
else
:
tensor
=
pynccl_comm
.
all_reduce
(
tensor
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
assert
torch
.
all
(
tensor
==
2
).
cpu
().
item
()
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
def
test_pynccl_multiple_allreduce
():
# this tests pynccl for multiple tp groups, in a standalone way
...
...
@@ -112,23 +113,24 @@ def test_pynccl_multiple_allreduce():
@
worker_fn_wrapper
def
multiple_allreduce_with_vllm_worker_fn
():
device
=
torch
.
device
(
f
"cuda:
{
torch
.
distributed
.
get_rank
()
}
"
)
ensure_model_parallel_initialized
(
2
,
2
)
with
ensure_current_vllm_config
():
ensure_model_parallel_initialized
(
2
,
2
)
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
,
device
=
device
)
with
graph_capture
(
device
=
device
):
# two tp groups can communicate independently
if
torch
.
distributed
.
get_rank
()
in
[
0
,
1
]:
tensor
=
tensor_model_parallel_all_reduce
(
tensor
)
tensor
=
tensor_model_parallel_all_reduce
(
tensor
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
assert
torch
.
all
(
tensor
==
4
).
cpu
().
item
()
else
:
tensor
=
tensor_model_parallel_all_reduce
(
tensor
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
assert
torch
.
all
(
tensor
==
2
).
cpu
().
item
()
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
def
test_pynccl_multiple_allreduce_with_vllm
():
# this tests pynccl for multiple tp groups, together with vllm
...
...
@@ -145,12 +147,12 @@ def worker_fn_with_cudagraph():
)
# run something in the default stream to initialize torch engine
a
=
torch
.
ones
((
4
,
4
),
device
=
f
"cuda:
{
pynccl_comm
.
rank
}
"
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
with
torch
.
cuda
.
graph
(
graph
):
a_out
=
pynccl_comm
.
all_reduce
(
a
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
graph
.
replay
()
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
assert
torch
.
all
(
a_out
==
pynccl_comm
.
world_size
).
cpu
().
item
()
...
...
@@ -178,12 +180,12 @@ def all_gather_worker_fn():
).
to
(
device
)
pynccl_comm
.
all_gather
(
result
,
tensor
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
torch
.
testing
.
assert_close
(
result
,
expected
,
rtol
=
1e-5
,
atol
=
1e-8
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl_all_gather
():
distributed_run
(
all_gather_worker_fn
,
2
)
...
...
@@ -213,12 +215,12 @@ def all_gatherv_worker_fn():
).
to
(
device
)
pynccl_comm
.
all_gatherv
(
result
,
tensor
,
sizes
=
sizes
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
torch
.
testing
.
assert_close
(
result
,
expected
,
rtol
=
1e-5
,
atol
=
1e-8
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl_all_gatherv
():
distributed_run
(
all_gatherv_worker_fn
,
2
)
...
...
@@ -253,12 +255,12 @@ def reduce_scatter_worker_fn():
).
to
(
device
)
pynccl_comm
.
reduce_scatter
(
result
,
tensor
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
torch
.
testing
.
assert_close
(
result
,
expected
,
rtol
=
1e-5
,
atol
=
1e-8
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl_reduce_scatter
():
distributed_run
(
reduce_scatter_worker_fn
,
2
)
...
...
@@ -291,19 +293,19 @@ def reduce_scatterv_worker_fn():
expected
=
sum
(
tensor
[
start
:
end
]
for
tensor
in
all_tensors
).
to
(
device
)
pynccl_comm
.
reduce_scatterv
(
result
,
tensor
,
sizes
=
sizes
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
torch
.
testing
.
assert_close
(
result
,
expected
,
rtol
=
1e-5
,
atol
=
1e-8
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl_reduce_scatterv
():
distributed_run
(
reduce_scatterv_worker_fn
,
2
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl_with_cudagraph
():
distributed_run
(
worker_fn_with_cudagraph
,
2
)
...
...
@@ -323,12 +325,12 @@ def send_recv_worker_fn():
pynccl_comm
.
send
(
tensor
,
dst
=
(
pynccl_comm
.
rank
+
1
)
%
pynccl_comm
.
world_size
)
else
:
pynccl_comm
.
recv
(
tensor
,
src
=
(
pynccl_comm
.
rank
-
1
)
%
pynccl_comm
.
world_size
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
assert
torch
.
all
(
tensor
==
1
).
cpu
().
item
()
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl_send_recv
():
distributed_run
(
send_recv_worker_fn
,
2
)
...
...
@@ -353,7 +355,7 @@ def multiple_send_recv_worker_fn():
pynccl_comm
.
send
(
tensor
,
dst
=
(
pynccl_comm
.
rank
+
1
)
%
pynccl_comm
.
world_size
)
else
:
pynccl_comm
.
recv
(
tensor
,
src
=
(
pynccl_comm
.
rank
-
1
)
%
pynccl_comm
.
world_size
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
if
torch
.
distributed
.
get_rank
()
in
[
0
,
2
]:
assert
torch
.
all
(
tensor
==
1
).
cpu
().
item
()
else
:
...
...
@@ -361,14 +363,14 @@ def multiple_send_recv_worker_fn():
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
def
test_pynccl_multiple_send_recv
():
distributed_run
(
multiple_send_recv_worker_fn
,
4
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
torch
.
accelerator
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
def
test_pynccl_broadcast
():
distributed_run
(
broadcast_worker_fn
,
4
)
...
...
@@ -394,7 +396,7 @@ def broadcast_worker_fn():
pynccl_comm
.
broadcast
(
recv_tensors
[
i
],
src
=
i
)
# the broadcast op might be launched in a different stream
# need to synchronize to make sure the tensor is ready
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
assert
torch
.
all
(
recv_tensors
[
i
]
==
i
).
cpu
().
item
()
...
...
tests/distributed/test_quick_all_reduce.py
View file @
3fb4b5fa
...
...
@@ -39,7 +39,7 @@ def graph_quickreduce(
with
monkeypatch
.
context
()
as
m
:
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
ensure_model_parallel_initialized
(
tp_size
,
pp_size
)
group
=
get_tp_group
().
device_group
...
...
@@ -52,7 +52,7 @@ def graph_quickreduce(
data
=
torch
.
zeros
(
1
)
data
=
data
.
to
(
device
=
device
)
torch
.
distributed
.
all_reduce
(
data
,
group
=
group
)
torch
.
cuda
.
synchronize
()
torch
.
accelerator
.
synchronize
()
del
data
# we use the first group to communicate once
...
...
@@ -65,13 +65,11 @@ def graph_quickreduce(
for
sz
in
test_sizes
:
for
dtype
in
[
torch
.
float16
,
torch
.
bfloat16
]:
with
graph_capture
(
device
=
device
)
as
graph_capture_context
:
inp1
=
torch
.
randint
(
1
,
23
,
(
sz
,),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
()
)
inp2
=
torch
.
randint
(
-
23
,
1
,
(
sz
,),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
()
)
torch
.
cuda
.
synchronize
()
device_idx
=
torch
.
accelerator
.
current_device_index
()
inp1
=
torch
.
randint
(
1
,
23
,
(
sz
,),
dtype
=
dtype
,
device
=
device_idx
)
inp2
=
torch
.
randint
(
-
23
,
1
,
(
sz
,),
dtype
=
dtype
,
device
=
device_idx
)
torch
.
accelerator
.
synchronize
()
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
,
stream
=
graph_capture_context
.
stream
):
for
_
in
range
(
num_communication
):
...
...
@@ -95,7 +93,7 @@ def eager_quickreduce(
with
monkeypatch
.
context
()
as
m
:
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
...
...
@@ -130,7 +128,7 @@ def test_custom_quick_allreduce(
quant_mode
,
):
world_size
=
tp_size
*
pipeline_parallel_size
if
world_size
>
torch
.
cuda
.
device_count
():
if
world_size
>
torch
.
accelerator
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
monkeypatch
.
setenv
(
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION"
,
quant_mode
)
...
...
@@ -145,7 +143,7 @@ def qr_variable_input(rank, world_size):
has been observed with the gpt_oss model).
"""
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
qr_max_size
=
None
# MB
_ptr
=
ops
.
init_custom_qr
(
rank
,
world_size
,
qr_max_size
)
ranks
=
[]
...
...
@@ -169,14 +167,13 @@ def qr_variable_input(rank, world_size):
s1
=
1024
while
num
<
50000
:
# 50000 is sufficient to identify issues.
dtype
=
torch
.
float16
device_idx
=
torch
.
accelerator
.
current_device_index
()
if
num
%
2
==
0
:
s2
=
1024
inp1
=
torch
.
zeros
(
(
s1
,
s2
),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
()
)
inp1
=
torch
.
zeros
((
s1
,
s2
),
dtype
=
dtype
,
device
=
device_idx
)
else
:
s2
=
2048
inp1
=
torch
.
ones
((
s1
,
s2
),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_
device
()
)
inp1
=
torch
.
ones
((
s1
,
s2
),
dtype
=
dtype
,
device
=
device
_idx
)
result
=
torch
.
empty_like
(
inp1
)
# FP = 0 INT8 = 1 INT6 = 2 INT4 = 3 NONE = 4
ops
.
qr_all_reduce
(
_ptr
,
inp1
,
result
,
3
,
cast_bf2half
=
True
)
...
...
@@ -198,7 +195,7 @@ def qr_variable_input(rank, world_size):
@
pytest
.
mark
.
parametrize
(
"pipeline_parallel_size"
,
[
1
])
def
test_custom_quick_allreduce_variable_input
(
tp_size
,
pipeline_parallel_size
):
world_size
=
tp_size
*
pipeline_parallel_size
if
world_size
>
torch
.
cuda
.
device_count
():
if
world_size
>
torch
.
accelerator
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
multiprocessing
.
set_start_method
(
"spawn"
,
force
=
True
)
...
...
tests/distributed/test_shm_broadcast.py
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
multiprocessing
import
random
import
threading
import
time
from
unittest
import
mock
import
multiprocess
as
mp
import
numpy
as
np
import
pytest
import
torch.distributed
as
dist
from
vllm.distributed.device_communicators.shm_broadcast
import
MessageQueue
...
...
@@ -22,7 +25,14 @@ def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
return
[
np
.
random
.
randint
(
1
,
100
,
i
)
for
i
in
sizes
]
def
distributed_run
(
fn
,
world_size
):
def
distributed_run
(
fn
,
world_size
,
timeout
=
60
):
"""Run a function in multiple processes with proper error handling.
Args:
fn: Function to run in each process
world_size: Number of processes to spawn
timeout: Maximum time in seconds to wait for processes (default: 60)
"""
number_of_processes
=
world_size
processes
=
[]
for
i
in
range
(
number_of_processes
):
...
...
@@ -33,19 +43,45 @@ def distributed_run(fn, world_size):
env
[
"LOCAL_WORLD_SIZE"
]
=
str
(
number_of_processes
)
env
[
"MASTER_ADDR"
]
=
"localhost"
env
[
"MASTER_PORT"
]
=
"12345"
p
=
m
ultiprocessing
.
Process
(
target
=
fn
,
args
=
(
env
,))
p
=
m
p
.
Process
(
target
=
fn
,
args
=
(
env
,))
processes
.
append
(
p
)
p
.
start
()
for
p
in
processes
:
p
.
join
()
# Monitor processes and fail fast if any process fails
start_time
=
time
.
time
()
failed_processes
=
[]
# Wait for all processes, checking for failures
while
time
.
time
()
-
start_time
<
timeout
:
all_done
=
True
for
i
,
p
in
enumerate
(
processes
):
if
p
.
is_alive
():
all_done
=
False
elif
p
.
exitcode
!=
0
:
# Process failed
failed_processes
.
append
((
i
,
p
.
exitcode
))
break
if
failed_processes
or
all_done
:
break
time
.
sleep
(
0.1
)
# Check every 100ms
for
p
in
processes
:
assert
p
.
exitcode
==
0
# Check for timeout if no failures detected yet
for
i
,
p
in
enumerate
(
processes
):
if
p
.
is_alive
():
p
.
kill
()
p
.
join
()
# Report failures
if
failed_processes
:
error_msg
=
"Distributed test failed:
\n
"
for
rank
,
status
in
failed_processes
:
error_msg
+=
f
" Rank
{
rank
}
: Exit code
{
status
}
\n
"
raise
AssertionError
(
error_msg
)
def
worker_fn_wrapper
(
fn
):
# `m
ultiprocessing
.Process` cannot accept environment variables directly
# `m
p
.Process` cannot accept environment variables directly
# so we need to pass the environment variables as arguments
# and update the environment variables in the function
def
wrapped_fn
(
env
):
...
...
@@ -115,3 +151,244 @@ def worker_fn():
def
test_shm_broadcast
():
distributed_run
(
worker_fn
,
4
)
@
worker_fn_wrapper
def
worker_fn_test_shutdown_busy
():
rank
=
dist
.
get_rank
()
writer_rank
=
2
message_queue
=
MessageQueue
.
create_from_process_group
(
dist
.
group
.
WORLD
,
40
*
1024
,
2
,
writer_rank
)
if
not
message_queue
.
_is_writer
:
# Put into busy mode
message_queue
.
_spin_condition
.
busy_loop_s
=
9999
shutdown_event
=
threading
.
Event
()
def
shutdown_thread
(
mq
,
shutdown_event
):
shutdown_event
.
wait
()
mq
.
shutdown
()
threading
.
Thread
(
target
=
shutdown_thread
,
args
=
(
message_queue
,
shutdown_event
)
).
start
()
with
pytest
.
raises
(
TimeoutError
):
message_queue
.
dequeue
(
timeout
=
0.01
)
shutdown_event
.
set
()
with
pytest
.
raises
(
RuntimeError
,
match
=
"cancelled"
):
message_queue
.
dequeue
(
timeout
=
1
)
assert
message_queue
.
shutting_down
print
(
f
"torch distributed passed the test! Rank
{
rank
}
"
)
dist
.
barrier
()
def
test_message_queue_shutdown_busy
(
caplog_vllm
):
distributed_run
(
worker_fn_test_shutdown_busy
,
4
)
print
(
caplog_vllm
.
text
)
@
worker_fn_wrapper
def
worker_fn_test_shutdown_idle
():
rank
=
dist
.
get_rank
()
writer_rank
=
2
message_queue
=
MessageQueue
.
create_from_process_group
(
dist
.
group
.
WORLD
,
40
*
1024
,
2
,
writer_rank
)
if
not
message_queue
.
_is_writer
:
# Put into idle mode
message_queue
.
_spin_condition
.
last_read
=
0
shutdown_event
=
threading
.
Event
()
def
shutdown_thread
(
mq
,
shutdown_event
):
shutdown_event
.
wait
()
mq
.
shutdown
()
threading
.
Thread
(
target
=
shutdown_thread
,
args
=
(
message_queue
,
shutdown_event
)
).
start
()
with
pytest
.
raises
(
TimeoutError
):
message_queue
.
dequeue
(
timeout
=
0.01
)
shutdown_event
.
set
()
with
pytest
.
raises
(
RuntimeError
,
match
=
"cancelled"
):
message_queue
.
dequeue
(
timeout
=
1
)
assert
message_queue
.
shutting_down
print
(
f
"torch distributed passed the test! Rank
{
rank
}
"
)
dist
.
barrier
()
def
test_message_queue_shutdown_idle
():
distributed_run
(
worker_fn_test_shutdown_idle
,
4
)
@
worker_fn_wrapper
def
worker_fn_test_idle_to_busy
():
rank
=
dist
.
get_rank
()
writer_rank
=
2
message_queue
=
MessageQueue
.
create_from_process_group
(
dist
.
group
.
WORLD
,
40
*
1024
,
2
,
writer_rank
)
message1
=
"hello world"
message2
=
np
.
random
.
randint
(
1
,
100
,
100
)
with
mock
.
patch
.
object
(
message_queue
.
_spin_condition
,
"wait"
,
wraps
=
message_queue
.
_spin_condition
.
wait
)
as
wrapped_wait
:
if
not
message_queue
.
_is_writer
:
# Put into idle mode
message_queue
.
_spin_condition
.
last_read
=
0
# no messages, so expect a TimeoutError
with
pytest
.
raises
(
TimeoutError
):
message_queue
.
dequeue
(
timeout
=
0.01
)
# wait should only be called once while idle
assert
wrapped_wait
.
call_count
==
1
# sync with the writer and wait for message1
dist
.
barrier
()
recv_message
=
message_queue
.
dequeue
(
timeout
=
5
)
assert
recv_message
==
message1
# second call to wait, with a message read, this puts in a busy spin
assert
wrapped_wait
.
call_count
==
2
# sync with the writer and wait for message2
dist
.
barrier
()
recv_message
=
message_queue
.
dequeue
(
timeout
=
1
)
assert
np
.
array_equal
(
recv_message
,
message2
)
# in busy mode, we expect wait to have been called multiple times
assert
wrapped_wait
.
call_count
>
3
else
:
# writer writes two messages in sync with the reader
dist
.
barrier
()
# sleep delays the send to ensure reader enters the read loop
time
.
sleep
(
0.1
)
message_queue
.
enqueue
(
message1
)
dist
.
barrier
()
time
.
sleep
(
0.1
)
message_queue
.
enqueue
(
message2
)
message_queue
.
shutdown
()
assert
message_queue
.
shutting_down
print
(
f
"torch distributed passed the test! Rank
{
rank
}
"
)
def
test_message_queue_idle_wake
():
distributed_run
(
worker_fn_test_idle_to_busy
,
4
)
@
worker_fn_wrapper
def
worker_fn_test_busy_to_idle
():
rank
=
dist
.
get_rank
()
writer_rank
=
2
message_queue
=
MessageQueue
.
create_from_process_group
(
dist
.
group
.
WORLD
,
40
*
1024
,
2
,
writer_rank
)
message1
=
12345
message2
=
list
(
range
(
3
))
with
mock
.
patch
.
object
(
message_queue
.
_spin_condition
,
"wait"
,
wraps
=
message_queue
.
_spin_condition
.
wait
)
as
wrapped_wait
:
if
not
message_queue
.
_is_writer
:
# Put into busy mode
message_queue
.
_spin_condition
.
busy_loop_s
=
9999
# sync with the writer and wait for message1
dist
.
barrier
()
recv_message
=
message_queue
.
dequeue
(
timeout
=
1
)
assert
recv_message
==
message1
# in busy mode, we expect wait to have been called many times
assert
wrapped_wait
.
call_count
>
1
# simulate busy loop ending
message_queue
.
_spin_condition
.
busy_loop_s
=
0
# ensure we enter idle mode, then record call count
with
pytest
.
raises
(
TimeoutError
):
message_queue
.
dequeue
(
timeout
=
0.01
)
call_count
=
wrapped_wait
.
call_count
# sync with the writer and wait for message2
dist
.
barrier
()
recv_message
=
message_queue
.
dequeue
(
timeout
=
1
)
assert
recv_message
==
message2
# call to wait after idle should only happen once
assert
wrapped_wait
.
call_count
==
call_count
+
1
else
:
# writer writes two messages in sync with the reader
dist
.
barrier
()
# sleep delays the send to ensure reader enters the read loop
time
.
sleep
(
0.1
)
message_queue
.
enqueue
(
message1
)
dist
.
barrier
()
time
.
sleep
(
0.1
)
message_queue
.
enqueue
(
message2
)
message_queue
.
shutdown
()
assert
message_queue
.
shutting_down
print
(
f
"torch distributed passed the test! Rank
{
rank
}
"
)
def
test_message_queue_busy_to_idle
():
distributed_run
(
worker_fn_test_busy_to_idle
,
4
)
def
test_warning_logs
(
caplog_vllm
):
"""
Test that warning logs are emitted at VLLM_RINGBUFFER_WARNING_INTERVAL intervals
when indefinite=False, and are not emitted when indefinite=True.
"""
# Patch the warning log interval to every 1 ms during reads
with
mock
.
patch
(
"vllm.distributed.device_communicators.shm_broadcast.VLLM_RINGBUFFER_WARNING_INTERVAL"
,
new
=
0.001
,
# 1 ms
):
writer
=
MessageQueue
(
n_reader
=
1
,
n_local_reader
=
1
,
max_chunk_bytes
=
1024
*
1024
,
# 1MB chunks
max_chunks
=
10
,
)
reader
=
MessageQueue
.
create_from_handle
(
writer
.
export_handle
(),
rank
=
0
)
writer
.
wait_until_ready
()
reader
.
wait_until_ready
()
# We should have at least one warning log here
# "0 seconds" expected due to rounding of 1ms test interval
with
pytest
.
raises
(
TimeoutError
):
reader
.
dequeue
(
timeout
=
0.01
,
indefinite
=
False
)
assert
any
(
"No available shared memory broadcast block found in 0 seconds"
in
record
.
message
for
record
in
caplog_vllm
.
records
)
caplog_vllm
.
clear
()
# We should have no warnings this time
with
pytest
.
raises
(
TimeoutError
):
reader
.
dequeue
(
timeout
=
0.01
,
indefinite
=
True
)
assert
all
(
"No available shared memory broadcast block found in 0 seconds"
not
in
record
.
message
for
record
in
caplog_vllm
.
records
)
# Clean up when done
writer
.
shutdown
()
reader
.
shutdown
()
tests/distributed/test_symm_mem_allreduce.py
View file @
3fb4b5fa
...
...
@@ -39,7 +39,7 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
accelerator
.
set_device
_index
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
update_environment_variables
(
...
...
@@ -105,7 +105,7 @@ def test_symm_mem_allreduce(
monkeypatch
:
pytest
.
MonkeyPatch
,
tp_size
,
pipeline_parallel_size
):
world_size
=
tp_size
*
pipeline_parallel_size
if
world_size
>
torch
.
cuda
.
device_count
():
if
world_size
>
torch
.
accelerator
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
q
=
mp
.
get_context
(
"spawn"
).
Queue
()
mp
.
spawn
(
symm_mem_allreduce_worker
,
args
=
(
world_size
,
q
),
nprocs
=
world_size
)
...
...
@@ -126,7 +126,7 @@ def test_symm_mem_allreduce(
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
def
test_dp_with_symm_mem_allreduce
(
monkeypatch
:
pytest
.
MonkeyPatch
):
world_size
=
4
if
world_size
>
torch
.
cuda
.
device_count
():
if
world_size
>
torch
.
accelerator
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
# Verify that the DataParallel runs without error
engine_args
=
EngineArgs
(
...
...
tests/distributed/test_torchrun_example.py
View file @
3fb4b5fa
...
...
@@ -22,7 +22,7 @@ prompts = [
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# set different `gpu_memory_utilization`
and `swap_space`
for different ranks,
# set different `gpu_memory_utilization` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
...
...
@@ -30,7 +30,6 @@ llm = LLM(
pipeline_parallel_size
=
int
(
os
.
getenv
(
"PP_SIZE"
,
1
)),
distributed_executor_backend
=
"external_launcher"
,
gpu_memory_utilization
=
random
.
uniform
(
0.7
,
0.9
),
swap_space
=
random
.
randint
(
1
,
4
),
seed
=
0
,
)
...
...
Prev
1
…
19
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment