Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ab79863e
Unverified
Commit
ab79863e
authored
Apr 03, 2026
by
Jeffrey Wang
Committed by
GitHub
Apr 03, 2026
Browse files
Remove MQ multi-node tests (#38934)
Signed-off-by:
Jeffrey Wang
<
jeffreywang@anyscale.com
>
parent
5f1de2b1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
133 deletions
+0
-133
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/distributed.yaml
+0
-14
tests/distributed/test_mq_tcp_multinode.py
tests/distributed/test_mq_tcp_multinode.py
+0
-119
No files found.
.buildkite/test_areas/distributed.yaml
View file @
ab79863e
...
...
@@ -224,20 +224,6 @@ steps:
commands
:
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
-
label
:
MessageQueue TCP Multi-Node (2 GPUs)
timeout_in_minutes
:
10
working_dir
:
"
/vllm-workspace/tests"
num_devices
:
1
num_nodes
:
2
no_plugin
:
true
optional
:
true
source_file_dependencies
:
-
vllm/distributed/device_communicators/shm_broadcast.py
-
vllm/distributed/parallel_state.py
-
tests/distributed/test_mq_tcp_multinode.py
commands
:
-
./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 1 $IMAGE_TAG "torchrun --nnodes 2 --nproc-per-node=1 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_mq_tcp_multinode.py" "torchrun --nnodes 2 --nproc-per-node=1 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_mq_tcp_multinode.py"
-
label
:
Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes
:
30
working_dir
:
"
/vllm-workspace/tests"
...
...
tests/distributed/test_mq_tcp_multinode.py
deleted
100644 → 0
View file @
5f1de2b1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Multi-node integration test for MessageQueue TCP fallback.
Verifies that when writer and readers span separate nodes (Docker containers
with isolated /dev/shm), `create_from_process_group` correctly detects
cross-node ranks via `in_the_same_node_as()` and falls back to ZMQ TCP
transport — and that data actually arrives.
"""
import
numpy
as
np
import
torch.distributed
as
dist
from
vllm.distributed.device_communicators.shm_broadcast
import
MessageQueue
from
vllm.distributed.parallel_state
import
in_the_same_node_as
def
main
():
dist
.
init_process_group
(
backend
=
"gloo"
)
rank
=
dist
.
get_rank
()
world_size
=
dist
.
get_world_size
()
assert
world_size
>=
2
,
(
f
"Need at least 2 ranks across nodes, got world_size=
{
world_size
}
"
)
# Verify that in_the_same_node_as detects cross-node correctly
status
=
in_the_same_node_as
(
dist
.
group
.
WORLD
,
source_rank
=
0
)
local_count
=
sum
(
status
)
print
(
f
"[Rank
{
rank
}
] in_the_same_node_as(source=0):
{
status
}
"
f
"(local=
{
local_count
}
/
{
world_size
}
)"
)
# With 2 Docker containers (1 proc each), rank 0 and rank 1
# should be on different nodes.
assert
local_count
<
world_size
,
(
f
"Expected cross-node ranks but all
{
world_size
}
ranks appear local."
)
# Create MessageQueue
writer_rank
=
0
mq
=
MessageQueue
.
create_from_process_group
(
dist
.
group
.
WORLD
,
max_chunk_bytes
=
1024
*
1024
,
# 1 MiB
max_chunks
=
10
,
writer_rank
=
writer_rank
,
)
# Verify the transport path selection
if
rank
==
writer_rank
:
print
(
f
"[Rank
{
rank
}
] Writer: n_local_reader=
{
mq
.
n_local_reader
}
, "
f
"n_remote_reader=
{
mq
.
n_remote_reader
}
"
)
assert
mq
.
n_remote_reader
>
0
,
(
"Writer should have at least 1 remote (TCP) reader in a multi-node setup."
)
else
:
if
status
[
rank
]:
assert
mq
.
_is_local_reader
,
(
f
"Rank
{
rank
}
is on the same node as writer but is not a local reader."
)
print
(
f
"[Rank
{
rank
}
] Reader: local (shared memory)"
)
else
:
assert
mq
.
_is_remote_reader
,
(
f
"Rank
{
rank
}
is on a different node but is not a remote (TCP) reader."
)
print
(
f
"[Rank
{
rank
}
] Reader: remote (TCP)"
)
# Test data transfer: simple objects
dist
.
barrier
()
if
rank
==
writer_rank
:
mq
.
enqueue
(
"hello_from_node0"
)
else
:
msg
=
mq
.
dequeue
(
timeout
=
10
)
assert
msg
==
"hello_from_node0"
dist
.
barrier
()
print
(
f
"[Rank
{
rank
}
] Simple object test passed"
)
# Test data transfer: numpy arrays
np
.
random
.
seed
(
42
)
arrays
=
[
np
.
random
.
randint
(
0
,
100
,
size
=
np
.
random
.
randint
(
100
,
5000
))
for
_
in
range
(
100
)
]
dist
.
barrier
()
if
rank
==
writer_rank
:
for
arr
in
arrays
:
mq
.
enqueue
(
arr
)
else
:
for
i
,
expected
in
enumerate
(
arrays
):
received
=
mq
.
dequeue
(
timeout
=
10
)
assert
np
.
array_equal
(
expected
,
received
),
(
f
"Array mismatch at index
{
i
}
: "
f
"expected shape
{
expected
.
shape
}
, got shape
{
received
.
shape
}
"
)
dist
.
barrier
()
print
(
f
"[Rank
{
rank
}
] Numpy array test passed"
)
# Test data transfer: large payload (> max_chunk_bytes)
dist
.
barrier
()
big_array
=
np
.
zeros
(
200_000
,
dtype
=
np
.
int64
)
# ~1.6 MiB > 1 MiB chunk
if
rank
==
writer_rank
:
mq
.
enqueue
(
big_array
)
else
:
received
=
mq
.
dequeue
(
timeout
=
10
)
assert
np
.
array_equal
(
big_array
,
received
)
dist
.
barrier
()
print
(
f
"[Rank
{
rank
}
] Large payload test passed"
)
# Done -- cleanup
dist
.
barrier
()
print
(
f
"[Rank
{
rank
}
] All MessageQueue TCP multi-node tests passed!"
)
dist
.
destroy_process_group
()
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment