Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
49bfc538
Unverified
Commit
49bfc538
authored
Sep 15, 2025
by
Sage Moore
Committed by
GitHub
Sep 15, 2025
Browse files
Update num_tokens_across_dp to use nccl instead of gloo (#24105)
Signed-off-by:
Sage Moore
<
sage@neuralmagic.com
>
parent
a0b26701
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
25 additions
and
4 deletions
+25
-4
vllm/envs.py
vllm/envs.py
+8
-0
vllm/forward_context.py
vllm/forward_context.py
+17
-4
No files found.
vllm/envs.py
View file @
49bfc538
...
...
@@ -92,6 +92,7 @@ if TYPE_CHECKING:
VLLM_ALLOW_RUNTIME_LORA_UPDATING
:
bool
=
False
VLLM_SKIP_P2P_CHECK
:
bool
=
False
VLLM_DISABLED_KERNELS
:
list
[
str
]
=
[]
VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION
:
bool
=
False
VLLM_USE_V1
:
bool
=
True
VLLM_ROCM_USE_AITER
:
bool
=
False
VLLM_ROCM_USE_AITER_PAGED_ATTN
:
bool
=
False
...
...
@@ -745,6 +746,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda
:
[]
if
"VLLM_DISABLED_KERNELS"
not
in
os
.
environ
else
os
.
environ
[
"VLLM_DISABLED_KERNELS"
].
split
(
","
),
# Swaps the all reduce backend that we use to coordinate the DP padding
# information from NCCL to gloo.
"VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION"
:
lambda
:
(
os
.
getenv
(
"VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# If set, use the V1 code path.
"VLLM_USE_V1"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_V1"
,
"1"
))),
...
...
vllm/forward_context.py
View file @
49bfc538
...
...
@@ -13,6 +13,7 @@ import torch.distributed as dist
import
vllm.envs
as
envs
from
vllm.config
import
CUDAGraphMode
,
ParallelConfig
,
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
if
TYPE_CHECKING
:
from
vllm.attention.backends.abstract
import
AttentionMetadata
...
...
@@ -75,14 +76,26 @@ class DPMetadata:
Gather the num_tokens across all DP ranks and return results in a
CPU tensor of size dp_size.
"""
from
vllm.distributed.parallel_state
import
get_dp_group
device
=
current_platform
.
device_type
group
=
get_dp_group
().
device_group
# Transfering this tensor from GPU to CPU will introduce a GPU sync
# point that could adversely affect performance of vllm with asynch
# scheduling. This environment variable exists to quickly disable
# this optimization if we run into this case.
if
envs
.
VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION
:
logger
.
info_once
(
"Using CPU all reduce to syncronize DP padding between ranks."
)
device
=
"cpu"
group
=
get_dp_group
().
cpu_group
num_tokens_across_dp
=
[
0
]
*
dp_size
num_tokens_across_dp
[
dp_rank
]
=
num_tokens
num_tokens_tensor
=
torch
.
tensor
(
num_tokens_across_dp
,
device
=
"cpu"
,
device
=
device
,
dtype
=
torch
.
int32
)
from
vllm.distributed.parallel_state
import
get_dp_group
dist
.
all_reduce
(
num_tokens_tensor
,
group
=
get_dp_group
().
cpu_group
)
return
num_tokens_tensor
dist
.
all_reduce
(
num_tokens_tensor
,
group
=
group
)
return
num_tokens_tensor
.
cpu
()
@
staticmethod
def
make
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment