Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7ea22e42
Unverified
Commit
7ea22e42
authored
Aug 26, 2025
by
nvjullin
Committed by
GitHub
Aug 26, 2025
Browse files
[Misc] Add override for allreduce fusion thresholds (#23639)
Signed-off-by:
Julien Lin
<
jullin@nvidia.com
>
parent
9d4183dd
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
24 additions
and
0 deletions
+24
-0
vllm/compilation/collective_fusion.py
vllm/compilation/collective_fusion.py
+13
-0
vllm/envs.py
vllm/envs.py
+11
-0
No files found.
vllm/compilation/collective_fusion.py
View file @
7ea22e42
...
...
@@ -10,6 +10,7 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized
from
torch._inductor.pattern_matcher
import
PatternMatcherPass
from
torch.distributed._symmetric_memory
import
enable_symm_mem_for_group
import
vllm.envs
as
envs
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
get_tp_group
,
tensor_model_parallel_all_reduce
from
vllm.distributed.parallel_state
import
(
...
...
@@ -401,6 +402,18 @@ if flashinfer_comm is not None:
6
:
MiB
//
2
,
# 512KB
8
:
MiB
//
2
,
# 512KB
}
try
:
_FI_MAX_SIZES
.
update
({
int
(
k
):
int
(
float
(
v
)
*
MiB
)
for
k
,
v
in
envs
.
VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB
.
items
()
})
except
Exception
as
e
:
raise
ValueError
(
"Failed to parse VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB: "
+
str
(
e
))
from
e
# opt for a more conservative default value
# when world size is not in _FI_MAX_SIZES
_DEFAULT_FI_MAX_SIZE
=
MiB
//
2
...
...
vllm/envs.py
View file @
7ea22e42
...
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
hashlib
import
json
import
os
import
sys
import
tempfile
...
...
@@ -1046,6 +1047,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE"
,
"163840"
)),
# Specifies the thresholds of the communicated tensor sizes under which
# vllm should use flashinfer fused allreduce. The variable should be a
# JSON with the following format:
# { <world size>: <max size in mb> }
# Unspecified world sizes will fallback to
# { 2: 64, 4: 1, <everything else>: 0.5 }
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB"
:
lambda
:
json
.
loads
(
os
.
getenv
(
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB"
,
"{}"
)),
# MoE routing strategy selector.
# See `RoutingSimulator.get_available_strategies()` # for available
# strategies.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment