Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a55cf41a
Unverified
Commit
a55cf41a
authored
Sep 09, 2025
by
Wentao Ye
Committed by
GitHub
Sep 09, 2025
Browse files
[Compilation][WideEP] Enable Piecewise CUDAGraph for DeepEPHT (#24123)
parent
6fb27881
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
21 additions
and
10 deletions
+21
-10
vllm/config/compilation.py
vllm/config/compilation.py
+14
-1
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+7
-9
No files found.
vllm/config/compilation.py
View file @
a55cf41a
...
@@ -546,7 +546,8 @@ class CompilationConfig:
...
@@ -546,7 +546,8 @@ class CompilationConfig:
# full cudagraph outside the fx graph. This reduces some cpu
# full cudagraph outside the fx graph. This reduces some cpu
# overhead when the runtime batch_size is not cudagraph captured.
# overhead when the runtime batch_size is not cudagraph captured.
# see https://github.com/vllm-project/vllm/pull/20059 for details.
# see https://github.com/vllm-project/vllm/pull/20059 for details.
self
.
splitting_ops
=
self
.
_attention_ops
# make a copy to avoid mutating the class-level list via reference.
self
.
splitting_ops
=
list
(
self
.
_attention_ops
)
elif
len
(
self
.
splitting_ops
)
==
0
:
elif
len
(
self
.
splitting_ops
)
==
0
:
logger
.
warning_once
(
"Using piecewise compilation with empty "
logger
.
warning_once
(
"Using piecewise compilation with empty "
"splitting_ops."
)
"splitting_ops."
)
...
@@ -561,6 +562,18 @@ class CompilationConfig:
...
@@ -561,6 +562,18 @@ class CompilationConfig:
self
.
cudagraph_mode
=
CUDAGraphMode
.
FULL
self
.
cudagraph_mode
=
CUDAGraphMode
.
FULL
self
.
splitting_ops
=
[]
self
.
splitting_ops
=
[]
if
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
:
# exclude MoE dispatch/combine from capture by ensuring
# piecewise splitting includes them, so communication remains
# outside CUDA graphs while compute can still be graphed.
moe_ops
=
[
"vllm.moe_forward"
,
"vllm.moe_forward_shared"
,
]
for
op
in
moe_ops
:
if
op
not
in
self
.
splitting_ops
:
self
.
splitting_ops
.
append
(
op
)
def
splitting_ops_contain_attention
(
self
)
->
bool
:
def
splitting_ops_contain_attention
(
self
)
->
bool
:
return
self
.
splitting_ops
is
not
None
and
all
(
return
self
.
splitting_ops
is
not
None
and
all
(
op
in
self
.
splitting_ops
for
op
in
self
.
_attention_ops
)
op
in
self
.
splitting_ops
for
op
in
self
.
_attention_ops
)
vllm/platforms/cuda.py
View file @
a55cf41a
...
@@ -183,16 +183,14 @@ class CudaPlatformBase(Platform):
...
@@ -183,16 +183,14 @@ class CudaPlatformBase(Platform):
compilation_config
=
vllm_config
.
compilation_config
compilation_config
=
vllm_config
.
compilation_config
if
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
if
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
and
parallel_config
.
data_parallel_size
>
1
and
parallel_config
.
data_parallel_size
>
1
and
compilation_config
.
cudagraph_mode
!=
CUDAGraphMode
.
NONE
):
and
compilation_config
.
cudagraph_mode
not
in
[
CUDAGraphMode
.
NONE
,
CUDAGraphMode
.
PIECEWISE
]):
logger
.
info
(
logger
.
info
(
"Data Parallel: disabling cudagraphs since DP "
"Data Parallel with DeepEP high-throughput: using PIECEWISE "
"with DeepEP high-throughput kernels are not CUDA Graph "
"CUDA graphs and excluding MoE ops from capture. Set "
"compatible. The DeepEP low-latency kernels are CUDA Graph "
"VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE "
"compatible. Set the all_to_all backend to deepep_low_latency "
"graphs captured as well."
)
"to use those kernels instead."
)
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
NONE
if
model_config
is
not
None
:
model_config
.
enforce_eager
=
True
@
classmethod
@
classmethod
def
get_current_memory_usage
(
cls
,
def
get_current_memory_usage
(
cls
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment