Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2a30a2d
Unverified
Commit
d2a30a2d
authored
Sep 18, 2025
by
Wentao Ye
Committed by
GitHub
Sep 18, 2025
Browse files
[Bug] Fix torch Compilation Cache Hit Error (#25093)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
75fb112d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
10 additions
and
19 deletions
+10
-19
vllm/config/compilation.py
vllm/config/compilation.py
+0
-12
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+10
-7
No files found.
vllm/config/compilation.py
View file @
d2a30a2d
...
@@ -563,18 +563,6 @@ class CompilationConfig:
...
@@ -563,18 +563,6 @@ class CompilationConfig:
self
.
cudagraph_mode
=
CUDAGraphMode
.
FULL
self
.
cudagraph_mode
=
CUDAGraphMode
.
FULL
self
.
splitting_ops
=
[]
self
.
splitting_ops
=
[]
if
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
:
# exclude MoE dispatch/combine from capture by ensuring
# piecewise splitting includes them, so communication remains
# outside CUDA graphs while compute can still be graphed.
moe_ops
=
[
"vllm.moe_forward"
,
"vllm.moe_forward_shared"
,
]
for
op
in
moe_ops
:
if
op
not
in
self
.
splitting_ops
:
self
.
splitting_ops
.
append
(
op
)
def
splitting_ops_contain_attention
(
self
)
->
bool
:
def
splitting_ops_contain_attention
(
self
)
->
bool
:
return
self
.
splitting_ops
is
not
None
and
all
(
return
self
.
splitting_ops
is
not
None
and
all
(
op
in
self
.
splitting_ops
for
op
in
self
.
_attention_ops
)
op
in
self
.
splitting_ops
for
op
in
self
.
_attention_ops
)
vllm/platforms/cuda.py
View file @
d2a30a2d
...
@@ -191,14 +191,17 @@ class CudaPlatformBase(Platform):
...
@@ -191,14 +191,17 @@ class CudaPlatformBase(Platform):
compilation_config
=
vllm_config
.
compilation_config
compilation_config
=
vllm_config
.
compilation_config
if
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
if
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
and
parallel_config
.
data_parallel_size
>
1
and
parallel_config
.
data_parallel_size
>
1
and
compilation_config
.
cudagraph_mode
and
compilation_config
.
cudagraph_mode
!=
CUDAGraphMode
.
NONE
):
not
in
[
CUDAGraphMode
.
NONE
,
CUDAGraphMode
.
PIECEWISE
]):
# TODO: Piecewise Cuda graph might be enabled
# if torch compile cache key issue fixed
# See https://github.com/vllm-project/vllm/pull/25093
logger
.
info
(
logger
.
info
(
"Data Parallel with DeepEP high-throughput: using PIECEWISE "
"Data Parallel: disabling cudagraphs since DP "
"CUDA graphs and excluding MoE ops from capture. Set "
"with DeepEP high-throughput kernels are not CUDA Graph "
"VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE "
"compatible. The DeepEP low-latency kernels are CUDA Graph "
"graphs captured as well."
)
"compatible. Set the all_to_all backend to deepep_low_latency "
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
PIECEWISE
"to use those kernels instead."
)
compilation_config
.
cudagraph_mode
=
CUDAGraphMode
.
NONE
@
classmethod
@
classmethod
def
get_current_memory_usage
(
cls
,
def
get_current_memory_usage
(
cls
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment