Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7cfa420f
Unverified
Commit
7cfa420f
authored
Oct 14, 2025
by
Angela Yi
Committed by
GitHub
Oct 15, 2025
Browse files
[BugFix] Patch inductor partitioning logic (#26735)
Signed-off-by:
angelayi
<
yiangela7@gmail.com
>
parent
a27b288e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
118 additions
and
0 deletions
+118
-0
vllm/env_override.py
vllm/env_override.py
+118
-0
No files found.
vllm/env_override.py
View file @
7cfa420f
...
...
@@ -3,6 +3,7 @@
import
os
import
torch
from
packaging
import
version
from
vllm.logger
import
init_logger
...
...
@@ -21,3 +22,120 @@ os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1"
os
.
environ
[
"TORCHINDUCTOR_COMPILE_THREADS"
]
=
"1"
# see https://github.com/vllm-project/vllm/issues/10619
torch
.
_inductor
.
config
.
compile_threads
=
1
# ========================================
# torch 2.9 Inductor Scheduler monkeypatch
# ========================================
# This change monkeypatches a function in Inductor to work around the following
# bug: https://github.com/vllm-project/vllm/issues/26678
#
# The bug occurs when `use_inductor_graph_partition` is turned on and there
# exists operators inside of `splitting_ops` that have an in-place mutation. In
# vllm, this specifically occurs on the operator
# vllm.unified_attention_with_output. In this case, inductor does not populate
# the inductor IR's `origin_node` field, causing an assertion error when trying
# to access the node's `origin_node` field.
#
# So, we will monkeypatch torch._inductor.scheduler.Scheduler.should_partition
# so that it does not access the inductor IR node's `origin_node` field and just
# returns True if a node is registered as having a custom partition function.
# This is ok for now since vllm's implementation of the custom partition
# functions just return True.
# ========================================
def
should_partition_patched
(
self
,
node
,
should_log
:
bool
=
False
)
->
bool
:
# This is a patched version of
# torch._inductor.scheduler.Scheduler.should_partition that modifies
# the following piece of code so that we always return True:
# https://github.com/pytorch/pytorch/blob/ecb53078faf86ca1b33277df33b82985675bb011/torch/_inductor/scheduler.py#L4712-L4724
"""Return True if we should partition the inductor graph on this node"""
import
torch._inductor.ir
as
ir
from
torch._inductor.scheduler
import
(
BaseSchedulerNode
,
FusedSchedulerNode
,
_custom_should_partition_fns
,
)
from
torch._inductor.utils
import
(
_unstable_customized_partition_wrapper
,
is_cudagraph_unsafe_op
,
maybe_log_cudagraph_partition
,
)
# Allow users to manually specify if a node should be partitioned
# Can only do this for FallbackKernels
ir_node
=
node
.
node
if
isinstance
(
ir_node
,
ir
.
FallbackKernel
):
operator
=
ir_node
.
op_overload
if
operator
is
not
None
and
operator
in
_custom_should_partition_fns
:
return
True
# When not using cudagraphs, keep all kernels in the `call` function
# instead of graph partition functions, since graph partition only brings
# benefit to cudagraph
if
(
not
torch
.
_inductor
.
config
.
triton
.
cudagraphs
and
_unstable_customized_partition_wrapper
.
wrapper
is
None
):
return
True
# avoid duplicating logs when should_partition is called multiple times
# on the same node
def
noop_log
(
msg
:
str
,
node
:
BaseSchedulerNode
|
None
)
->
None
:
return
log_partition_reason
=
maybe_log_cudagraph_partition
if
should_log
else
noop_log
if
isinstance
(
node
,
FusedSchedulerNode
):
return
any
(
self
.
should_partition
(
snode
)
for
snode
in
node
.
snodes
)
assert
node
.
node
is
not
None
if
not
node
.
is_gpu
():
log_partition_reason
(
"non gpu ops"
,
node
=
node
)
return
True
if
isinstance
(
node
.
node
,
ir
.
DeviceCopy
):
log_partition_reason
(
"DeviceCopy ops"
,
node
=
node
)
return
True
if
isinstance
(
node
.
node
,
ir
.
Conditional
):
log_partition_reason
(
"Conditional ops"
,
node
=
node
)
return
True
if
getattr
(
node
.
node
,
"unbacked_bindings"
,
None
):
log_partition_reason
(
"unbacked binding ops"
,
node
=
node
)
return
True
if
is_cudagraph_unsafe_op
(
node
.
node
):
log_partition_reason
(
"CUDAGraph-unsafe custom ops"
,
node
=
node
)
return
True
return
False
def
_update_scheduler_patched
(
self
)
->
None
:
# Copied from torch._inductor.graph.GrahLowering._update_scheduler. Patches
# this method so that we can patch Scheduler.should_partition with the
# function above
"""
(Re)initializes the scheduler member. When initializing the scheduler, no CUBIN
files should be generated (to avoid biasing any benchmarks and pessimizing
fusion decisions).
"""
import
torch._inductor.config
as
config
from
torch._inductor.scheduler
import
Scheduler
Scheduler
.
should_partition
=
should_partition_patched
with
config
.
patch
(
"triton.store_cubin"
,
False
):
self
.
scheduler
=
Scheduler
(
self
.
operations
)
if
version
.
parse
(
str
(
torch
.
__version__
))
==
version
.
parse
(
"2.9.0"
):
from
torch._inductor.graph
import
GraphLowering
GraphLowering
.
_update_scheduler
=
_update_scheduler_patched
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment