Remove redundant mutates_args and dispatch_key for direct_register_custom_op (#25512)

Signed-off-by: mgoin <mgoin64@gmail.com>

Remove redundant mutates_args and dispatch_key for direct_register_custom_op (#25512)
Signed-off-by: mgoin <mgoin64@gmail.com>
7361ab37 · Michael Goin · GitHub · 95bc60e4 · 7361ab37 · 7361ab37
Unverified Commit 7361ab37 authored Sep 23, 2025 by Michael Goin Committed by GitHub Sep 23, 2025
20 changed files
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -575,9 +575,7 @@ def unified_attention_fake(
 direct_register_custom_op(
    op_name="unified_attention",
    op_func=unified_attention,
-    mutates_args=[],
    fake_impl=unified_attention_fake,
-    dispatch_key=current_platform.dispatch_key,
    tags=tag_cudagraph_unsafe,
 )
@@ -628,6 +626,5 @@ direct_register_custom_op(
    op_func=unified_attention_with_output,
    mutates_args=["output", "output_block_scale"],
    fake_impl=unified_attention_with_output_fake,
-    dispatch_key=current_platform.dispatch_key,
    tags=tag_cudagraph_unsafe,
 )
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -547,7 +547,6 @@ if flashinfer_comm is not None:
            "scale_out",
        ],
        fake_impl=call_trtllm_fused_allreduce_norm_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    flashinfer_trtllm_fused_allreduce_norm = (
        torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default)

--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -46,7 +46,6 @@ def register_nccl_symmetric_ops(pynccl_comm):
    direct_register_custom_op(
        op_name="all_reduce_symmetric_with_copy",
        op_func=all_reduce_symmetric_with_copy_impl,
-        mutates_args=[],
        fake_impl=all_reduce_symmetric_with_copy_fake,
    )

--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -149,29 +149,22 @@ def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int,
 if supports_custom_op():
-    from vllm.platforms import current_platform
    direct_register_custom_op(
        op_name="all_reduce",
        op_func=all_reduce,
-        mutates_args=[],
        fake_impl=all_reduce_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    direct_register_custom_op(
        op_name="reduce_scatter",
        op_func=reduce_scatter,
-        mutates_args=[],
        fake_impl=reduce_scatter_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    direct_register_custom_op(
        op_name="all_gather",
        op_func=all_gather,
-        mutates_args=[],
        fake_impl=all_gather_fake,
-        dispatch_key=current_platform.dispatch_key,
    )

--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -11,7 +11,6 @@ import torch
 from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel
 from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr
-from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op
@@ -283,7 +282,6 @@ try:
        op_func=_lora_expand,
        mutates_args=["output_tensor"],
        fake_impl=_lora_expand_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    lora_expand = torch.ops.vllm.lora_expand

--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -11,7 +11,6 @@ import torch
 from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel
 from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr
-from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op
@@ -237,7 +236,6 @@ try:
        op_func=_lora_shrink,
        mutates_args=["output_tensor"],
        fake_impl=_lora_shrink_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    lora_shrink = torch.ops.vllm.lora_shrink

--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -92,7 +92,6 @@ def flashinfer_fused_moe_blockscale_fp8_fake(
 direct_register_custom_op(
    op_name="flashinfer_fused_moe_blockscale_fp8",
    op_func=flashinfer_fused_moe_blockscale_fp8,
-    mutates_args=[],
    fake_impl=flashinfer_fused_moe_blockscale_fp8_fake,
    tags=(torch.Tag.needs_fixed_stride_order, ),
 )

--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -235,6 +235,5 @@ def fused_marlin_moe_fake(hidden_states: torch.Tensor,
 direct_register_custom_op(
    op_name="fused_marlin_moe",
    op_func=fused_marlin_moe,
-    mutates_args=[],
    fake_impl=fused_marlin_moe_fake,
 )
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1256,7 +1256,6 @@ def outplace_fused_experts_fake(
 direct_register_custom_op(
    op_name="outplace_fused_experts",
    op_func=outplace_fused_experts,
-    mutates_args=[],
    fake_impl=outplace_fused_experts_fake,
    tags=(() if is_torch_equal_or_newer("2.7.0") else
          (torch.Tag.needs_fixed_stride_order, )),

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -2040,7 +2040,6 @@ direct_register_custom_op(
    op_func=moe_forward,
    mutates_args=["hidden_states"],
    fake_impl=moe_forward_fake,
-    dispatch_key=current_platform.dispatch_key,
    tags=(torch.Tag.needs_fixed_stride_order, ),
 )
@@ -2071,7 +2070,6 @@ direct_register_custom_op(
    op_func=moe_forward_shared,
    mutates_args=["hidden_states"],
    fake_impl=moe_forward_shared_fake,
-    dispatch_key=current_platform.dispatch_key,
    tags=(torch.Tag.needs_fixed_stride_order, ),
 )

--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -223,17 +223,13 @@ if current_platform.is_rocm():
    direct_register_custom_op(
        op_name="rocm_aiter_asm_moe_tkw1",
        op_func=rocm_aiter_asm_moe_tkw1_impl,
-        mutates_args=[],
        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    direct_register_custom_op(
        op_name="rocm_aiter_fused_moe",
        op_func=rocm_aiter_fused_moe_impl,
-        mutates_args=[],
        fake_impl=rocm_aiter_fused_moe_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    direct_register_custom_op(
@@ -241,7 +237,6 @@ if current_platform.is_rocm():
        op_func=rocm_aiter_topk_softmax_impl,
        mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
        fake_impl=rocm_aiter_topk_softmax_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    direct_register_custom_op(
@@ -249,7 +244,6 @@ if current_platform.is_rocm():
        op_func=rocm_aiter_biased_grouped_topk_impl,
        mutates_args=["topk_weights", "topk_ids"],
        fake_impl=rocm_aiter_biased_grouped_topk_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    direct_register_custom_op(
@@ -257,7 +251,6 @@ if current_platform.is_rocm():
        op_func=rocm_aiter_grouped_topk_impl,
        mutates_args=["topk_weights", "topk_ids"],
        fake_impl=rocm_aiter_grouped_topk_fake,
-        dispatch_key=current_platform.dispatch_key,
    )

--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -103,17 +103,13 @@ if current_platform.is_rocm():
    direct_register_custom_op(
        op_name="rocm_aiter_rms_norm",
        op_func=rocm_aiter_rms_norm_impl,
-        mutates_args=[],
        fake_impl=rocm_aiter_rms_norm_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    direct_register_custom_op(
        op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
        op_func=rocm_aiter_rmsnorm2d_fwd_with_add_impl,
-        mutates_args=[],
        fake_impl=rocm_aiter_rmsnorm2d_fwd_with_add_fake,
-        dispatch_key=current_platform.dispatch_key,
    )

--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -31,7 +31,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
-from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
@@ -401,5 +400,4 @@ direct_register_custom_op(
    op_func=linear_attention,
    mutates_args=["output"],
    fake_impl=linear_attention_fake,
-    dispatch_key=current_platform.dispatch_key,
 )
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
    selective_scan_fn, selective_state_update)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
@@ -464,5 +463,4 @@ direct_register_custom_op(
    op_func=mamba_mixer,
    mutates_args=["output"],
    fake_impl=mamba_mixer_fake,
-    dispatch_key=current_platform.dispatch_key,
 )
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import (
    LoaderFunction, composed_weight_loader, sharded_weight_loader)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
@@ -765,5 +764,4 @@ direct_register_custom_op(
    op_func=mamba_mixer2,
    mutates_args=["output"],
    fake_impl=mamba_mixer2_fake,
-    dispatch_key=current_platform.dispatch_key,
 )
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -21,7 +21,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
    causal_conv1d_fn, causal_conv1d_update)
-from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.short_conv_attn import (
    ShortConvAttentionMetadata)
@@ -251,5 +250,4 @@ direct_register_custom_op(
    op_func=short_conv,
    mutates_args=["output"],
    fake_impl=short_conv_fake,
-    dispatch_key=current_platform.dispatch_key,
 )
--- a/vllm/model_executor/layers/quantization/deepgemm.py
+++ b/vllm/model_executor/layers/quantization/deepgemm.py
@@ -4,7 +4,6 @@ import logging
 import torch
-from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils import direct_register_custom_op
 from vllm.utils.deep_gemm import fp8_gemm_nt
@@ -75,7 +74,5 @@ def w8a8_deepgemm_block_scaled_mm_fake(
 direct_register_custom_op(
    op_name="w8a8_deepgemm_block_scaled_mm",
    op_func=w8a8_deepgemm_block_scaled_mm,
-    mutates_args=[],
    fake_impl=w8a8_deepgemm_block_scaled_mm_fake,
-    dispatch_key=current_platform.dispatch_key,
 )
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -161,7 +161,6 @@ try:
    direct_register_custom_op(
        op_name="_fused_mul_mat_gguf",
        op_func=_fused_mul_mat_gguf,
-        mutates_args=[],
        fake_impl=_fused_mul_mat_gguf_fake,
    )
    fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf
@@ -273,7 +272,6 @@ try:
    direct_register_custom_op(
        op_name="_fused_moe_gguf",
        op_func=_fused_moe_gguf,
-        mutates_args=[],
        fake_impl=_fused_moe_gguf_fake,
    )
    fused_moe_gguf = torch.ops.vllm._fused_moe_gguf
@@ -319,7 +317,6 @@ try:
    direct_register_custom_op(
        op_name="_apply_gguf_embedding",
        op_func=_apply_gguf_embedding,
-        mutates_args=[],
        fake_impl=_apply_gguf_embedding_fake,
    )
    apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding

--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -51,9 +51,7 @@ if current_platform.is_rocm():
    direct_register_custom_op(
        op_name="rocm_aiter_gemm_w8a8",
        op_func=rocm_aiter_gemm_w8a8_impl,
-        mutates_args=[],
        fake_impl=rocm_aiter_gemm_w8a8_fake,
-        dispatch_key=current_platform.dispatch_key,
    )

--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -91,9 +91,7 @@ if current_platform.is_rocm():
    direct_register_custom_op(
        op_name="rocm_aiter_gemm_w8a8_blockscale",
        op_func=rocm_aiter_gemm_w8a8_blockscale_impl,
-        mutates_args=[],
        fake_impl=rocm_aiter_gemm_w8a8_blockscale_fake,
-        dispatch_key=current_platform.dispatch_key,
    )
    if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_LINEAR
            and current_platform.is_fp8_fnuz()):
@@ -135,7 +133,6 @@ def _w8a8_triton_block_scaled_mm_fake(
 direct_register_custom_op(
    "w8a8_triton_block_scaled_mm_func",
    _w8a8_triton_block_scaled_mm_func,
-    mutates_args=[],
    fake_impl=_w8a8_triton_block_scaled_mm_fake,
    dispatch_key="CUDA",
 )