[torch.compile][ROCm] Fuse quantization onto attention using a torch.compile pass (#16756)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Sage Moore <sage@neuralmagic.com>

[torch.compile][ROCm] Fuse quantization onto attention using a torch.compile pass (#16756)
Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Sage Moore <sage@neuralmagic.com>
f98548b9 · Luka Govedič · GitHub · 96846bb3 · f98548b9 · f98548b9
Unverified Commit f98548b9 authored Jun 12, 2025 by Luka Govedič Committed by GitHub Jun 12, 2025
13 changed files
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch._inductor.pattern_matcher as pm
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch._subclasses.fake_tensor import (FakeTensorMode,
+                                           unset_fake_temporarily)
+from vllm.attention import Attention
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from .fusion import QUANT_OPS, GroupShape, QuantKey, empty_bf16, empty_fp32
+from .vllm_inductor_pass import VllmInductorPass
+logger = init_logger(__name__)
+ATTN_OP = torch.ops.vllm.unified_attention_with_output.default
+RESHAPE_OP = torch.ops.aten.reshape.default
+class AttentionStaticQuantPattern:
+    def __init__(
+        self,
+        layer_name: str,
+        num_heads: int,
+        head_size: int,
+        quant_dtype: torch.dtype,
+        symmetric=True,
+    ):
+        self.layer_name = layer_name
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.quant_dtype = quant_dtype
+        self.quant_key = QuantKey(dtype=quant_dtype,
+                                  static=True,
+                                  group_shape=GroupShape.PER_TENSOR,
+                                  symmetric=symmetric)
+        assert self.quant_key in QUANT_OPS, \
+            f"unsupported quantization scheme {self.quant_key}"
+        self.QUANT_OP = QUANT_OPS[self.quant_key]
+    def empty_quant(self, *args, **kwargs):
+        kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
+        return torch.empty(*args, **kwargs)
+    def register_if_supported(self, pm_pass: PatternMatcherPass,
+                              layer: Attention):
+        if layer.impl.fused_output_quant_supported(self.quant_dtype,
+                                                   self.quant_key.static,
+                                                   self.quant_key.group_shape):
+            self._register(pm_pass)
+    def _register(self, pm_pass: PatternMatcherPass):
+        def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    output_attn: torch.Tensor, output_quant: torch.Tensor,
+                    scale: torch.Tensor):
+            view_7 = RESHAPE_OP(output_attn,
+                                [-1, self.num_heads, self.head_size])
+            at1 = auto_functionalized(ATTN_OP,
+                                      query=q,
+                                      key=k,
+                                      value=v,
+                                      output=view_7,
+                                      layer_name=self.layer_name,
+                                      output_scale=None)
+            attn_out_view = RESHAPE_OP(at1[1],
+                                       [-1, self.num_heads * self.head_size])
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=output_quant,
+                                      input=attn_out_view,
+                                      scale=scale)
+            return at2[1]
+        def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                        output_attn: torch.Tensor, output_quant: torch.Tensor,
+                        scale: torch.Tensor):
+            view_7 = RESHAPE_OP(output_quant,
+                                [-1, self.num_heads, self.head_size])
+            at1 = auto_functionalized(ATTN_OP,
+                                      query=q,
+                                      key=k,
+                                      value=v,
+                                      output=view_7,
+                                      layer_name=self.layer_name,
+                                      output_scale=scale)
+            return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
+        # Need custom fake mode, otherwise tracing happens with real tensors.
+        # That would not work for the unified_attention custom op.
+        with unset_fake_temporarily(), FakeTensorMode():
+            inputs = [
+                empty_bf16(5, self.num_heads, self.head_size),  # q
+                empty_bf16(5, self.num_heads, self.head_size),  # k
+                empty_bf16(5, self.num_heads, self.head_size),  # v
+                empty_bf16(5, self.num_heads * self.head_size),  # attn_output
+                self.empty_quant(5, self.num_heads *
+                                 self.head_size),  # quant_output
+                empty_fp32(1, 1)  # scale
+            ]
+            def wrap_trace_fn(process_fx, trace_fn):
+                def wrapped(*args, **kwargs):
+                    return process_fx(trace_fn(*args, **kwargs))
+                return wrapped
+            def fx_view_to_reshape(gm: torch.fx.GraphModule):
+                from torch._inductor.fx_passes.post_grad import view_to_reshape
+                view_to_reshape(gm)
+                return gm
+            pm.register_replacement(
+                pattern, replacement, inputs,
+                wrap_trace_fn(fx_view_to_reshape, pm.fwd_only), pm_pass)
+class AttnFusionPass(VllmInductorPass):
+    """
+    This pass fuses post-attention quantization onto attention if supported.
+    It uses the pattern matcher and matches each layer manually, as strings
+    cannot be wildcarded. This also lets us check support on attention layers
+    upon registration instead of during pattern matching.
+    Currently, only static fp8 quant is supported, but patterns could easily be
+    added for other quant schemes and dtypes. The bigger hurdle for wider
+    support are attention kernels, which need to support fusing output quant.
+    """
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+        self.static_fwd_ctx = config.compilation_config.static_forward_context
+        self.patterns = PatternMatcherPass(pass_name="attn_fusion_pass")
+        for key, layer in self.static_fwd_ctx.items():
+            pattern = AttentionStaticQuantPattern(key, layer.num_heads,
+                                                  layer.head_size,
+                                                  current_platform.fp8_dtype())
+            pattern.register_if_supported(self.patterns, layer)
+        if len(self.static_fwd_ctx) == 0:
+            logger.warning(
+                "Attention + quant fusion is enabled, but "
+                "CompilationConfig.static_forward_context is empty. "
+                "Cannot access attention layers so no fusion "
+                "patterns were registered.")
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        self.begin()
+        self.dump_graph(graph, "before_attn_fusion")
+        count = self.patterns.apply(graph)
+        logger.debug("Fused quantization onto %s attention nodes", count)
+        self.dump_graph(graph, "after_attn_fusion")
+        self.end_and_log()
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import operator
-from collections.abc import Iterable
+from collections.abc import Iterable, Iterator
 from typing import Optional
 from torch import fx
@@ -14,6 +14,10 @@ def is_func(node: fx.Node, target) -> bool:
    return node.op == "call_function" and node.target == target
+def is_auto_func(node: fx.Node, op: OpOverload) -> bool:
+    return is_func(node, auto_functionalized) and node.args[0] == op
 # Returns the first specified node with the given op (if it exists)
 def find_specified_fn_maybe(nodes: Iterable[fx.Node],
                            op: OpOverload) -> Optional[fx.Node]:
@@ -60,3 +64,21 @@ def find_getitem(node: fx.Node, idx: int) -> fx.Node:
    ret = find_getitem_maybe(node, idx)
    assert ret is not None, f"Could not find getitem {idx} in node {node}"
    return ret
+# An auto-functionalization-aware utility for finding nodes with a specific op
+def find_op_nodes(op: OpOverload, graph: fx.Graph) -> Iterator[fx.Node]:
+    if not op._schema.is_mutable:
+        yield from graph.find_nodes(op="call_function", target=op)
+    for n in graph.find_nodes(op="call_function", target=auto_functionalized):
+        if n.args[0] == op:
+            yield n
+# Asserts that the node only has one user and returns it
+# Even if a node has only 1 user, it might share storage with another node,
+# which might need to be taken into account.
+def get_only_user(node: fx.Node) -> fx.Node:
+    assert len(node.users) == 1
+    return next(iter(node.users))
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -23,7 +23,23 @@ class NoOpEliminationPass(VllmInductorPass):
    in the 2D-case. Additionally, torch internal no-op elimination pass does
    not handle certain slice variants.
+    Cases handled:
+      1. A chain of reshapes is equivalent to the last reshape called on the
+      base tensor (input of the first reshape).
+      2. A reshape that produces the shape of the input is redundant
+      3. A slice that produces the shape of the input is redundant
    Example graph 1:
+    mul_1: "f16[s0, 4096]" = ...
+    view_1: "f16[s0, 128, 32]" = torch.reshape(mul_1, [-1, 128, 32])
+    view_2: "f16[s0, 4096]" = torch.reshape(view_2, [-1, 4096])
+    view_3: "f16[s0, 128, 32]" = torch.reshape(view_3, [-1, 128, 32])
+    Can be replaced with:
+    mul_1: "f16[s0, 4096]" = ...
+    view_3: "f16[s0, 128, 32]" = ...
+    Example graph 2:
    getitem_1: "f16[s0, 4096]" = ...
    view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096])
    at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...)
@@ -34,7 +50,7 @@ class NoOpEliminationPass(VllmInductorPass):
    at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...)
    out: "f8e4m3fn[s0, 4096]" = at[1]
-    Example graph 2:
+    Example graph 3:
    arg0: "s0" = SymInt(s0)
    scaled_mm: "f16[s0, 4096]" = ...
    slice_1: "f16[s0, 4096]" = torch.slice(scaled_mm, -1, 0, arg0)
@@ -58,6 +74,18 @@ class NoOpEliminationPass(VllmInductorPass):
        # Remove no-op reshapes/views:
        for node in graph.nodes:
            if is_func(node, torch.ops.aten.reshape.default):
+                # Case 1: rewrite reshape chains to reshapes on the base tensor
+                input = node.args[0]
+                # If the input is a reshape, rebind to that node
+                if is_func(input, torch.ops.aten.reshape.default):
+                    # The new input is guaranteed not to be a reshape,
+                    # because we process nodes in order
+                    node.update_arg(0, input.args[0])
+                    if len(input.users) == 0:
+                        graph.erase_node(input)
+                        count += 1
+                # Case 2: remove this reshape if it produces the original shape
                input, shape = node.args[:2]
                input_shape = input.meta["val"].shape
                if len(shape) != len(input_shape):

--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -10,6 +10,7 @@ from .activation_quant_fusion import ActivationQuantFusionPass
 from .collective_fusion import AsyncTPPass
 from .fix_functionalization import FixFunctionalizationPass
 from .fusion import FusionPass
+from .fusion_attn import AttnFusionPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
 from .noop_elimination import NoOpEliminationPass
 from .sequence_parallelism import SequenceParallelismPass
@@ -59,6 +60,9 @@ class PostGradPassManager(CustomGraphPass):
            if self.pass_config.enable_async_tp:
                self.passes += [AsyncTPPass(config)]
+        if self.pass_config.enable_attn_fusion:
+            self.passes += [AttnFusionPass(config)]
        self.fix_functionalization = FixFunctionalizationPass(config)
    def add(self, pass_: InductorPass):

--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -4,6 +4,7 @@
 import time
 import torch
+from torch._dynamo.utils import lazy_format_graph_code
 from vllm.config import PassConfig, VllmConfig
 # yapf: disable
@@ -34,6 +35,8 @@ class VllmInductorPass(InductorPass):
        self.pass_name = self.__class__.__name__
    def dump_graph(self, graph: torch.fx.Graph, stage: str, always=False):
+        lazy_format_graph_code(stage, graph.owning_module)
        if stage in self.pass_config.dump_graph_stages or always:
            # Make sure filename includes rank in the distributed setting
            parallel = p_is_init() and get_tp_world_size() > 1

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3804,9 +3804,10 @@ class PassConfig:
    its own stages (before, after, maybe in-between)."""
    dump_graph_dir: Path = Path(".")
    """Directory to dump the graphs."""
-    # TODO(luka) better pass enabling system.
    enable_fusion: bool = True
-    """Whether to enable the custom fusion pass."""
+    """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
+    enable_attn_fusion: bool = False
+    """Whether to enable the custom attention+quant fusion pass."""
    enable_noop: bool = True
    """Whether to enable the custom no-op elimination pass."""
    enable_sequence_parallelism: bool = False
@@ -3814,6 +3815,8 @@ class PassConfig:
    enable_async_tp: bool = False
    """Whether to enable async TP."""
+    # TODO(luka) better pass enabling system.
    def uuid(self):
        """
        Produces a hash unique to the pass configuration.
@@ -3821,18 +3824,20 @@ class PassConfig:
        Do not include dump_graph_* in the hash - they don't affect
        compilation.
        """
-        include = {
+        exclude = {"dump_graph_stages", "dump_graph_dir"}
-            "enable_fusion", "enable_noop", "enable_sequence_parallelism",
+        dict_ = {k: v for k, v in asdict(self).items() if k not in exclude}
-            "enable_async_tp"
-        }
-        dict_ = {k: v for k, v in asdict(self).items() if k in include}
        return InductorPass.hash_dict(dict_)
    def __post_init__(self) -> None:
-        if not self.enable_noop and self.enable_fusion:
+        if not self.enable_noop:
-            logger.warning_once(
+            if self.enable_fusion:
-                "Fusion enabled but reshape elimination disabled. "
+                logger.warning_once(
-                "RMSNorm + quant (fp8) fusion might not work")
+                    "Fusion enabled but reshape elimination disabled. "
+                    "RMSNorm/SiluMul + quant (fp8) fusion might not work")
+            if self.enable_attn_fusion:
+                logger.warning_once(
+                    "Fusion enabled but reshape elimination disabled. "
+                    "Attention + quant (fp8) fusion might not work")
 @config

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
    VLLM_NCCL_SO_PATH: Optional[str] = None
    LD_LIBRARY_PATH: Optional[str] = None
-    VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_USE_TRITON_FLASH_ATTN: bool = True
    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
    VLLM_FLASH_ATTN_VERSION: Optional[int] = None
    LOCAL_RANK: int = 0

--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -569,6 +569,7 @@ class FlashAttentionImpl(AttentionImpl):
        kv_cache: torch.Tensor,
        attn_metadata: FlashAttentionMetadata,
        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with FlashAttention.
@@ -586,6 +587,11 @@ class FlashAttentionImpl(AttentionImpl):
        """
        assert output is not None, "Output tensor must be provided."
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashAttentionImpl")
        if attn_metadata is None:
            # Profiling run.
            return output

--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -547,6 +547,7 @@ class FlashInferImpl(AttentionImpl):
        kv_cache: torch.Tensor,
        attn_metadata: FlashInferMetadata,
        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with FlashInfer.
@@ -561,6 +562,11 @@ class FlashInferImpl(AttentionImpl):
        """
        assert output is not None, "Output tensor must be provided."
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashInferImpl")
        if attn_metadata is None:
            # Profiling run.
            return output

--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -414,6 +414,7 @@ class FlexAttentionImpl(AttentionImpl):
        kv_cache: torch.Tensor,
        attn_metadata: FlexAttentionMetadata,
        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with FLexAttention.
@@ -427,6 +428,12 @@ class FlexAttentionImpl(AttentionImpl):
            shape = [num_tokens, num_heads * head_size]
        """
        assert output is not None, "Output tensor must be provided."
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlexAttentionImpl")
        enable_gqa = self.num_kv_heads != self.num_heads
        if attn_metadata is None:

--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -865,10 +865,16 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
        kv_cache: torch.Tensor,
        attn_metadata: M,
        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        assert output is not None, "Output tensor must be provided."
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for MLACommonImpl")
        if attn_metadata is None:
            # The zero fill is required when used with DP + EP
            # to ensure all ranks within a DP group compute the

--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -161,6 +161,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
        kv_cache: torch.Tensor,
        attn_metadata: PallasMetadata,
        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with Pallas attention.
@@ -173,6 +174,11 @@ class PallasAttentionBackendImpl(AttentionImpl):
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for PallasAttentionBackendImpl")
        # For determine_available_memory case.
        if kv_cache.numel() == 0:
            if output is None:

--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -142,6 +142,7 @@ class TritonAttentionImpl(AttentionImpl):
        kv_cache: torch.Tensor,
        attn_metadata: FlashAttentionMetadata,
        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with FlashAttention.
@@ -156,6 +157,11 @@ class TritonAttentionImpl(AttentionImpl):
        """
        assert output is not None, "Output tensor must be provided."
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for TritonAttentionImpl")
        if attn_metadata is None:
            # Profiling run.
            return output