remove unused code

2af8e008 · zhuwenwen · 993c31c3 · 993c31c3 · 993c31c3 · 993c31c3
Commit 2af8e008 authored Jan 16, 2026 by zhuwenwen
6 changed files
--- a/vllm/attention/backends/cpu_mla.py
+++ b/vllm/attention/backends/cpu_mla.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-import torch
-import vllm._custom_ops as ops
-from vllm._ipex_ops import ipex_ops
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadataBuilder,
-                                              AttentionType,
-                                              is_quantized_kv_cache)
-from vllm.attention.backends.mla.common import MLACommonImpl, MLACommonState
-from vllm.attention.backends.torch_sdpa import TorchSDPAMetadata
-from vllm.utils import make_tensor_with_pad
-from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
-class CPUMLABackend(AttentionBackend):
-    @staticmethod
-    def get_name() -> str:
-        return "CPU_MLA"
-    @staticmethod
-    def get_metadata_cls() -> Type["CPUMLAMetadata"]:
-        return CPUMLAMetadata
-    @staticmethod
-    def get_builder_cls() -> Type["CPUMLAMetadataBuilder"]:
-        return CPUMLAMetadataBuilder
-    @staticmethod
-    def get_state_cls() -> Type["MLACommonState"]:
-        return MLACommonState
-    @staticmethod
-    def get_impl_cls() -> Type["CPUMLAImpl"]:
-        return CPUMLAImpl
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,  # assumed to be 1 for MLA
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, block_size, head_size)
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        ops.copy_blocks_mla(kv_caches, src_to_dists)
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [576]
-@dataclass
-class CPUMLAMetadata(TorchSDPAMetadata):
-    # New for MLA
-    # Input positions for rotrary embeddings since for MLA the rotary
-    # position embeddings are applied inside the attention backend
-    input_positions: torch.Tensor = None
-    # required by MLACommonImpl
-    is_profile_run: bool = False
-class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
-    def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
-        self.chunked_prefill = input_builder.chunked_prefill
-        self.input_builder = input_builder
-        assert not self.chunked_prefill, \
-            "chunked prefill is currently not supported"
-    def prepare(self):
-        self.input_data = self.input_builder.input_data
-    def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
-        input_data = self.input_data
-        prefill_seq_lens = seq_lens[0:input_data.num_prefills]
-        prefill_query_lens = query_lens[0:input_data.num_prefills]
-        slot_mapping = torch.tensor(input_data.slot_mapping,
-                                    dtype=torch.long,
-                                    device="cpu")
-        # metadata for prefill
-        if input_data.num_prefills > 0:
-            query_lens_tensor = torch.tensor(prefill_query_lens,
-                                             dtype=torch.int32,
-                                             device="cpu")
-            kv_lens_tensor = torch.tensor(prefill_seq_lens,
-                                          dtype=torch.int32,
-                                          device="cpu")
-            query_start_loc = torch.zeros(input_data.num_prefills + 1,
-                                          dtype=torch.int32,
-                                          device="cpu")
-            kv_start_loc = torch.zeros(input_data.num_prefills + 1,
-                                       dtype=torch.int32,
-                                       device="cpu")
-            torch.cumsum(query_lens_tensor,
-                         dim=0,
-                         dtype=torch.int32,
-                         out=query_start_loc[1:])
-            torch.cumsum(kv_lens_tensor,
-                         dim=0,
-                         dtype=torch.int32,
-                         out=kv_start_loc[1:])
-            max_query_len = max(prefill_query_lens)
-            max_kv_len = max(prefill_seq_lens)
-            # for chunked-prefill
-            if self.chunked_prefill:
-                prefill_block_tables = make_tensor_with_pad(
-                    self.input_data.prefill_block_tables,
-                    pad=0,
-                    dtype=torch.int32,
-                    device="cpu",
-                )
-            else:
-                prefill_block_tables = None
-        else:
-            query_start_loc = None
-            kv_start_loc = None
-            max_query_len = None
-            max_kv_len = None
-            prefill_block_tables = None
-        # metadata for decode
-        if input_data.num_decode_tokens != 0:
-            seq_lens_tensor = torch.tensor(
-                input_data.seq_lens[input_data.num_prefills:],
-                dtype=torch.int32,
-                device="cpu",
-            )
-            block_tables = make_tensor_with_pad(
-                self.input_data.decode_block_tables,
-                pad=0,
-                dtype=torch.int32,
-                device="cpu",
-            )
-        else:
-            block_tables = torch.tensor([])
-            seq_lens_tensor = torch.tensor(
-                input_data.seq_lens[:input_data.num_prefills],
-                dtype=torch.int32,
-                device="cpu",
-            )
-        # For multi-modal models
-        placeholder_index_maps = None
-        if len(input_data.multi_modal_inputs_list) != 0:
-            placeholder_index_maps = {
-                modality: placeholder_map.index_map()
-                for modality, placeholder_map in
-                input_data.multi_modal_placeholder_maps.items()
-            }
-        return CPUMLAMetadata(
-            chunked_prefill=self.chunked_prefill,
-            seq_lens=prefill_seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            max_kv_len=max_kv_len,
-            prefill_query_start_loc=query_start_loc,
-            kv_start_loc=kv_start_loc,
-            max_decode_seq_len=input_data.max_decode_seq_len,
-            num_prefills=input_data.num_prefills,
-            num_prefill_tokens=input_data.num_prefill_tokens,
-            num_decode_tokens=input_data.num_decode_tokens,
-            block_tables=block_tables,
-            prefill_block_tables=prefill_block_tables,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-            input_positions=torch.tensor([self.input_data.input_positions]))
-class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
-    def __init__(
-            self,
-            num_heads: int,
-            head_size: int,
-            scale: float,
-            num_kv_heads: int,
-            alibi_slopes: Optional[List[float]],
-            sliding_window: Optional[int],
-            kv_cache_dtype: str,
-            blocksparse_params: Optional[Dict[str, Any]],
-            logits_soft_cap: Optional[float],
-            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
-            # MLA Specific Arguments
-            **mla_args) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads,
-                         alibi_slopes, sliding_window, kv_cache_dtype,
-                         blocksparse_params, logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
-        unsupported_features = [
-            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
-        ]
-        if any(unsupported_features):
-            raise NotImplementedError(
-                "CPUMLAImpl does not support one of the following: "
-                "alibi_slopes, sliding_window, blocksparse_params, "
-                "logits_soft_cap")
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "CPUMLAImpl")
-        # states is implemented.
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "CPUMLAImpl with FP8 KV cache not yet supported")
-    def _forward_prefill(
-            self,
-            q: torch.Tensor,
-            kv_c_normed: torch.Tensor,
-            k_pe: torch.Tensor,
-            kv_c_and_k_pe_cache: torch.Tensor,
-            attn_metadata: CPUMLAMetadata,  # type: ignore[override]
-    ) -> torch.Tensor:
-        prefill_metadata = attn_metadata.prefill_metadata
-        assert prefill_metadata is not None
-        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
-            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-        k_nope, v = kv_nope\
-            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-        output = torch.empty_like(q)
-        ipex_ops.varlen_attention(
-            query=q,
-            key=k,
-            value=v_padded,
-            out=output,
-            seqlen_q=prefill_metadata.prefill_query_start_loc,
-            seqlen_k=prefill_metadata.prefill_query_start_loc,
-            max_seqlen_q=prefill_metadata.max_query_len,
-            max_seqlen_k=prefill_metadata.max_query_len,
-            pdropout=0.0,
-            softmax_scale=self.scale,
-            zero_tensors=False,
-            is_causal=True,
-            return_softmax=False,
-            gen_=None,
-            logits_soft_cap=0.0,
-            window_size_left=-1,
-            window_size_right=-1,
-            alibi_slopes=None,
-        )
-        # remove padding
-        output = output.view(-1, self.num_heads,
-                             q.shape[-1])[..., :v.shape[-1]]
-        return output.reshape(-1, self.num_heads * v.shape[-1])
-    def _forward_decode(
-            self,
-            q_nope: torch.Tensor,
-            q_pe: torch.Tensor,
-            kv_c_and_k_pe_cache: torch.Tensor,
-            attn_metadata: CPUMLAMetadata,  # type: ignore[override]
-    ) -> torch.Tensor:
-        assert kv_c_and_k_pe_cache.numel() > 0
-        decode_meta = attn_metadata.decode_metadata
-        assert decode_meta is not None
-        q = torch.cat([q_nope, q_pe], dim=-1)
-        o = q.new_empty(q.shape[0], self.num_heads, self.kv_lora_rank)
-        # Run MQA
-        ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale,
-                                   decode_meta.block_tables,
-                                   decode_meta.seq_lens_tensor)
-        return self._v_up_proj(o)
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" Attention layer with torch scaled_dot_product_attention
-    and PagedAttention."""
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-import torch
-from vllm._ipex_ops import ipex_ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.attention.ops.paged_attn import (PagedAttention,
-                                           PagedAttentionMetadata)
-from vllm.logger import init_logger
-logger = init_logger(__name__)
-_PARTITION_SIZE = 512
-class IpexAttnBackend(AttentionBackend):
-    @staticmethod
-    def get_name() -> str:
-        return "IPEX"
-    @staticmethod
-    def get_impl_cls() -> Type["IpexAttnBackendImpl"]:
-        return IpexAttnBackendImpl
-    @staticmethod
-    def get_metadata_cls() -> Type["IpexAttnMetadata"]:
-        return IpexAttnMetadata
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        from vllm._ipex_ops import ipex_ops as ops
-        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        from vllm._ipex_ops import ipex_ops as ops
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
-@dataclass
-class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata):
-    """Metadata for IpexAttnBackend.
-    """
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    is_prompt: bool
-    slot_mapping: torch.Tensor
-    seq_lens: Optional[List[int]]
-    seqlen_q: Optional[torch.Tensor]
-    max_seqlen: Optional[int]
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[List[torch.Tensor]] = None
-    @property
-    def prefill_metadata(self) -> Optional["IpexAttnMetadata"]:
-        # Currently chunked prefill is not supported
-        if self.num_decode_tokens == 0:
-            assert self.num_prefills > 0
-            return self
-        return None
-    @property
-    def decode_metadata(self) -> Optional["IpexAttnMetadata"]:
-        # Currently chunked prefill is not supported
-        if self.num_prefills > 0:
-            assert self.num_decode_tokens == 0
-            return None
-        return self
-class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Ipex is not supported yet, it will fall"
-                " back to global attention for long context.")
-        if blocksparse_params is not None:
-            raise ValueError(
-                "IPEX backend does not support block-sparse attention.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = sliding_window
-        self.kv_cache_dtype = kv_cache_dtype
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.need_mask = (self.sliding_window is not None)
-        if logits_soft_cap is None:
-            logits_soft_cap = -1
-        self.logits_soft_cap = logits_soft_cap
-        supported_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {supported_head_sizes}.")
-        if is_quantized_kv_cache(kv_cache_dtype):
-            raise NotImplementedError(
-                "IPEX backend does not support FP8 KV cache. "
-                "Please use xFormers backend instead.")
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "IpexAttnBackendImpl")
-    def split_kv_cache(
-        self,
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        x = 1
-        num_blocks = kv_cache.shape[1]
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
-                                   -1, x)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
-        return key_cache, value_cache
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: IpexAttnMetadata,  # type: ignore
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with IPEX varlen_attention and PagedAttention.
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for IpexAttentionImpl")
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        num_tokens, hidden_size = query.shape
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-        if kv_cache.numel() > 0:
-            key_cache, value_cache = self.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-            ipex_ops.reshape_and_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping.flatten(),
-                self.kv_cache_dtype,
-                layer._k_scale_float,
-                layer._v_scale_float,
-            )
-        if attn_metadata.is_prompt:
-            assert attn_metadata.seq_lens is not None
-            if (kv_cache.numel() == 0
-                    or attn_metadata.block_tables.numel() == 0):
-                if self.num_kv_heads != self.num_heads:
-                    key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
-                    value = value.repeat_interleave(self.num_queries_per_kv,
-                                                    dim=1)
-                if attn_metadata.attn_bias is None:
-                    if self.sliding_window is not None:
-                        att_masks = _make_sliding_window_bias(
-                            attn_metadata.seq_lens, self.sliding_window,
-                            query.dtype)  # type: ignore
-                    else:
-                        att_masks = _make_sliding_window_bias(
-                            attn_metadata.seq_lens, None, dtype=query.dtype)
-                    attn_metadata.attn_bias = att_masks
-                output = torch.empty(
-                    (num_tokens, self.num_heads, self.head_size),
-                    dtype=query.dtype,
-                    device=query.device)
-                ipex_ops.varlen_attention(
-                    query,
-                    key,
-                    value,
-                    output,
-                    attn_metadata.seqlen_q,
-                    attn_metadata.seqlen_q,
-                    self.alibi_slopes,
-                    attn_metadata.max_seqlen,
-                    attn_metadata.max_seqlen,
-                    pdropout=0.0,
-                    softmax_scale=self.scale,
-                    zero_tensors=False,
-                    is_causal=True,
-                    return_softmax=False,
-                    gen_=None,
-                    window_size_left=-1,
-                    window_size_right=-1,
-                    logits_soft_cap=self.logits_soft_cap,
-                )
-            else:
-                # prefix-enabled attention
-                raise RuntimeError(
-                    "IPEX backend doesn't support prefix decoding.")
-        else:
-            # Decoding run.
-            max_seq_len = attn_metadata.max_decode_seq_len
-            output = torch.empty_like(query)
-            block_size = value_cache.shape[3]
-            num_seqs, num_heads, head_size = query.shape
-            max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
-                                  _PARTITION_SIZE)
-            # NOTE(woosuk): We use a simple heuristic to decide whether to use
-            # PagedAttention V1 or V2. If the number of partitions is 1, we use
-            # V1 to avoid the overhead of reduction. Also, if the number of
-            # sequences or heads is large, we use V1 since there is enough work
-            # to parallelize.
-            # TODO(woosuk): Tune this heuristic.
-            # For context len > 8192, use V2 kernel to avoid shared memory
-            # shortage.
-            use_v1 = (max_seq_len <= 8192 and
-                      (max_num_partitions == 1 or num_seqs * num_heads > 512))
-            if use_v1:
-                # Run PagedAttention V1.
-                ipex_ops.paged_attention_v1(
-                    output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    self.num_kv_heads,
-                    self.scale,
-                    attn_metadata.block_tables,
-                    attn_metadata.seq_lens_tensor,
-                    block_size,
-                    max_seq_len,
-                    self.alibi_slopes,
-                    self.kv_cache_dtype,
-                    layer._k_scale_float,
-                    layer._v_scale_float,
-                )
-            else:
-                # Run PagedAttention V2.
-                assert _PARTITION_SIZE % block_size == 0
-                tmp_output = torch.empty(
-                    size=(num_seqs, num_heads, max_num_partitions, head_size),
-                    dtype=output.dtype,
-                    device=output.device,
-                )
-                exp_sums = torch.empty(
-                    size=(num_seqs, num_heads, max_num_partitions),
-                    dtype=torch.float32,
-                    device=output.device,
-                )
-                max_logits = torch.empty_like(exp_sums)
-                ipex_ops.paged_attention_v2(
-                    output,
-                    exp_sums,
-                    max_logits,
-                    tmp_output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    self.num_kv_heads,
-                    self.scale,
-                    attn_metadata.block_tables,
-                    attn_metadata.seq_lens_tensor,
-                    block_size,
-                    max_seq_len,
-                    self.alibi_slopes,
-                    self.kv_cache_dtype,
-                    layer._k_scale_float,
-                    layer._v_scale_float,
-                )
-            # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
-def _make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    dtype: torch.dtype,
-    seq_lens: List[int],
-) -> List[torch.Tensor]:
-    attn_biases = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
-        # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(seq_len, 1)`
-        # here. We find that both biases give the same results, but
-        # the bias below more accurately follows the original ALiBi
-        # paper.
-        bias = bias[None, :] - bias[:, None]
-        num_heads = alibi_slopes.shape[0]
-        bias = bias[None, :].repeat((num_heads, 1, 1))
-        bias.mul_(alibi_slopes[:, None, None])
-        inf_mask = torch.empty(
-            (1, seq_len, seq_len),
-            dtype=bias.dtype,
-            device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1)
-        attn_biases.append((bias + inf_mask).to(dtype))
-    return attn_biases
-def _make_sliding_window_bias(
-    seq_lens: List[int],
-    window_size: Optional[int],
-    dtype: torch.dtype,
-) -> List[torch.Tensor]:
-    attn_biases = []
-    for seq_len in seq_lens:
-        tensor = torch.full(
-            (1, seq_len, seq_len),
-            dtype=dtype,
-            fill_value=1,
-        )
-        shift = 0
-        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
-        if window_size is not None:
-            mask = torch.triu(mask, diagonal=shift - window_size + 1)
-        mask = torch.log(mask)
-        attn_biases.append(mask.to(dtype))
-    return attn_biases
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-# MLA Common Components
-This file implements common components for MLA implementations.
-First we define:
-Sq      as Q sequence length
-Skv     as KV sequence length
-MLA has two possible ways of computing, a data-movement friendly approach and a
-compute friendly approach, we generally want to use the compute friendly
-approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1)
-and the data-movement friendly approach for "decode" (i.e. the ratio
-Sq / Skv is "large").
-NOTE what we deem small and large is currently determined by if its labelled
-prefill or decode by the scheduler, but this is something we should probably
-tune.
-Main reference: DeepseekV2 paper, and FlashInfer Implementation
-(https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
-Deepseek's MLA attention works the following way:
-* Use a single latent vector to represent the per-token entry of the KV cache.
-* For decode (i.e. the memory friendly approach) the attention "simulates" a
-multi-head attention, while the compute is similar to multi-query attention.
-Below is example of both paths assuming batchsize = 1
-## More Extent Definitions:
-C           Context length, `Skv - Sq`
-H           hidden size
-N           number of attention heads
-Lq          latent dimension for Q              1536 in DSV3
-Lkv         latent dimension for K/V            512 in DSV3
-P           nope dimension, no rope.            128 in DSV3
-R           rope dimension, goes through rope.  64 in DSV3
-V           V head dim.                         128 in DSV3
-## Vector/Matrix Definitions
-h_t         hidden states (input to attention)  shape [Sq, H]
-q_c         latent/compressed Q                 shape [Sq, Lq]
-q_nope      uncompressed Q (no-rope)            shape [Sq, N, P]
-q_pe        uncompressed Q (rope)               shape [Sq, N, R]
-kv_c        latent/compressed KV                shape [Skv, Lkv]
-k_pe        decoupled k position embeddings     shape [Skv, R]
-new_kv_c    new kv_c from current iter          shape [Sq, Lkv]
-new_k_pe    new k_pe from current iter          shape [Sq, R]
-cache_kv_c  cached k_c from previous iters      shape [C, Lkv]
-cache_k_pe  cached k_pe from previous iters     shape [C, R]
-W_DQ        project h_t to q_c                  shape [H, Lq]
-W_UQ        project q_c to q_nope               shape [Lq, N * P]
-W_QR        project q_c to q_pe                 shape [Lq, N * R]
-W_DKV       project h_t to kv_c                 shape [H, Lkv]
-W_UK        project kv_c to k_nope              shape [Lkv, N, P]
-W_KR        project h_t to k_pe                 shape [H, R]
-W_UV        project kv_c to v                   shape [Lkv, N, V]
-W_O         project v to h_t                    shape [N * V, H]
-## Compute Friendly Approach (i.e. "_forward_prefill"):
-q_c      = h_t @ W_DQ
-q_nope   = (q_c @ W_UQ).view(Sq, N, P)
-q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
-new_kv_c = h_t @ W_DKV
-new_k_pe = RoPE(h_t @ W_KR)
-kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
-k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
-k_nope   = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P)
-v        = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V)
-// MHA with QK headdim = P + R
-//           V headdim = V
-//      spda_o shape [Sq, N, V]
-spda_o = scaled_dot_product_attention(
-    torch.cat([q_nope, q_pe], dim=-1),
-    torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
-    v
-) 
-return spda_o @ W_O
-NOTE: in the actual code, 
-    `kv_b_proj` is [W_UK; W_UV] concatenated per head
-    `q_b_proj` is [W_UQ; W_QR] concatenated per head
-    `out_proj` is W_O
-## Data-Movement Friendly Approach (i.e. "_forward_decode"):
-Runtime
-q_c      = h_t @ W_DQ
-q_nope   = (q_c @ W_UQ).view(-1, N, P)
-ql_nope  = einsum("snh,lnh->snl", q, W_UK)
-q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
-new_kv_c = h_t @ W_DKV
-new_k_pe = RoPE(h_t @ W_KR)
-kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
-k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
-// MQA with QK headdim = Lkv + R
-//           V headdim = Lkv
-//      spda_o shape [Sq, N, Lkv]
-// NOTE: this is less compute-friendly since Lkv > P
-//       but is more data-movement friendly since its MQA vs MHA
-spda_o = scaled_dot_product_attention(
-    torch.cat([ql_nope, q_pe], dim=-1),
-    torch.cat([kv_c, k_pe], dim=-1),
-    kv_c
-)
-o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV)
-return o.view(-1, N * V) @ self.num_heads @ W_O
-## Chunked Prefill
-For chunked prefill we want to use the compute friendly algorithm. We are 
-assuming sufficiently large Sq / Skv ratio, in the future may want to switch to 
-the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
-However, the compute-friendly approach can potentially run out of memory if Skv
-is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
-To mitigate this, we chunk the computation of attention with respect to the 
-current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a 
-fixed workspace size.
-The chunked prefill approach is as follows:
-MCC        Max chunk of context to process per iter, computed dynamically, 
-           used to bound the memory usage
-q_c        = h_t @ W_DQ
-q_nope     = (q_c @ W_UQ).view(Sq, N, P)
-q_pe       = RoPE(q_c @ W_QR).view(Sq, N, R)
-new_kv_c   = h_t @ W_DKV
-new_k_pe   = RoPE(h_t @ W_KR)
-new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P)
-new_v      = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V)
-// MHA between queries and new KV
-//     with QK headdim = P + R
-//           V headdim = V
-//    curr_o   shape [Sq, N, V]
-//    curr_lse shape [N, Sq], this is just order FA returns
-curr_o, curr_lse = scaled_dot_product_attention(
-    torch.cat([q_nope, q_pe], dim=-1),
-    torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
-    new_v,
-    casual=True,
-    return_softmax_lse=True
-) 
-// Compute attention with the already existing context
-for chunk_idx in range(cdiv(C, MCC)):
-    chunk_start  = chunk_idx * MCC
-    chunk_end    = min(chunk_start + MCC, C)
-    Sc           = chunk_end - chunk_start
-    cache_kv_c_chunk   = cache_kv_c[chunk_start:chunk_end]
-    cache_k_pe_chunk   = cache_k_pe[chunk_start:chunk_end]
-    cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
-    cache_v_chunk      = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
-    chunk_o, chunk_lse = scaled_dot_product_attention(
-        torch.cat([q_nope, q_pe], dim=-1),
-        torch.cat([cache_k_nope_chunk,
-                   cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)],
-                   dim=-1),
-        cache_v_chunk,
-        casual=False,
-        return_softmax_lse=True
-    )
-    curr_o, curr_lse = merge_attn_states(
-        suffix_output=curr_o,
-        suffix_lse=curr_lse,
-        prefix_output=chunk_o,
-        prefix_lse=chunk_lse,
-    )
-return curr_o @ W_O
-"""
-import os
-import functools
-from abc import abstractmethod
-from collections import defaultdict
-from contextlib import contextmanager
-from dataclasses import dataclass
-from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
-                    Type, TypeVar)
-import torch
-from vllm import _custom_ops as ops
-from vllm import envs
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
-                                              AttentionMetadata,
-                                              AttentionMetadataBuilder,
-                                              AttentionState, MLAAttentionImpl)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
-                                           compute_slot_mapping_start_idx,
-                                           is_block_tables_empty)
-from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.attention.utils.fa_utils import get_flash_attn_version
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase,
-                                               UnquantizedLinearMethod)
-from vllm.multimodal import MultiModalPlaceholderMap
-from vllm.platforms import current_platform
-from vllm.triton_utils import HAS_TRITON
-from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
-if HAS_TRITON:
-    from vllm.attention.ops.triton_flash_attention import triton_attention
-else:
-    triton_attention = None
-try:
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
-    is_vllm_fa = True
-except ImportError:
-    is_vllm_fa = False
-    try:
-        # For rocm use upstream flash attention
-        from flash_attn import flash_attn_varlen_func
-    except ImportError:
-        flash_attn_varlen_func = None
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-is_hip = current_platform.is_rocm()
-class MLACommonBackend(AttentionBackend):
-    @staticmethod
-    def get_name() -> str:
-        return "TRITON_MLA"
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return MLACommonMetadata
-    @staticmethod
-    def get_builder_cls() -> Type["MLACommonMetadataBuilder"]:
-        return MLACommonMetadataBuilder
-    @staticmethod
-    def get_state_cls() -> Type["MLACommonState"]:
-        return MLACommonState
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,  # assumed to be 1 for MLA
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, block_size, head_size)
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        ops.copy_blocks_mla(kv_caches, src_to_dists)
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [576]
-T = TypeVar("T", bound="MLACommonMetadata")
-class MLACommonState(AttentionState, Generic[T]):
-    def __init__(self, runner):
-        self.runner = runner
-        self._is_graph_capturing = False
-        scheduler_config = runner.scheduler_config
-        self.model_config = runner.model_config
-        cache_config = runner.cache_config
-        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
-        self.enable_prefix_caching = cache_config.enable_prefix_caching
-        if self.chunked_prefill_enabled or self.enable_prefix_caching:
-            self.context_chunk_workspace_size = min(
-                # Max sure there is enough for 8 full length request or at least
-                # 4 pages of cache per request
-                max(
-                    8 * self.model_config.max_model_len, 4 *
-                    scheduler_config.max_num_seqs * cache_config.block_size),
-                # For long-context models try not to over-allocate limiting
-                # kv-cache space, limiting it to 64k tokens,
-                # which would result in the workspace being:
-                #   2*(576)*(64*1024) = 144mb
-                # (assuming 576 MLA head dim, and fp16)
-                # which would result in up-projected context being
-                #   2*(192*128)*(64*1024) = 3gb
-                # (assuming 192 QK head dim, 128 heads, and fp16)
-                128 * 1024)
-            assert self.context_chunk_workspace_size >= \
-                scheduler_config.max_num_seqs * cache_config.block_size
-    @contextmanager
-    def graph_capture(self, max_batch_size: int):
-        self._is_graph_capturing = True
-        self._graph_slot_mapping = torch.full((max_batch_size, ),
-                                              PAD_SLOT_ID,
-                                              dtype=torch.long,
-                                              device=self.runner.device)
-        self._graph_seq_lens = torch.ones(max_batch_size,
-                                          dtype=torch.int32,
-                                          device=self.runner.device)
-        self._graph_block_tables = torch.from_numpy(
-            self.runner.graph_block_tables).to(device=self.runner.device)
-        self._positions = torch.zeros((max_batch_size, ),
-                                      dtype=torch.long,
-                                      device=self.runner.device)
-        yield
-        self._is_graph_capturing = False
-        del self._graph_slot_mapping
-        del self._graph_seq_lens
-        del self._graph_block_tables
-        del self._positions
-    def graph_clone(self, batch_size: int):
-        assert self._is_graph_capturing
-        return self.__class__(self.runner)
-    def graph_capture_get_metadata_for_batch(
-            self,
-            batch_size: int,
-            is_encoder_decoder_model: bool = False) -> T:
-        assert self._is_graph_capturing
-        attn_metadata = self.runner.attn_backend.make_metadata(
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            use_cuda_graph=True,
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=batch_size,
-            slot_mapping=self._graph_slot_mapping[:batch_size],
-            seq_lens=None,
-            seq_lens_tensor=self._graph_seq_lens[:batch_size],
-            max_query_len=1,
-            max_decode_query_len=1,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=self.runner.max_seq_len_to_capture,
-            query_start_loc=None,
-            seq_start_loc=None,
-            context_lens_tensor=None,
-            block_tables=self._graph_block_tables[:batch_size],
-            head_dim=self.runner.model_config.get_head_size())
-        if is_encoder_decoder_model:
-            raise NotImplementedError(
-                "MLACommonState does not support encoder/decoder yet")
-        return attn_metadata
-    def get_graph_input_buffers(self,
-                                attn_metadata,
-                                is_encoder_decoder_model: bool = False):
-        input_buffers = {
-            "slot_mapping": attn_metadata.slot_mapping,
-            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
-            "block_tables": attn_metadata.decode_metadata.block_tables,
-        }
-        if is_encoder_decoder_model:
-            raise NotImplementedError(
-                "MLACommonState does not support encoder/decoder yet")
-        return input_buffers
-    def prepare_graph_input_buffers(self,
-                                    input_buffers,
-                                    attn_metadata,
-                                    is_encoder_decoder_model: bool = False):
-        input_buffers["seq_lens_tensor"].copy_(
-            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
-        input_buffers["block_tables"].copy_(
-            attn_metadata.decode_metadata.block_tables, non_blocking=True)
-        if is_encoder_decoder_model:
-            raise NotImplementedError(
-                "TritonMLAState does not support encoder/decoder yet")
-    def begin_forward(self, model_input):
-        if self.chunked_prefill_enabled or self.enable_prefix_caching:
-            if not hasattr(self, "context_chunk_workspace"):
-                # not self.runner.device does not return the correct device
-                # for this process, (init_device sets the correct device but
-                # only on the Worker). The only way Ive figured out to get the
-                # correct device is to allocate the workspace on the first call
-                # to begin_forward and use the device of the input tokens
-                assert model_input.input_tokens is not None
-                self.context_chunk_workspace = torch.empty(
-                    (self.context_chunk_workspace_size,
-                     self.model_config.get_head_size()),
-                    dtype=self.model_config.dtype,
-                    device=model_input.input_tokens.device,
-                )
-            model_input.attn_metadata.context_chunk_workspace = \
-                self.context_chunk_workspace
-@dataclass
-class MLACommonMetadata(AttentionMetadata):
-    """Metadata for MLACommon. 
-    NOTE: Please read the comment at the top of the file before trying to 
-    understand this class
-    NOTE: Any python object stored here is not updated when it is
-    cuda-graph replayed. If you have values that need to be changed
-    dynamically, it should be stored in tensor. The tensor has to be
-    updated from `CUDAGraphRunner.forward` API.
-    """
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
-    # NOTE(sang): Definition of context_len, query_len, and seq_len.
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ---------------------|
-    #                                   |-- query_len ---|
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]]
-    # seq_lens stored as a tensor.
-    seq_lens_tensor: Optional[torch.Tensor]
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
-    # (batch_size,) A tensor of context lengths (tokens that are computed
-    # so far).
-    context_lens_tensor: Optional[torch.Tensor]
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: Optional[torch.Tensor]
-    # Maximum query length in the batch.
-    max_query_len: Optional[int] = None
-    # Max number of query tokens among request in the batch.
-    max_decode_query_len: Optional[int] = None
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor] = None
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor] = None
-    _cached_prefill_metadata: Optional[Any] = None
-    _cached_decode_metadata: Optional[Any] = None
-    num_prefill_tokens: int
-    # The dimension of the attention heads
-    head_dim: Optional[int] = None
-    # Used when chunked prefill is enabled to simulate worst case workspace
-    # allocations, hopefully to avoid going OOM
-    is_profile_run: bool = False
-    # New for MLA (compared to FlashAttention)
-    # For chunked prefill
-    context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
-    context_chunk_starts: Optional[torch.Tensor] = None
-    context_chunk_seq_tot: Optional[List[int]] = None
-    context_chunk_max_seq_lens: Optional[List[int]] = None
-    # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted
-    context_chunk_workspace: Optional[torch.Tensor] = None
-    def __post_init__(self):
-        supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f" received {self.head_dim}.")
-    @property
-    def prefill_metadata(self):
-        if self.num_prefills == 0:
-            return None
-        if self._cached_prefill_metadata is not None:
-            return self._cached_prefill_metadata
-        assert self.seq_lens is not None
-        assert self.seq_lens_tensor is not None
-        # Compute some attn_metadata fields which default to None
-        query_start_loc = (None if self.query_start_loc is None else
-                           self.query_start_loc[:self.num_prefills + 1])
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[:self.num_prefill_tokens])
-        seq_lens = (None if self.seq_lens is None else
-                    self.seq_lens[:self.num_prefills])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[:self.num_prefills])
-        seq_start_loc = (None if self.seq_start_loc is None else
-                         self.seq_start_loc[:self.num_prefills + 1])
-        context_lens_tensor = (None if self.context_lens_tensor is None else
-                               self.context_lens_tensor[:self.num_prefills])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[:self.num_prefills])
-        self._cached_prefill_metadata = self.__class__(
-            # Required by ModelRunner
-            use_cuda_graph=False,  # Not Attention Related
-            # Required by Attention Metadata
-            num_prefills=self.num_prefills,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=0,
-            slot_mapping=slot_mapping,
-            # Required by Attention Metadata (not used)
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            # MLACommonMetadata
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=self.max_prefill_seq_len,
-            max_decode_query_len=0,
-            max_decode_seq_len=0,
-            query_start_loc=query_start_loc,
-            seq_start_loc=seq_start_loc,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            head_dim=self.head_dim,
-            is_profile_run=self.is_profile_run,
-            # MLACommonMetadata Chunk prefill specific
-            context_chunk_cu_seq_lens=self.context_chunk_cu_seq_lens,
-            context_chunk_starts=self.context_chunk_starts,
-            context_chunk_seq_tot=self.context_chunk_seq_tot,
-            context_chunk_max_seq_lens=self.context_chunk_max_seq_lens,
-        )
-        return self._cached_prefill_metadata
-    @property
-    def decode_metadata(self):
-        if self.num_decode_tokens == 0:
-            return None
-        if self._cached_decode_metadata is not None:
-            return self._cached_decode_metadata
-        assert self.seq_lens_tensor is not None
-        # Compute some attn_metadata fields which default to None
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[self.num_prefill_tokens:])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[self.num_prefills:])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[self.num_prefills:])
-        self._cached_decode_metadata = self.__class__(
-            # Required by ModelRunner
-            use_cuda_graph=self.use_cuda_graph,  # Not Attention Related
-            # Required by Attention Metadata
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=slot_mapping,
-            # Required by Attention Metadata (not used)
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            # MLACommonMetadata
-            seq_lens=None,
-            seq_lens_tensor=seq_lens_tensor,
-            max_decode_query_len=self.max_decode_query_len,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=self.max_decode_seq_len,
-            # Batch may be composed of prefill|decodes, adjust query start
-            # indices to refer to the start of decodes. E.g.
-            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
-            query_start_loc=(self.query_start_loc[self.num_prefills:] -
-                             self.query_start_loc[self.num_prefills])
-            if self.query_start_loc is not None else None,
-            seq_start_loc=self.seq_start_loc[self.num_prefills:]
-            if self.seq_start_loc is not None else None,
-            context_lens_tensor=None,
-            block_tables=block_tables,
-            head_dim=self.head_dim,
-            is_profile_run=self.is_profile_run)
-        return self._cached_decode_metadata
-class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
-    """
-    NOTE: Please read the comment at the top of the file before trying to 
-    understand this class
-    """
-    BLOCK_TABLE_EXTENDER: list[list[int]] = []
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-        self.chunked_prefill_enabled = \
-            self.runner.scheduler_config.chunked_prefill_enabled
-        self.enable_prefix_caching = \
-            self.runner.cache_config.enable_prefix_caching
-        if self.chunked_prefill_enabled or self.enable_prefix_caching:
-            attn_state = self.input_builder.runner.attn_state
-            self.context_chunk_workspace_size = \
-                attn_state.context_chunk_workspace_size
-            self.page_size = self.runner.block_size
-    def prepare(self):
-        self.slot_mapping: List[int] = []
-        self.prefill_seq_lens: List[int] = []
-        self.context_lens: List[int] = []
-        self.block_tables: List[List[int]] = []
-        self.curr_seq_lens: List[int] = []
-        self.multimodal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-        self.num_prefills = 0
-        self.num_prefill_tokens = 0
-        self.num_decode_tokens = 0
-        self.has_prefix_cache_hit = False
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
-        """Add a sequence group to the metadata. Specifically update/append
-        1. context length.
-        2. block table.
-        3. slot mapping.
-        """
-        is_prompt = inter_data.is_prompt
-        block_tables = inter_data.block_tables
-        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block) in zip(
-                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
-                 inter_data.orig_seq_lens, inter_data.seq_lens,
-                 inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks):
-            self.context_lens.append(context_len)
-            if is_prompt:
-                self.num_prefills += 1
-                self.num_prefill_tokens += token_len
-                self.prefill_seq_lens.append(seq_len)
-            else:
-                self.num_decode_tokens += query_len
-                self.curr_seq_lens.append(curr_seq_len)
-            # Compute block table.
-            # TODO(sang): Combine chunked prefill and prefix caching by
-            # only allowing multiple of block_size chunk size.
-            # NOTE: This only works for oooooooxxx style attention.
-            block_table = []
-            if prefix_cache_hit:
-                # NOTE(woosuk): For flash-attn, the block table should
-                # include the entries for the incoming prefill tokens.
-                block_table = block_tables[seq_id]
-            elif ((chunked_prefill_enabled or not is_prompt)
-                  and block_tables is not None):
-                if curr_sliding_window_block == 0:
-                    block_table = block_tables[seq_id]
-                else:
-                    block_table = block_tables[seq_id][
-                        -curr_sliding_window_block:]
-            self.block_tables.append(block_table)
-            # Compute slot mapping.
-            is_profile_run = is_block_tables_empty(block_tables)
-            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
-                                                       context_len,
-                                                       self.sliding_window)
-            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
-                                 seq_len, context_len, start_idx,
-                                 self.block_size, inter_data.block_tables)
-    def _get_graph_runner_block_tables(
-            self, num_seqs: int,
-            block_tables: List[List[int]]) -> torch.Tensor:
-        # The shape of graph_block_tables is
-        # [max batch size, max context len // block size].
-        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
-        assert max_batch_size >= num_seqs
-        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
-        for i, block_table in enumerate(block_tables):
-            if block_table:
-                num_blocks = len(block_table)
-                if num_blocks <= max_blocks:
-                    graph_block_tables[i, :num_blocks] = block_table
-                else:
-                    # It may be possible to have more blocks allocated due
-                    # to lookahead slots of multi-step, however, they are
-                    # not used anyway, so can be safely ignored.
-                    graph_block_tables[
-                        i, :max_blocks] = block_table[:max_blocks]
-        return torch.from_numpy(graph_block_tables).to(
-            device=self.runner.device, non_blocking=True)
-    def build(self, seq_lens: List[int], query_lens: List[int],
-              cuda_graph_pad_size: int, batch_size: int):
-        """Build attention metadata with on-device tensors.
-        Args:
-            seq_lens: The maybe padded sequence lengths of the input sequences.
-            query_lens: The query lengths of the input sequences.
-            cuda_graph_pad_size: The padding size for cuda graph.
-                                 -1 if cuda graph is not used.
-            batch_size: The maybe padded batch size.
-        """
-        prefix_cache_hit = any([
-            inter_data.prefix_cache_hit
-            for inter_data in self.input_builder.inter_data_list
-        ])
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled,
-                                prefix_cache_hit)
-        device = self.runner.device
-        use_captured_graph = cuda_graph_pad_size != -1
-        max_query_len = max(query_lens)
-        decode_query_lens = query_lens[self.num_prefills:]
-        if len(decode_query_lens) > 0:
-            max_decode_query_len = max(decode_query_lens)
-        else:
-            max_decode_query_len = 1
-        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
-        max_decode_seq_len = max(self.curr_seq_lens, default=0)
-        num_decode_tokens = self.num_decode_tokens
-        query_start_loc = list(accumulate(query_lens, initial=0))
-        seq_start_loc = list(accumulate(seq_lens, initial=0))
-        num_seqs = len(seq_lens)
-        if use_captured_graph:
-            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER *
-                                     cuda_graph_pad_size)
-            num_decode_tokens = batch_size - self.num_prefill_tokens
-            block_tables = self._get_graph_runner_block_tables(
-                num_seqs, self.block_tables)
-        else:
-            block_tables = make_tensor_with_pad(
-                self.block_tables,
-                pad=0,
-                dtype=torch.int,
-                device=device,
-            )
-        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
-        assert device is not None
-        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
-                                               device, self.runner.pin_memory)
-        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
-                                           self.runner.pin_memory)
-        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
-                                               device, self.runner.pin_memory)
-        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
-                                                  device,
-                                                  self.runner.pin_memory)
-        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
-                                                device, self.runner.pin_memory)
-        context_chunk_cu_seq_lens = None
-        context_chunk_starts = None
-        context_chunk_seq_tot = None
-        context_chunk_max_seq_lens = None
-        if (self.chunked_prefill_enabled or self.enable_prefix_caching) \
-            and self.num_prefills > 0 \
-            and context_lens_tensor is not None \
-            and context_lens_tensor[:self.num_prefills].max() > 0:
-            # NOTE: it is recommended you read the `Chunked Prefill` section in
-            # the comment at the top of the file before trying to understand
-            # the following code
-            num_prefills_with_context = \
-                (context_lens_tensor[:self.num_prefills] > 0).sum().item()
-            # currently we allocate an equal amount of workspace for each
-            # prefill in the batch, we could probably use a more advanced
-            # algorithm here and allocate more workspace to prefills with
-            # longer context lengths
-            max_context_chunk = \
-                self.context_chunk_workspace_size // num_prefills_with_context
-            # align max_context_chunk to page_size by rounding down,
-            # currently the `gather_and_maybe_dequant_cache` kernel cannot
-            # handle `context_chunk_starts` that are not aligned to page_size
-            max_context_chunk = round_down(max_context_chunk, self.page_size)
-            assert max_context_chunk > 0
-            num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
-            # if `max_context_chunk = 256`, `num_chunks = 3`, and
-            #   `num_prefills_with_context = 4`, create a tensor that looks like
-            #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
-            context_chunk_starts = \
-                torch.arange(num_chunks, device=device, dtype=torch.int32)\
-                .unsqueeze(1).expand(-1, self.num_prefills)\
-                * max_context_chunk
-            chunk_ends = torch.min(context_lens_tensor[:self.num_prefills]\
-                .unsqueeze(0), context_chunk_starts + max_context_chunk)
-            chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0)
-            _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
-                torch.int32)
-            zero = torch.zeros(num_chunks, dtype=torch.int32, device=device)\
-                .unsqueeze(-1)
-            context_chunk_cu_seq_lens = \
-                torch.cat([zero, _context_chunk_cu_seq_lens], dim=1)
-            context_chunk_max_seq_lens = \
-                chunk_seq_lens.max(dim=1).values.tolist()
-            context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
-            assert max(context_chunk_seq_tot) <= \
-                self.context_chunk_workspace_size
-        return self.runner.attn_backend.make_metadata(
-            # Required by ModelRunner
-            use_cuda_graph=use_captured_graph,  # Not Attention Related
-            # Required by Attention Metadata
-            num_prefills=self.num_prefills,
-            slot_mapping=slot_mapping_tensor,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            # Required by Attention Metadata (not used)
-            multi_modal_placeholder_index_maps=None,  # Not Attention Related
-            enable_kv_scales_calculation=False,
-            # MLACommonMetadata
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            max_decode_query_len=max_decode_query_len,
-            max_prefill_seq_len=max_prefill_seq_len,
-            max_decode_seq_len=max_decode_seq_len,
-            query_start_loc=query_start_loc_tensor,
-            seq_start_loc=seq_start_loc_tensor,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            head_dim=self.runner.model_config.get_head_size(),
-            is_profile_run=self.runner.in_profile_run,
-            # MLACommonMetadata Chunk prefill specific
-            context_chunk_cu_seq_lens=context_chunk_cu_seq_lens,
-            context_chunk_starts=context_chunk_starts,
-            context_chunk_seq_tot=context_chunk_seq_tot,
-            context_chunk_max_seq_lens=context_chunk_max_seq_lens,
-        )
-class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
-    """
-    NOTE: Please read the comment at the top of the file before trying to 
-    understand this class
-    """
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
-        attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
-        # MLA Specific Arguments
-        q_lora_rank: Optional[int],
-        kv_lora_rank: int,
-        qk_nope_head_dim: int,
-        qk_rope_head_dim: int,
-        qk_head_dim: int,
-        v_head_dim: int,
-        kv_b_proj: ColumnParallelLinear,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing not supported in V0.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.kv_cache_dtype = kv_cache_dtype
-        self.q_lora_rank = q_lora_rank
-        self.kv_lora_rank = kv_lora_rank
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.qk_head_dim = qk_head_dim
-        self.v_head_dim = v_head_dim
-        self.kv_b_proj = kv_b_proj
-        self.triton_fa_func = triton_attention
-        # Handle the differences between the flash_attn_varlen from flash_attn
-        # and the one from vllm_flash_attn. The former is used on RoCM and the
-        # latter has an additional parameter to control FA2 vs FA3
-        self.flash_attn_varlen_func = flash_attn_varlen_func
-        # self.vllm_flash_attn_version = get_flash_attn_version()
-        # if self.vllm_flash_attn_version is not None:
-        #     self.flash_attn_varlen_func = \
-        #         functools.partial(flash_attn_varlen_func,
-        #                           fa_version=self.vllm_flash_attn_version)
-        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim for attention backends that do
-        # not support different headdims
-        # We don't need to pad V if we are on a hopper system with FA3
-        self._pad_v = self.vllm_flash_attn_version is None or not (
-            self.vllm_flash_attn_version == 3
-            and current_platform.get_device_capability()[0] == 9)
-    def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
-                                         return_softmax_lse, **kwargs):
-        maybe_padded_v = v
-        if self._pad_v:
-            maybe_padded_v = torch.nn.functional.pad(
-                v, [0, q.shape[-1] - v.shape[-1]], value=0)
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \
-            and not return_softmax_lse:
-            attn_out = self.triton_fa_func(
-                q,
-                k,
-                maybe_padded_v,
-                None,  # output
-                kwargs["cu_seqlens_q"],
-                kwargs["cu_seqlens_k"],
-                kwargs["max_seqlen_q"],
-                kwargs["max_seqlen_k"],
-                kwargs["causal"],
-                softmax_scale,
-                None,  # bias
-            )
-        elif is_vllm_fa:
-            attn_out = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=maybe_padded_v,
-                return_softmax_lse=return_softmax_lse,
-                softmax_scale=softmax_scale,
-                **kwargs,
-            )
-        else:
-            # Use return_attn_probs instead of return_softmax_lse for RoCM
-            attn_out = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=maybe_padded_v,
-                return_attn_probs=return_softmax_lse,
-                softmax_scale=softmax_scale,
-                **kwargs,
-            )
-        # Unpack the output if there is multiple results,
-        # triton always returns (output, softmax_lse),
-        # vllm_flash_attn returns (output, softmax_lse) when
-        #  `return_softmax_lse = True`
-        # flash_attn (RoCM) returns (output, softmax_lse, ...) when
-        #  `return_attn_probs = True`
-        rest = None
-        if isinstance(attn_out, tuple):
-            attn_out, *rest = attn_out
-        # Remain consistent with old `flash_attn_varlen_func` where there
-        # is only one output tensor if `return_softmax_lse` is False.
-        if return_softmax_lse:
-            assert rest is not None
-            return attn_out, rest[0]
-        return attn_out
-    def _v_up_proj(self, x):
-        # Convert from (B, N, L) to (N, B, L)
-        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
-        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
-        x = torch.bmm(x, self.W_UV)
-        # Convert from (N, B, V) to (B, N * V)
-        return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
-    def process_weights_after_loading(self, act_dtype: torch.dtype):
-        def get_layer_weight(layer):
-            WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
-            for attr in WEIGHT_NAMES:
-                if hasattr(layer, attr):
-                    return getattr(layer, attr)
-            raise AttributeError(
-                f"Layer '{layer}' has no recognized weight attribute:"
-                f" {WEIGHT_NAMES}.")
-        def get_and_maybe_dequant_weights(layer: LinearBase):
-            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
-                # NOTE: This should only be used offline, since it's O(N^3)
-                eye = torch.eye(layer.input_size_per_partition,
-                                dtype=act_dtype,
-                                device=get_layer_weight(layer).device)
-                dequant_weights = layer.quant_method.apply(layer,
-                                                           eye,
-                                                           bias=None)
-                del eye
-                # standardize to (output, input)
-                return dequant_weights.T
-            return layer.weight
-        # we currently do not have quantized bmm's which are needed for
-        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
-        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
-        if self.use_llama_nn and self.kv_b_proj.quant_method is None:
-            kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj)
-            assert kv_b_proj_weight.shape == (
-                self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
-                self.kv_lora_rank,), (
-                    f"{kv_b_proj_weight.shape=}, "
-                    f"{self.kv_lora_rank=}, "
-                    f"{self.num_heads=}, "
-                    f"{self.qk_nope_head_dim=}, "
-                    f"{self.v_head_dim=}")
-        else:
-            kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
-            assert kv_b_proj_weight.shape == (
-                self.kv_lora_rank,
-                self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
-                    f"{kv_b_proj_weight.shape=}, "
-                    f"{self.kv_lora_rank=}, "
-                    f"{self.num_heads=}, "
-                    f"{self.qk_nope_head_dim=}, "
-                    f"{self.v_head_dim=}")
-        kv_b_proj_weight = kv_b_proj_weight.view(
-            self.kv_lora_rank,
-            self.num_heads,
-            self.qk_nope_head_dim + self.v_head_dim,
-        )
-        W_UK, W_UV = kv_b_proj_weight.split(
-            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        # Convert from (L, N, V) to (N, L, V)
-        self.W_UV = W_UV.transpose(0, 1)
-        # Convert from (L, N, P) to (N, P, L)
-        self.W_UK_T = W_UK.permute(1, 2, 0)
-    def _compute_prefill_context(
-        self,
-        q: torch.Tensor,
-        kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: MLACommonMetadata,
-        k_scale: torch.Tensor,
-    ):
-        prefill_metadata = attn_metadata.prefill_metadata
-        assert prefill_metadata is not None
-        assert prefill_metadata.context_chunk_seq_tot is not None
-        assert prefill_metadata.context_chunk_cu_seq_lens is not None
-        assert prefill_metadata.context_chunk_starts is not None
-        assert prefill_metadata.context_chunk_max_seq_lens is not None
-        assert prefill_metadata.context_lens_tensor is not None
-        output = None
-        iters = len(prefill_metadata.context_chunk_seq_tot)
-        # Fetch from attn_metadata directly, since it late bound by
-        # MLAAttentionState, grabbing it directly `attn_metadata` can avoid
-        # any weirdness around prefill_metadata caching
-        assert attn_metadata.context_chunk_workspace is not None
-        workspace = attn_metadata.context_chunk_workspace
-        for i in range(iters):
-            toks = prefill_metadata.context_chunk_seq_tot[i]
-            ops.gather_and_maybe_dequant_cache(
-                src_cache=kv_c_and_k_pe_cache,
-                dst=workspace,
-                block_table=prefill_metadata.block_tables,
-                cu_seq_lens=prefill_metadata.context_chunk_cu_seq_lens[i],
-                batch_size=prefill_metadata.num_prefills,
-                kv_cache_dtype=self.kv_cache_dtype,
-                scale=k_scale,
-                seq_starts=prefill_metadata.context_chunk_starts[i],
-            )
-            kv_c_normed = workspace[:toks]\
-                [..., :self.kv_lora_rank]
-            k_pe = workspace[:toks]\
-                [..., self.kv_lora_rank:].unsqueeze(1)
-            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
-                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-            k_nope, v = kv_nope\
-                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
-                          dim=-1)
-            attn_output, attn_softmax_lse = \
-                self._flash_attn_varlen_diff_headdims(
-                q=q,
-                k=k,
-                v=v,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                max_seqlen_q=prefill_metadata.max_query_len,
-                max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
-                softmax_scale=self.scale,
-                causal=False,  # Context is unmasked
-                return_softmax_lse=True,
-            )
-            if output is None:
-                output = attn_output
-                output_lse = attn_softmax_lse
-            else:
-                output_tmp = torch.empty_like(output)
-                output_lse_tmp = torch.empty_like(output_lse)
-                merge_attn_states(
-                    output=output_tmp,
-                    output_lse=output_lse_tmp,
-                    prefix_output=output,
-                    prefix_lse=output_lse,
-                    suffix_output=attn_output,
-                    suffix_lse=attn_softmax_lse,
-                )
-                output = output_tmp
-                output_lse = output_lse_tmp
-        return output, output_lse
-    def _forward_prefill(
-        self,
-        q: torch.Tensor,
-        kv_c_normed: torch.Tensor,
-        k_pe: torch.Tensor,
-        kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: MLACommonMetadata,
-        k_scale: torch.Tensor,
-    ) -> torch.Tensor:
-        prefill_metadata = attn_metadata.prefill_metadata
-        assert prefill_metadata is not None
-        has_context = prefill_metadata.context_lens_tensor is not None \
-            and prefill_metadata.context_lens_tensor.max() > 0
-        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
-            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-        k_nope, v = kv_nope\
-            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
-        output = self._flash_attn_varlen_diff_headdims(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=prefill_metadata.query_start_loc,
-            cu_seqlens_k=prefill_metadata.query_start_loc,
-            max_seqlen_q=prefill_metadata.max_prefill_seq_len,
-            max_seqlen_k=prefill_metadata.max_prefill_seq_len,
-            softmax_scale=self.scale,
-            causal=True,
-            return_softmax_lse=has_context,
-        )
-        if has_context:
-            # ROCm flash_attn_varlen_func will return 3 objects instead of 2
-            suffix_output, suffix_lse = output
-            context_output, context_lse = self._compute_prefill_context( \
-                q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
-            output = torch.empty_like(suffix_output)
-            merge_attn_states(
-                output=output,
-                prefix_output=context_output,
-                prefix_lse=context_lse,
-                suffix_output=suffix_output,
-                suffix_lse=suffix_lse,
-            )
-        # unpad if necessary
-        if self._pad_v:
-            output = output[..., :v.shape[-1]]
-        return output.flatten(start_dim=-2)
-    @abstractmethod
-    def _forward_decode(
-        self,
-        ql_nope: torch.Tensor,
-        q_pe: torch.Tensor,
-        kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: T,
-    ) -> torch.Tensor:
-        raise NotImplementedError
-    def forward(
-        self,
-        layer: AttentionLayer,
-        q: torch.Tensor,  # query in unified attn
-        k_c_normed: torch.Tensor,  # key in unified attn
-        k_pe: torch.Tensor,  # value in unified attn
-        kv_cache: torch.Tensor,
-        attn_metadata: T,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if output is not None:
-            raise NotImplementedError(
-                "output is not yet supported for MLAImplBase")
-        if output_scale is not None or output_block_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for MLAImplBase")
-        if attn_metadata.is_profile_run and \
-            attn_metadata.context_chunk_workspace is not None:
-            # During the profile run try to simulate to worse case output size
-            # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
-            # since this can be large
-            _ = torch.empty(
-                (attn_metadata.context_chunk_workspace.shape[0],
-                 self.num_heads, self.qk_nope_head_dim + self.v_head_dim),
-                device=k_c_normed.device,
-                dtype=k_c_normed.dtype,
-            )
-        has_decode = attn_metadata.decode_metadata is not None
-        has_prefill = attn_metadata.prefill_metadata is not None
-        num_prefill_tokens: int = attn_metadata.num_prefill_tokens
-        q = q.view(-1, self.num_heads, self.qk_head_dim)
-        decode_q = q[num_prefill_tokens:]
-        prefill_q = q[:num_prefill_tokens]
-        prefill_k_pe = k_pe[:num_prefill_tokens]
-        prefill_k_c_normed = k_c_normed[:num_prefill_tokens]
-        # write the latent and rope to kv cache
-        if kv_cache.numel() > 0:
-            ops.concat_and_cache_mla(
-                k_c_normed,
-                k_pe.squeeze(1),
-                kv_cache,
-                attn_metadata.slot_mapping.flatten(),
-                kv_cache_dtype=self.kv_cache_dtype,
-                scale=layer._k_scale,
-            )
-        output = torch.empty(attn_metadata.num_prefill_tokens +
-                             attn_metadata.num_decode_tokens,
-                             self.v_head_dim * self.num_heads,
-                             device=q.device,
-                             dtype=q.dtype)
-        if has_prefill:
-            output[:num_prefill_tokens] = self._forward_prefill(
-                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                attn_metadata, layer._k_scale)
-        if has_decode:
-            decode_q_nope, decode_q_pe = decode_q.split(
-                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-            # Convert from (B, N, P) to (N, B, P)
-            decode_q_nope = decode_q_nope.transpose(0, 1)
-            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
-            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
-            # Convert from (N, B, L) to (B, N, L)
-            decode_ql_nope = decode_ql_nope.transpose(0, 1)
-            output[num_prefill_tokens:] = self._forward_decode(
-                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
-        return output
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-import torch
-import torch_xla.experimental.custom_kernel  # Required to register custom ops.
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.logger import init_logger
-logger = init_logger(__name__)
-class PallasAttentionBackend(AttentionBackend):
-    @staticmethod
-    def get_name() -> str:
-        return "PALLAS"
-    @staticmethod
-    def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
-        return PallasAttentionBackendImpl
-    @staticmethod
-    def get_metadata_cls() -> Type["PallasMetadata"]:
-        return PallasMetadata
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_kv_heads, num_blocks, block_size, head_size)
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        raise RuntimeError("swap_blocks is not used for the TPU backend.")
-    @torch.compile(backend="openxla")
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-        src_to_dists: Tuple[torch.Tensor, torch.Tensor],
-    ) -> None:
-        src_indices, dst_indices = src_to_dists
-        for k_cache, v_cache in kv_caches:
-            torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
-            k_cache[:, dst_indices] = k_cache[:, src_indices]
-            torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
-            v_cache[:, dst_indices] = v_cache[:, src_indices]
-@dataclass
-class PallasMetadata(AttentionMetadata):
-    # Currently, input sequences can only contain all prefills
-    # or all decoding.
-    block_tables: Optional[torch.Tensor] = None
-    context_lens: Optional[torch.Tensor] = None
-    effective_query_lens: Optional[torch.Tensor] = None
-    @property
-    def prefill_metadata(self) -> Optional["PallasMetadata"]:
-        if self.num_prefills == 0:
-            return None
-        assert self.num_decode_tokens == 0
-        return self
-    @property
-    def decode_metadata(self) -> Optional["PallasMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.block_tables is not None
-        assert self.context_lens is not None
-        return self
-class PallasAttentionBackendImpl(AttentionImpl):
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Pallas is not supported yet, it will fall back "
-                "to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.logits_soft_cap = logits_soft_cap
-        if head_size % 128 != 0:
-            raise NotImplementedError(
-                f"Head size must be a multiple of 128, found {head_size}.")
-        if alibi_slopes is not None:
-            raise NotImplementedError("Alibi slopes is not supported.")
-        if sliding_window is not None:
-            raise NotImplementedError("Sliding window is not supported.")
-        if is_quantized_kv_cache(kv_cache_dtype):
-            raise NotImplementedError("FP8 KV cache dtype is not supported.")
-        if blocksparse_params is not None:
-            raise NotImplementedError("Blocksparse is not supported.")
-        if torch_xla.tpu.version() < 4:
-            raise NotImplementedError("TPU version must be 4 or higher.")
-        self.megacore_mode = None
-        tpu_env = torch_xla.tpu.get_tpu_env()
-        tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None)
-                    or tpu_env.get("TYPE", None)
-                    or tpu_env.get("TPU_ACCELERATOR_TYPE", None))
-        assert tpu_type is not None
-        tpu_type = tpu_type.lower()
-        if (("lite" not in tpu_type) and ("v6" not in tpu_type)):
-            if self.num_kv_heads % 2 == 0:
-                self.megacore_mode = "kv_head"
-            else:
-                # NOTE(woosuk): If the batch size is not a multiple of 2, the
-                # megacore mode will be None.
-                self.megacore_mode = "batch"
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: Tuple[torch.Tensor, torch.Tensor],
-        attn_metadata: PallasMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with Pallas attention.
-        Args:
-            query: shape = [batch_size, seq_len, num_heads * head_size]
-            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size]
-            kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size]
-                NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor 
-                with shape [0] for profiling run.
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [batch_size, seq_len, num_heads * head_size]
-        """
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for PallasAttentionImpl")
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        batch_size, seq_len, hidden_size = query.shape
-        query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
-        key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
-        value = value.view(batch_size, seq_len, self.num_kv_heads,
-                           self.head_size)
-        if kv_cache[0].numel() > 0:
-            slot_mapping = attn_metadata.slot_mapping
-            key_cache, value_cache = kv_cache
-            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
-        query = query * self.scale
-        if attn_metadata.num_prefills > 0:
-            if attn_metadata.block_tables is None:
-                # Prefill without paged KV cache.
-                assert seq_len % 16 == 0, (
-                    "Pallas FlashAttention kernel requires seq_len to be a "
-                    f"multiple of 16 but got {seq_len}")
-                # Handle GQA/MQA.
-                if self.num_kv_heads != self.num_heads:
-                    key = key.repeat_interleave(self.num_queries_per_kv,
-                                                dim=-2)
-                    key = key.view(batch_size, seq_len, self.num_heads,
-                                   self.head_size)
-                    value = value.repeat_interleave(self.num_queries_per_kv,
-                                                    dim=-2)
-                    value = value.view(batch_size, seq_len, self.num_heads,
-                                       self.head_size)
-                # FlashAttention kernel requires the input shape to be
-                # [batch_size, num_heads, seq_len, d_model]
-                # while the input is [batch_size, seq_len, num_heads, d_model].
-                # Permute the input to match the required format.
-                output = torch.ops.xla.flash_attention(
-                    query.permute(0, 2, 1, 3),
-                    key.permute(0, 2, 1, 3),
-                    value.permute(0, 2, 1, 3),
-                    True,
-                )
-                output = output.permute(0, 2, 1, 3)
-            else:
-                # Prefill with paged KV cache.
-                # TODO(woosuk): Tune the below knobs.
-                num_kv_pages_per_compute_block = 16
-                num_queries_per_compute_block = 16
-                assert seq_len % num_queries_per_compute_block == 0
-                output = torch.ops.xla.multi_queries_paged_attention(
-                    query,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.context_lens,
-                    attn_metadata.block_tables,
-                    attn_metadata.effective_query_lens,
-                    num_kv_pages_per_compute_block,
-                    num_queries_per_compute_block,
-                    use_kernel=True,
-                    attn_logits_soft_cap=self.logits_soft_cap,
-                )
-        else:
-            # Decoding run.
-            assert kv_cache[0].numel() > 0
-            query = query.squeeze(dim=1)
-            pages_per_compute_block = 16  # TODO(woosuk): Tune this value.
-            assert attn_metadata.block_tables is not None
-            assert attn_metadata.context_lens is not None
-            # NOTE(woosuk): The PagedAttention Pallas kernel stores the entire
-            # block table in SMEM. Therefore, if the block table is too large,
-            # the kernel compilation will fail. To avoid this, we split the
-            # batch dimension into smaller chunks and run the kernel multiple
-            # times.
-            MAX_SMEM_USAGE = 512 * 1024
-            size_per_seq = 4 * attn_metadata.block_tables.shape[1]
-            max_num_seq = MAX_SMEM_USAGE // size_per_seq
-            if batch_size <= max_num_seq:
-                output = paged_attention(
-                    query,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.context_lens,
-                    attn_metadata.block_tables,
-                    pages_per_compute_block,
-                    self.megacore_mode,
-                    attn_logits_soft_cap=self.logits_soft_cap,
-                )
-            else:
-                chunk_size = max_num_seq
-                # Make sure the chunk size is a multiple of 2.
-                chunk_size = chunk_size // 2 * 2
-                num_chunks = (batch_size + chunk_size - 1) // chunk_size
-                output = torch.empty_like(query)
-                for chunk_idx in range(num_chunks):
-                    chunk_start = chunk_idx * chunk_size
-                    chunk_end = chunk_start + chunk_size
-                    # NOTE(woosuk): We skip this line because it causes Dynamo
-                    # compilation error. Instead, we rely on the slice operation
-                    # to handle the out-of-bound case.
-                    # chunk_end = min(chunk_end, batch_size)
-                    chunk_output = paged_attention(
-                        query[chunk_start:chunk_end],
-                        key_cache,
-                        value_cache,
-                        attn_metadata.context_lens[chunk_start:chunk_end],
-                        attn_metadata.block_tables[chunk_start:chunk_end],
-                        pages_per_compute_block,
-                        self.megacore_mode,
-                        attn_logits_soft_cap=self.logits_soft_cap,
-                    )
-                    output[chunk_start:chunk_end] = chunk_output
-        # Reshape the output tensor.
-        return output.reshape(batch_size, seq_len, hidden_size)
-def write_to_kv_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-) -> None:
-    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
-    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
-    key = key.flatten(0, 2)
-    value = value.flatten(0, 2)
-    key_cache = key_cache.flatten(0, 2)
-    value_cache = value_cache.flatten(0, 2)
-    key_cache.index_copy_(0, slot_mapping, key)
-    value_cache.index_copy_(0, slot_mapping, value)
-def paged_attention(
-    query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    context_lens: torch.Tensor,
-    block_tables: torch.Tensor,
-    pages_per_compute_block: int,
-    megacore_mode: Optional[str],
-    *,
-    attn_logits_soft_cap: Optional[float],
-) -> torch.Tensor:
-    batch_size = query.shape[0]
-    if megacore_mode == "batch" and batch_size % 2 != 0:
-        megacore_mode = None
-    else:
-        megacore_mode = megacore_mode
-    return torch.ops.xla.paged_attention(
-        query,
-        key_cache,
-        value_cache,
-        context_lens,
-        block_tables,
-        pages_per_compute_block,
-        megacore_mode=megacore_mode,
-        attn_logits_soft_cap=attn_logits_soft_cap,
-    )
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer ROCm GPUs."""
-import itertools
-from dataclasses import dataclass
-from functools import cache
-from typing import List, Optional, Tuple, Type
-import torch
-import vllm.envs as envs
-from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType)
-from vllm.attention.backends.utils import (CommonAttentionState,
-                                           CommonMetadataBuilder)
-from vllm.attention.ops.paged_attn import (PagedAttention,
-                                           PagedAttentionMetadata)
-from vllm.config import get_current_vllm_config
-from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey, kFp8StaticTensorSym)
-from vllm.platforms import current_platform
-logger = init_logger(__name__)
-_PARTITION_SIZE_ROCM = 256
-@cache
-def is_rocm_aiter_paged_attn_enabled() -> bool:
-    return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN \
-        and envs.VLLM_ROCM_USE_AITER \
-@cache
-def _get_paged_attn_module() -> PagedAttention:
-    """
-    Initializes the appropriate PagedAttention module from `attention/ops`,
-    which is used as helper function
-    by `ROCmFlashAttentionImpl` and `ROCmFlashAttentionBackend`.
-    The choice of attention module depends on whether
-    AITER paged attention is enabled:
-    - If enabled, `ROCmFlashAttentionImpl` uses `AITERPagedAttention`.
-    - Otherwise, it defaults to using the original `PagedAttention`.
-    """
-    if is_rocm_aiter_paged_attn_enabled():
-        # Import AITERPagedAttention only when the flag is enabled
-        from vllm.attention.ops.rocm_aiter_paged_attn import (
-            AITERPagedAttention)
-        return AITERPagedAttention()
-    return PagedAttention()
-class ROCmFlashAttentionBackend(AttentionBackend):
-    accept_output_buffer: bool = True
-    @staticmethod
-    def get_name() -> str:
-        return "ROCM_FLASH"
-    @staticmethod
-    def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]:
-        return ROCmFlashAttentionImpl
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return ROCmFlashAttentionMetadata
-    @staticmethod
-    def get_builder_cls() -> Type["ROCmFlashAttentionMetadataBuilder"]:
-        return ROCmFlashAttentionMetadataBuilder
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        paged_attn = _get_paged_attn_module()
-        return paged_attn.get_kv_cache_shape(num_blocks, block_size,
-                                             num_kv_heads, head_size)
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        paged_attn = _get_paged_attn_module()
-        paged_attn.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        paged_attn = _get_paged_attn_module()
-        paged_attn.copy_blocks(kv_caches, src_to_dists)
-@dataclass
-class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
-    """Metadata for FlashAttentionBackend.
-    NOTE: Any python object stored here is not updated when it is
-    cuda-graph replayed. If you have values that need to be changed
-    dynamically, it should be stored in tensor. The tensor has to be
-    updated from `CUDAGraphRunner.forward` API.
-    """
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]]
-    # seq_lens stored as a tensor.
-    seq_lens_tensor: Optional[torch.Tensor]
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
-    # NOTE(sang): Definition of context_len, query_len, and seq_len.
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ----------------------|
-    #                                   |-- query_len ---|
-    # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int] = None
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor] = None
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor] = None
-    # (batch_size,) A tensor of context lengths (tokens that are computed
-    # so far).
-    context_lens_tensor: Optional[torch.Tensor] = None
-    # Max number of query tokens among request in the batch.
-    max_decode_query_len: Optional[int] = None
-    _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None
-    _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None
-    # Begin encoder attn & enc/dec cross-attn fields...
-    # Encoder sequence lengths representation
-    encoder_seq_lens: Optional[List[int]] = None
-    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: Optional[int] = None
-    # Number of tokens input to encoder
-    num_encoder_tokens: Optional[int] = None
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: Optional[torch.Tensor] = None
-    cross_block_tables: Optional[torch.Tensor] = None
-    @property
-    def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
-        if self.num_prefills == 0:
-            return None
-        if self._cached_prefill_metadata is not None:
-            return self._cached_prefill_metadata
-        assert self.seq_lens is not None
-        assert self.seq_lens_tensor is not None
-        assert self.block_tables is not None
-        self._cached_prefill_metadata = ROCmFlashAttentionMetadata(
-            num_prefills=self.num_prefills,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=0,
-            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
-            multi_modal_placeholder_index_maps=self.
-            multi_modal_placeholder_index_maps,
-            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
-            seq_lens=self.seq_lens[:self.num_prefills],
-            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=self.max_prefill_seq_len,
-            max_decode_seq_len=0,
-            query_start_loc=None if self.query_start_loc is None else
-            self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=None if self.seq_start_loc is None else
-            self.seq_start_loc[:self.num_prefills + 1],
-            context_lens_tensor=None if self.context_lens_tensor is None else
-            self.context_lens_tensor[:self.num_prefills],
-            block_tables=self.block_tables[:self.num_prefills],
-            use_cuda_graph=False,
-            # Begin encoder & cross attn fields below...
-            encoder_seq_lens=self.encoder_seq_lens,
-            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
-            max_encoder_seq_len=self.max_encoder_seq_len,
-            cross_slot_mapping=self.cross_slot_mapping,
-            cross_block_tables=self.cross_block_tables)
-        return self._cached_prefill_metadata
-    @property
-    def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        if self._cached_decode_metadata is not None:
-            return self._cached_decode_metadata
-        assert self.block_tables is not None
-        assert self.seq_lens_tensor is not None
-        self._cached_decode_metadata = ROCmFlashAttentionMetadata(
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=True,
-            seq_lens=None,
-            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
-            max_query_len=None,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=self.max_decode_seq_len,
-            query_start_loc=None,
-            seq_start_loc=None,
-            context_lens_tensor=None,
-            block_tables=self.block_tables[self.num_prefills:],
-            use_cuda_graph=self.use_cuda_graph,
-            # Begin encoder & cross attn fields below...
-            encoder_seq_lens=self.encoder_seq_lens,
-            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
-            max_encoder_seq_len=self.max_encoder_seq_len,
-            cross_slot_mapping=self.cross_slot_mapping,
-            cross_block_tables=self.cross_block_tables)
-        # Batch may be composed of prefill|decodes, adjust query start indices
-        # to refer to the start of decodes when the two are split apart.
-        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
-        if self._cached_decode_metadata.query_start_loc is not None:
-            qs = self._cached_decode_metadata.query_start_loc
-            self._cached_decode_metadata.query_start_loc = qs - qs[0]
-        return self._cached_decode_metadata
-class ROCmFlashAttentionMetadataBuilder(
-        CommonMetadataBuilder[ROCmFlashAttentionMetadata]):
-    _metadata_cls = ROCmFlashAttentionMetadata
-def _make_alibi_bias(alibi_slopes: torch.Tensor,
-                     dtype: torch.dtype,
-                     seq_lens: Optional[List[int]],
-                     make_attn_mask: bool = True) -> List[torch.Tensor]:
-    attn_biases = []
-    if seq_lens:
-        for seq_len in seq_lens:
-            bias = torch.arange(seq_len, dtype=dtype)
-            # NOTE(zhuohan): HF uses
-            #     `bias = bias[None, :].repeat(seq_len, 1)`
-            # here. We find that both biases give the same results, but
-            # the bias below more accurately follows the original ALiBi
-            # paper.
-            bias = bias[None, :] - bias[:, None]
-            num_heads = alibi_slopes.shape[0]
-            bias = bias[None, :].repeat(
-                (num_heads, 1, 1)).to(alibi_slopes.device)
-            bias.mul_(alibi_slopes[:, None, None])
-            if make_attn_mask:
-                inf_mask = torch.empty(
-                    (1, seq_len, seq_len),
-                    dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1).to(
-                        alibi_slopes.device)
-                attn_biases.append((bias + inf_mask).to(dtype))
-            else:
-                attn_biases.append(bias.to(dtype))
-    return attn_biases
-def _get_seq_len_block_table_args(
-    attn_metadata: ROCmFlashAttentionMetadata,
-    attn_type: str,
-) -> tuple:
-    '''
-    The particular choice of sequence-length
-    attributes which should be extracted from attn_metadata is dependent
-    on the type of attention operation.
-    Decoder attn -> select entirely decoder self-attention-related fields
-    Encoder/decoder cross-attn -> select encoder sequence lengths
-    Encoder attn -> select encoder sequence lengths fields
-    Encoder-only attn -> select prefill sequence lengths with 
-        bidirectional attention
-    Arguments:
-    * attn_metadata: Attention metadata structure associated with attention op
-    * attn_type: encoder attention, decoder self-attention,
-                encoder/decoder cross-attention, encoder-only
-    Returns:
-    * Appropriate sequence-lengths tensors for query and key
-    * Appropriate max sequence-length scalar
-    * Causal masking flag
-    '''
-    if attn_type == AttentionType.ENCODER:
-        assert attn_metadata.encoder_seq_lens is not None
-        assert attn_metadata.encoder_seq_lens_tensor is not None
-        query_seq_start_loc = torch.tensor(
-            list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)),
-            device=attn_metadata.encoder_seq_lens_tensor.device,
-            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
-        causal_mask = False
-        # No block tables associated with encoder attention
-        return (query_seq_start_loc, attn_metadata.max_encoder_seq_len,
-                query_seq_start_loc, attn_metadata.max_encoder_seq_len,
-                attn_metadata.encoder_seq_lens, causal_mask)
-    elif attn_type == AttentionType.ENCODER_ONLY:
-        # For encoder-only models, we use the prefill sequence lengths
-        assert attn_metadata.seq_lens is not None
-        assert attn_metadata.seq_lens_tensor is not None
-        query_seq_start_loc = torch.tensor(
-            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
-            device=attn_metadata.seq_lens_tensor.device,
-            dtype=attn_metadata.seq_lens_tensor.dtype)
-        max_seq_len = attn_metadata.max_prefill_seq_len
-        # Encoder-only models typically use bidirectional attention
-        causal_mask = False
-        return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
-                max_seq_len, attn_metadata.seq_lens, causal_mask)
-    elif attn_type == AttentionType.DECODER:
-        # Decoder self-attention
-        # Choose max_seq_len based on whether we are in prompt_run
-        assert attn_metadata.seq_lens is not None
-        assert attn_metadata.seq_lens_tensor is not None
-        query_seq_start_loc = torch.tensor(
-            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
-            device=attn_metadata.seq_lens_tensor.device,
-            dtype=attn_metadata.seq_lens_tensor.dtype)
-        max_seq_len = attn_metadata.max_prefill_seq_len
-        causal_mask = True
-        return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
-                max_seq_len, attn_metadata.seq_lens, causal_mask)
-    elif attn_type == AttentionType.ENCODER_DECODER:
-        assert attn_metadata.seq_lens is not None
-        assert attn_metadata.encoder_seq_lens_tensor is not None
-        query_start_loc = torch.tensor(
-            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
-            device=attn_metadata.encoder_seq_lens_tensor.device,
-            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
-        assert attn_metadata.encoder_seq_lens is not None
-        assert attn_metadata.seq_lens_tensor is not None
-        key_seq_start_loc = torch.tensor(
-            list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)),
-            device=attn_metadata.seq_lens_tensor.device,
-            dtype=attn_metadata.seq_lens_tensor.dtype)
-        causal_mask = False
-        # Enc/dec cross-attention KVs match encoder sequence length;
-        # cross-attention utilizes special "cross" block tables
-        return (query_start_loc, attn_metadata.max_prefill_seq_len,
-                key_seq_start_loc, attn_metadata.max_encoder_seq_len,
-                attn_metadata.seq_lens, causal_mask)
-    else:
-        raise AttributeError(f"Invalid attention type {str(attn_type)}")
-class ROCmFlashAttentionImpl(AttentionImpl):
-    """
-    If the input tensors contain prompt tokens, the layout is as follows:
-    |<--------------- num_prompt_tokens -------------->|
-    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|
-    Otherwise, the layout is as follows:
-    |<------------------ num_generation_tokens (M) ----------------->|
-    |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->|
-    Generation tokens can contain padding when cuda-graph is used.
-    Currently, prompt tokens don't contain any padding.
-    The prompts might have different lengths, while the generation tokens
-    always have length 1.
-    If chunked prefill is enabled, prefill tokens and decode tokens can be
-    batched together in a flattened 1D query.
-    |<----- num_prefill_tokens ---->|<------- num_decode_tokens ----------->|	
-    |<-prompt_0->|...|<-prompt_N-1->|<-generation_0->|...|<-generation_M-1->|
-    Currently, cuda graph is disabled for chunked prefill, meaning there's no
-    padding between prefill and decode tokens.
-    """
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0 "
-                                      "ROCM_FLASH backend.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in ROCm Flash Attention is not supported yet, it "
-                "will fail back to global attention for long context.")
-        if use_irope:
-            logger.warning(
-                "Using irope in V0 is not supported yet, it will fall back "
-                "to global attention for long context.")
-        if logits_soft_cap is None:
-            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
-            self.logits_soft_cap = 0.0
-        else:
-            self.logits_soft_cap = logits_soft_cap
-        self.attn_type = attn_type
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = ((sliding_window, sliding_window)
-                               if sliding_window is not None else (-1, -1))
-        self.kv_cache_dtype = kv_cache_dtype
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.paged_attn_module = _get_paged_attn_module()
-        supported_head_sizes = self.paged_attn_module.get_supported_head_sizes(
-        )
-        if head_size not in supported_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {supported_head_sizes}.")
-        self.use_naive_attn = False
-        # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
-        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
-        if self.use_triton_flash_attn:
-            if logits_soft_cap is not None:
-                raise ValueError(
-                    "ROCm Triton FlashAttention does not support attention"
-                    " logits soft capping."
-                    " please try using the ROCm CK "
-                    "FA backend instead by setting the env var "
-                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
-            from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
-                triton_attention)
-            self.triton_attn_func = triton_attention
-            logger.debug("Using Triton FA in ROCmBackend")
-            if self.sliding_window != (-1, -1):
-                logger.warning("ROCm Triton FA does not currently support "
-                               "sliding window attention. If using half "
-                               "precision, please try using the ROCm CK "
-                               "FA backend instead by setting the env var "
-                               "`VLLM_USE_TRITON_FLASH_ATTN=0`")
-        else:
-            # if not using triton, navi3x/navi21/navi10 do not use flash-attn
-            # either
-            if not current_platform.has_device_capability(90):
-                self.use_naive_attn = True
-            else:
-                try:
-                    from flash_attn import flash_attn_varlen_func  # noqa: F401
-                    self.fa_attn_func = flash_attn_varlen_func
-                    logger.debug("Using CK FA in ROCmBackend")
-                except ModuleNotFoundError:
-                    self.use_naive_attn = True
-            if self.use_naive_attn:
-                if logits_soft_cap is not None:
-                    raise ValueError(
-                        "ROCm Naive FlashAttention does not support "
-                        "attention logits soft capping.")
-                self.sdpa_attn_func = _sdpa_attention
-                logger.debug("Using naive (SDPA) attention in ROCmBackend")
-        self.aiter_kv_scales_initialized = False
-        self.force_fp8_attention = (
-            get_current_vllm_config() is not None
-            and get_current_vllm_config().model_config.override_attention_dtype
-            == "fp8")
-    def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
-        """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
-        tokens, n_kv_heads, head_dim = x.shape
-        return (x[:, :,
-                  None, :].expand(tokens, n_kv_heads, n_rep,
-                                  head_dim).reshape(tokens, n_kv_heads * n_rep,
-                                                    head_dim))
-    def fused_output_quant_supported(self, quant_key: QuantKey):
-        if self.use_triton_flash_attn:
-            return quant_key == kFp8StaticTensorSym
-        # Only supported in the Triton backend
-        return False
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: ROCmFlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with FlashAttention and PagedAttention.
-        For decoder-only models: query, key and value must be non-None.
-        For encoder/decoder models:
-        * ROCmFlashAttentionImpl.forward() may be invoked for both self- and 
-            cross-attention layers.
-        * For self-attention: query, key and value must be non-None.
-        * For cross-attention:
-            * Query must be non-None
-            * During prefill, key and value must be non-None; key and value
-              get cached for use during decode.
-            * During decode, key and value may be None, since:
-              (1) key and value tensors were cached during prefill, and
-              (2) cross-attention key and value tensors do not grow during
-                  decode
-        A note on how the attn_type (attention type enum) argument impacts
-        attention forward() behavior:
-            * DECODER: normal decoder-only behavior;
-                use decoder self-attention block table
-            * ENCODER: no KV caching; pass encoder sequence
-                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
-                max_encoder_seq_len) to kernel, in lieu of decoder
-                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len)
-            * ENCODER_DECODER: cross-attention behavior;
-                use cross-attention block table for caching KVs derived
-                from encoder hidden states; since KV sequence lengths
-                will match encoder sequence lengths, pass encoder sequence
-                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
-                max_encoder_seq_len)
-            * ENCODER_ONLY: bidirectional attention with no KV caching;
-                use prefill sequence attributes
-        Args:
-            layer: Attention layer instance.
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache: KV cache tensor with shape 
-                [2, num_blocks, block_size * num_kv_heads * head_size].
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
-            attn_metadata: Metadata for attention.
-            output: Optional output tensor.
-            output_scale: Optional output scale tensor.
-            output_block_scale: Optional output block scale tensor.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        assert output is not None, "Output tensor must be provided."
-        if output_scale is not None and not self.use_triton_flash_attn:
-            raise NotImplementedError(
-                "fused output quantization only supported for Triton"
-                " implementation in ROCMFlashAttentionImpl for now")
-        if output_block_scale is not None:
-            raise NotImplementedError(
-                "fused nvfp4 output quantization is not supported"
-                " for ROCMFlashAttentionImpl")
-        query = query.view(-1, self.num_heads, self.head_size)
-        if key is not None:
-            assert value is not None
-            key = key.view(-1, self.num_kv_heads, self.head_size)
-            value = value.view(-1, self.num_kv_heads, self.head_size)
-        else:
-            assert value is None
-        paged_attn = self.paged_attn_module
-        # Reshaping kv tensors is required for AITER paged attention kernel
-        # because it works on a different tensor shape,
-        # when the size of one element is one byte (int8/fp8 dtypes).
-        # This reshaping is only required on the first forward call
-        # and the kv cache must not be empty.
-        if (is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
-                and not self.aiter_kv_scales_initialized
-                and kv_cache.shape != torch.Size([0])):
-            num_blocks = kv_cache.shape[1]
-            block_size = kv_cache.shape[2] // (self.num_kv_heads *
-                                               self.head_size)
-            k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
-                                  dtype=torch.float32,
-                                  device=kv_cache.device)
-            v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
-                                  dtype=torch.float32,
-                                  device=kv_cache.device)
-            self.aiter_kv_scales_initialized = True
-            k_scale.fill_(layer._k_scale.item())
-            v_scale.fill_(layer._v_scale.item())
-            layer._k_scale = k_scale
-            layer._v_scale = v_scale
-        # Only update KV cache for decoder self-attention
-        # and encoder-decoder cross-attention
-        if self.attn_type not in [
-                AttentionType.ENCODER, AttentionType.ENCODER_ONLY
-        ] and kv_cache.numel() > 0:
-            key_cache, value_cache = paged_attn.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-            if key is not None and value is not None:
-                # Reshape the input keys and values and store them in the
-                # cache. If kv_cache is not provided, the new key and value
-                # tensors are not cached. This happens during the initial
-                # memory profiling run.
-                paged_attn.write_to_paged_cache(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping
-                    if self.attn_type != AttentionType.ENCODER_DECODER else
-                    attn_metadata.cross_slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-        if self.attn_type != AttentionType.ENCODER:
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-        elif self.attn_type == AttentionType.ENCODER_ONLY:
-            # For encoder-only models, all tokens are processed in one go
-            num_prefill_tokens = query.shape[0]
-        else:
-            assert attn_metadata.num_encoder_tokens is not None
-            num_prefill_tokens = attn_metadata.num_encoder_tokens
-        # Query for decode. KV is not needed because it is already cached.
-        decode_query = query[num_prefill_tokens:]
-        # QKV for prefill.
-        query = query[:num_prefill_tokens]
-        # For encoder-only and encoder models,
-        # we process all tokens at once
-        # For decoder and encoder-decoder,
-        # we may need to limit key/value to prefill tokens
-        if key is not None and value is not None \
-            and self.attn_type not in [AttentionType.ENCODER_DECODER,
-                                       AttentionType.ENCODER_ONLY]:
-            key = key[:num_prefill_tokens]
-            value = value[:num_prefill_tokens]
-        if prefill_meta := attn_metadata.prefill_metadata:
-            # Prompt run.
-            # normal attention and DECODER
-            if self.attn_type == AttentionType.DECODER and (
-                    kv_cache.numel() == 0 or prefill_meta.block_tables is None
-                    or prefill_meta.block_tables.numel() == 0):
-                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
-                 key_max_seq_len, seq_lens,
-                 causal_mask) = (prefill_meta.seq_start_loc,
-                                 prefill_meta.max_prefill_seq_len,
-                                 prefill_meta.seq_start_loc,
-                                 prefill_meta.max_prefill_seq_len,
-                                 attn_metadata.seq_lens, True)
-            # prefix-enabled attention and ENCODER/ENCODER_DECODER
-            else:
-                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
-                 key_max_seq_len, seq_lens,
-                 causal_mask) = _get_seq_len_block_table_args(
-                     prefill_meta, self.attn_type)
-            # Prompt run.
-            if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
-                # triton attention
-                # When block_tables are not filled, it means q and k are the
-                # prompt, and they have the same length.
-                attn_masks = None
-                if self.use_triton_flash_attn:
-                    if self.alibi_slopes is not None:
-                        attn_masks = _make_alibi_bias(
-                            self.alibi_slopes,
-                            query.dtype,
-                            seq_lens,
-                            make_attn_mask=causal_mask)  # type: ignore
-                    use_fp8_scales = (layer._q_scale and layer._k_scale
-                                      and layer._v_scale and layer._prob_scale
-                                      and (self.kv_cache_dtype == "fp8"
-                                           or self.force_fp8_attention))
-                    full_scales = (
-                        layer._q_scale.item(), layer._k_scale.item(),
-                        layer._v_scale.item(),
-                        layer._prob_scale.item()) if use_fp8_scales else None
-                    self.triton_attn_func(
-                        query,
-                        key,
-                        value,
-                        output[:num_prefill_tokens],
-                        query_seq_start_loc,
-                        key_seq_start_loc,
-                        query_max_seq_len,
-                        key_max_seq_len,
-                        causal_mask,
-                        self.scale,
-                        attn_masks[0][None]
-                        if attn_masks is not None else None,
-                        full_scales,
-                        output_scale,
-                    )
-                elif self.use_naive_attn:
-                    if self.num_kv_heads != self.num_heads:
-                        # Interleave for MQA workaround.
-                        key = self.repeat_kv(key, self.num_queries_per_kv)
-                        value = self.repeat_kv(value, self.num_queries_per_kv)
-                    if self.alibi_slopes is not None:
-                        attn_masks = _make_alibi_bias(
-                            self.alibi_slopes,
-                            query.dtype,
-                            attn_metadata.seq_lens,
-                            make_attn_mask=causal_mask)  # type: ignore
-                    query = query.movedim(0, query.dim() - 2)
-                    key = key.movedim(0, key.dim() - 2)
-                    value = value.movedim(0, value.dim() - 2)
-                    # sdpa math backend attention
-                    self.sdpa_attn_func(
-                        query,
-                        key,
-                        value,
-                        output[:num_prefill_tokens],
-                        query_seq_start_loc,
-                        num_prefill_tokens,
-                        self.num_heads,
-                        self.head_size,
-                        self.scale,
-                        attn_masks,
-                    )
-                else:
-                    # upstream FA does not support an output arg, copy
-                    output[:num_prefill_tokens] = self.fa_attn_func(
-                        q=query,
-                        k=key,
-                        v=value,
-                        cu_seqlens_q=query_seq_start_loc,
-                        cu_seqlens_k=key_seq_start_loc,
-                        max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                        max_seqlen_k=key_max_seq_len,
-                        softmax_scale=self.scale,
-                        causal=causal_mask,
-                        window_size=self.sliding_window,
-                        alibi_slopes=self.alibi_slopes,
-                        softcap=self.logits_soft_cap,
-                    )
-            else:
-                # prefix-enabled attention -
-                # not applicable for encoder-only models
-                if self.attn_type != AttentionType.ENCODER_ONLY:
-                    output[:num_prefill_tokens] = paged_attn.forward_prefix(
-                        query,
-                        key,
-                        value,
-                        self.kv_cache_dtype,
-                        key_cache,
-                        value_cache,
-                        prefill_meta.block_tables,
-                        prefill_meta.query_start_loc,
-                        prefill_meta.seq_lens_tensor,
-                        prefill_meta.max_query_len,
-                        self.alibi_slopes,
-                        self.sliding_window[0],
-                        layer._k_scale,
-                        layer._v_scale,
-                    )
-        # Skip decode phase for encoder-only models
-        if (decode_meta := attn_metadata.decode_metadata) and (
-                self.attn_type != AttentionType.ENCODER_ONLY):
-            # Decoding run.
-            # Whether to use rocm custom paged attention or not
-            num_seqs, num_heads, head_size = decode_query.shape
-            block_size = value_cache.shape[3]
-            gqa_ratio = num_heads // self.num_kv_heads
-            from vllm.platforms.rocm import use_rocm_custom_paged_attention
-            use_custom = use_rocm_custom_paged_attention(
-                decode_query.dtype, head_size, block_size, gqa_ratio,
-                decode_meta.max_decode_seq_len, self.sliding_window,
-                self.kv_cache_dtype, self.alibi_slopes)
-            use_custom = False
-            if use_custom:
-                max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
-                               != AttentionType.ENCODER_DECODER else
-                               decode_meta.max_encoder_seq_len)
-                assert max_seq_len is not None
-                max_num_partitions = (
-                    (max_seq_len + _PARTITION_SIZE_ROCM - 1) //
-                    _PARTITION_SIZE_ROCM)
-                assert _PARTITION_SIZE_ROCM % block_size == 0
-                tmp_output = torch.empty(
-                    size=(num_seqs, num_heads, max_num_partitions, head_size),
-                    dtype=query.dtype,
-                    device=output.device,
-                )
-                exp_sums = torch.empty(
-                    size=(num_seqs, num_heads, max_num_partitions),
-                    dtype=torch.float32,
-                    device=output.device,
-                )
-                max_logits = torch.empty_like(exp_sums)
-                query_start_loc = None
-                ops.paged_attention_rocm(
-                    output[num_prefill_tokens:],
-                    exp_sums,
-                    max_logits,
-                    tmp_output,
-                    decode_query,
-                    key_cache,
-                    value_cache,
-                    self.num_kv_heads,
-                    self.scale,
-                    decode_meta.block_tables
-                    if self.attn_type != AttentionType.ENCODER_DECODER else
-                    decode_meta.cross_block_tables,
-                    decode_meta.seq_lens_tensor
-                    if self.attn_type != AttentionType.ENCODER_DECODER else
-                    decode_meta.encoder_seq_lens_tensor,
-                    query_start_loc,
-                    block_size,
-                    max_seq_len,
-                    self.alibi_slopes,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                    output_scale,
-                )
-            else:
-                # PagedAttention does not support fused quant, manually quantize
-                if output_scale is None:
-                    out_pa = output[num_prefill_tokens:]
-                else:
-                    out_pa = torch.empty_like(output[num_prefill_tokens:],
-                                              dtype=query.dtype)
-                out_pa[:] = paged_attn.forward_decode(
-                    decode_query,
-                    key_cache,
-                    value_cache,
-                    decode_meta.block_tables
-                    if self.attn_type != AttentionType.ENCODER_DECODER else
-                    decode_meta.cross_block_tables,
-                    decode_meta.seq_lens_tensor
-                    if self.attn_type != AttentionType.ENCODER_DECODER else
-                    decode_meta.encoder_seq_lens_tensor,
-                    decode_meta.max_decode_seq_len
-                    if self.attn_type != AttentionType.ENCODER_DECODER else
-                    decode_meta.max_encoder_seq_len,
-                    self.kv_cache_dtype,
-                    self.num_kv_heads,
-                    self.scale,
-                    self.alibi_slopes,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-                # Manually perform quantization
-                if output_scale is not None:
-                    out_uq = out_pa.view(-1, self.num_heads * self.head_size)
-                    out_q = output.view(-1, self.num_heads * self.head_size)
-                    ops.scaled_fp8_quant(out_uq,
-                                         output_scale,
-                                         output=out_q[num_prefill_tokens:])
-        # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
-def _sdpa_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    output: torch.Tensor,
-    seq_lens: torch.Tensor,
-    num_tokens: int,
-    num_heads: int,
-    head_size: int,
-    scale: float,
-    attn_masks: Optional[List[torch.Tensor]] = None,
-) -> torch.Tensor:
-    start = 0
-    assert output.shape == (num_tokens, num_heads, head_size)
-    assert output.dtype == query.dtype
-    assert output.device == query.device
-    for i, seq_len in enumerate(seq_lens):
-        end = start + seq_len
-        with torch.nn.attention.sdpa_kernel(
-                torch.nn.attention.SDPBackend.MATH):
-            sub_out = torch.nn.functional.scaled_dot_product_attention(
-                query[:, start:end, :],
-                key[:, start:end, :],
-                value[:, start:end, :],
-                dropout_p=0.0,
-                is_causal=attn_masks is None,
-                attn_mask=attn_masks[i] if attn_masks else None,
-                scale=scale).movedim(query.dim() - 2, 0)
-            output[start:end, :, :] = sub_out
-            start = end
-    return output
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" Attention layer with torch scaled_dot_product_attention
-    and PagedAttention."""
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-import torch
-from torch.nn.functional import scaled_dot_product_attention
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata,
-                                              AttentionMetadataBuilder,
-                                              AttentionType,
-                                              is_quantized_kv_cache)
-# yapf: enable
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex
-from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.logger import init_logger
-from vllm.utils import make_tensor_with_pad
-from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
-logger = init_logger(__name__)
-class TorchSDPABackend(AttentionBackend):
-    @staticmethod
-    def get_name() -> str:
-        return "TORCH_SDPA"
-    @staticmethod
-    def get_impl_cls() -> Type["TorchSDPABackendImpl"]:
-        return TorchSDPABackendImpl
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return TorchSDPAMetadata
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-    @staticmethod
-    def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]:
-        return TorchSDPAMetadataBuilder
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        raise NotImplementedError("Swap is not supported in TorchSDPABackend.")
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
-@dataclass
-class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
-    """Metadata for TorchSDPABackend.
-    """
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    chunked_prefill: bool
-    seq_lens: Optional[List[int]] = None  # For non-chunked prefill
-    # For chunked prefill only
-    max_query_len: Optional[int] = None
-    max_kv_len: Optional[int] = None
-    prefill_query_start_loc: Optional[torch.Tensor] = None
-    kv_start_loc: Optional[torch.Tensor] = None
-    prefill_block_tables: Optional[torch.Tensor] = None
-    # For V1 logits index only
-    query_start_loc: Optional[torch.Tensor] = None
-    # Begin encoder attn & enc/dec cross-attn fields...
-    # Encoder sequence lengths representation
-    encoder_seq_lens: Optional[List[int]] = None
-    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: Optional[int] = None
-    # Number of tokens input to encoder
-    num_encoder_tokens: Optional[int] = None
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: Optional[torch.Tensor] = None
-    cross_block_tables: Optional[torch.Tensor] = None
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[List[torch.Tensor]] = None
-        self.encoder_attn_bias: Optional[List[torch.Tensor]] = None
-        self.cross_attn_bias: Optional[List[torch.Tensor]] = None
-    @property
-    def is_all_encoder_attn_metadata_set(self):
-        '''
-        All attention metadata required for encoder attention is set.
-        '''
-        return ((self.encoder_seq_lens is not None)
-                and (self.encoder_seq_lens_tensor is not None)
-                and (self.max_encoder_seq_len is not None))
-    @property
-    def is_all_cross_attn_metadata_set(self):
-        '''
-        All attention metadata required for enc/dec cross-attention is set.
-        Superset of encoder attention required metadata.
-        '''
-        return (self.is_all_encoder_attn_metadata_set
-                and (self.cross_slot_mapping is not None)
-                and (self.cross_block_tables is not None))
-    @property
-    def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_prefill_tokens == 0:
-            return None
-        return self
-    @property
-    def decode_metadata(self) -> Optional["TorchSDPAMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        return self
-    def get_seq_lens(
-        self,
-        attn_type: str,
-    ):
-        '''
-        Extract appropriate sequence lengths from attention metadata
-        according to attention type.
-        Arguments:
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-        Returns:
-        * Appropriate sequence lengths tensor for query
-        * Appropriate sequence lengths tensor for key & value
-        '''
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.seq_lens
-        elif attn_type == AttentionType.ENCODER:
-            seq_lens_q = self.encoder_seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            seq_lens_q = self.seq_lens
-            seq_lens_kv = self.encoder_seq_lens
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-        return seq_lens_q, seq_lens_kv
-    def get_attn_bias(
-        self,
-        attn_type: str,
-    ) -> Optional[List[torch.Tensor]]:
-        '''
-        Extract appropriate attention bias from attention metadata
-        according to attention type.
-        Arguments:
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-        Returns:
-        * Appropriate attention bias value given the attention type
-        '''
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            return self.attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            return self.encoder_attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            return self.cross_attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-    def set_attn_bias(
-        self,
-        attn_bias: List[torch.Tensor],
-        attn_type: str,
-    ) -> None:
-        '''
-        Update appropriate attention bias field of attention metadata,
-        according to attention type.
-        Arguments:
-        * attn_metadata: Attention metadata structure associated with attention
-        * attn_bias: The desired attention bias value
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-        '''
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            self.attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER:
-            self.encoder_attn_bias = attn_bias
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            self.cross_attn_bias = attn_bias
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-    def get_seq_len_block_table_args(
-        self,
-        attn_type: str,
-    ) -> tuple:
-        '''
-        The particular choice of sequence-length- and block-table-related
-        attributes which should be extracted from attn_metadata is dependent
-        on the type of attention operation.
-        Decoder attn -> select entirely decoder self-attention-related fields
-        Encoder/decoder cross-attn -> select encoder sequence lengths &
-                                    cross-attn block-tables fields
-        Encoder attn -> select encoder sequence lengths fields & no block tables
-        Arguments:
-        * attn_metadata: Attention metadata structure associated with attention
-        * is_prompt: True if prefill, False otherwise
-        * attn_type: encoder attention, decoder self-attention,
-                    encoder/decoder cross-attention
-        Returns:
-        * Appropriate sequence-lengths tensor
-        * Appropriate max sequence-length scalar
-        * Appropriate block tables (or None)
-        '''
-        if (attn_type == AttentionType.DECODER
-                or attn_type == AttentionType.ENCODER_ONLY):
-            # Decoder self-attention
-            # Choose max_seq_len based on whether we are in prompt_run
-            return (self.seq_lens_tensor, self.max_decode_seq_len,
-                    self.block_tables)
-        elif attn_type == AttentionType.ENCODER_DECODER:
-            # Enc/dec cross-attention KVs match encoder sequence length;
-            # cross-attention utilizes special "cross" block tables
-            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
-                    self.cross_block_tables)
-        elif attn_type == AttentionType.ENCODER:
-            # No block tables associated with encoder attention
-            return (self.encoder_seq_lens_tensor, self.max_encoder_seq_len,
-                    None)
-        else:
-            raise AttributeError(f"Invalid attention type {str(attn_type)}")
-class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
-    def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
-        self.chunked_prefill = input_builder.chunked_prefill
-        self.input_builder = input_builder
-    def prepare(self):
-        self.input_data = self.input_builder.input_data
-    def build(self, seq_lens: List[int], query_lens: List[int],
-              cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata:
-        input_data = self.input_data
-        prefill_seq_lens = seq_lens[0:input_data.num_prefills]
-        prefill_query_lens = query_lens[0:input_data.num_prefills]
-        slot_mapping = torch.tensor(input_data.slot_mapping,
-                                    dtype=torch.long,
-                                    device="cpu")
-        # For chunked-prefill
-        if self.chunked_prefill and input_data.num_prefill_tokens != 0:
-            prefill_block_tables = make_tensor_with_pad(
-                self.input_data.prefill_block_tables,
-                pad=0,
-                dtype=torch.int32,
-                device="cpu",
-            )
-            query_lens_tensor = torch.tensor(prefill_query_lens,
-                                             dtype=torch.int32,
-                                             device="cpu")
-            kv_lens_tensor = torch.tensor(prefill_seq_lens,
-                                          dtype=torch.int32,
-                                          device="cpu")
-            query_start_loc = torch.zeros(input_data.num_prefills + 1,
-                                          dtype=torch.int32,
-                                          device="cpu")
-            kv_start_loc = torch.zeros(input_data.num_prefills + 1,
-                                       dtype=torch.int32,
-                                       device="cpu")
-            torch.cumsum(query_lens_tensor,
-                         dim=0,
-                         dtype=torch.int32,
-                         out=query_start_loc[1:])
-            torch.cumsum(kv_lens_tensor,
-                         dim=0,
-                         dtype=torch.int32,
-                         out=kv_start_loc[1:])
-            max_query_len = max(prefill_query_lens)
-            max_kv_len = max(prefill_seq_lens)
-        else:
-            prefill_block_tables = None
-            query_start_loc = None
-            kv_start_loc = None
-            max_query_len = None
-            max_kv_len = None
-        # For paged attention
-        if input_data.num_decode_tokens != 0:
-            seq_lens_tensor = torch.tensor(
-                input_data.seq_lens[input_data.num_prefills:],
-                dtype=torch.int32,
-                device="cpu",
-            )
-            block_tables = make_tensor_with_pad(
-                self.input_data.decode_block_tables,
-                pad=0,
-                dtype=torch.int32,
-                device="cpu",
-            )
-        else:
-            block_tables = torch.tensor([])
-            seq_lens_tensor = torch.tensor(
-                input_data.seq_lens[:input_data.num_prefills],
-                dtype=torch.int32,
-                device="cpu",
-            )
-        # For multi-modal models
-        placeholder_index_maps = None
-        if len(input_data.multi_modal_inputs_list) != 0:
-            placeholder_index_maps = {
-                modality: placeholder_map.index_map()
-                for modality, placeholder_map in
-                input_data.multi_modal_placeholder_maps.items()
-            }
-        attn_metadata = TorchSDPAMetadata(
-            chunked_prefill=self.chunked_prefill,
-            seq_lens=prefill_seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            max_kv_len=max_kv_len,
-            prefill_query_start_loc=query_start_loc,
-            kv_start_loc=kv_start_loc,
-            max_decode_seq_len=input_data.max_decode_seq_len,
-            num_prefills=input_data.num_prefills,
-            num_prefill_tokens=input_data.num_prefill_tokens,
-            num_decode_tokens=input_data.num_decode_tokens,
-            block_tables=block_tables,
-            prefill_block_tables=prefill_block_tables,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-        )
-        return attn_metadata
-class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if blocksparse_params is not None:
-            raise ValueError(
-                "Torch SPDA does not support block-sparse attention.")
-        if logits_soft_cap is not None:
-            logger.warning_once("Torch SPDA does not support logits soft cap. "
-                                "Outputs may be slightly off.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Torch SPDA is not supported yet, it will fall"
-                " back to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = sliding_window
-        self.kv_cache_dtype = kv_cache_dtype
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.need_mask = (self.alibi_slopes is not None
-                          or self.sliding_window is not None)
-        supported_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {supported_head_sizes}.")
-        if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex:
-            raise NotImplementedError(
-                "Torch SDPA backend FP8 KV cache requires "
-                "intel_extension_for_pytorch support.")
-        self.attn_type = attn_type
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,  # type: ignore
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with torch SDPA and PagedAttention.
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for TorchSDPABackendImpl")
-        # For warming-up
-        if attn_metadata is None:
-            return query
-        attn_type = self.attn_type
-        if (attn_type == AttentionType.ENCODER
-                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
-            raise AttributeError("Encoder attention requires setting "
-                                 "encoder metadata attributes.")
-        elif (attn_type == AttentionType.ENCODER_DECODER
-              and (not attn_metadata.is_all_cross_attn_metadata_set)):
-            raise AttributeError("Encoder/decoder cross-attention "
-                                 "requires setting cross-attention "
-                                 "metadata attributes.")
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, self.num_heads, self.head_size)
-        if key is not None:
-            assert value is not None
-            key = key.view(-1, self.num_kv_heads, self.head_size)
-            value = value.view(-1, self.num_kv_heads, self.head_size)
-        else:
-            assert value is None
-        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
-            # KV-cache during decoder-self- or
-            # encoder-decoder-cross-attention, but not
-            # during encoder attention.
-            #
-            # Even if there are no new key/value pairs to cache,
-            # we still need to break out key_cache and value_cache
-            # i.e. for later use by paged attention
-            key_cache, value_cache = PagedAttention.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-            if (key is not None) and (value is not None):
-                if attn_type == AttentionType.ENCODER_DECODER:
-                    # Update cross-attention KV cache (prefill-only)
-                    # During cross-attention decode, key & value will be None,
-                    # preventing this IF-statement branch from running
-                    updated_slot_mapping = attn_metadata.cross_slot_mapping
-                else:
-                    # Update self-attention KV cache (prefill/decode)
-                    updated_slot_mapping = attn_metadata.slot_mapping
-                PagedAttention.write_to_paged_cache(
-                    key, value, key_cache, value_cache, updated_slot_mapping,
-                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
-        if attn_type != AttentionType.ENCODER:
-            # Decoder self-attention supports chunked prefill.
-            # Encoder/decoder cross-attention requires no chunked
-            # prefill (100% prefill or 100% decode tokens, no mix)
-            num_prefill_tokens = attn_metadata.num_prefill_tokens
-            num_decode_tokens = attn_metadata.num_decode_tokens
-        else:
-            # Encoder attention - chunked prefill is not applicable;
-            # derive token-count from query shape & and treat them
-            # as 100% prefill tokens
-            assert attn_metadata.num_encoder_tokens is not None
-            num_prefill_tokens = attn_metadata.num_encoder_tokens
-            num_decode_tokens = 0
-        if attn_type == AttentionType.DECODER:
-            # Only enforce this shape-constraint for decoder
-            # self-attention
-            assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-            assert value.shape[0] == num_prefill_tokens + num_decode_tokens
-        output = torch.empty_like(query)
-        if prefill_meta := attn_metadata.prefill_metadata:
-            if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
-                assert attn_metadata.seq_lens is not None
-                self._run_sdpa_forward(output,
-                                       query,
-                                       key,
-                                       value,
-                                       prefill_meta,
-                                       attn_type=attn_type)
-            else:
-                # prefix-enabled attention
-                assert not self.need_mask
-                import intel_extension_for_pytorch.llm.modules as ipex_modules
-                output = torch.empty_like(query)
-                ipex_modules.PagedAttention.flash_attn_varlen_func(
-                    output[:prefill_meta.num_prefill_tokens, :, :],
-                    query[:prefill_meta.num_prefill_tokens, :, :],
-                    key_cache,
-                    value_cache,
-                    prefill_meta.prefill_query_start_loc,
-                    prefill_meta.kv_start_loc,
-                    prefill_meta.max_query_len,
-                    prefill_meta.max_kv_len,
-                    self.scale,
-                    True,
-                    prefill_meta.prefill_block_tables,
-                    self.alibi_slopes,
-                )
-        if decode_meta := attn_metadata.decode_metadata:
-            assert attn_type != AttentionType.ENCODER_ONLY, (
-                "Encoder-only models should not have decode metadata.")
-            # Decoding run.
-            (
-                seq_lens_arg,
-                max_seq_len_arg,
-                block_tables_arg,
-            ) = decode_meta.get_seq_len_block_table_args(attn_type)
-            PagedAttention.forward_decode(
-                output[attn_metadata.num_prefill_tokens:, :, :],
-                query[attn_metadata.num_prefill_tokens:, :, :],
-                key_cache,
-                value_cache,
-                block_tables_arg,
-                seq_lens_arg,
-                max_seq_len_arg,
-                self.kv_cache_dtype,
-                self.num_kv_heads,
-                self.scale,
-                self.alibi_slopes,
-                layer._k_scale,
-                layer._v_scale,
-            )
-        # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
-    def _run_sdpa_forward(
-        self,
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        attn_metadata: TorchSDPAMetadata,
-        attn_type: str = AttentionType.DECODER,
-    ) -> None:
-        if self.num_kv_heads != self.num_heads:
-            key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
-            value = value.repeat_interleave(self.num_queries_per_kv, dim=1)
-        attn_masks = attn_metadata.get_attn_bias(attn_type)
-        if attn_masks is None:
-            if self.alibi_slopes is not None:
-                attn_masks = _make_alibi_bias(
-                    self.alibi_slopes, query.dtype,
-                    attn_metadata.seq_lens)  # type: ignore
-            elif self.sliding_window is not None:
-                assert attn_metadata.seq_lens is not None
-                attn_masks = _make_sliding_window_bias(
-                    attn_metadata.seq_lens, self.sliding_window,
-                    query.dtype)  # type: ignore
-            else:
-                seq_lens, _ = attn_metadata.get_seq_lens(attn_type)
-                attn_masks = [None] * len(seq_lens)
-            attn_metadata.set_attn_bias(attn_masks, attn_type)
-        query = query.movedim(0, query.dim() - 2)
-        key = key.movedim(0, key.dim() - 2)
-        value = value.movedim(0, value.dim() - 2)
-        causal_attn = (attn_type == AttentionType.DECODER)
-        seq_lens_q, seq_lens_kv = attn_metadata.get_seq_lens(attn_type)
-        start_q, start_kv = 0, 0
-        for seq_len_q, seq_len_kv, mask in zip(seq_lens_q, seq_lens_kv,
-                                               attn_masks):
-            end_q = start_q + seq_len_q
-            end_kv = start_kv + seq_len_kv
-            sub_out = scaled_dot_product_attention(
-                query[None, :, start_q:end_q, :],
-                key[None, :, start_kv:end_kv, :],
-                value[None, :, start_kv:end_kv, :],
-                attn_mask=mask,
-                dropout_p=0.0,
-                is_causal=causal_attn and mask is None,
-                scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0)
-            output[start_q:end_q, :, :] = sub_out
-            start_q, start_kv = end_q, end_kv
-def _make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    dtype: torch.dtype,
-    seq_lens: List[int],
-) -> List[torch.Tensor]:
-    attn_biases: List[torch.Tensor] = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype)
-        # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(seq_len, 1)`
-        # here. We find that both biases give the same results, but
-        # the bias below more accurately follows the original ALiBi
-        # paper.
-        bias = bias[None, :] - bias[:, None]
-        num_heads = alibi_slopes.shape[0]
-        bias = bias[None, :].repeat((num_heads, 1, 1))
-        bias.mul_(alibi_slopes[:, None, None]).unsqueeze_(0)
-        inf_mask = torch.empty(
-            (1, seq_len, seq_len),
-            dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)
-        attn_biases.append((bias + inf_mask).to(dtype))
-    return attn_biases
-def _make_sliding_window_bias(
-    seq_lens: List[int],
-    window_size: Optional[int],
-    dtype: torch.dtype,
-) -> List[torch.Tensor]:
-    attn_biases: List[torch.Tensor] = []
-    for seq_len in seq_lens:
-        tensor = torch.full(
-            (1, seq_len, seq_len),
-            dtype=dtype,
-            fill_value=1,
-        )
-        shift = 0
-        mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
-        if window_size is not None:
-            mask = torch.triu(mask, diagonal=shift - window_size + 1)
-        mask = torch.log(mask)
-        attn_biases.append(mask.to(dtype))
-    return attn_biases