"""Attention layer with FlashAttention.""" from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type import numpy as np import torch import triton import triton.language as tl from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) from vllm.envs import VLLM_FLASH_ATTN_VERSION from vllm.platforms import current_platform from vllm.utils import cdiv from vllm.vllm_flash_attn import (flash_attn_varlen_func, is_fa_version_supported) class FlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @staticmethod def get_supported_head_sizes() -> List[int]: return [32, 64, 96, 128, 160, 192, 224, 256] @staticmethod def get_name() -> str: return "FLASH_ATTN_VLLM_V1" @staticmethod def get_impl_cls() -> Type["FlashAttentionImpl"]: return FlashAttentionImpl @staticmethod def get_metadata_cls() -> Type["AttentionMetadata"]: return FlashAttentionMetadata @staticmethod def get_kv_cache_shape( num_blocks: int, block_size: int, num_kv_heads: int, head_size: int, ) -> Tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) @staticmethod def use_cascade_attention(*args, **kwargs) -> bool: return use_cascade_attention(*args, **kwargs) @dataclass class FlashAttentionMetadata: # NOTE(sang): Definition of context_len, query_len, and seq_len. # |---------- N-1 iteration --------| # |---------------- N iteration ---------------------| # |- tokenA -|......................|-- newTokens ---| # |---------- context_len ----------| # |-------------------- seq_len ---------------------| # |-- query_len ---| num_actual_tokens: int # Number of tokens excluding padding. max_query_len: int query_start_loc: torch.Tensor max_seq_len: int seq_lens: torch.Tensor block_table: torch.Tensor slot_mapping: torch.Tensor # For cascade attention. use_cascade: bool common_prefix_len: int cu_prefix_query_lens: Optional[torch.Tensor] prefix_kv_lens: Optional[torch.Tensor] suffix_kv_lens: Optional[torch.Tensor] # For logging. num_input_tokens: int = 0 # Number of tokens including padding. class FlashAttentionImpl(AttentionImpl): def __init__( self, num_heads: int, head_size: int, scale: float, num_kv_heads: int, alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( "FlashAttention does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) self.num_kv_heads = num_kv_heads if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.alibi_slopes = alibi_slopes if sliding_window is None: self.sliding_window = (-1, -1) else: self.sliding_window = (sliding_window - 1, 0) self.kv_cache_dtype = kv_cache_dtype if logits_soft_cap is None: # In flash-attn, setting logits_soft_cap as 0 means no soft cap. logits_soft_cap = 0 self.logits_soft_cap = logits_soft_cap assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() if head_size not in support_head_sizes: raise ValueError( f"Head size {head_size} is not supported by FlashAttention. " f"Supported head sizes are: {support_head_sizes}.") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " "encoder/decoder cross-attention " "are not implemented for " "FlashAttentionImpl") # if hopper default to FA3, otherwise stick to FA2 for now # TODO(lucas): profile FA3 on ampere to see if it makes sense to # use FA3 as default for both if current_platform.get_device_capability()[0] >= 9: self.fa_version = 3 if is_fa_version_supported(3) else 2 else: self.fa_version = 2 if VLLM_FLASH_ATTN_VERSION is not None: assert VLLM_FLASH_ATTN_VERSION in [2, 3] self.fa_version = VLLM_FLASH_ATTN_VERSION assert is_fa_version_supported(self.fa_version) def forward( self, layer: torch.nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. Args: query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] """ assert output is not None, "Output tensor must be provided." if attn_metadata is None: # Profiling run. return output # IMPORTANT! # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead # in this method. For example, `view` and `slice` (or `[:n]`) operations # are surprisingly slow even in the case they do not invoke any GPU ops. # Minimize the PyTorch ops in this method as much as possible. # Whenever making a change in this method, please benchmark the # performance to make sure it does not introduce any overhead. num_actual_tokens = attn_metadata.num_actual_tokens # Reshape the input keys and values and store them in the cache. # NOTE(woosuk): Here, key and value are padded while slot_mapping is # not padded. However, we don't need to do key[:num_actual_tokens] and # value[:num_actual_tokens] because the reshape_and_cache_flash op uses # the slot_mapping's shape to determine the number of actual tokens. key_cache, value_cache = kv_cache.unbind(0) torch.ops._C_cache_ops.reshape_and_cache_flash( key, value, key_cache, value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, layer._k_scale, layer._v_scale, ) # Compute attention and update output up to `num_actual_tokens`. if not attn_metadata.use_cascade: # Regular attention (common case). flash_attn_varlen_func( q=query[:num_actual_tokens], k=key_cache, v=value_cache, out=output[:num_actual_tokens], cu_seqlens_q=attn_metadata.query_start_loc, max_seqlen_q=attn_metadata.max_query_len, seqused_k=attn_metadata.seq_lens, max_seqlen_k=attn_metadata.max_seq_len, softmax_scale=self.scale, causal=True, alibi_slopes=self.alibi_slopes, window_size=self.sliding_window, block_table=attn_metadata.block_table, softcap=self.logits_soft_cap, fa_version=self.fa_version, ) return output # Cascade attention (rare case). cascade_attention( output[:num_actual_tokens], query[:num_actual_tokens], key_cache, value_cache, cu_query_lens=attn_metadata.query_start_loc, max_query_len=attn_metadata.max_query_len, cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens, prefix_kv_lens=attn_metadata.prefix_kv_lens, suffix_kv_lens=attn_metadata.suffix_kv_lens, max_kv_len=attn_metadata.max_seq_len, softmax_scale=self.scale, alibi_slopes=self.alibi_slopes, sliding_window=self.sliding_window, logits_soft_cap=self.logits_soft_cap, block_table=attn_metadata.block_table, common_prefix_len=attn_metadata.common_prefix_len, fa_version=self.fa_version, ) return output def use_cascade_attention( common_prefix_len: int, query_lens: np.ndarray, num_query_heads: int, num_kv_heads: int, use_alibi: bool, use_sliding_window: bool, num_sms: int, ) -> bool: """Decide whether to use cascade attention. This function 1) checks whether cascade attention is supported with the given configuration, and 2) heuristically decides whether using cascade attention can improve performance. """ # Too short common prefix. Probably not worth using cascade attention. # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold. # NOTE(woosuk): This is the common case. We should return False as soon as # possible to avoid any unnecessary computation. if common_prefix_len < 256: return False # Cascade attention is currently not supported with these variants. if use_alibi or use_sliding_window: return False # Too few queries. Probably not worth using cascade attention. # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold. num_reqs = len(query_lens) if num_reqs < 8: return False # Heuristics to decide whether using cascade attention is beneficial. # 1. When FlashDecoding is not used for normal attention, cascade attention # is likely to be faster since it saves memory bandwidth. num_queries_per_kv = num_query_heads // num_kv_heads # The criteria for using FlashDecoding can be found in the following link: # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535 use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window and not use_alibi and np.all(query_lens == 1)) if not use_flash_decoding: # Use cascade attention. return True # 2. When FlashDecoding is used for normal attention, it is not clear # whether cascade attention is beneficial, because FlashDecoding can # launch more CTAs than cascade attention. # We use a simple performance model to compare the two methods. # NOTE(woosuk): The performance model is very rough and may not be # accurate. num_tokens = num_reqs # NOTE(woosuk): These are default tile sizes. flash-attn might use # different tile sizes (e.g., 64 or 256) depending on the configuration. q_tile_size = 128 kv_tile_size = 128 num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size) cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size) cascade_waves = cdiv(cascade_ctas, num_sms) cascade_time = cascade_waves * num_prefix_tiles flash_decoding_ctas = (num_reqs * num_kv_heads * cdiv(num_queries_per_kv, q_tile_size)) flash_decoding_ctas *= num_prefix_tiles flash_decoding_time = cdiv(flash_decoding_ctas, num_sms) # Use cascade attention if it is faster than FlashDecoding. return cascade_time < flash_decoding_time def cascade_attention( output: torch.Tensor, query: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, cu_query_lens: torch.Tensor, max_query_len: int, cu_prefix_query_lens: torch.Tensor, prefix_kv_lens: torch.Tensor, suffix_kv_lens: torch.Tensor, max_kv_len: int, softmax_scale: float, alibi_slopes: Optional[torch.Tensor], sliding_window: Tuple[int, int], logits_soft_cap: float, block_table: torch.Tensor, common_prefix_len: int, fa_version: int, ) -> torch.Tensor: assert alibi_slopes is None, ("Cascade attention does not support ALiBi.") # TODO: Support sliding window. assert sliding_window == (-1, -1), ( "Cascade attention does not support sliding window.") num_tokens = query.shape[0] block_size = key_cache.shape[-3] assert common_prefix_len % block_size == 0 num_common_kv_blocks = common_prefix_len // block_size assert num_common_kv_blocks > 0 # Process shared prefix. prefix_output, prefix_lse = flash_attn_varlen_func( q=query, k=key_cache, v=value_cache, cu_seqlens_q=cu_prefix_query_lens, seqused_k=prefix_kv_lens, max_seqlen_q=num_tokens, max_seqlen_k=common_prefix_len, softmax_scale=softmax_scale, causal=False, window_size=sliding_window, block_table=block_table[:1], softcap=logits_soft_cap, return_softmax_lse=True, fa_version=fa_version, ) # Process suffix per query. suffix_output, suffix_lse = flash_attn_varlen_func( q=query, k=key_cache, v=value_cache, cu_seqlens_q=cu_query_lens, seqused_k=suffix_kv_lens, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len - common_prefix_len, softmax_scale=softmax_scale, causal=True, window_size=sliding_window, block_table=block_table[:, num_common_kv_blocks:], softcap=logits_soft_cap, return_softmax_lse=True, fa_version=fa_version, ) # Merge prefix and suffix outputs, and store the result in output. merge_attn_states(output, prefix_output, prefix_lse, suffix_output, suffix_lse) def merge_attn_states( output: torch.Tensor, prefix_output: torch.Tensor, prefix_lse: torch.Tensor, suffix_output: torch.Tensor, suffix_lse: torch.Tensor, ) -> None: num_tokens = output.shape[0] num_query_heads = output.shape[1] head_size = output.shape[2] padded_head_size = triton.next_power_of_2(head_size) # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead. merge_attn_states_kernel[(num_tokens, num_query_heads)]( output, prefix_output, prefix_lse, suffix_output, suffix_lse, head_size, padded_head_size, ) @triton.jit def merge_attn_states_kernel( output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] prefix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] prefix_lse, # [NUM_HEADS, NUM_TOKENS] suffix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] suffix_lse, # [NUM_HEADS, NUM_TOKENS] HEAD_SIZE: tl.constexpr, PADDED_HEAD_SIZE: tl.constexpr, ): token_idx = tl.program_id(0) num_tokens = tl.num_programs(0) head_idx = tl.program_id(1) num_heads = tl.num_programs(1) p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx) s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx) max_lse = tl.maximum(p_lse, s_lse) p_lse = p_lse - max_lse s_lse = s_lse - max_lse head_arange = tl.arange(0, PADDED_HEAD_SIZE) head_mask = head_arange < HEAD_SIZE p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange, mask=head_mask) s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange, mask=head_mask) # NOTE(woosuk): Be careful with the numerical stability. # We should compute the scale first, and then multiply it with the output. # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly. p_scale = tl.exp(p_lse) / (tl.exp(p_lse) + tl.exp(s_lse)) s_scale = tl.exp(s_lse) / (tl.exp(p_lse) + tl.exp(s_lse)) out = p_out * p_scale + s_out * s_scale tl.store(output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange, out, mask=head_mask)