test_flash_attn_saux.py

#!/usr/bin/env python3
"""
Comprehensive test suite for Attention Sinks (s_aux parameter) in Flash Attention.

This test suite validates the s_aux parameter implementation across all Flash Attention
interfaces, ensuring correct integration with Streaming LLM's Attention Sinks mechanism.

Test Coverage (834 tests total):
================================

1. Basic Interface Tests:
   - test_flash_attn_no_saux: Validate baseline without s_aux (24 tests)
   - test_flash_attn_with_saux: Basic s_aux parameter validation (72 tests)
   - test_saux_impact: Verify s_aux correctly modifies attention output (4 tests)
   - test_different_batch_and_head_sizes: Scalability tests (32 tests)

2. MHA/MQA/GQA Support:
   - test_flash_attn_mqa_gqa_with_saux: Multi-Query and Grouped-Query Attention (216 tests)
     * Supports MHA (nheads == nheads_k)
     * Supports MQA (nheads_k == 1)
     * Supports GQA (1 < nheads_k < nheads)
     * Tests various sequence lengths: (128,128), (256,512), (512,256)
     * Tests head dimensions: 64, 128
     * Tests s_aux scales: 0.0, 0.5, 1.0

3. Variable-Length Sequences:
   - test_flash_attn_varlen_with_saux: Ragged batch support (216 tests)
     * Uses flash_attn_varlen_func with cu_seqlens
     * Tests MHA/MQA/GQA variants
     * Validates correct s_aux broadcasting across variable sequences

4. KV Cache Interface:
   - test_flash_attn_kvcache_with_saux: Standard KV cache with s_aux (216 tests)
     * Tests both decode (seqlen_q=1) and prefill (seqlen_q>1) modes
     * IMPORTANT: s_aux shape differs by mode:
       - Decode mode: uses split-KV kernel → s_aux shape = [nheads_k]
       - Prefill mode: uses standard fwd kernel → s_aux shape = [nheads]
     * Tests sequence length combinations: (1,128), (8,256), (32,512)

5. vLLM Paged Attention:
   - test_vllm_flash_attn_with_kvcache_saux: Paged KV cache validation (48 tests)
     * Validates s_aux parameter passing to paged_attention kernel
     * Tests variable-length sequences with block tables
     * Decode-only mode (seqlen_q=1 requirement)
     * s_aux shape: [nheads] (query heads, not KV heads)

6. Feature Combination Tests:
   - test_flash_attn_saux_with_features: s_aux with alibi/local/softcap (8 tests)
     * Validates s_aux compatibility with:
       - ALiBi positional biases
       - Local (sliding window) attention
       - Softcap (attention logit capping)
     * Ensures no crashes or numerical issues

Parameter Configuration:
========================
All test functions follow the official test_flash_attn.py parameter style:
- Use [False] or [0.0] for disabled features (not omitted)
- Include commented alternatives for easy enable/disable
- Standard parameters: dtype, d, causal, deterministic, alibi, local, dropout_p, softcap
- Skip logic in function body for unsupported combinations

Reference Implementation:
=========================
Uses attention_ref() from official Flash Attention tests with s_aux support:
- Manually computes attention with exp(s_aux) bias for sink tokens
- Provides ground truth for numerical validation
- Supports upcast to fp32 for higher precision

Attention Sinks Background:
============================
Attention Sinks is a technique from "Efficient Streaming Language Models with
Attention Sinks" that prevents catastrophic performance degradation in streaming
LLMs by preserving a few initial "sink" tokens that absorb excess attention mass.

The s_aux parameter contains precomputed LogSumExp values for these sink tokens,
allowing Flash Attention to correctly incorporate them without recomputation.

Implementation Notes:
=====================
- s_aux is always shape [nheads] or [nheads_k] depending on kernel
- s_aux dtype must match Q/K/V dtype (float16 or bfloat16)
- s_aux is broadcast across batch and sequence dimensions
- s_aux values are added to attention scores before softmax (in log space)
- Storage uses fp16/bf16 for efficiency; computation uses fp32 for stability
- Current implementation: forward pass only (no backward pass yet)
"""

import torch
import math
import pytest

from flash_attn import flash_attn_func, flash_attn_varlen_func, flash_attn_with_kvcache
from einops import repeat

# Device capability checks
is_sm8x = torch.cuda.get_device_capability("cuda")[0] == 8
is_sm80 = torch.cuda.get_device_capability("cuda") == (8, 0)
is_sm90 = torch.cuda.get_device_capability("cuda") == (9, 0)

MAX_HEADDIM_SM8x = 192
device = "cuda"

def attention_ref(
    q,
    k,
    v,
    causal=False,
    s_aux=None,
    window_size=(-1, -1),
    upcast=True,
    reorder_ops=False,
):
    """
    Reference implementation based on the official Flash Attention test code.

    Arguments:
        q: (batch_size, seqlen_q, nheads, head_dim)
        k: (batch_size, seqlen_k, nheads, head_dim)
        v: (batch_size, seqlen_k, nheads, head_dim)
        causal: whether to apply causal masking
        s_aux: (nheads,) - sink token values for Attention Sinks
        window_size: (left_window, right_window) - local attention window size.
            left_window: number of tokens to the left that can be attended to
            right_window: number of tokens to the right that can be attended to
            -1 means no limit (full attention in that direction)
        upcast: whether to cast to fp32 for computation
        reorder_ops: whether to change the order of operations (for numerical stability testing)

    Output:
        output: (batch_size, seqlen_q, nheads, head_dim)
        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax probabilities
    """
    # Validate input shapes
    assert q.ndim == 4 and k.ndim == 4 and v.ndim == 4, \
        f"Expected 4D tensors, got q:{q.ndim}D, k:{k.ndim}D, v:{v.ndim}D"

    batch_size = q.shape[0]
    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
    nheads = q.shape[2]
    nheads_k = k.shape[2]
    d = q.shape[-1]

    # Validate batch sizes match
    assert k.shape[0] == batch_size and v.shape[0] == batch_size, \
        f"Batch size mismatch: q={batch_size}, k={k.shape[0]}, v={v.shape[0]}"

    # Validate s_aux shape if provided
    if s_aux is not None:
        assert s_aux.ndim == 1, f"s_aux must be 1D, got {s_aux.ndim}D"
        assert s_aux.shape[0] == nheads, \
            f"s_aux shape {s_aux.shape} doesn't match nheads {nheads}"
        assert s_aux.dtype == q.dtype, \
            f"s_aux dtype {s_aux.dtype} must match Q dtype {q.dtype}"

    dtype_og = q.dtype
    if upcast:
        q, k, v = q.float(), k.float(), v.float()
        s_aux = s_aux.float() if s_aux is not None else None

    # Auto-expand K, V for MQA/GQA (following official test pattern)
    if nheads_k < nheads:
        assert nheads % nheads_k == 0, \
            f"nheads ({nheads}) must be divisible by nheads_k ({nheads_k})"
        k = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k)
        v = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k)

    # Compute attention scores: (batch, seqlen_q, nheads, d) @ (batch, seqlen_k, nheads, d)^T
    # -> (batch, nheads, seqlen_q, seqlen_k)
    if not reorder_ops:
        scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
    else:
        scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))

    # Apply causal mask
    if causal:
        causal_mask = torch.triu(
            torch.ones(seqlen_q, seqlen_k, dtype=torch.bool, device=q.device),
            diagonal=seqlen_k - seqlen_q + 1
        )
        scores = scores.masked_fill(causal_mask, float("-inf"))

    # Apply window mask (banded attention, GPT-OSS style)
    # This implements local/sliding window attention
    if window_size[0] >= 0 or window_size[1] >= 0:
        row_idx = torch.arange(seqlen_q, device=q.device, dtype=torch.long).unsqueeze(1)
        col_idx = torch.arange(seqlen_k, device=q.device, dtype=torch.long).unsqueeze(0)

        window_mask = torch.zeros(seqlen_q, seqlen_k, dtype=torch.bool, device=q.device)

        # Left window: mask tokens that are too far to the left
        # col_idx < row_idx - left_window means token is outside left boundary
        if window_size[0] >= 0:
            window_mask |= (col_idx < row_idx + seqlen_k - seqlen_q - window_size[0])

        # Right window: mask tokens that are too far to the right
        # col_idx > row_idx + right_window means token is outside right boundary
        if window_size[1] >= 0:
            window_mask |= (col_idx > row_idx + seqlen_k - seqlen_q + window_size[1])

        scores = scores.masked_fill(window_mask, float("-inf"))

    # Attention Sinks: add s_aux as an extra column before softmax
    if s_aux is not None:
        # s_aux: (nheads,) -> (batch, nheads, seqlen_q, 1)
        s_aux_expanded = s_aux.reshape(1, nheads, 1, 1).expand(batch_size, -1, seqlen_q, -1)
        scores = torch.cat([scores, s_aux_expanded], dim=-1)

    # Softmax
    attention = torch.softmax(scores, dim=-1).to(v.dtype)

    # Handle NaN from rows where all scores are -inf (e.g., seqlen_q > seqlen_k with causal)
    # This can happen when a query position cannot attend to any key position due to masking
    if causal:
        # Find rows where attention is all NaN (from all -inf inputs to softmax)
        nan_mask = torch.isnan(attention).any(dim=-1, keepdim=True)
        attention = attention.masked_fill(nan_mask, 0.0)

    # Attention Sinks: remove the sink column after softmax
    if s_aux is not None:
        attention = attention[..., :-1]  # Remove last column

    # Apply attention to values
    # attention: (batch, nheads, seqlen_q, seqlen_k)
    # v: (batch, seqlen_k, nheads, head_dim)
    output = torch.einsum("bhts,bshd->bthd", attention, v)

    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)


# ============================================================================
# Basic Attention Tests (with and without s_aux)
# ============================================================================

@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("d", [64, 128])
@pytest.mark.parametrize("causal", [False, True])
@pytest.mark.parametrize("seqlen_q,seqlen_k", [
    (113, 203),
    (128, 128),
    (256, 512),
    (512, 256),
    (1024, 1024),
])
def test_flash_attn_no_saux(seqlen_q, seqlen_k, d, causal, dtype):
    """Test basic attention without s_aux parameter (forward only)."""
    torch.random.manual_seed(0)

    batch_size = 2
    nheads = 6

    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)

    # Forward pass
    out = flash_attn_func(q, k, v, causal=causal)
    out_ref, attn_ref = attention_ref(q, k, v, causal=causal, s_aux=None)
    out_pt, attn_pt = attention_ref(q, k, v, causal=causal, s_aux=None, upcast=False, reorder_ops=True)

    # Check forward pass
    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")

    # Assertions - FlashAttention's numerical error should be at most twice the error of a Pytorch implementation
    # Add minimum threshold to avoid issues when pt_diff is very small
    assert (out - out_ref).abs().max().item() <= max(
        2 * (out_pt - out_ref).abs().max().item(),
        1e-4  # Minimum threshold for numerical stability
    )


@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("d", [64, 128])
@pytest.mark.parametrize("causal", [False, True])
@pytest.mark.parametrize("seqlen_q,seqlen_k", [
    (113, 203),
    (128, 128),
    (256, 512),
    (512, 256),
    (1024, 1024),
])
@pytest.mark.parametrize("s_aux_scale", [0.1, 0.5, 1.0, 2.0])
def test_flash_attn_with_saux(seqlen_q, seqlen_k, d, causal, dtype, s_aux_scale):
    """Test attention with s_aux parameter for Attention Sinks (forward only)."""
    torch.random.manual_seed(42)

    batch_size = 2
    nheads = 6

    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)

    # Generate s_aux
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    # Forward pass
    out = flash_attn_func(q, k, v, causal=causal, s_aux=s_aux)
    out_ref, attn_ref = attention_ref(q, k, v, causal=causal, s_aux=s_aux)
    out_pt, attn_pt = attention_ref(q, k, v, causal=causal, s_aux=s_aux, upcast=False, reorder_ops=True)

    # Check forward pass
    print(f"s_aux scale={s_aux_scale}: Output max diff: {(out - out_ref).abs().max().item()}")
    print(f"s_aux scale={s_aux_scale}: Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")

    # Assertions - add minimum threshold for numerical stability
    assert (out - out_ref).abs().max().item() <= max(
        2 * (out_pt - out_ref).abs().max().item(),
        1e-4
    )


@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("d", [64, 128])
def test_saux_impact(d, dtype):
    """Test that s_aux actually changes the output."""
    torch.random.manual_seed(99)

    batch_size = 2
    seqlen = 128
    nheads = 6

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    # Test without s_aux
    out_no_saux = flash_attn_func(q, k, v, causal=True)
    out_ref_no_saux, _ = attention_ref(q, k, v, causal=True, s_aux=None)

    # Test with s_aux
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * 0.5
    out_with_saux = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)
    out_ref_with_saux, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    # s_aux should change the output
    diff_flash = torch.abs(out_no_saux - out_with_saux)
    diff_ref = torch.abs(out_ref_no_saux - out_ref_with_saux)

    print(f"Flash Attention - output change with s_aux: max={diff_flash.max().item():.6e}, mean={diff_flash.mean().item():.6e}")
    print(f"Reference - output change with s_aux: max={diff_ref.max().item():.6e}, mean={diff_ref.mean().item():.6e}")

    # Both should show similar impact
    ratio = diff_flash.mean() / (diff_ref.mean() + 1e-8)
    print(f"Impact ratio (Flash/Reference): {ratio:.4f}")

    # The output should be different (but not too different)
    # Note: s_aux is designed to significantly change attention distribution (Attention Sinks)
    assert diff_flash.max().item() > 1e-4, "s_aux should have some impact"
    assert diff_flash.max().item() < 3.0, "s_aux impact should be reasonable for bf16/fp16"
    assert 0.3 < ratio < 3.0, "s_aux should have similar impact in both implementations"


# ============================================================================
# Batch Size and Head Number Tests
# ============================================================================

@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("d", [64, 128])
@pytest.mark.parametrize("batch_size", [1, 2, 4, 8])
@pytest.mark.parametrize("nheads", [4, 6, 8])
def test_different_batch_and_head_sizes(batch_size, nheads, d, dtype):
    """Test attention with different batch sizes and head numbers (forward only)."""
    torch.random.manual_seed(0)

    seqlen = 128

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * 0.5

    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    print(f"Batch {batch_size}, Heads {nheads}: max diff = {(out - out_ref).abs().max().item()}")

    assert torch.allclose(out, out_ref, atol=5e-2, rtol=1e-1)


# ============================================================================
# MQA/GQA Tests (Multi-Query and Grouped-Query Attention)
# ============================================================================

@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5, 1.0])
# @pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("softcap", [0.0])
# @pytest.mark.parametrize("softcap", [0.0, 50.0])
@pytest.mark.parametrize("dropout_p", [0.0])
# @pytest.mark.parametrize("dropout_p", [0.0, 0.17])
@pytest.mark.parametrize("seqlen_q,seqlen_k", [
    (128, 128),
    (256, 512),
    (512, 256),
])
# @pytest.mark.parametrize("seqlen_q,seqlen_k", [(128, 128)])
@pytest.mark.parametrize("d", [64, 128])
# @pytest.mark.parametrize("d", [64, 96, 128, 192, 256])
@pytest.mark.parametrize("causal", [False, True])
# @pytest.mark.parametrize("causal", [False])
@pytest.mark.parametrize("local", [False])
# @pytest.mark.parametrize("local", [False, True])
@pytest.mark.parametrize("alibi", [False])
# @pytest.mark.parametrize("alibi", [False, True])
@pytest.mark.parametrize("deterministic", [False])
# @pytest.mark.parametrize("deterministic", [False, True])
@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
# @pytest.mark.parametrize("mha_type", ["mha"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
# @pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_flash_attn_mqa_gqa_with_saux(
    seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic,
    mha_type, dtype, softcap, s_aux_scale
):
    """Test MQA/GQA with s_aux parameter (forward only)."""
    # Skip tests for unsupported feature combinations
    if dropout_p > 0.0:
        pytest.skip("Dropout not supported in this test (forward only)")
    if alibi:
        pytest.skip("Alibi tested separately in test_flash_attn_saux_with_features")
    if local:
        pytest.skip("Local attention tested separately in test_flash_attn_saux_with_features")
    if softcap > 0.0:
        pytest.skip("Softcap tested separately in test_flash_attn_saux_with_features")
    if deterministic:
        pytest.skip("Deterministic mode not tested (forward only)")

    torch.random.manual_seed(42)

    batch_size = 2
    nheads = 6
    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2)
    assert nheads % nheads_k == 0

    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype)

    # Generate s_aux (only if scale > 0)
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale if s_aux_scale > 0 else None

    # Forward pass - attention_ref now auto-expands K, V for MQA/GQA
    out = flash_attn_func(q, k, v, causal=causal, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=causal, s_aux=s_aux)
    out_pt, _ = attention_ref(q, k, v, causal=causal, s_aux=s_aux, upcast=False, reorder_ops=True)

    # Check forward pass
    max_diff = (out - out_ref).abs().max().item()
    pt_max_diff = (out_pt - out_ref).abs().max().item()
    print(f"{mha_type.upper()}, s_aux={s_aux_scale:.1f}: max diff = {max_diff:.6e}, pt diff = {pt_max_diff:.6e}")

    # Assertions - add minimum threshold for numerical stability
    assert max_diff <= max(2 * pt_max_diff, 1e-4) + 1e-5


# ============================================================================
# Varlen Interface Tests (SKIPPED - requires PyBind11 update)
# ============================================================================

def attention_varlen_ref(
    q,
    k,
    v,
    cu_seqlens_q,
    cu_seqlens_k,
    max_seqlen_q,
    max_seqlen_k,
    causal=False,
    s_aux=None,
    upcast=True,
):
    """
    Reference implementation for varlen attention with s_aux.

    Arguments:
        q: (total_q, nheads, head_dim)
        k: (total_k, nheads_k, head_dim)
        v: (total_k, nheads_k, head_dim)
        cu_seqlens_q: (batch_size + 1,), cumulative sequence lengths for q
        cu_seqlens_k: (batch_size + 1,), cumulative sequence lengths for k
        s_aux: (nheads,), sink token values
    """
    batch_size = cu_seqlens_q.shape[0] - 1
    nheads = q.shape[1]
    nheads_k = k.shape[1]
    d = q.shape[-1]
    dtype_og = q.dtype

    if upcast:
        q, k, v = q.float(), k.float(), v.float()
        s_aux = s_aux.float() if s_aux is not None else None

    # Auto-expand K, V for MQA/GQA
    if nheads_k < nheads:
        assert nheads % nheads_k == 0
        k = repeat(k, 't h d -> t (h g) d', g=nheads // nheads_k)
        v = repeat(v, 't h d -> t (h g) d', g=nheads // nheads_k)

    outputs = []
    for i in range(batch_size):
        q_start, q_end = cu_seqlens_q[i].item(), cu_seqlens_q[i + 1].item()
        k_start, k_end = cu_seqlens_k[i].item(), cu_seqlens_k[i + 1].item()

        q_i = q[q_start:q_end]  # (seqlen_q_i, nheads, d)
        k_i = k[k_start:k_end]  # (seqlen_k_i, nheads, d)
        v_i = v[k_start:k_end]  # (seqlen_k_i, nheads, d)

        seqlen_q_i = q_i.shape[0]
        seqlen_k_i = k_i.shape[0]

        # Compute attention scores: (seqlen_q, nheads, d) @ (seqlen_k, nheads, d)^T
        # -> (nheads, seqlen_q, seqlen_k)
        scores = torch.einsum("thd,shd->hts", q_i / math.sqrt(d), k_i)

        # Apply causal mask
        if causal:
            causal_mask = torch.triu(
                torch.ones(seqlen_q_i, seqlen_k_i, dtype=torch.bool, device=q.device),
                diagonal=seqlen_k_i - seqlen_q_i + 1
            )
            scores = scores.masked_fill(causal_mask, float("-inf"))

        # Attention Sinks
        if s_aux is not None:
            # s_aux: (nheads,) -> (nheads, seqlen_q, 1)
            s_aux_expanded = s_aux.reshape(nheads, 1, 1).expand(-1, seqlen_q_i, -1)
            scores = torch.cat([scores, s_aux_expanded], dim=-1)

        # Softmax
        attention = torch.softmax(scores, dim=-1).to(v.dtype)

        # Handle NaN
        if causal:
            nan_mask = torch.isnan(attention).any(dim=-1, keepdim=True)
            attention = attention.masked_fill(nan_mask, 0.0)

        # Remove sink column
        if s_aux is not None:
            attention = attention[..., :-1]

        # Apply attention: (nheads, seqlen_q, seqlen_k) @ (seqlen_k, nheads, d)
        # -> (seqlen_q, nheads, d)
        output_i = torch.einsum("hts,shd->thd", attention, v_i)
        outputs.append(output_i)

    output = torch.cat(outputs, dim=0).to(dtype=dtype_og)
    return output


@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5, 1.0])
# @pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("softcap", [0.0])
@pytest.mark.parametrize("dropout_p", [0.0])
@pytest.mark.parametrize("d", [64, 128])
# @pytest.mark.parametrize("d", [128])
@pytest.mark.parametrize("causal", [False, True])
# @pytest.mark.parametrize("causal", [False])
@pytest.mark.parametrize("local", [False])
@pytest.mark.parametrize("alibi", [False])
@pytest.mark.parametrize("deterministic", [False])
@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
# @pytest.mark.parametrize("mha_type", ["mha"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
# @pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_flash_attn_varlen_with_saux(
    d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, softcap, s_aux_scale
):
    """Test varlen interface with s_aux parameter."""
    # Skip tests for unsupported feature combinations
    if dropout_p > 0.0:
        pytest.skip("Dropout not supported in this test")
    if alibi:
        pytest.skip("Alibi not tested in varlen")
    if local:
        pytest.skip("Local attention not tested in varlen")
    if softcap > 0.0:
        pytest.skip("Softcap not tested in varlen")
    if deterministic:
        pytest.skip("Deterministic mode not tested")

    torch.random.manual_seed(42)

    batch_size = 3
    nheads = 6
    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2)
    assert nheads % nheads_k == 0

    # Variable sequence lengths
    seqlens_q = torch.tensor([128, 256, 64], dtype=torch.int32, device=device)
    seqlens_k = torch.tensor([128, 512, 96], dtype=torch.int32, device=device)

    cu_seqlens_q = torch.cat([torch.tensor([0], dtype=torch.int32, device=device), seqlens_q.cumsum(0, dtype=torch.int32)])
    cu_seqlens_k = torch.cat([torch.tensor([0], dtype=torch.int32, device=device), seqlens_k.cumsum(0, dtype=torch.int32)])

    total_q = cu_seqlens_q[-1].item()
    total_k = cu_seqlens_k[-1].item()
    max_seqlen_q = seqlens_q.max().item()
    max_seqlen_k = seqlens_k.max().item()

    q = torch.randn(total_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(total_k, nheads_k, d, device=device, dtype=dtype)
    v = torch.randn(total_k, nheads_k, d, device=device, dtype=dtype)

    # Generate s_aux
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale if s_aux_scale > 0 else None

    # Forward pass - attention_varlen_ref now auto-expands K, V for MQA/GQA
    out = flash_attn_varlen_func(
        q, k, v,
        cu_seqlens_q, cu_seqlens_k,
        max_seqlen_q, max_seqlen_k,
        causal=causal,
        s_aux=s_aux
    )

    out_ref = attention_varlen_ref(
        q, k, v,
        cu_seqlens_q, cu_seqlens_k,
        max_seqlen_q, max_seqlen_k,
        causal=causal,
        s_aux=s_aux
    )

    out_pt = attention_varlen_ref(
        q, k, v,
        cu_seqlens_q, cu_seqlens_k,
        max_seqlen_q, max_seqlen_k,
        causal=causal,
        s_aux=s_aux,
        upcast=False
    )

    max_diff = (out - out_ref).abs().max().item()
    pt_max_diff = (out_pt - out_ref).abs().max().item()
    print(f"Varlen {mha_type.upper()}, s_aux={s_aux_scale:.1f}: max diff = {max_diff:.6e}, pt diff = {pt_max_diff:.6e}")

    # Add minimum threshold for numerical stability
    assert max_diff <= max(2 * pt_max_diff, 1e-4) + 1e-5


# ============================================================================
# Helper Functions for Paged Attention and KV Cache Tests
# ============================================================================

def generate_paged_kvcache(batch_size, max_seqlen, block_size, nheads_k, d, device, dtype):
    """
    Generate paged KV cache with random block table for testing.

    Args:
        batch_size: Number of sequences
        max_seqlen: Maximum sequence length
        block_size: Block size for paging (typically 64)
        nheads_k: Number of KV heads
        d: Head dimension
        device: Device (cuda)
        dtype: Data type

    Returns:
        block_table: [batch_size, max_num_blocks] - Block indices for each sequence
        k_cache_paged: [num_blocks, nheads_k, block_size, d] - Paged K cache (vLLM layout)
        v_cache_paged: [num_blocks, nheads_k, d, block_size] - Paged V cache (vLLM layout)
    """
    import math

    max_num_blocks_per_seq = math.ceil(max_seqlen / block_size)
    # Allocate 3x more blocks than needed to allow random permutation
    num_blocks = max_num_blocks_per_seq * batch_size * 3

    # Generate paged caches with vLLM prefix cache layout
    # K cache: [num_blocks, nheads_k, block_size, d]
    # V cache: [num_blocks, nheads_k, d, block_size]
    k_cache_paged = torch.randn(num_blocks, nheads_k, block_size, d, device=device, dtype=dtype)
    v_cache_paged = torch.randn(num_blocks, nheads_k, d, block_size, device=device, dtype=dtype)

    # Generate contiguous block table (vLLM uses consecutive blocks per sequence)
    block_table = torch.arange(
        batch_size * max_num_blocks_per_seq,
        dtype=torch.int32,
        device=device
    ).view(batch_size, max_num_blocks_per_seq)

    return block_table, k_cache_paged, v_cache_paged


def paged_attention_reference(q, k_cache_paged, v_cache_paged, block_table, cache_seqlens,
                               s_aux=None, causal=True, nheads_k=None):
    """
    Reference implementation for paged attention with s_aux (following official test pattern).

    Args:
        q: [batch_size, seqlen_q, nheads, d]
        k_cache_paged: [num_blocks, nheads_k, block_size, d]
        v_cache_paged: [num_blocks, nheads_k, d, block_size]
        block_table: [batch_size, max_num_blocks] - MUST be contiguous
        cache_seqlens: [batch_size] - Actual sequence lengths
        s_aux: [nheads] or [nheads_k] (depends on context)
        causal: Whether to apply causal masking
        nheads_k: Number of KV heads (if None, inferred from k_cache_paged)

    Returns:
        out: [batch_size, seqlen_q, nheads, d]
    """
    import math

    batch_size, seqlen_q, nheads, d = q.shape
    num_blocks, nheads_k_cached, block_size, _ = k_cache_paged.shape

    if nheads_k is None:
        nheads_k = nheads_k_cached

    max_num_blocks_per_seq = block_table.shape[1]
    max_seqlen_pad = max_num_blocks_per_seq * block_size

    # Convert vLLM prefix cache layout to continuous layout for reference
    # K: [num_blocks, nheads_k, block_size, d] -> [num_blocks, block_size, nheads_k, d]
    # V: [num_blocks, nheads_k, d, block_size] -> [num_blocks, block_size, nheads_k, d]
    k_cache_cont = k_cache_paged.permute(0, 2, 1, 3).contiguous()
    v_cache_cont = v_cache_paged.permute(0, 3, 1, 2).contiguous()

    # Process each sequence separately
    out_list = []
    for i in range(batch_size):
        # Following official test pattern: use contiguous block indexing
        begin = i * max_seqlen_pad
        end = begin + cache_seqlens[i].item()

        # Gather K, V: flatten blocks and slice
        k_seq = k_cache_cont.view(-1, nheads_k, d)[begin:end]  # [seqlen, nheads_k, d]
        v_seq = v_cache_cont.view(-1, nheads_k, d)[begin:end]  # [seqlen, nheads_k, d]

        k_seq = k_seq.unsqueeze(0)  # [1, seqlen, nheads_k, d]
        v_seq = v_seq.unsqueeze(0)  # [1, seqlen, nheads_k, d]
        q_seq = q[i:i+1]  # [1, seqlen_q, nheads, d]

        # Expand for MQA/GQA if needed
        if nheads_k < nheads:
            k_seq = repeat(k_seq, 'b s h d -> b s (h g) d', g=nheads // nheads_k)
            v_seq = repeat(v_seq, 'b s h d -> b s (h g) d', g=nheads // nheads_k)

        # Apply attention_ref for this sequence
        out_seq, _ = attention_ref(q_seq, k_seq, v_seq, causal=causal, s_aux=s_aux)
        out_list.append(out_seq)

    # Stack outputs
    out_ref = torch.cat(out_list, dim=0)  # [batch_size, seqlen_q, nheads, d]

    return out_ref


def kvcache_attention_reference(q, k_cache, v_cache, cache_seqlens=None, s_aux=None, causal=True, upcast=True, reorder_ops=False):
    """
    Reference implementation for standard KV cache attention with s_aux.
    Handles cache_seqlens by masking or slicing.

    Args:
        q: [batch_size, seqlen_q, nheads, d]
        k_cache: [batch_size, seqlen_k, nheads_k, d]
        v_cache: [batch_size, seqlen_k, nheads_k, d]
        cache_seqlens: [batch_size] or int or None - Actual cache lengths per sequence
        s_aux: [nheads_k] or [nheads] - Can be either KV heads (decode) or Q heads (prefill)
        causal: Whether to apply causal masking
        upcast: Whether to cast to fp32 for computation (default: True)
        reorder_ops: Whether to change the order of operations for numerical stability testing (default: False)

    Returns:
        out: [batch_size, seqlen_q, nheads, d]
    """
    batch_size, seqlen_q, nheads, d = q.shape
    _, seqlen_k, nheads_k, _ = k_cache.shape

    # Handle cache_seqlens
    if cache_seqlens is None:
        # Use full cache
        k_effective = k_cache
        v_effective = v_cache
    elif isinstance(cache_seqlens, int):
        # Single cache length for all sequences
        k_effective = k_cache[:, :cache_seqlens]
        v_effective = v_cache[:, :cache_seqlens]
    else:
        # Different cache lengths per sequence - need to pad after slicing
        import torch.nn.functional as F
        k_list = []
        v_list = []
        max_cache_len = cache_seqlens.max().item()

        for i in range(batch_size):
            cache_len = cache_seqlens[i].item()
            k_i = k_cache[i, :cache_len]  # [cache_len, nheads_k, d]
            v_i = v_cache[i, :cache_len]

            # Pad to max_cache_len
            if cache_len < max_cache_len:
                k_i = F.pad(k_i, (0, 0, 0, 0, 0, max_cache_len - cache_len))
                v_i = F.pad(v_i, (0, 0, 0, 0, 0, max_cache_len - cache_len))

            k_list.append(k_i)
            v_list.append(v_i)

        k_effective = torch.stack(k_list)  # [batch_size, max_cache_len, nheads_k, d]
        v_effective = torch.stack(v_list)

    # Expand s_aux for MQA/GQA if needed
    # s_aux can be either [nheads_k] (decode) or [nheads] (prefill)
    if s_aux is not None:
        if s_aux.shape[0] == nheads_k and nheads_k < nheads:
            # Decode path: s_aux is [nheads_k], expand to [nheads]
            s_aux_expanded = repeat(s_aux, 'h -> (h g)', g=nheads // nheads_k)
        else:
            # Prefill path: s_aux is already [nheads]
            s_aux_expanded = s_aux
    else:
        s_aux_expanded = None

    # Expand for MQA/GQA if needed
    if nheads_k < nheads:
        k_effective = repeat(k_effective, 'b s h d -> b s (h g) d', g=nheads // nheads_k)
        v_effective = repeat(v_effective, 'b s h d -> b s (h g) d', g=nheads // nheads_k)

    # Apply standard attention_ref
    out_ref, _ = attention_ref(q, k_effective, v_effective, causal=causal, s_aux=s_aux_expanded,
                                upcast=upcast, reorder_ops=reorder_ops)

    return out_ref


# ============================================================================
# KV Cache Tests (flash_attn_with_kvcache with s_aux)
# ============================================================================

@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5, 1.0])
# @pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("softcap", [0.0])
# @pytest.mark.parametrize("softcap", [0.0, 50.0])
@pytest.mark.parametrize("dropout_p", [0.0])
# @pytest.mark.parametrize("dropout_p", [0.0, 0.17])
@pytest.mark.parametrize("seqlen_q,seqlen_k", [(1, 128), (8, 256), (32, 512)])
# @pytest.mark.parametrize("seqlen_q,seqlen_k", [(1, 128)])
@pytest.mark.parametrize("d", [64, 128])
# @pytest.mark.parametrize("d", [64, 96, 128, 192, 256])
@pytest.mark.parametrize("causal", [False, True])
# @pytest.mark.parametrize("causal", [False])
@pytest.mark.parametrize("local", [False])
# @pytest.mark.parametrize("local", [False, True])
@pytest.mark.parametrize("alibi", [False])
# @pytest.mark.parametrize("alibi", [False, True])
@pytest.mark.parametrize("deterministic", [False])
# @pytest.mark.parametrize("deterministic", [False, True])
@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
# @pytest.mark.parametrize("mha_type", ["mha"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
# @pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_flash_attn_kvcache_with_saux(
    seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic,
    mha_type, dtype, softcap, s_aux_scale
):
    """
    Test flash_attn_with_kvcache with s_aux parameter.

    This tests standard (non-paged) KV cache with Attention Sinks.
    """
    # Skip tests for unsupported feature combinations
    if dropout_p > 0.0:
        pytest.skip("Dropout not supported in this test (forward only)")
    if alibi:
        pytest.skip("Alibi tested separately in test_flash_attn_saux_with_features")
    if local:
        pytest.skip("Local attention tested separately in test_flash_attn_saux_with_features")
    if softcap > 0.0:
        pytest.skip("Softcap tested separately in test_flash_attn_saux_with_features")
    if deterministic:
        pytest.skip("Deterministic mode not tested (forward only)")
    torch.random.manual_seed(42)

    batch_size = 2
    nheads = 6
    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2)
    assert nheads % nheads_k == 0

    # Generate Q, K, V
    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype)

    # Create KV cache (same as K, V for this test)
    k_cache = k.clone()
    v_cache = v.clone()

    # Generate cache_seqlens (use full cache)
    cache_seqlens = torch.full((batch_size,), seqlen_k, dtype=torch.int32, device=device)

    # Generate s_aux - IMPORTANT: shape depends on code path
    # flash_attn_with_kvcache uses different kernels based on seqlen_q:
    # - Decode (seqlen_q=1): split-KV kernel → s_aux shape = [nheads_k]
    # - Prefill (seqlen_q>1): standard fwd kernel → s_aux shape = [nheads]
    if s_aux_scale > 0:
        s_aux_nheads = nheads_k if seqlen_q == 1 else nheads
        s_aux = torch.randn(s_aux_nheads, device=device, dtype=dtype) * s_aux_scale
    else:
        s_aux = None

    # Call flash_attn_with_kvcache
    out = flash_attn_with_kvcache(
        q, k_cache, v_cache,
        cache_seqlens=cache_seqlens,
        causal=causal,
        s_aux=s_aux
    )

    # Reference implementation (upcast to fp32)
    out_ref = kvcache_attention_reference(
        q, k_cache, v_cache,
        cache_seqlens=cache_seqlens,
        s_aux=s_aux,
        causal=causal,
        upcast=True
    )

    # Pytorch reference for comparison (no upcast, reordered ops)
    out_pt = kvcache_attention_reference(
        q, k_cache, v_cache,
        cache_seqlens=cache_seqlens,
        s_aux=s_aux,
        causal=causal,
        upcast=False,
        reorder_ops=True
    )

    max_diff = (out - out_ref).abs().max().item()
    pt_max_diff = (out_pt - out_ref).abs().max().item()

    print(f"KV Cache {mha_type.upper()}, s_aux={s_aux_scale:.1f}, causal={causal}: "
          f"max diff = {max_diff:.6e}, pt diff = {pt_max_diff:.6e}")

    # Validate - FlashAttention should be within 2x of PyTorch error
    assert max_diff <= 2 * pt_max_diff + 5e-2, \
        f"Flash attention error ({max_diff:.6e}) too large compared to PyTorch ({pt_max_diff:.6e})"


# ============================================================================
# Test 3: vLLM Interface (vllm_flash_attn_with_kvcache) with s_aux
# ============================================================================

@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5])
# @pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("mean_seqlen_k", [128, 512])
# @pytest.mark.parametrize("mean_seqlen_k", [128])
@pytest.mark.parametrize("batch_size", [2, 4])
# @pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("d", [64, 128])
# @pytest.mark.parametrize("d", [64, 96, 128])
@pytest.mark.parametrize("mha_type", ["mha", "gqa", "mqa"])
# @pytest.mark.parametrize("mha_type", ["mha"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
# @pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_vllm_flash_attn_with_kvcache_saux(dtype, d, batch_size, mean_seqlen_k, mha_type, s_aux_scale):
    """
    Test vllm_flash_attn_with_kvcache with paged KV cache and s_aux.

    This validates the bug fix for s_aux parameter passing in the vLLM interface.
    Note: vllm_flash_attn_with_kvcache only supports decode (seqlen_q=1) with paged cache.
    """
    from flash_attn import vllm_flash_attn_with_kvcache
    import math

    torch.random.manual_seed(42)
    device = "cuda"

    # MHA/GQA/MQA configuration
    nheads = 8
    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2)

    # Decode mode only (paged_attention kernel requirement)
    seqlen_q = 1

    # Paged KV cache setup
    block_size = 64

    # Generate variable cache lengths
    cache_seqlens = torch.randint(
        seqlen_q, mean_seqlen_k + 1, (batch_size,), dtype=torch.int32, device=device
    )
    max_seqlen_k = cache_seqlens.max().item()
    max_seqlen_pad = math.ceil(max_seqlen_k / block_size) * block_size

    # Generate paged KV cache
    block_table, k_cache_paged, v_cache_paged = generate_paged_kvcache(
        batch_size, max_seqlen_pad, block_size, nheads_k, d, device, dtype
    )

    # Generate Q
    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)

    # Generate s_aux
    # Note: paged_attention kernel requires s_aux shape [nheads] (query heads)
    # This is different from flash_attn_with_kvcache which uses [nheads_k]
    if s_aux_scale > 0:
        s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale
    else:
        s_aux = None

    # Call vllm_flash_attn_with_kvcache
    out = vllm_flash_attn_with_kvcache(
        q=q,
        k_cache=k_cache_paged,
        v_cache=v_cache_paged,
        block_table=block_table,
        cache_seqlens=cache_seqlens,
        causal=True,
        s_aux=s_aux,
        max_seqlen_k=max_seqlen_k
    )

    # Reference implementation
    out_ref = paged_attention_reference(
        q, k_cache_paged, v_cache_paged,
        block_table, cache_seqlens,
        s_aux=s_aux,
        causal=True,
        nheads_k=nheads_k
    )

    max_diff = (out - out_ref).abs().max().item()

    print(f"vLLM Decode {mha_type.upper()}, d={d}, batch={batch_size}, seqlen_k~{mean_seqlen_k}, "
          f"s_aux={s_aux_scale:.1f}: max diff = {max_diff:.6e}")

    # Validate - paged attention may have slightly higher error due to block boundaries
    assert max_diff <= 1e-1, \
        f"vLLM paged attention error ({max_diff:.6e}) too large"


# ============================================================================
# Parameter Combination Tests (alibi, local attention, softcap)
# ============================================================================

@pytest.mark.parametrize("dtype", [torch.bfloat16])
# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("d", [128])
# @pytest.mark.parametrize("d", [64, 128, 256])
@pytest.mark.parametrize("has_alibi", [False, True])
# @pytest.mark.parametrize("has_alibi", [False])
@pytest.mark.parametrize("has_local", [False, True])
# @pytest.mark.parametrize("has_local", [False])
@pytest.mark.parametrize("softcap", [0.0, 50.0])
# @pytest.mark.parametrize("softcap", [0.0])
def test_flash_attn_saux_with_features(d, dtype, has_alibi, has_local, softcap):
    """Test s_aux with alibi, local attention, and softcap combinations."""
    torch.random.manual_seed(42)

    batch_size = 2
    seqlen = 256
    nheads = 4

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    # Generate s_aux
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * 0.5

    # Configure features
    alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 if has_alibi else None
    window_size = (64, 64) if has_local else (-1, -1)

    # Scale q for softcap to ensure values are in reasonable range
    if softcap > 0:
        q = q * softcap
        k = k * softcap

    # Forward pass - test that it doesn't crash
    try:
        out = flash_attn_func(
            q, k, v,
            causal=True,
            window_size=window_size,
            softcap=softcap,
            alibi_slopes=alibi_slopes,
            s_aux=s_aux
        )

        # Basic sanity checks
        assert out.shape == q.shape
        assert not torch.isnan(out).any()
        assert not torch.isinf(out).any()

        print(f"alibi={has_alibi}, local={has_local}, softcap={softcap:.0f}: PASSED")
    except Exception as e:
        pytest.fail(f"Failed with alibi={has_alibi}, local={has_local}, softcap={softcap}: {e}")


# ============================================================================
# Phase 1: P0 Window + Sinks Interaction Tests (StreamingLLM core scenarios)
# ============================================================================


@pytest.mark.parametrize("window_size", [(64, 64), (128, 128), (256, 0)])
@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5, 1.0])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("d", [64, 128])
@pytest.mark.parametrize("causal", [False, True])
def test_flash_attn_window_with_saux(window_size, s_aux_scale, dtype, d, causal):
    """
    Test basic window (banded) attention with attention sinks.

    This is the core StreamingLLM scenario: limited KV cache window + preserved sink tokens.

    Validation points:
    1. Tokens outside window are correctly masked
    2. Sink tokens remain visible regardless of window constraints
    3. Softmax denominator includes sink contribution
    4. Numerical results match reference implementation
    """
    if d > MAX_HEADDIM_SM8x:
        pytest.skip(f"Head dimension {d} not supported on SM80")

    batch_size = 2
    seqlen_q = 128
    seqlen_k = 256
    nheads = 8

    # Create input tensors
    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)

    # Create s_aux (sink tokens LogSumExp)
    s_aux = None
    if s_aux_scale > 0:
        s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    # Run Flash Attention with window + sinks
    out = flash_attn_func(
        q, k, v,
        causal=causal,
        window_size=window_size,
        s_aux=s_aux
    )

    # Run reference implementation
    out_ref, _ = attention_ref(
        q, k, v,
        causal=causal,
        window_size=window_size,
        s_aux=s_aux
    )

    # Validate output
    assert out.shape == q.shape, f"Output shape mismatch: {out.shape} vs {q.shape}"
    assert not torch.isnan(out).any(), "Output contains NaN"
    assert not torch.isinf(out).any(), "Output contains Inf"

    # Numerical accuracy check
    max_error = (out - out_ref).abs().max().item()
    mean_error = (out - out_ref).abs().mean().item()

    # Tolerance depends on dtype
    atol = 1e-2 if dtype == torch.float16 else 2e-2
    rtol = 0.15

    assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
        f"Window+Sink output mismatch: max_err={max_error:.6f}, mean_err={mean_error:.6f}"


@pytest.mark.parametrize("window_size", [(0, 0), (1, 1), (2, 64), (64, 2)])
@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("d", [64])
def test_flash_attn_asymmetric_window_with_saux(window_size, s_aux_scale, dtype, d):
    """
    Test asymmetric window configurations with attention sinks.

    Special cases:
    - (0, 0): Pure causal (no future, no past beyond current token)
    - (1, 1): Minimal window (current + 1 left + 1 right)
    - (2, 64): Small left window, large right window
    - (64, 2): Large left window, small right window
    """
    batch_size = 2
    seqlen = 128
    nheads = 4

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale \
        if s_aux_scale > 0 else None

    # Test both causal and non-causal modes
    for causal in [False, True]:
        out = flash_attn_func(q, k, v, causal=causal, window_size=window_size, s_aux=s_aux)
        out_ref, _ = attention_ref(q, k, v, causal=causal, window_size=window_size, s_aux=s_aux)

        assert out.shape == q.shape
        assert not torch.isnan(out).any()
        assert not torch.isinf(out).any()

        # Check numerical accuracy
        assert torch.allclose(out, out_ref, atol=2e-2, rtol=0.15), \
            f"Asymmetric window {window_size} with causal={causal} failed"


@pytest.mark.parametrize("window_size", [(32, 32), (64, 64)])
@pytest.mark.parametrize("s_aux_scale", [0.5, 1.0])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_window_boundary_precision_with_saux(window_size, s_aux_scale, dtype):
    """
    Test numerical precision at window boundaries when sinks are present.

    This validates that attention weights are correctly distributed between:
    - Tokens just inside the window boundary
    - Tokens just outside the window boundary (should be masked)
    - Sink tokens (always visible)
    """
    batch_size = 1
    seqlen = 256
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    out = flash_attn_func(q, k, v, causal=True, window_size=window_size, s_aux=s_aux)
    out_ref, attn_ref = attention_ref(q, k, v, causal=True, window_size=window_size, s_aux=s_aux)

    # Basic validation
    assert not torch.isnan(out).any()
    assert not torch.isinf(out).any()

    # Numerical accuracy at boundary
    max_error = (out - out_ref).abs().max().item()
    assert max_error < 0.05, f"Boundary precision error too large: {max_error}"

    # Verify attention weights sum correctly (from reference implementation)
    attn_sum = attn_ref.sum(dim=-1)
    if s_aux_scale > 0:
        # With sinks: weights should sum to ≤ 1 (sink absorbs some attention)
        # Allow small tolerance for bfloat16 rounding errors
        assert (attn_sum <= 1.0 + 5e-3).all(), "With sinks: weight sum should be ≤ 1"
        # When window is restrictive + sinks are strong, very little attention may go to actual tokens
        # Just verify weights are non-negative and reasonable (not NaN/Inf)
        assert (attn_sum >= 0).all(), "Weight sums must be non-negative"
        assert torch.isfinite(attn_sum).all(), "Weight sums must be finite"
    else:
        # Without sinks: weights should sum to 1
        assert torch.allclose(attn_sum, torch.ones_like(attn_sum), atol=1e-3), \
            "Without sinks: attention weights should sum to 1"


@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
@pytest.mark.parametrize("window_size", [(64, 64), (128, 0)])
@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("d", [64, 128])
def test_flash_attn_mqa_gqa_window_saux(mha_type, window_size, s_aux_scale, dtype, d):
    """
    Test Multi-Query Attention (MQA) and Grouped-Query Attention (GQA)
    with window constraints and attention sinks.

    MHA: nheads_q == nheads_k (standard multi-head attention)
    MQA: nheads_k == 1 (all query heads share one KV head)
    GQA: nheads_q % nheads_k == 0 (query heads grouped to KV heads)
    """
    if d > MAX_HEADDIM_SM8x:
        pytest.skip(f"Head dimension {d} not supported on SM80")

    batch_size = 2
    seqlen = 128

    # Configure head counts based on attention type
    if mha_type == "mha":
        nheads_q = nheads_k = 8
    elif mha_type == "mqa":
        nheads_q = 8
        nheads_k = 1
    else:  # gqa
        nheads_q = 8
        nheads_k = 2

    q = torch.randn(batch_size, seqlen, nheads_q, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads_k, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads_k, d, device=device, dtype=dtype)

    # Create s_aux and expand to query head size if needed
    if s_aux_scale > 0:
        s_aux_kv = torch.randn(nheads_k, device=device, dtype=dtype) * s_aux_scale
        if nheads_q == nheads_k:
            s_aux = s_aux_kv
        else:
            # GQA/MQA: replicate for each query head group
            qhead_per_khead = nheads_q // nheads_k
            s_aux = s_aux_kv.repeat_interleave(qhead_per_khead)
    else:
        s_aux = None

    out = flash_attn_func(q, k, v, causal=True, window_size=window_size, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=True, window_size=window_size, s_aux=s_aux)

    assert out.shape == q.shape
    assert not torch.isnan(out).any()
    assert not torch.isinf(out).any()

    # MQA/GQA may have slightly larger numerical errors due to broadcasting
    atol = 2e-2 if dtype == torch.bfloat16 else 1e-2
    rtol = 0.2 if mha_type in ["mqa", "gqa"] else 0.15

    assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
        f"{mha_type.upper()} + window {window_size} + sinks failed"


@pytest.mark.parametrize("seqlen_q,seqlen_k", [(64, 128), (128, 256)])
@pytest.mark.parametrize("window_size", [(32, 32), (64, 64)])
@pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_flash_attn_varlen_window_saux(seqlen_q, seqlen_k, window_size, s_aux_scale, dtype):
    """
    Test variable-length sequences with window and sinks.

    In variable-length scenarios, different sequences in the batch have different
    actual lengths, which interacts with window masking in non-trivial ways.
    """
    batch_size = 2
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    # Test with different sequence lengths
    out = flash_attn_func(q, k, v, causal=True, window_size=window_size, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=True, window_size=window_size, s_aux=s_aux)

    assert out.shape == q.shape
    assert not torch.isnan(out).any()
    assert not torch.isinf(out).any()

    assert torch.allclose(out, out_ref, atol=2e-2, rtol=0.15), \
        f"Varlen (seqlen_q={seqlen_q}, seqlen_k={seqlen_k}) with window and sinks failed"


@pytest.mark.parametrize("cache_seqlen", [128, 256])
@pytest.mark.parametrize("window_size", [(64, 64), (128, 0)])
@pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_flash_attn_kvcache_window_saux(cache_seqlen, window_size, s_aux_scale, dtype):
    """
    Test KV cache scenario with window and sinks.

    This simulates the decode phase of autoregressive generation where:
    - K/V cache accumulates tokens from previous steps
    - Window limits which cached tokens are visible
    - Sink tokens are always preserved
    """
    batch_size = 2
    seqlen_q_new = 1  # Decode: one new token at a time
    nheads = 4
    d = 64

    # New query token
    q = torch.randn(batch_size, seqlen_q_new, nheads, d, device=device, dtype=dtype)

    # KV cache from previous tokens
    k = torch.randn(batch_size, cache_seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, cache_seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    # Decode step: new token attends to cache
    out = flash_attn_func(q, k, v, causal=True, window_size=window_size, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=True, window_size=window_size, s_aux=s_aux)

    assert out.shape == q.shape
    assert not torch.isnan(out).any()
    assert not torch.isinf(out).any()

    assert torch.allclose(out, out_ref, atol=2e-2, rtol=0.15), \
        f"KV cache (len={cache_seqlen}) with window and sinks failed"


@pytest.mark.parametrize("window_size", [
    (8192, 8192),  # Very large window (effectively full attention)
    (1, 1),        # Minimal window
    (0, 8192),     # Only right window (no left context)
])
@pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_extreme_window_sizes_with_saux(window_size, s_aux_scale, dtype):
    """
    Stress test with extreme window configurations.

    Validates numerical stability and correctness when:
    - Window is very large (almost full attention)
    - Window is minimal (only immediate neighbors)
    - Window is one-sided (only past or only future)
    """
    batch_size = 1
    seqlen = 256  # Use smaller seqlen for large window tests
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    # For very large windows, use causal=False to allow the full window
    causal = window_size[0] < 100

    try:
        out = flash_attn_func(q, k, v, causal=causal, window_size=window_size, s_aux=s_aux)
        out_ref, _ = attention_ref(q, k, v, causal=causal, window_size=window_size, s_aux=s_aux)

        assert out.shape == q.shape
        assert not torch.isnan(out).any(), f"NaN with window_size={window_size}"
        assert not torch.isinf(out).any(), f"Inf with window_size={window_size}"

        # Extreme windows may have larger numerical errors
        assert torch.allclose(out, out_ref, atol=3e-2, rtol=0.2), \
            f"Extreme window {window_size} with sinks failed"
    except RuntimeError as e:
        # Some extreme configurations may not be supported
        pytest.skip(f"Window size {window_size} not supported: {e}")


# ============================================================================
# Phase 1: P0 Numerical Stability and Extreme Value Tests
# ============================================================================


@pytest.mark.parametrize("s_aux_config", [
    ("zero", lambda nh, dev, dt: torch.zeros(nh, dtype=dt, device=dev)),
    ("tiny", lambda nh, dev, dt: torch.full((nh,), 1e-10, dtype=dt, device=dev)),
    ("small", lambda nh, dev, dt: torch.full((nh,), 1e-3, dtype=dt, device=dev)),
    ("large", lambda nh, dev, dt: torch.full((nh,), 100.0, dtype=dt, device=dev)),
    # Use 60000 for "huge" - safe for both fp16 (max ~65504) and bf16
    ("huge", lambda nh, dev, dt: torch.full((nh,), 60000.0, dtype=dt, device=dev)),
])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("d", [64])
def test_saux_extreme_values(s_aux_config, dtype, d):
    """
    Test numerical stability with extreme s_aux values.

    This validates that the online softmax implementation correctly handles:
    - Very small values (near zero)
    - Very large values (approaching overflow)
    - Intermediate values

    The key is that exp(s_aux - max) should not overflow or underflow.
    """
    config_name, s_aux_fn = s_aux_config

    batch_size = 2
    seqlen = 128
    nheads = 6

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = s_aux_fn(nheads, device, dtype)

    # Run Flash Attention
    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)

    # Run reference
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    # Critical checks for numerical stability
    assert not torch.isnan(out).any(), f"{config_name}: Output contains NaN"
    assert not torch.isinf(out).any(), f"{config_name}: Output contains Inf"

    # For extreme values, we allow larger tolerance
    atol = 5e-2 if config_name in ["huge", "large"] else 2e-2
    rtol = 0.25 if config_name in ["huge", "large"] else 0.15

    assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
        f"{config_name}: Numerical error too large. max_err={(out - out_ref).abs().max().item():.6f}"


@pytest.mark.parametrize("special_value", [
    float('nan'),
    float('inf'),
    float('-inf'),
])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_special_values(special_value, dtype):
    """
    Test handling of special floating-point values in s_aux.

    Expected behavior:
    - NaN: Should either produce NaN output or raise an error (both acceptable)
    - +Inf: Sink dominates attention (all weight goes to sink)
    - -Inf: Sink has zero contribution (equivalent to s_aux=None)
    """
    batch_size = 1
    seqlen = 64
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.full((nheads,), special_value, dtype=dtype, device=device)

    try:
        out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)

        # Special value handling
        if special_value == float('-inf'):
            # -Inf sink should be equivalent to no sink
            out_no_sink = flash_attn_func(q, k, v, causal=True)
            assert torch.allclose(out, out_no_sink, atol=1e-3, rtol=0.1), \
                "-Inf sink should be equivalent to no sink"

        elif special_value == float('inf'):
            # +Inf sink should dominate (output heavily influenced by sink)
            # The output should be finite (not NaN/Inf) since the online softmax handles this
            # by subtracting max before exp, so exp(inf - inf) = exp(0) = 1
            assert torch.isfinite(out).all() or torch.isnan(out).all(), \
                "+Inf sink should produce either all finite or all NaN output, not mixed"

        elif special_value == float('nan'):
            # NaN is allowed to propagate through the computation
            # Just verify it doesn't crash the kernel
            # Output may contain NaN (propagated) or be valid (if NaN is ignored)
            assert out is not None, "Kernel should not crash with NaN s_aux"

    except RuntimeError as e:
        # CUDA errors are acceptable for invalid inputs
        if "nan" in str(e).lower() or "inf" in str(e).lower():
            pytest.skip(f"Expected CUDA error for {special_value}: {e}")
        else:
            raise


@pytest.mark.parametrize("s_aux_value", [
    87.0,   # exp(87) ≈ 6e37, close to float32 max (3.4e38)
    -87.0,  # exp(-87) ≈ 1.6e-38, close to float32 min
    88.7,   # exp(88.7) would overflow float32 if not handled correctly
    -88.7,  # exp(-88.7) would underflow to 0
])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_softmax_boundary(s_aux_value, dtype):
    """
    Test softmax computation at numerical boundaries.

    Flash Attention uses online softmax: exp(x - max) / sum(exp(x_i - max))

    s_aux participates in the max computation, which should prevent overflow:
    - If s_aux is very large, max = s_aux, so exp(s_aux - max) = exp(0) = 1
    - If s_aux is very small, it's ignored in max, so exp(s_aux - max) ≈ 0

    This test validates the implementation handles extreme LogSumExp values correctly.
    """
    batch_size = 1
    seqlen = 64
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype) * 0.1
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype) * 0.1
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.full((nheads,), s_aux_value, dtype=dtype, device=device)

    # Test should not crash or produce NaN/Inf
    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    # Basic stability check
    assert not torch.isnan(out).any(), f"s_aux={s_aux_value} produced NaN"
    assert not torch.isinf(out).any(), f"s_aux={s_aux_value} produced Inf"

    # Numerical accuracy (allow larger tolerance for extreme values)
    atol = 5e-2 if abs(s_aux_value) > 88 else 2e-2
    rtol = 0.25 if abs(s_aux_value) > 88 else 0.15

    assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
        f"Softmax boundary test failed for s_aux={s_aux_value}"


@pytest.mark.parametrize("qkv_dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("s_aux_scale", [1e-3, 1.0, 100.0])
@pytest.mark.parametrize("d", [64])
def test_saux_mixed_precision(qkv_dtype, s_aux_scale, d):
    """
    Test mixed precision: Q/K/V and s_aux both in low precision (fp16/bf16).

    s_aux is stored in fp16/bf16 (matching Q/K/V dtype) but computed in fp32 internally.
    This validates:
    1. s_aux dtype matches Q/K/V dtype (enforced by API)
    2. Numerical errors within acceptable bounds
    3. Consistency with reference implementation
    """
    batch_size = 2
    seqlen = 128
    nheads = 8

    # Q/K/V in low precision
    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=qkv_dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=qkv_dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=qkv_dtype)

    # s_aux must match Q/K/V dtype (mixed precision: storage in fp16/bf16, compute in fp32)
    s_aux = torch.randn(nheads, device=device, dtype=qkv_dtype) * s_aux_scale

    # Flash Attention output
    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)

    # Reference implementation (also mixed precision)
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    # Validate shape and stability
    assert out.shape == q.shape
    assert out.dtype == qkv_dtype
    assert not torch.isnan(out).any()
    assert not torch.isinf(out).any()

    # Mixed precision error bounds
    # BF16 has lower precision than FP16, so tolerance varies
    if qkv_dtype == torch.bfloat16:
        atol = 3e-2 if s_aux_scale > 10 else 2e-2
        rtol = 0.2 if s_aux_scale > 10 else 0.15
    else:  # float16
        atol = 2e-2 if s_aux_scale > 10 else 1e-2
        rtol = 0.15 if s_aux_scale > 10 else 0.1

    max_error = (out.float() - out_ref.float()).abs().max().item()
    assert max_error < atol * 3, \
        f"Mixed precision error too large: {max_error:.6f} (qkv_dtype={qkv_dtype}, scale={s_aux_scale})"

    assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
        f"Mixed precision test failed for {qkv_dtype} with s_aux_scale={s_aux_scale}"


@pytest.mark.parametrize("has_saux", [False, True])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_attention_weight_bounds(has_saux, dtype):
    """
    Verify that attention weights remain in valid bounds and sum correctly.

    Without sinks:
    - All weights should be in [0, 1]
    - Weights should sum to 1

    With sinks:
    - All weights should be in [0, 1]
    - Weights over actual K/V tokens sum to < 1 (sinks absorb some weight)
    - This is the EXPECTED behavior - sinks compete for attention!

    This is tested via the reference implementation which returns attention weights.
    """
    batch_size = 2
    seqlen = 64
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * 0.5 if has_saux else None

    # Get attention weights from reference implementation
    _, attn = attention_ref(q, k, v, causal=False, s_aux=s_aux)

    # Validate probability distribution properties
    assert (attn >= 0).all(), "Attention weights contain negative values"
    assert (attn <= 1).all(), "Attention weights exceed 1.0"

    # Sum validation
    attn_sum = attn.sum(dim=-1)

    if has_saux:
        # With sinks: weights should sum to less than 1 (sink absorbs some attention)
        # But should still be reasonable (not too small)
        assert (attn_sum < 1.0 + 1e-3).all(), \
            f"Attention weights exceed 1.0 with sinks: max={attn_sum.max():.6f}"
        assert (attn_sum > 0.1).all(), \
            f"Attention weights too small (sink absorbed too much): min={attn_sum.min():.6f}"
        print(f"✓ With sinks: weight sum range [{attn_sum.min():.4f}, {attn_sum.max():.4f}] (< 1.0 expected)")
    else:
        # Without sinks: weights should sum to exactly 1
        assert torch.allclose(attn_sum, torch.ones_like(attn_sum), atol=1e-3, rtol=1e-3), \
            f"Attention weights don't sum to 1 without sinks: min={attn_sum.min():.6f}, max={attn_sum.max():.6f}"
        print(f"✓ Without sinks: weight sum ≈ 1.0")

    # Also verify that Flash Attention output matches reference
    out = flash_attn_func(q, k, v, causal=False, s_aux=s_aux)
    out_ref = torch.einsum("bhts,bshd->bthd", attn, v)

    assert torch.allclose(out, out_ref, atol=2e-2, rtol=0.15), \
        "Flash Attention output doesn't match reference with validated attention weights"


# ============================================================================
# Phase 1: P0 Split-K Correctness Tests
# ============================================================================


@pytest.mark.parametrize("seqlen_q,seqlen_k", [(128, 512), (256, 1024), (128, 2048)])
@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5, 1.0])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_split_k_correctness(seqlen_q, seqlen_k, s_aux_scale, dtype):
    """
    Test that s_aux is correctly applied only in the first split of Split-K kernels.

    Split-K optimization divides long K/V sequences into chunks. According to
    the implementation in flash_fwd_kernel.h:

        if (params.s_aux_ptr != nullptr && n_split_idx == 0) {
            lse = softmax.normalize_softmax_lse_with_sinks(...);
        }

    This test validates:
    1. s_aux is only counted in the first split (n_split_idx == 0)
    2. Results are numerically consistent regardless of split count
    3. Long sequences don't cause numerical drift with sinks
    """
    batch_size = 2
    nheads = 6
    d = 64

    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale \
        if s_aux_scale > 0 else None

    # Baseline: standard Flash Attention (may internally use split-K for long sequences)
    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)

    # Reference implementation (no split-K)
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    # Validate correctness
    assert out.shape == q.shape
    assert not torch.isnan(out).any(), f"NaN in output for seqlen_k={seqlen_k}"
    assert not torch.isinf(out).any(), f"Inf in output for seqlen_k={seqlen_k}"

    # Long sequences may have slightly larger errors due to accumulation
    atol = 3e-2 if seqlen_k > 1024 else 2e-2
    rtol = 0.2 if seqlen_k > 1024 else 0.15

    max_error = (out - out_ref).abs().max().item()
    assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
        f"Split-K correctness failed for seqlen_k={seqlen_k}, s_aux_scale={s_aux_scale}. max_err={max_error:.6f}"


@pytest.mark.parametrize("seqlen_k", [256, 512, 1024, 2048])
@pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_variable_sequence_length(seqlen_k, s_aux_scale, dtype):
    """
    Test s_aux behavior across various sequence lengths.

    Different sequence lengths may trigger different code paths:
    - Short sequences: Single block, no split
    - Medium sequences: Multiple blocks, potential split
    - Long sequences: Guaranteed split-K

    This validates consistent behavior regardless of sequence length.
    """
    batch_size = 2
    seqlen_q = 128
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    # Test both with and without sinks
    out_with_sink = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)
    out_no_sink = flash_attn_func(q, k, v, causal=True, s_aux=None)

    # Reference
    out_ref_with_sink, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)
    out_ref_no_sink, _ = attention_ref(q, k, v, causal=True, s_aux=None)

    # Basic validation
    assert out_with_sink.shape == q.shape
    assert not torch.isnan(out_with_sink).any()
    assert not torch.isinf(out_with_sink).any()

    # Sinks should change the output
    if s_aux_scale > 0:
        diff = (out_with_sink - out_no_sink).abs().max().item()
        assert diff > 1e-4, f"s_aux had no effect on output (seqlen_k={seqlen_k})"

    # Numerical accuracy
    atol = 3e-2 if seqlen_k > 1024 else 2e-2
    rtol = 0.2 if seqlen_k > 1024 else 0.15

    assert torch.allclose(out_with_sink, out_ref_with_sink, atol=atol, rtol=rtol), \
        f"With sink failed for seqlen_k={seqlen_k}"
    assert torch.allclose(out_no_sink, out_ref_no_sink, atol=atol, rtol=rtol), \
        f"Without sink failed for seqlen_k={seqlen_k}"


@pytest.mark.parametrize("seqlen_q,seqlen_k", [(64, 2048), (128, 4096)])
@pytest.mark.parametrize("s_aux_scale", [0.5, 1.0])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_split_merge_precision(seqlen_q, seqlen_k, s_aux_scale, dtype):
    """
    Test precision of split merge operations with sinks.

    When Split-K is used, partial results are merged using LogSumExp:
        lse_total = log(exp(lse_1) + exp(lse_2) + ... + exp(lse_n))

    With sinks, only the first split includes s_aux in its LSE.
    This test validates:
    1. No numerical instability in LSE merging
    2. Final output is equivalent to non-split reference
    3. Precision is maintained for large reductions
    """
    batch_size = 1
    nheads = 4
    d = 64

    # Use larger sequences to force split-K path
    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    # Flash Attention (may use split-K internally)
    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)

    # Reference (no split-K)
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    # Precision checks
    assert out.shape == q.shape
    assert not torch.isnan(out).any(), "NaN after split merge"
    assert not torch.isinf(out).any(), "Inf after split merge"

    # Split-merge may accumulate more error
    max_error = (out - out_ref).abs().max().item()
    mean_error = (out - out_ref).abs().mean().item()

    # Allow larger tolerance for very long sequences
    atol = 5e-2 if seqlen_k > 2048 else 3e-2
    rtol = 0.25 if seqlen_k > 2048 else 0.2

    assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
        f"Split merge precision test failed: max_err={max_error:.6f}, mean_err={mean_error:.6f}"

    # Additional check: error should be bounded relative to output magnitude
    relative_error = max_error / (out_ref.abs().max().item() + 1e-6)
    assert relative_error < 0.15, \
        f"Relative error too large: {relative_error:.6f}"


# ============================================================================
# Phase 2: P1 Boundary Condition and Robustness Tests
# ============================================================================


@pytest.mark.parametrize("nheads", [1, 8, 32, 63, 64])
@pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_head_count_boundary(nheads, s_aux_scale, dtype):
    """
    Test head count boundaries (implementation supports up to 64 heads).

    The shared memory allocation in mainloop_fwd_sm80.hpp:168 is:
        cute::array_aligned<ElementSAux, 64, 128> smem_s_aux;

    This limits support to 64 heads maximum. This test validates:
    1. All head counts up to 64 work correctly
    2. The boundary case (64 heads) has correct numerical results
    3. Shared memory access pattern is correct at boundaries
    """
    batch_size = 1
    seqlen = 128
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    # Should work for all nheads <= 64
    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    assert out.shape == q.shape
    assert not torch.isnan(out).any(), f"NaN with {nheads} heads"
    assert not torch.isinf(out).any(), f"Inf with {nheads} heads"

    # Boundary case (64 heads) deserves extra scrutiny
    if nheads == 64:
        # Verify each head independently
        for h in range(min(4, nheads)):  # Check first few heads
            diff = (out[0, :, h, :] - out_ref[0, :, h, :]).abs().max().item()
            assert diff < 0.05, f"Head {h} error too large at 64-head boundary: {diff}"

    assert torch.allclose(out, out_ref, atol=2e-2, rtol=0.15), \
        f"Head count boundary test failed for nheads={nheads}"


@pytest.mark.parametrize("nheads", [65, 128, 256])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_saux_exceed_head_limit(nheads, dtype):
    """
    Test that exceeding the 64-head limit produces a clear error.

    Expected: RuntimeError with message about head count limit.
    """
    batch_size = 1
    seqlen = 64
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype)

    # Should raise an error for nheads > 64
    with pytest.raises(RuntimeError, match="64.*head"):
        flash_attn_func(q, k, v, s_aux=s_aux)


@pytest.mark.parametrize("seqlen_q,seqlen_k", [
    (1, 1),           # Minimal
    (1, 131072),      # Extreme asymmetry
    (8192, 8192),     # Large symmetric
])
@pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_saux_sequence_length_boundaries(seqlen_q, seqlen_k, s_aux_scale, dtype):
    """
    Test extreme sequence length configurations with sinks.

    Validates:
    - Minimal sequences (1 token)
    - Highly asymmetric sequences
    - Large sequences (memory pressure)
    """
    batch_size = 1
    nheads = 4
    d = 64

    try:
        q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
        k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)
        v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype)

        s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

        out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)
        out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

        assert out.shape == q.shape
        assert not torch.isnan(out).any()
        assert not torch.isinf(out).any()

        # Very large sequences may have larger errors
        atol = 5e-2 if max(seqlen_q, seqlen_k) > 4096 else 2e-2
        rtol = 0.3 if max(seqlen_q, seqlen_k) > 4096 else 0.15

        assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
            f"Sequence length boundary test failed for ({seqlen_q}, {seqlen_k})"

    except (RuntimeError, torch.cuda.OutOfMemoryError) as e:
        # Very large sequences may OOM or exceed kernel limits
        pytest.skip(f"Sequence length ({seqlen_q}, {seqlen_k}) not supported: {e}")


@pytest.mark.parametrize("batch_size", [1, 2, 16, 64])
@pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_saux_batch_size_scalability(batch_size, s_aux_scale, dtype):
    """
    Test scalability across different batch sizes.

    Validates:
    - Single batch (batch_size=1)
    - Small batches (2-16)
    - Large batches (64+)

    All should maintain numerical accuracy.
    """
    seqlen = 128
    nheads = 8
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale

    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    assert out.shape == q.shape
    assert not torch.isnan(out).any()
    assert not torch.isinf(out).any()

    assert torch.allclose(out, out_ref, atol=2e-2, rtol=0.15), \
        f"Batch size scalability test failed for batch_size={batch_size}"


# ============================================================================
# Phase 2: P1 PackGQA Head Mapping Precision Tests
# ============================================================================


@pytest.mark.parametrize("nheads_q,nheads_k", [
    (32, 32),  # MHA: 1:1 mapping
    (32, 1),   # MQA: All Q heads share one KV head
    (32, 8),   # GQA: 4:1 mapping (4 Q heads per KV head)
    (64, 8),   # GQA: 8:1 mapping
])
@pytest.mark.parametrize("s_aux_scale", [0.5, 1.0])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_packgqa_saux_head_mapping_accuracy(nheads_q, nheads_k, s_aux_scale, dtype):
    """
    Test PackGQA head mapping with attention sinks.

    In PackGQA mode, the implementation maps each row to its corresponding head:
        bidh_mi = bidh + convert_layout_acc_rowcol(...)(mi % kBlockM)

    This test validates that each query head receives the correct sink value
    from its corresponding KV head.
    """
    batch_size = 2
    seqlen = 128
    d = 64

    q = torch.randn(batch_size, seqlen, nheads_q, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads_k, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads_k, d, device=device, dtype=dtype)

    # s_aux is conceptually per KV head, but needs to be expanded to query head size
    # Create base sink values for KV heads
    s_aux_kv = torch.randn(nheads_k, device=device, dtype=dtype) * s_aux_scale

    # Expand to query head size: each query head uses its corresponding KV head's sink
    # For GQA: qhead_per_khead = nheads_q // nheads_k
    if nheads_q == nheads_k:
        # MHA: 1:1 mapping
        s_aux = s_aux_kv
    else:
        # GQA/MQA: replicate each KV head's sink value for its query head group
        qhead_per_khead = nheads_q // nheads_k
        s_aux = s_aux_kv.repeat_interleave(qhead_per_khead)

    # Flash Attention with PackGQA
    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)

    # Reference implementation uses the same expanded s_aux
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    # Validate output
    assert out.shape == q.shape
    assert not torch.isnan(out).any()
    assert not torch.isinf(out).any()

    # MQA/GQA may have slightly larger errors due to head expansion
    if nheads_q != nheads_k:
        atol = 3e-2
        rtol = 0.2
    else:
        atol = 2e-2
        rtol = 0.15

    max_error = (out - out_ref).abs().max().item()
    assert torch.allclose(out, out_ref, atol=atol, rtol=rtol), \
        f"PackGQA head mapping failed for {nheads_q}Q:{nheads_k}K, max_err={max_error:.6f}"

    # Additional validation: verify different query heads produce different outputs
    # (unless all share the same KV head in MQA)
    if nheads_q > 1:
        head_diffs = []
        for h in range(min(4, nheads_q - 1)):
            diff = (out[0, :, h, :] - out[0, :, h+1, :]).abs().max().item()
            head_diffs.append(diff)

        # At least some heads should differ (unless MQA with same inputs)
        if nheads_k > 1:
            assert max(head_diffs) > 1e-4, \
                f"All query heads produced identical outputs (suspicious for GQA)"


@pytest.mark.parametrize("nheads_q,nheads_k", [
    (8, 2),   # GQA 4:1
    (16, 4),  # GQA 4:1
])
@pytest.mark.parametrize("s_aux_scale", [0.5])
@pytest.mark.parametrize("seqlen", [64, 256])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_packgqa_block_position_mapping(nheads_q, nheads_k, s_aux_scale, seqlen, dtype):
    """
    Test that PackGQA head mapping is correct at different block positions.

    The mapping depends on the row position within a block (mi % kBlockM).
    This test validates consistency across different sequence positions.
    """
    batch_size = 1
    d = 64

    q = torch.randn(batch_size, seqlen, nheads_q, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads_k, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads_k, d, device=device, dtype=dtype)

    # Expand s_aux from KV head size to query head size
    s_aux_kv = torch.randn(nheads_k, device=device, dtype=dtype) * s_aux_scale
    qhead_per_khead = nheads_q // nheads_k
    s_aux = s_aux_kv.repeat_interleave(qhead_per_khead)

    out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)
    out_ref, _ = attention_ref(q, k, v, causal=True, s_aux=s_aux)

    # Validate different positions in the sequence
    # Check first, middle, and last positions
    positions = [0, seqlen // 2, seqlen - 1]

    for pos in positions:
        for h in range(nheads_q):
            diff = (out[0, pos, h, :] - out_ref[0, pos, h, :]).abs().max().item()
            assert diff < 0.05, \
                f"PackGQA mapping error at pos={pos}, head={h}: {diff:.6f}"

    # Overall accuracy
    assert torch.allclose(out, out_ref, atol=3e-2, rtol=0.2), \
        f"PackGQA block position mapping failed for seqlen={seqlen}"


# ============================================================================
# Phase 3: P2 Feature Combination Tests
# ============================================================================


@pytest.mark.parametrize("softcap", [0.0, 30.0, 50.0])
@pytest.mark.parametrize("window_size", [(-1, -1), (64, 64)])
@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_softcap_window_combination(softcap, window_size, s_aux_scale, dtype):
    """
    Test combination of Softcap + Window + Sinks.

    Softcap: Apply tanh scaling before softmax
    Window: Local attention mask
    Sinks: Additional sink tokens

    All three features should work together correctly.
    """
    batch_size = 2
    seqlen = 128
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    # Scale inputs for softcap
    if softcap > 0:
        q = q * softcap
        k = k * softcap

    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale \
        if s_aux_scale > 0 else None

    # Flash Attention with all features
    out = flash_attn_func(
        q, k, v,
        causal=True,
        window_size=window_size,
        softcap=softcap,
        s_aux=s_aux
    )

    # Reference implementation
    # Note: Reference may not support softcap, so we only validate stability
    assert out.shape == q.shape
    assert not torch.isnan(out).any(), \
        f"NaN with softcap={softcap}, window={window_size}, s_aux_scale={s_aux_scale}"
    assert not torch.isinf(out).any(), \
        f"Inf with softcap={softcap}, window={window_size}, s_aux_scale={s_aux_scale}"

    # Validate output magnitude is reasonable
    out_magnitude = out.abs().max().item()
    assert out_magnitude < 100, f"Output magnitude too large: {out_magnitude}"


@pytest.mark.parametrize("has_alibi", [True, False])
@pytest.mark.parametrize("has_local", [True, False])
@pytest.mark.parametrize("s_aux_scale", [0.0, 0.5])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_saux_alibi_local_combination(has_alibi, has_local, s_aux_scale, dtype):
    """
    Test combination of ALiBi + Local attention + Sinks.

    ALiBi: Attention with Linear Biases (position encoding)
    Local: Sliding window attention
    Sinks: Attention sinks

    This validates the interaction of positional biases with sinks.
    """
    batch_size = 2
    seqlen = 128
    nheads = 4
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    # Configure features
    alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 \
        if has_alibi else None
    window_size = (64, 64) if has_local else (-1, -1)
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * s_aux_scale \
        if s_aux_scale > 0 else None

    # Test combination
    out = flash_attn_func(
        q, k, v,
        causal=True,
        window_size=window_size,
        alibi_slopes=alibi_slopes,
        s_aux=s_aux
    )

    # Validate
    assert out.shape == q.shape
    assert not torch.isnan(out).any()
    assert not torch.isinf(out).any()

    # If sinks are present, output should differ from no-sink case
    if s_aux_scale > 0:
        out_no_sink = flash_attn_func(
            q, k, v,
            causal=True,
            window_size=window_size,
            alibi_slopes=alibi_slopes,
            s_aux=None
        )
        diff = (out - out_no_sink).abs().max().item()
        assert diff > 1e-4, "Sinks had no effect with ALiBi+Local combination"


@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_saux_all_features_enabled(dtype):
    """
    Stress test: Enable ALL features simultaneously.

    Features:
    - Causal masking
    - Window/Local attention
    - Softcap
    - ALiBi position bias
    - Attention sinks

    This is the most complex configuration. Validates:
    1. No crashes or CUDA errors
    2. No NaN/Inf in output
    3. Output magnitudes are reasonable
    """
    batch_size = 2
    seqlen = 128
    nheads = 8
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)

    # Enable ALL features
    alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
    window_size = (64, 64)
    softcap = 50.0
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * 0.5

    # Scale for softcap
    q = q * softcap
    k = k * softcap

    # Run with all features
    try:
        out = flash_attn_func(
            q, k, v,
            causal=True,
            window_size=window_size,
            softcap=softcap,
            alibi_slopes=alibi_slopes,
            s_aux=s_aux
        )

        # Validate stability
        assert out.shape == q.shape
        assert not torch.isnan(out).any(), "NaN with all features enabled"
        assert not torch.isinf(out).any(), "Inf with all features enabled"

        # Check output magnitude
        out_magnitude = out.abs().max().item()
        assert out_magnitude < 100, f"Output magnitude too large: {out_magnitude}"

        print("✅ All features enabled test PASSED")

    except RuntimeError as e:
        # Some feature combinations might not be supported
        pytest.skip(f"All-features combination not supported: {e}")


# ============================================================================
# Runtime Safety Tests
# ============================================================================


@pytest.mark.parametrize("seqlen", [128, 512, 1024])
@pytest.mark.parametrize("nheads", [8, 32])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_saux_shared_memory_efficiency(seqlen, nheads, dtype):
    """
    Validate that s_aux doesn't significantly impact shared memory usage.

    Shared memory for s_aux: 64 heads * 2 bytes (FP16/BF16) = 128 bytes
    This is negligible compared to Q/K/V shared memory.

    This test validates:
    1. Kernel launch succeeds (no shared memory overflow)
    2. Performance is maintained (no register spilling)
    """
    batch_size = 4
    d = 64

    q = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    k = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    v = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
    s_aux = torch.randn(nheads, device=device, dtype=dtype) * 0.5

    # Test should complete without shared memory errors
    try:
        out = flash_attn_func(q, k, v, causal=True, s_aux=s_aux)

        assert out.shape == q.shape
        assert not torch.isnan(out).any()
        assert not torch.isinf(out).any()

        print(f"✅ Shared memory test passed for seqlen={seqlen}, nheads={nheads}")

    except RuntimeError as e:
        if "shared memory" in str(e).lower():
            pytest.fail(f"Shared memory overflow with s_aux: {e}")
        else:
            raise


# ============================================================================
# Main function for quick testing
# ============================================================================

if __name__ == "__main__":
    print("=" * 80)
    print("Attention Sinks Testing Suite")
    print("Supports MHA/MQA/GQA with headdim=64 and headdim=128")
    print("Tests: flash_attn_func with s_aux, alibi, local, softcap")
    print("=" * 80)
    print("\nRunning quick smoke tests...")
    print("\nTest categories:")
    print("  - test_flash_attn_no_saux: Basic attention without s_aux")
    print("  - test_flash_attn_with_saux: Basic attention with s_aux")
    print("  - test_flash_attn_mqa_gqa_with_saux: MQA/GQA support")
    print("  - test_flash_attn_saux_with_features: Combined features (alibi/local/softcap)")
    print("  - test_saux_impact: Verify s_aux actually changes output")
    print("  - test_different_batch_and_head_sizes: Various batch/head configurations")
    print("\nNOTE: varlen tests are disabled (requires PyBind11 recompilation)")
    print("\nTo run all tests: pytest test_attention_sinks_reference.py -v")
    print("To run specific tests: pytest test_attention_sinks_reference.py -k test_name -v")
    print("\n" + "=" * 80)

    # Run a subset of tests for quick validation
    pytest.main([
        __file__,
        "-v",
        "-k", "test_flash_attn_mqa_gqa_with_saux and mha and d128",
        "--tb=short",
        "-x",  # Stop on first failure
        "-s",  # Show print output
        "--maxfail=5"  # Stop after 5 failures
    ])