[5/N][Attention] Finish eliminating `vllm/attention` folder (#32064)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>

[5/N][Attention] Finish eliminating `vllm/attention` folder (#32064)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
a608b4c6 · Matthew Bonanni · GitHub · 1f3a2c29 · a608b4c6 · a608b4c6
Unverified Commit a608b4c6 authored Jan 27, 2026 by Matthew Bonanni Committed by GitHub Jan 27, 2026
20 changed files
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -640,8 +640,9 @@ steps:
  # grade: Blocking
  source_file_dependencies:
  - csrc/attention/
-  - vllm/attention
  - vllm/v1/attention
+    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
+  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT

--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -568,8 +568,9 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/attention/
-  - vllm/attention
  - vllm/v1/attention
+    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
+  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT

--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -15,8 +15,9 @@ steps:
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
-  - vllm/attention
  - vllm/v1/attention
+    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
+  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT

--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,8 +2,8 @@
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
-/vllm/attention @LucasWilkinson
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
+/vllm/model_executor/layers/attention @LucasWilkinson
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep

--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -29,7 +29,7 @@ The initialization code should look like this:
    ```python
    from torch import nn
    from vllm.config import VllmConfig
-    from vllm.attention.layer import Attention
+    from vllm.model_executor.layers.attention import Attention
    class MyAttention(nn.Module):
        def __init__(self, vllm_config: VllmConfig, prefix: str):

--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -271,7 +271,7 @@ Taking `MMEncoderAttention` as an example:
 ??? code
    ```python
-    from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
+    from vllm.model_executor.layers.attention import MMEncoderAttention
    from vllm.model_executor.custom_op import CustomOp

--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -21,7 +21,6 @@ from tests.compile.fusion_test_utils import (
 from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.attention.layer import Attention
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.matcher_utils import QUANT_OPS
@@ -40,6 +39,7 @@ from vllm.config import (
    set_current_vllm_config,
 )
 from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    QuantKey,
    kFp8StaticTensorSym,

--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -5,7 +5,6 @@ import pytest
 import torch
 from tests.compile.backend import TestBackend
-from vllm.attention.layer import Attention
 from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
@@ -21,6 +20,7 @@ from vllm.config import (
    VllmConfig,
    set_current_vllm_config,
 )
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform

--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -9,8 +9,7 @@ import torch
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
-from vllm.attention.layer import Attention
+from vllm.model_executor.layers.attention import Attention, MMEncoderAttention
-from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 from vllm.utils.torch_utils import set_random_seed

--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -12,7 +12,7 @@ from unittest.mock import patch
 import pytest
 import torch
-from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
+from vllm.model_executor.layers.attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform

--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -5,7 +5,6 @@ import numpy as np
 import pytest
 import torch
-from vllm.attention.layer import Attention
 from vllm.config import (
    AttentionConfig,
    CacheConfig,
@@ -19,6 +18,7 @@ from vllm.distributed.parallel_state import (
    init_distributed_environment,
    initialize_model_parallel,
 )
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams

--- a/tests/v1/worker/test_utils.py
+++ b/tests/v1/worker/test_utils.py
@@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache
 def test_bind_kv_cache(default_vllm_config):
-    from vllm.attention.layer import Attention
+    from vllm.model_executor.layers.attention import Attention
    ctx = {
        "layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"),
@@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config):
 def test_bind_kv_cache_non_attention(default_vllm_config):
-    from vllm.attention.layer import Attention
+    from vllm.model_executor.layers.attention import Attention
    # example from Jamba PP=2
    ctx = {
@@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config):
 def test_bind_kv_cache_draft_model(default_vllm_config):
-    from vllm.attention.layer import Attention
+    from vllm.model_executor.layers.attention import Attention
    layer_names = [
        "model.layers.0.attn",

--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -58,7 +58,6 @@ FILES = [
 SEPARATE_GROUPS = [
    "tests",
    # v0 related
-    "vllm/attention",
    "vllm/compilation",
    "vllm/lora",
    "vllm/model_executor",

--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
--- a/vllm/attention/utils/__init__.py
+++ b/vllm/attention/utils/__init__.py
--- a/vllm/attention/utils/kv_sharing_utils.py
+++ b/vllm/attention/utils/kv_sharing_utils.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-def validate_kv_sharing_target(
-    current_layer_name, target_layer_name, static_forward_context
-):
-    error_msg = (
-        f"Specified KV sharing target layer for {current_layer_name} "
-        f"is not valid: target layer {target_layer_name} "
-    )
-    if current_layer_name == target_layer_name:
-        raise ValueError(error_msg + "cannot be the same as the current layer.")
-    if target_layer_name not in static_forward_context:
-        from vllm.model_executor.models.utils import extract_layer_index
-        # If target layer name is not in the static fwd context, it means either
-        # a) the target layer does not come BEFORE the current layer, or
-        # b) the target layer is not an Attention layer that exists in the model
-        current_layer_idx = extract_layer_index(current_layer_name)
-        target_layer_idx = extract_layer_index(target_layer_name)
-        if current_layer_idx <= target_layer_idx:
-            raise ValueError(error_msg + "must come before the current layer.")
-        else:
-            raise ValueError(error_msg + "is not a valid Attention layer in the model.")
-    # Currently KV sharing is only supported between layers of the same type
-    target_layer_attn_type = static_forward_context[target_layer_name].attn_type
-    expected = static_forward_context[current_layer_name].attn_type
-    if target_layer_attn_type != expected:
-        raise ValueError(
-            error_msg + f"must be the same type as the current layer ({expected})."
-        )
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -11,9 +11,9 @@ from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
-from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    QuantKey,
    kNvfp4Dynamic,

--- a/vllm/compilation/qk_norm_rope_fusion.py
+++ b/vllm/compilation/qk_norm_rope_fusion.py
@@ -10,9 +10,9 @@ from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
-from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from .fusion import empty_bf16, empty_fp32, empty_i64

--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -8,7 +8,6 @@ from typing import Any
 import torch
-from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data
@@ -25,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
 )
 from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
 from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import BlockHash

--- a/vllm/model_executor/layers/attention/__init__.py
+++ b/vllm/model_executor/layers/attention/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.model_executor.layers.attention.attention import Attention
+from vllm.model_executor.layers.attention.chunked_local_attention import (
+    ChunkedLocalAttention,
+)
+from vllm.model_executor.layers.attention.cross_attention import CrossAttention
+from vllm.model_executor.layers.attention.encoder_only_attention import (
+    EncoderOnlyAttention,
+)
+from vllm.model_executor.layers.attention.mla_attention import MLAAttention
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
+from vllm.model_executor.layers.attention.static_sink_attention import (
+    StaticSinkAttention,
+)
+__all__ = [
+    "Attention",
+    "ChunkedLocalAttention",
+    "CrossAttention",
+    "EncoderOnlyAttention",
+    "MLAAttention",
+    "MMEncoderAttention",
+    "StaticSinkAttention",
+]