[1/N][Attention] Restructure attention: move files (#31916)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>

[1/N][Attention] Restructure attention: move files (#31916)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2612ba92 · Matthew Bonanni · GitHub · 1f8b7c53 · 2612ba92 · 2612ba92
Unverified Commit 2612ba92 authored Jan 09, 2026 by Matthew Bonanni Committed by GitHub Jan 09, 2026
20 changed files
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1105,8 +1105,8 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
  - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py

--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -954,8 +954,8 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
  - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py

--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -90,8 +90,8 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
  - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py

--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,7 +3,6 @@

 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
@@ -27,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # vLLM V1
 /vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
@@ -117,15 +117,15 @@ mkdocs.yaml @hmellor
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten

 # Kernels
-/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
-/vllm/attention/ops/triton_unified_attention.py @tdoublep
+/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep

 # ROCm related: specify owner with write access to notify AMD folks for careful code review
 /vllm/**/*rocm* @tjtanaa
 /docker/Dockerfile.rocm* @gshtras @tjtanaa
 /vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
-/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
 /csrc/rocm @gshtras @tjtanaa
 /requirements/*rocm* @tjtanaa

--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -222,10 +222,10 @@ pull_request_rules:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
      - files~=^requirements/rocm.*\.txt
-      - files~=^vllm/attention/backends/rocm.*\.py
-      - files~=^vllm/attention/ops/rocm.*\.py
      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
+      - files~=^vllm/v1/attention/backends/rocm.*\.py
      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
+      - files~=^vllm/v1/attention/ops/rocm.*\.py
      - files~=^tests/kernels/.*_rocm.*\.py
      - files=vllm/platforms/rocm.py
      - title~=(?i)AMD

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -7,9 +7,6 @@ import torch
 from tabulate import tabulate

 from vllm import _custom_ops as ops
-from vllm.attention.ops.triton_reshape_and_cache_flash import (
-    triton_reshape_and_cache_flash,
-)
 from vllm.logger import init_logger
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
@@ -17,6 +14,9 @@ from vllm.utils.torch_utils import (
    create_kv_caches_with_random_flash,
    set_random_seed,
 )
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash,
+)

 logger = init_logger(__name__)


--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -142,7 +142,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
-It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend.
+It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/v1/attention/backends/registry.py) when adding a new mamba backend.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
 Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
 The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -60,7 +60,7 @@ For example:
 **1. Attention:**

 ```python
--8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn"
+--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn"

 --8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
 ```

--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -124,7 +124,7 @@ Every plugin has three parts:

    Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented.

-5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
+5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.v1.attention.backend.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.

 6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:

@@ -153,5 +153,5 @@ The interface for the model/module may change during vLLM's development. If you

 !!! warning "Deprecations"
    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
-    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
+    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
    - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -4,10 +4,10 @@
 from argparse import Namespace

 from vllm import LLM, EngineArgs
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import AttentionConfig
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.attention.backends.registry import AttentionBackendEnum


 def parse_args():

--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -4,10 +4,10 @@
 from argparse import Namespace

 from vllm import LLM, EngineArgs
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import AttentionConfig
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.attention.backends.registry import AttentionBackendEnum


 def parse_args():

--- a/tests/compile/fullgraph/test_full_cudagraph.py
+++ b/tests/compile/fullgraph/test_full_cudagraph.py
@@ -9,10 +9,10 @@ import pytest
 from tests.utils import wait_for_gpu_memory_to_clear
 from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM, SamplingParams
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum


 @contextlib.contextmanager

--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -10,10 +10,10 @@ import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum

 from ...utils import create_new_process_for_each_test


--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -9,8 +9,6 @@ from tests.compile.backend import LazyInitPass, TestBackend
 from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import Attention
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
@@ -37,6 +35,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
+from vllm.v1.attention.backend import AttentionMetadata
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.kv_cache_interface import AttentionSpec

 FP8_DTYPE = current_platform.fp8_dtype()

--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -5,7 +5,6 @@ import pytest
 import torch

 from tests.compile.backend import TestBackend
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
 from vllm.compilation.noop_elimination import NoOpEliminationPass
@@ -25,6 +24,7 @@ from vllm.config import (
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
+from vllm.v1.attention.backend import AttentionType

 RSQRT_OP = torch.ops.aten.rsqrt.default
 INDEX_SELECT_OP = torch.ops.aten.index.Tensor

--- a/tests/config/test_multimodal_config.py
+++ b/tests/config/test_multimodal_config.py
@@ -3,8 +3,8 @@

 import pytest

-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MultiModalConfig
+from vllm.v1.attention.backends.registry import AttentionBackendEnum


 def test_mm_encoder_attn_backend_str_conversion():

--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -299,7 +299,7 @@ def test_compilation_config():


 def test_attention_config():
-    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())


--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -6,9 +6,9 @@ import pytest
 import torch

 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
-from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available

 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]

--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -10,7 +10,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.attention.layer import Attention
-from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 from vllm.utils.torch_utils import set_random_seed
@@ -30,7 +30,7 @@ NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing

 # This should be sync with get_supported_head_sizes() in
-# vllm.attention.ops.paged_attn.PagedAttention
+# vllm.v1.attention.ops.paged_attn.PagedAttention
 HEAD_SIZES = [32, 80, 128, 256]

 BLOCK_SIZES = [16, 32]

--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -6,13 +6,13 @@ from unittest.mock import patch
 import pytest
 import torch

-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend


 @pytest.fixture(autouse=True)
@@ -182,7 +182,7 @@ def test_backend_selection(
                        expected = name
                        assert backend.get_name() == expected
                    elif name == "FLASH_ATTN_MLA":
-                        from vllm.attention.utils.fa_utils import (
+                        from vllm.v1.attention.backends.fa_utils import (
                            flash_attn_supports_mla,
                        )