Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a608b4c6
Unverified
Commit
a608b4c6
authored
Jan 27, 2026
by
Matthew Bonanni
Committed by
GitHub
Jan 27, 2026
Browse files
[5/N][Attention] Finish eliminating `vllm/attention` folder (#32064)
Signed-off-by:
Matthew Bonanni
<
mbonanni@redhat.com
>
parent
1f3a2c29
Changes
151
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
46 additions
and
52 deletions
+46
-52
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+2
-1
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-1
.buildkite/test_areas/kernels.yaml
.buildkite/test_areas/kernels.yaml
+2
-1
.github/CODEOWNERS
.github/CODEOWNERS
+1
-1
docs/contributing/model/basic.md
docs/contributing/model/basic.md
+1
-1
docs/design/custom_op.md
docs/design/custom_op.md
+1
-1
tests/compile/test_fusion_attn.py
tests/compile/test_fusion_attn.py
+1
-1
tests/compile/test_qk_norm_rope_fusion.py
tests/compile/test_qk_norm_rope_fusion.py
+1
-1
tests/kernels/attention/test_attention.py
tests/kernels/attention/test_attention.py
+1
-2
tests/kernels/attention/test_mha_attn.py
tests/kernels/attention/test_mha_attn.py
+1
-1
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner.py
+1
-1
tests/v1/worker/test_utils.py
tests/v1/worker/test_utils.py
+3
-3
tools/pre_commit/mypy.py
tools/pre_commit/mypy.py
+0
-1
vllm/attention/__init__.py
vllm/attention/__init__.py
+0
-0
vllm/attention/utils/__init__.py
vllm/attention/utils/__init__.py
+0
-0
vllm/attention/utils/kv_sharing_utils.py
vllm/attention/utils/kv_sharing_utils.py
+0
-33
vllm/compilation/fusion_attn.py
vllm/compilation/fusion_attn.py
+1
-1
vllm/compilation/qk_norm_rope_fusion.py
vllm/compilation/qk_norm_rope_fusion.py
+1
-1
vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
...buted/kv_transfer/kv_connector/v1/offloading_connector.py
+1
-1
vllm/model_executor/layers/attention/__init__.py
vllm/model_executor/layers/attention/__init__.py
+26
-0
No files found.
.buildkite/test-amd.yaml
View file @
a608b4c6
...
@@ -640,8 +640,9 @@ steps:
...
@@ -640,8 +640,9 @@ steps:
# grade: Blocking
# grade: Blocking
source_file_dependencies
:
source_file_dependencies
:
-
csrc/attention/
-
csrc/attention/
-
vllm/attention
-
vllm/v1/attention
-
vllm/v1/attention
# TODO
:
remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
-
vllm/model_executor/layers/attention
-
tests/kernels/attention
-
tests/kernels/attention
commands
:
commands
:
-
pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-
pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
...
...
.buildkite/test-pipeline.yaml
View file @
a608b4c6
...
@@ -568,8 +568,9 @@ steps:
...
@@ -568,8 +568,9 @@ steps:
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
]
source_file_dependencies
:
source_file_dependencies
:
-
csrc/attention/
-
csrc/attention/
-
vllm/attention
-
vllm/v1/attention
-
vllm/v1/attention
# TODO
:
remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
-
vllm/model_executor/layers/attention
-
tests/kernels/attention
-
tests/kernels/attention
commands
:
commands
:
-
pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-
pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
...
...
.buildkite/test_areas/kernels.yaml
View file @
a608b4c6
...
@@ -15,8 +15,9 @@ steps:
...
@@ -15,8 +15,9 @@ steps:
timeout_in_minutes
:
35
timeout_in_minutes
:
35
source_file_dependencies
:
source_file_dependencies
:
-
csrc/attention/
-
csrc/attention/
-
vllm/attention
-
vllm/v1/attention
-
vllm/v1/attention
# TODO
:
remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
-
vllm/model_executor/layers/attention
-
tests/kernels/attention
-
tests/kernels/attention
commands
:
commands
:
-
pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-
pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
...
...
.github/CODEOWNERS
View file @
a608b4c6
...
@@ -2,8 +2,8 @@
...
@@ -2,8 +2,8 @@
# for more info about CODEOWNERS file
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/attention @LucasWilkinson
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/layers/mamba @tdoublep
...
...
docs/contributing/model/basic.md
View file @
a608b4c6
...
@@ -29,7 +29,7 @@ The initialization code should look like this:
...
@@ -29,7 +29,7 @@ The initialization code should look like this:
```python
```python
from torch import nn
from torch import nn
from vllm.config import VllmConfig
from vllm.config import VllmConfig
from vllm.attention
.layer
import Attention
from vllm.
model_executor.layers.
attention import Attention
class MyAttention(nn.Module):
class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
def __init__(self, vllm_config: VllmConfig, prefix: str):
...
...
docs/design/custom_op.md
View file @
a608b4c6
...
@@ -271,7 +271,7 @@ Taking `MMEncoderAttention` as an example:
...
@@ -271,7 +271,7 @@ Taking `MMEncoderAttention` as an example:
??? code
??? code
```python
```python
from vllm.
attention.layers.mm_encoder_
attention import MMEncoderAttention
from vllm.
model_executor.layers.
attention import MMEncoderAttention
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.custom_op import CustomOp
...
...
tests/compile/test_fusion_attn.py
View file @
a608b4c6
...
@@ -21,7 +21,6 @@ from tests.compile.fusion_test_utils import (
...
@@ -21,7 +21,6 @@ from tests.compile.fusion_test_utils import (
from
tests.utils
import
flat_product
from
tests.utils
import
flat_product
from
tests.v1.attention.utils
import
BatchSpec
,
create_common_attn_metadata
from
tests.v1.attention.utils
import
BatchSpec
,
create_common_attn_metadata
from
vllm._custom_ops
import
cutlass_scaled_fp4_mm
,
scaled_fp4_quant
from
vllm._custom_ops
import
cutlass_scaled_fp4_mm
,
scaled_fp4_quant
from
vllm.attention.layer
import
Attention
from
vllm.compilation.fusion_attn
import
ATTN_OP
,
AttnFusionPass
from
vllm.compilation.fusion_attn
import
ATTN_OP
,
AttnFusionPass
from
vllm.compilation.fx_utils
import
find_op_nodes
from
vllm.compilation.fx_utils
import
find_op_nodes
from
vllm.compilation.matcher_utils
import
QUANT_OPS
from
vllm.compilation.matcher_utils
import
QUANT_OPS
...
@@ -40,6 +39,7 @@ from vllm.config import (
...
@@ -40,6 +39,7 @@ from vllm.config import (
set_current_vllm_config
,
set_current_vllm_config
,
)
)
from
vllm.forward_context
import
get_forward_context
,
set_forward_context
from
vllm.forward_context
import
get_forward_context
,
set_forward_context
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
QuantKey
,
QuantKey
,
kFp8StaticTensorSym
,
kFp8StaticTensorSym
,
...
...
tests/compile/test_qk_norm_rope_fusion.py
View file @
a608b4c6
...
@@ -5,7 +5,6 @@ import pytest
...
@@ -5,7 +5,6 @@ import pytest
import
torch
import
torch
from
tests.compile.backend
import
TestBackend
from
tests.compile.backend
import
TestBackend
from
vllm.attention.layer
import
Attention
from
vllm.compilation.matcher_utils
import
FLASHINFER_ROTARY_OP
,
RMS_OP
,
ROTARY_OP
from
vllm.compilation.matcher_utils
import
FLASHINFER_ROTARY_OP
,
RMS_OP
,
ROTARY_OP
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.compilation.post_cleanup
import
PostCleanupPass
from
vllm.compilation.post_cleanup
import
PostCleanupPass
...
@@ -21,6 +20,7 @@ from vllm.config import (
...
@@ -21,6 +20,7 @@ from vllm.config import (
VllmConfig
,
VllmConfig
,
set_current_vllm_config
,
set_current_vllm_config
,
)
)
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
...
tests/kernels/attention/test_attention.py
View file @
a608b4c6
...
@@ -9,8 +9,7 @@ import torch
...
@@ -9,8 +9,7 @@ import torch
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
tests.kernels.utils
import
opcheck
from
tests.kernels.utils
import
opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.attention.layer
import
Attention
from
vllm.model_executor.layers.attention
import
Attention
,
MMEncoderAttention
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.mem_utils
import
get_max_shared_memory_bytes
from
vllm.utils.mem_utils
import
get_max_shared_memory_bytes
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.utils.torch_utils
import
set_random_seed
...
...
tests/kernels/attention/test_mha_attn.py
View file @
a608b4c6
...
@@ -12,7 +12,7 @@ from unittest.mock import patch
...
@@ -12,7 +12,7 @@ from unittest.mock import patch
import
pytest
import
pytest
import
torch
import
torch
from
vllm.model_executor.layers.attention
.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.attention
import
MMEncoderAttention
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.platforms.cpu
import
CpuPlatform
from
vllm.platforms.cpu
import
CpuPlatform
from
vllm.platforms.cuda
import
CudaPlatform
from
vllm.platforms.cuda
import
CudaPlatform
...
...
tests/v1/worker/test_gpu_model_runner.py
View file @
a608b4c6
...
@@ -5,7 +5,6 @@ import numpy as np
...
@@ -5,7 +5,6 @@ import numpy as np
import
pytest
import
pytest
import
torch
import
torch
from
vllm.attention.layer
import
Attention
from
vllm.config
import
(
from
vllm.config
import
(
AttentionConfig
,
AttentionConfig
,
CacheConfig
,
CacheConfig
,
...
@@ -19,6 +18,7 @@ from vllm.distributed.parallel_state import (
...
@@ -19,6 +18,7 @@ from vllm.distributed.parallel_state import (
init_distributed_environment
,
init_distributed_environment
,
initialize_model_parallel
,
initialize_model_parallel
,
)
)
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.mamba.mamba_mixer2
import
MambaMixer2
from
vllm.model_executor.layers.mamba.mamba_mixer2
import
MambaMixer2
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
...
...
tests/v1/worker/test_utils.py
View file @
a608b4c6
...
@@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache
...
@@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache
def
test_bind_kv_cache
(
default_vllm_config
):
def
test_bind_kv_cache
(
default_vllm_config
):
from
vllm.attention
.layer
import
Attention
from
vllm.
model_executor.layers.
attention
import
Attention
ctx
=
{
ctx
=
{
"layers.0.self_attn"
:
Attention
(
32
,
128
,
0.1
,
prefix
=
"layers.0.self_attn"
),
"layers.0.self_attn"
:
Attention
(
32
,
128
,
0.1
,
prefix
=
"layers.0.self_attn"
),
...
@@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config):
...
@@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config):
def
test_bind_kv_cache_non_attention
(
default_vllm_config
):
def
test_bind_kv_cache_non_attention
(
default_vllm_config
):
from
vllm.attention
.layer
import
Attention
from
vllm.
model_executor.layers.
attention
import
Attention
# example from Jamba PP=2
# example from Jamba PP=2
ctx
=
{
ctx
=
{
...
@@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config):
...
@@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config):
def
test_bind_kv_cache_draft_model
(
default_vllm_config
):
def
test_bind_kv_cache_draft_model
(
default_vllm_config
):
from
vllm.attention
.layer
import
Attention
from
vllm.
model_executor.layers.
attention
import
Attention
layer_names
=
[
layer_names
=
[
"model.layers.0.attn"
,
"model.layers.0.attn"
,
...
...
tools/pre_commit/mypy.py
View file @
a608b4c6
...
@@ -58,7 +58,6 @@ FILES = [
...
@@ -58,7 +58,6 @@ FILES = [
SEPARATE_GROUPS
=
[
SEPARATE_GROUPS
=
[
"tests"
,
"tests"
,
# v0 related
# v0 related
"vllm/attention"
,
"vllm/compilation"
,
"vllm/compilation"
,
"vllm/lora"
,
"vllm/lora"
,
"vllm/model_executor"
,
"vllm/model_executor"
,
...
...
vllm/attention/__init__.py
deleted
100644 → 0
View file @
1f3a2c29
vllm/attention/utils/__init__.py
deleted
100644 → 0
View file @
1f3a2c29
vllm/attention/utils/kv_sharing_utils.py
deleted
100644 → 0
View file @
1f3a2c29
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
def
validate_kv_sharing_target
(
current_layer_name
,
target_layer_name
,
static_forward_context
):
error_msg
=
(
f
"Specified KV sharing target layer for
{
current_layer_name
}
"
f
"is not valid: target layer
{
target_layer_name
}
"
)
if
current_layer_name
==
target_layer_name
:
raise
ValueError
(
error_msg
+
"cannot be the same as the current layer."
)
if
target_layer_name
not
in
static_forward_context
:
from
vllm.model_executor.models.utils
import
extract_layer_index
# If target layer name is not in the static fwd context, it means either
# a) the target layer does not come BEFORE the current layer, or
# b) the target layer is not an Attention layer that exists in the model
current_layer_idx
=
extract_layer_index
(
current_layer_name
)
target_layer_idx
=
extract_layer_index
(
target_layer_name
)
if
current_layer_idx
<=
target_layer_idx
:
raise
ValueError
(
error_msg
+
"must come before the current layer."
)
else
:
raise
ValueError
(
error_msg
+
"is not a valid Attention layer in the model."
)
# Currently KV sharing is only supported between layers of the same type
target_layer_attn_type
=
static_forward_context
[
target_layer_name
].
attn_type
expected
=
static_forward_context
[
current_layer_name
].
attn_type
if
target_layer_attn_type
!=
expected
:
raise
ValueError
(
error_msg
+
f
"must be the same type as the current layer (
{
expected
}
)."
)
vllm/compilation/fusion_attn.py
View file @
a608b4c6
...
@@ -11,9 +11,9 @@ from torch import fx
...
@@ -11,9 +11,9 @@ from torch import fx
from
torch._higher_order_ops.auto_functionalize
import
auto_functionalized
from
torch._higher_order_ops.auto_functionalize
import
auto_functionalized
from
torch._inductor.pattern_matcher
import
PatternMatcherPass
from
torch._inductor.pattern_matcher
import
PatternMatcherPass
from
vllm.attention.layer
import
Attention
from
vllm.config
import
VllmConfig
,
get_layers_from_vllm_config
from
vllm.config
import
VllmConfig
,
get_layers_from_vllm_config
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
QuantKey
,
QuantKey
,
kNvfp4Dynamic
,
kNvfp4Dynamic
,
...
...
vllm/compilation/qk_norm_rope_fusion.py
View file @
a608b4c6
...
@@ -10,9 +10,9 @@ from torch import fx
...
@@ -10,9 +10,9 @@ from torch import fx
from
torch._higher_order_ops.auto_functionalize
import
auto_functionalized
from
torch._higher_order_ops.auto_functionalize
import
auto_functionalized
from
torch._inductor.pattern_matcher
import
PatternMatcherPass
from
torch._inductor.pattern_matcher
import
PatternMatcherPass
from
vllm.attention.layer
import
Attention
from
vllm.config
import
VllmConfig
,
get_layers_from_vllm_config
from
vllm.config
import
VllmConfig
,
get_layers_from_vllm_config
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
.fusion
import
empty_bf16
,
empty_fp32
,
empty_i64
from
.fusion
import
empty_bf16
,
empty_fp32
,
empty_i64
...
...
vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
View file @
a608b4c6
...
@@ -8,7 +8,6 @@ from typing import Any
...
@@ -8,7 +8,6 @@ from typing import Any
import
torch
import
torch
from
vllm.attention.layer
import
Attention
from
vllm.config
import
VllmConfig
,
get_layers_from_vllm_config
from
vllm.config
import
VllmConfig
,
get_layers_from_vllm_config
from
vllm.distributed.kv_events
import
BlockRemoved
,
BlockStored
,
KVCacheEvent
from
vllm.distributed.kv_events
import
BlockRemoved
,
BlockStored
,
KVCacheEvent
from
vllm.distributed.kv_transfer.kv_connector.utils
import
yield_req_data
from
vllm.distributed.kv_transfer.kv_connector.utils
import
yield_req_data
...
@@ -25,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
...
@@ -25,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
)
)
from
vllm.forward_context
import
ForwardContext
from
vllm.forward_context
import
ForwardContext
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.v1.attention.backend
import
AttentionBackend
,
AttentionMetadata
from
vllm.v1.attention.backend
import
AttentionBackend
,
AttentionMetadata
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.core.kv_cache_utils
import
BlockHash
from
vllm.v1.core.kv_cache_utils
import
BlockHash
...
...
vllm/model_executor/layers/attention/__init__.py
View file @
a608b4c6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.model_executor.layers.attention.attention
import
Attention
from
vllm.model_executor.layers.attention.chunked_local_attention
import
(
ChunkedLocalAttention
,
)
from
vllm.model_executor.layers.attention.cross_attention
import
CrossAttention
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.attention.mla_attention
import
MLAAttention
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.attention.static_sink_attention
import
(
StaticSinkAttention
,
)
__all__
=
[
"Attention"
,
"ChunkedLocalAttention"
,
"CrossAttention"
,
"EncoderOnlyAttention"
,
"MLAAttention"
,
"MMEncoderAttention"
,
"StaticSinkAttention"
,
]
Prev
1
2
3
4
5
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment