Unverified Commit a608b4c6 authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

[5/N][Attention] Finish eliminating `vllm/attention` folder (#32064)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
parent 1f3a2c29
...@@ -640,8 +640,9 @@ steps: ...@@ -640,8 +640,9 @@ steps:
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- csrc/attention/ - csrc/attention/
- vllm/attention
- vllm/v1/attention - vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention - tests/kernels/attention
commands: commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
......
...@@ -568,8 +568,9 @@ steps: ...@@ -568,8 +568,9 @@ steps:
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- csrc/attention/ - csrc/attention/
- vllm/attention
- vllm/v1/attention - vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention - tests/kernels/attention
commands: commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
......
...@@ -15,8 +15,9 @@ steps: ...@@ -15,8 +15,9 @@ steps:
timeout_in_minutes: 35 timeout_in_minutes: 35
source_file_dependencies: source_file_dependencies:
- csrc/attention/ - csrc/attention/
- vllm/attention
- vllm/v1/attention - vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention - tests/kernels/attention
commands: commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
# for more info about CODEOWNERS file # for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review # This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/attention @LucasWilkinson
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
......
...@@ -29,7 +29,7 @@ The initialization code should look like this: ...@@ -29,7 +29,7 @@ The initialization code should look like this:
```python ```python
from torch import nn from torch import nn
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.attention.layer import Attention from vllm.model_executor.layers.attention import Attention
class MyAttention(nn.Module): class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str): def __init__(self, vllm_config: VllmConfig, prefix: str):
......
...@@ -271,7 +271,7 @@ Taking `MMEncoderAttention` as an example: ...@@ -271,7 +271,7 @@ Taking `MMEncoderAttention` as an example:
??? code ??? code
```python ```python
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.attention import MMEncoderAttention
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
......
...@@ -21,7 +21,6 @@ from tests.compile.fusion_test_utils import ( ...@@ -21,7 +21,6 @@ from tests.compile.fusion_test_utils import (
from tests.utils import flat_product from tests.utils import flat_product
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.attention.layer import Attention
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.fx_utils import find_op_nodes
from vllm.compilation.matcher_utils import QUANT_OPS from vllm.compilation.matcher_utils import QUANT_OPS
...@@ -40,6 +39,7 @@ from vllm.config import ( ...@@ -40,6 +39,7 @@ from vllm.config import (
set_current_vllm_config, set_current_vllm_config,
) )
from vllm.forward_context import get_forward_context, set_forward_context from vllm.forward_context import get_forward_context, set_forward_context
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey, QuantKey,
kFp8StaticTensorSym, kFp8StaticTensorSym,
......
...@@ -5,7 +5,6 @@ import pytest ...@@ -5,7 +5,6 @@ import pytest
import torch import torch
from tests.compile.backend import TestBackend from tests.compile.backend import TestBackend
from vllm.attention.layer import Attention
from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.post_cleanup import PostCleanupPass
...@@ -21,6 +20,7 @@ from vllm.config import ( ...@@ -21,6 +20,7 @@ from vllm.config import (
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform from vllm.platforms import current_platform
......
...@@ -9,8 +9,7 @@ import torch ...@@ -9,8 +9,7 @@ import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention.layer import Attention from vllm.model_executor.layers.attention import Attention, MMEncoderAttention
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.mem_utils import get_max_shared_memory_bytes
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
......
...@@ -12,7 +12,7 @@ from unittest.mock import patch ...@@ -12,7 +12,7 @@ from unittest.mock import patch
import pytest import pytest
import torch import torch
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.attention import MMEncoderAttention
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform from vllm.platforms.cuda import CudaPlatform
......
...@@ -5,7 +5,6 @@ import numpy as np ...@@ -5,7 +5,6 @@ import numpy as np
import pytest import pytest
import torch import torch
from vllm.attention.layer import Attention
from vllm.config import ( from vllm.config import (
AttentionConfig, AttentionConfig,
CacheConfig, CacheConfig,
...@@ -19,6 +18,7 @@ from vllm.distributed.parallel_state import ( ...@@ -19,6 +18,7 @@ from vllm.distributed.parallel_state import (
init_distributed_environment, init_distributed_environment,
initialize_model_parallel, initialize_model_parallel,
) )
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
......
...@@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache ...@@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache
def test_bind_kv_cache(default_vllm_config): def test_bind_kv_cache(default_vllm_config):
from vllm.attention.layer import Attention from vllm.model_executor.layers.attention import Attention
ctx = { ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"), "layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"),
...@@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config): ...@@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config):
def test_bind_kv_cache_non_attention(default_vllm_config): def test_bind_kv_cache_non_attention(default_vllm_config):
from vllm.attention.layer import Attention from vllm.model_executor.layers.attention import Attention
# example from Jamba PP=2 # example from Jamba PP=2
ctx = { ctx = {
...@@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config): ...@@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config):
def test_bind_kv_cache_draft_model(default_vllm_config): def test_bind_kv_cache_draft_model(default_vllm_config):
from vllm.attention.layer import Attention from vllm.model_executor.layers.attention import Attention
layer_names = [ layer_names = [
"model.layers.0.attn", "model.layers.0.attn",
......
...@@ -58,7 +58,6 @@ FILES = [ ...@@ -58,7 +58,6 @@ FILES = [
SEPARATE_GROUPS = [ SEPARATE_GROUPS = [
"tests", "tests",
# v0 related # v0 related
"vllm/attention",
"vllm/compilation", "vllm/compilation",
"vllm/lora", "vllm/lora",
"vllm/model_executor", "vllm/model_executor",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
def validate_kv_sharing_target(
current_layer_name, target_layer_name, static_forward_context
):
error_msg = (
f"Specified KV sharing target layer for {current_layer_name} "
f"is not valid: target layer {target_layer_name} "
)
if current_layer_name == target_layer_name:
raise ValueError(error_msg + "cannot be the same as the current layer.")
if target_layer_name not in static_forward_context:
from vllm.model_executor.models.utils import extract_layer_index
# If target layer name is not in the static fwd context, it means either
# a) the target layer does not come BEFORE the current layer, or
# b) the target layer is not an Attention layer that exists in the model
current_layer_idx = extract_layer_index(current_layer_name)
target_layer_idx = extract_layer_index(target_layer_name)
if current_layer_idx <= target_layer_idx:
raise ValueError(error_msg + "must come before the current layer.")
else:
raise ValueError(error_msg + "is not a valid Attention layer in the model.")
# Currently KV sharing is only supported between layers of the same type
target_layer_attn_type = static_forward_context[target_layer_name].attn_type
expected = static_forward_context[current_layer_name].attn_type
if target_layer_attn_type != expected:
raise ValueError(
error_msg + f"must be the same type as the current layer ({expected})."
)
...@@ -11,9 +11,9 @@ from torch import fx ...@@ -11,9 +11,9 @@ from torch import fx
from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._higher_order_ops.auto_functionalize import auto_functionalized
from torch._inductor.pattern_matcher import PatternMatcherPass from torch._inductor.pattern_matcher import PatternMatcherPass
from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey, QuantKey,
kNvfp4Dynamic, kNvfp4Dynamic,
......
...@@ -10,9 +10,9 @@ from torch import fx ...@@ -10,9 +10,9 @@ from torch import fx
from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._higher_order_ops.auto_functionalize import auto_functionalized
from torch._inductor.pattern_matcher import PatternMatcherPass from torch._inductor.pattern_matcher import PatternMatcherPass
from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from .fusion import empty_bf16, empty_fp32, empty_i64 from .fusion import empty_bf16, empty_fp32, empty_i64
......
...@@ -8,7 +8,6 @@ from typing import Any ...@@ -8,7 +8,6 @@ from typing import Any
import torch import torch
from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data
...@@ -25,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( ...@@ -25,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
) )
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.attention import Attention
from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata
from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.kv_cache_utils import BlockHash from vllm.v1.core.kv_cache_utils import BlockHash
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.model_executor.layers.attention.attention import Attention
from vllm.model_executor.layers.attention.chunked_local_attention import (
ChunkedLocalAttention,
)
from vllm.model_executor.layers.attention.cross_attention import CrossAttention
from vllm.model_executor.layers.attention.encoder_only_attention import (
EncoderOnlyAttention,
)
from vllm.model_executor.layers.attention.mla_attention import MLAAttention
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.layers.attention.static_sink_attention import (
StaticSinkAttention,
)
__all__ = [
"Attention",
"ChunkedLocalAttention",
"CrossAttention",
"EncoderOnlyAttention",
"MLAAttention",
"MMEncoderAttention",
"StaticSinkAttention",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment