Unverified Commit 430dd4d9 authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

[Attention] Remove imports from `vllm/attention/__init__.py` (#29342)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
parent c4c0354e
......@@ -30,7 +30,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
......
......@@ -29,7 +29,7 @@ import torch
from torch import nn
from transformers import StableLmConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul
......
......@@ -28,7 +28,7 @@ import torch
from torch import nn
from transformers import Starcoder2Config
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
......
......@@ -9,7 +9,7 @@ from typing import Any
import torch
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import (
......
......@@ -27,7 +27,8 @@ from torch import nn
from transformers import AutoModel
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
from vllm.config.utils import getattr_iter
from vllm.distributed import get_pp_group, get_tp_group
......
......@@ -16,8 +16,8 @@ from transformers import (
)
from transformers.models.whisper.modeling_whisper import sinusoids
from vllm.attention import Attention, AttentionType
from vllm.attention.layer import MultiHeadAttention
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.attention.layers.cross_attention import CrossAttention
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
......
......@@ -335,7 +335,7 @@ class CudaPlatformBase(Platform):
use_sparse: bool,
attn_type: str | None = None,
) -> str:
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
if attn_type is None:
attn_type = AttentionType.DECODER
......
......@@ -51,7 +51,7 @@ class CPUAttentionBackend(AttentionBackend):
@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
"""CPU attention supports decoder and encoder-only attention."""
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
return attn_type in (
AttentionType.DECODER,
......
......@@ -84,7 +84,7 @@ class FlashAttentionBackend(AttentionBackend):
@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
"""FlashAttention supports all attention types."""
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
return attn_type in (
AttentionType.DECODER,
......
......@@ -87,7 +87,7 @@ class FlexAttentionBackend(AttentionBackend):
@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
"""FlexAttention supports both decoder and encoder-only attention."""
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
......
......@@ -4,7 +4,7 @@ from collections.abc import Iterator
import torch
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig
from vllm.platforms import current_platform
from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
......
......@@ -11,7 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
if TYPE_CHECKING:
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig
logger = init_logger(__name__)
......
......@@ -5,7 +5,7 @@ import numpy as np
import torch
from vllm import _custom_ops as ops
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
......
......@@ -19,12 +19,13 @@ import torch.nn as nn
from tqdm import tqdm
import vllm.envs as envs
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import (
AttentionBackend,
AttentionMetadata,
AttentionType,
MultipleOf,
)
from vllm.attention.layer import Attention
from vllm.compilation.counter import compilation_counter
from vllm.compilation.cuda_graph import CUDAGraphWrapper
from vllm.compilation.monitor import set_cudagraph_capturing_enabled
......
......@@ -13,7 +13,7 @@ from typing import (
import torch
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig
from vllm.config.cache import CacheDType
from vllm.distributed.kv_transfer import (
......
......@@ -17,9 +17,8 @@ import torch_xla.distributed.spmd as xs
import torch_xla.runtime as xr
import vllm.envs as envs
from vllm.attention import Attention
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import MLAAttention
from vllm.attention.layer import Attention, MLAAttention
from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
from vllm.config import (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment