Unverified Commit 430dd4d9 authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

[Attention] Remove imports from `vllm/attention/__init__.py` (#29342)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
parent c4c0354e
...@@ -30,7 +30,7 @@ import torch ...@@ -30,7 +30,7 @@ import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import Attention from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
......
...@@ -29,7 +29,7 @@ import torch ...@@ -29,7 +29,7 @@ import torch
from torch import nn from torch import nn
from transformers import StableLmConfig from transformers import StableLmConfig
from vllm.attention import Attention from vllm.attention.layer import Attention
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
......
...@@ -28,7 +28,7 @@ import torch ...@@ -28,7 +28,7 @@ import torch
from torch import nn from torch import nn
from transformers import Starcoder2Config from transformers import Starcoder2Config
from vllm.attention import Attention from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
......
...@@ -9,7 +9,7 @@ from typing import Any ...@@ -9,7 +9,7 @@ from typing import Any
import torch import torch
from torch import nn from torch import nn
from vllm.attention import Attention from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import ( from vllm.distributed import (
......
...@@ -27,7 +27,8 @@ from torch import nn ...@@ -27,7 +27,8 @@ from torch import nn
from transformers import AutoModel from transformers import AutoModel
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from vllm.attention import Attention, AttentionType from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
from vllm.config.utils import getattr_iter from vllm.config.utils import getattr_iter
from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed import get_pp_group, get_tp_group
......
...@@ -16,8 +16,8 @@ from transformers import ( ...@@ -16,8 +16,8 @@ from transformers import (
) )
from transformers.models.whisper.modeling_whisper import sinusoids from transformers.models.whisper.modeling_whisper import sinusoids
from vllm.attention import Attention, AttentionType from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import MultiHeadAttention from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.attention.layers.cross_attention import CrossAttention from vllm.attention.layers.cross_attention import CrossAttention
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
......
...@@ -335,7 +335,7 @@ class CudaPlatformBase(Platform): ...@@ -335,7 +335,7 @@ class CudaPlatformBase(Platform):
use_sparse: bool, use_sparse: bool,
attn_type: str | None = None, attn_type: str | None = None,
) -> str: ) -> str:
from vllm.attention import AttentionType from vllm.attention.backends.abstract import AttentionType
if attn_type is None: if attn_type is None:
attn_type = AttentionType.DECODER attn_type = AttentionType.DECODER
......
...@@ -51,7 +51,7 @@ class CPUAttentionBackend(AttentionBackend): ...@@ -51,7 +51,7 @@ class CPUAttentionBackend(AttentionBackend):
@classmethod @classmethod
def supports_attn_type(cls, attn_type: str) -> bool: def supports_attn_type(cls, attn_type: str) -> bool:
"""CPU attention supports decoder and encoder-only attention.""" """CPU attention supports decoder and encoder-only attention."""
from vllm.attention import AttentionType from vllm.attention.backends.abstract import AttentionType
return attn_type in ( return attn_type in (
AttentionType.DECODER, AttentionType.DECODER,
......
...@@ -84,7 +84,7 @@ class FlashAttentionBackend(AttentionBackend): ...@@ -84,7 +84,7 @@ class FlashAttentionBackend(AttentionBackend):
@classmethod @classmethod
def supports_attn_type(cls, attn_type: str) -> bool: def supports_attn_type(cls, attn_type: str) -> bool:
"""FlashAttention supports all attention types.""" """FlashAttention supports all attention types."""
from vllm.attention import AttentionType from vllm.attention.backends.abstract import AttentionType
return attn_type in ( return attn_type in (
AttentionType.DECODER, AttentionType.DECODER,
......
...@@ -87,7 +87,7 @@ class FlexAttentionBackend(AttentionBackend): ...@@ -87,7 +87,7 @@ class FlexAttentionBackend(AttentionBackend):
@classmethod @classmethod
def supports_attn_type(cls, attn_type: str) -> bool: def supports_attn_type(cls, attn_type: str) -> bool:
"""FlexAttention supports both decoder and encoder-only attention.""" """FlexAttention supports both decoder and encoder-only attention."""
from vllm.attention import AttentionType from vllm.attention.backends.abstract import AttentionType
return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY) return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
......
...@@ -4,7 +4,7 @@ from collections.abc import Iterator ...@@ -4,7 +4,7 @@ from collections.abc import Iterator
import torch import torch
from vllm.attention import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
......
...@@ -11,7 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager ...@@ -11,7 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
from vllm.v1.kv_offload.worker.worker import OffloadingHandler from vllm.v1.kv_offload.worker.worker import OffloadingHandler
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig from vllm.config import VllmConfig
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -5,7 +5,7 @@ import numpy as np ...@@ -5,7 +5,7 @@ import numpy as np
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
......
...@@ -19,12 +19,13 @@ import torch.nn as nn ...@@ -19,12 +19,13 @@ import torch.nn as nn
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs import vllm.envs as envs
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import ( from vllm.attention.backends.abstract import (
AttentionBackend, AttentionBackend,
AttentionMetadata, AttentionMetadata,
AttentionType,
MultipleOf, MultipleOf,
) )
from vllm.attention.layer import Attention
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.cuda_graph import CUDAGraphWrapper
from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.compilation.monitor import set_cudagraph_capturing_enabled
......
...@@ -13,7 +13,7 @@ from typing import ( ...@@ -13,7 +13,7 @@ from typing import (
import torch import torch
from vllm.attention import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.cache import CacheDType from vllm.config.cache import CacheDType
from vllm.distributed.kv_transfer import ( from vllm.distributed.kv_transfer import (
......
...@@ -17,9 +17,8 @@ import torch_xla.distributed.spmd as xs ...@@ -17,9 +17,8 @@ import torch_xla.distributed.spmd as xs
import torch_xla.runtime as xr import torch_xla.runtime as xr
import vllm.envs as envs import vllm.envs as envs
from vllm.attention import Attention
from vllm.attention.backends.abstract import AttentionType from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import MLAAttention from vllm.attention.layer import Attention, MLAAttention
from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
from vllm.config import ( from vllm.config import (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment