Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2612ba92
Unverified
Commit
2612ba92
authored
Jan 09, 2026
by
Matthew Bonanni
Committed by
GitHub
Jan 09, 2026
Browse files
[1/N][Attention] Restructure attention: move files (#31916)
Signed-off-by:
Matthew Bonanni
<
mbonanni@redhat.com
>
parent
1f8b7c53
Changes
195
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
44 additions
and
38 deletions
+44
-38
vllm/model_executor/models/plamo2.py
vllm/model_executor/models/plamo2.py
+1
-1
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+4
-2
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+2
-2
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+2
-2
vllm/model_executor/models/qwen3.py
vllm/model_executor/models/qwen3.py
+1
-1
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+1
-1
vllm/model_executor/models/qwen3_omni_moe_thinker.py
vllm/model_executor/models/qwen3_omni_moe_thinker.py
+1
-1
vllm/model_executor/models/qwen3_vl.py
vllm/model_executor/models/qwen3_vl.py
+1
-1
vllm/model_executor/models/seed_oss.py
vllm/model_executor/models/seed_oss.py
+1
-1
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip.py
+4
-2
vllm/model_executor/models/siglip2.py
vllm/model_executor/models/siglip2.py
+1
-1
vllm/model_executor/models/siglip2navit.py
vllm/model_executor/models/siglip2navit.py
+1
-1
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+1
-1
vllm/model_executor/models/transformers/base.py
vllm/model_executor/models/transformers/base.py
+4
-2
vllm/model_executor/models/vision.py
vllm/model_executor/models/vision.py
+1
-1
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+5
-5
vllm/model_executor/models/whisper_utils.py
vllm/model_executor/models/whisper_utils.py
+5
-5
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+2
-2
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+4
-4
vllm/platforms/interface.py
vllm/platforms/interface.py
+2
-2
No files found.
vllm/model_executor/models/plamo2.py
View file @
2612ba92
...
...
@@ -9,7 +9,6 @@ import torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
VllmConfig
,
get_current_vllm_config
...
...
@@ -66,6 +65,7 @@ from vllm.model_executor.models.utils import (
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.v1.attention.backend
import
AttentionMetadata
from
vllm.v1.attention.backends.mamba2_attn
import
Mamba2AttentionMetadata
...
...
vllm/model_executor/models/qwen2.py
View file @
2612ba92
...
...
@@ -33,13 +33,14 @@ import torch
from
torch
import
nn
from
transformers
import
Qwen2Config
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
@@ -59,6 +60,7 @@ from vllm.model_executor.model_loader.weight_utils import (
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
is_interleaved
,
set_default_rope_theta
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.utils
import
(
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
2612ba92
...
...
@@ -41,8 +41,6 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
Qwen2_5_VLVisionConfig
,
)
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.distributed
import
parallel_state
...
...
@@ -50,6 +48,7 @@ from vllm.distributed import utils as dist_utils
from
vllm.forward_context
import
set_forward_context
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
get_act_and_mul_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv3dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
...
...
@@ -83,6 +82,7 @@ from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.platform_utils
import
is_pin_memory_available
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interfaces
import
(
MultiModalEmbeddings
,
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
2612ba92
...
...
@@ -43,14 +43,13 @@ from transformers.models.qwen2_vl.configuration_qwen2_vl import (
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
smart_resize
from
transformers.models.qwen2_vl.video_processing_qwen2_vl
import
Qwen2VLVideoProcessor
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
parallel_state
,
tensor_model_parallel_all_gather
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
QuickGELU
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv3dLayer
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -90,6 +89,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interfaces
import
(
MultiModalEmbeddings
,
...
...
vllm/model_executor/models/qwen3.py
View file @
2612ba92
...
...
@@ -30,7 +30,6 @@ import torch
from
torch
import
nn
from
transformers
import
Qwen3Config
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -44,6 +43,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
set_default_rope_theta
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.qwen2
import
Qwen2MLP
as
Qwen3MLP
...
...
vllm/model_executor/models/qwen3_next.py
View file @
2612ba92
...
...
@@ -10,7 +10,6 @@ from einops import rearrange
from
torch
import
nn
from
transformers.activations
import
ACT2FN
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
...
...
@@ -75,6 +74,7 @@ from vllm.sequence import IntermediateTensors
from
vllm.transformers_utils.configs
import
Qwen3NextConfig
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.v1.attention.backend
import
AttentionMetadata
from
vllm.v1.attention.backends.gdn_attn
import
GDNAttentionMetadata
from
.interfaces
import
(
...
...
vllm/model_executor/models/qwen3_omni_moe_thinker.py
View file @
2612ba92
...
...
@@ -46,7 +46,6 @@ from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
)
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
...
...
@@ -75,6 +74,7 @@ from vllm.multimodal.processing import (
PromptUpdateDetails
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interfaces
import
(
MultiModalEmbeddings
,
...
...
vllm/model_executor/models/qwen3_vl.py
View file @
2612ba92
...
...
@@ -48,7 +48,6 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
)
from
transformers.video_utils
import
VideoMetadata
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
...
...
@@ -92,6 +91,7 @@ from vllm.multimodal.processing import (
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interfaces
import
(
MultiModalEmbeddings
,
...
...
vllm/model_executor/models/seed_oss.py
View file @
2612ba92
...
...
@@ -30,7 +30,6 @@ import torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
as
SeedOssConfig
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -56,6 +55,7 @@ from vllm.model_executor.model_loader.weight_utils import (
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
set_default_rope_theta
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
(
...
...
vllm/model_executor/models/siglip.py
View file @
2612ba92
...
...
@@ -15,12 +15,14 @@ from transformers import (
SiglipVisionConfig
,
)
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
MultiModalConfig
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
vllm/model_executor/models/siglip2.py
View file @
2612ba92
...
...
@@ -10,11 +10,11 @@ from torch import nn
from
torch.nn
import
functional
as
F
from
transformers
import
Siglip2VisionConfig
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
MultiModalConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
QKVParallelLinear
,
...
...
vllm/model_executor/models/siglip2navit.py
View file @
2612ba92
...
...
@@ -11,10 +11,10 @@ from torch.nn import functional as F
from
transformers
import
Siglip2VisionConfig
from
transformers.configuration_utils
import
PretrainedConfig
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
MultiModalConfig
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
vllm/model_executor/models/step3_vl.py
View file @
2612ba92
...
...
@@ -15,11 +15,11 @@ from torchvision import transforms
from
torchvision.transforms.functional
import
InterpolationMode
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
vllm/model_executor/models/transformers/base.py
View file @
2612ba92
...
...
@@ -27,13 +27,14 @@ from torch import nn
from
transformers
import
AutoModel
from
transformers.modeling_utils
import
ALL_ATTENTION_FUNCTIONS
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.config.utils
import
getattr_iter
from
vllm.distributed
import
get_pp_group
,
get_tp_group
from
vllm.distributed.utils
import
get_pp_indices
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
vllm.model_executor.models.interfaces
import
(
SupportsEagle
,
...
...
@@ -59,6 +60,7 @@ from vllm.model_executor.models.utils import (
maybe_prefix
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedModel
...
...
vllm/model_executor/models/vision.py
View file @
2612ba92
...
...
@@ -10,7 +10,6 @@ from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar
import
torch
from
transformers
import
PretrainedConfig
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
...
...
@@ -19,6 +18,7 @@ from vllm.distributed import (
)
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
logger
=
init_logger
(
__name__
)
...
...
vllm/model_executor/models/whisper.py
View file @
2612ba92
...
...
@@ -18,18 +18,15 @@ from transformers import (
)
from
transformers.models.whisper.modeling_whisper
import
sinusoids
from
vllm.attention.backends.abstract
import
(
AttentionType
,
)
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.cross_attention
import
CrossAttention
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
CacheConfig
,
ModelConfig
,
SpeechToTextConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.inputs.data
import
PromptType
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.cross_attention
import
CrossAttention
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
QKVParallelLinear
,
...
...
@@ -62,6 +59,9 @@ from vllm.transformers_utils.processor import cached_processor_from_config
from
vllm.utils.jsontree
import
json_map_leaves
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.torch_utils
import
set_default_torch_dtype
from
vllm.v1.attention.backend
import
(
AttentionType
,
)
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsTranscription
from
.utils
import
(
...
...
vllm/model_executor/models/whisper_utils.py
View file @
2612ba92
...
...
@@ -9,20 +9,20 @@ import torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
vllm.attention.backends.abstract
import
(
from
vllm.attention.layer
import
Attention
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.v1.attention.backend
import
(
AttentionBackend
,
AttentionMetadata
,
AttentionType
,
)
from
vllm.attention.layer
import
Attention
from
vllm.attention.selector
import
get_attn_backend
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.v1.attention.backends.flash_attn
import
FlashAttentionBackend
from
vllm.v1.attention.backends.utils
import
(
CommonAttentionMetadata
,
subclass_attention_backend_with_overrides
,
)
from
vllm.v1.attention.selector
import
get_attn_backend
from
vllm.v1.kv_cache_interface
import
AttentionSpec
# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
...
...
vllm/platforms/cpu.py
View file @
2612ba92
...
...
@@ -15,16 +15,16 @@ import regex as re
import
torch
from
vllm
import
envs
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.logger
import
init_logger
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interface
import
CpuArchEnum
,
Platform
,
PlatformEnum
logger
=
init_logger
(
__name__
)
if
TYPE_CHECKING
:
from
vllm.attention.selector
import
AttentionSelectorConfig
from
vllm.config
import
VllmConfig
from
vllm.v1.attention.selector
import
AttentionSelectorConfig
else
:
VllmConfig
=
None
...
...
vllm/platforms/cuda.py
View file @
2612ba92
...
...
@@ -14,17 +14,17 @@ from typing_extensions import ParamSpec
# import custom ops, trigger op registration
import
vllm._C
# noqa
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.logger
import
init_logger
from
vllm.utils.import_utils
import
import_pynvml
from
vllm.utils.torch_utils
import
cuda_device_count_stateless
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
if
TYPE_CHECKING
:
from
vllm.attention.selector
import
AttentionSelectorConfig
from
vllm.config
import
VllmConfig
from
vllm.config.cache
import
CacheDType
from
vllm.v1.attention.selector
import
AttentionSelectorConfig
else
:
VllmConfig
=
None
CacheDType
=
None
...
...
@@ -148,7 +148,7 @@ class CudaPlatformBase(Platform):
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.
v1.
attention.backends.registry
import
AttentionBackendEnum
parallel_config
=
vllm_config
.
parallel_config
model_config
=
vllm_config
.
model_config
...
...
@@ -200,7 +200,7 @@ class CudaPlatformBase(Platform):
use_cutlass_mla
=
backend
==
AttentionBackendEnum
.
CUTLASS_MLA
use_flashinfer_mla
=
backend
==
AttentionBackendEnum
.
FLASHINFER_MLA
from
vllm.attention.ops.flashmla
import
is_flashmla_dense_supported
from
vllm.
v1.
attention.ops.flashmla
import
is_flashmla_dense_supported
if
(
use_flashmla
...
...
vllm/platforms/interface.py
View file @
2612ba92
...
...
@@ -13,18 +13,18 @@ import numpy as np
import
torch
from
typing_extensions
import
deprecated
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.logger
import
init_logger
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
if
TYPE_CHECKING
:
from
torch.distributed
import
PrefixStore
,
ProcessGroup
from
vllm.attention.selector
import
AttentionSelectorConfig
from
vllm.config
import
VllmConfig
from
vllm.inputs
import
ProcessorInputs
,
PromptType
from
vllm.pooling_params
import
PoolingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
from
vllm.v1.attention.selector
import
AttentionSelectorConfig
else
:
FlexibleArgumentParser
=
object
...
...
Prev
1
…
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment