Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2612ba92
Unverified
Commit
2612ba92
authored
Jan 09, 2026
by
Matthew Bonanni
Committed by
GitHub
Jan 09, 2026
Browse files
[1/N][Attention] Restructure attention: move files (#31916)
Signed-off-by:
Matthew Bonanni
<
mbonanni@redhat.com
>
parent
1f8b7c53
Changes
195
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
44 additions
and
34 deletions
+44
-34
vllm/model_executor/layers/mamba/short_conv.py
vllm/model_executor/layers/mamba/short_conv.py
+1
-1
vllm/model_executor/models/afmoe.py
vllm/model_executor/models/afmoe.py
+1
-1
vllm/model_executor/models/aimv2.py
vllm/model_executor/models/aimv2.py
+1
-1
vllm/model_executor/models/apertus.py
vllm/model_executor/models/apertus.py
+4
-2
vllm/model_executor/models/bert.py
vllm/model_executor/models/bert.py
+3
-1
vllm/model_executor/models/bert_with_rope.py
vllm/model_executor/models/bert_with_rope.py
+3
-1
vllm/model_executor/models/blip.py
vllm/model_executor/models/blip.py
+1
-1
vllm/model_executor/models/clip.py
vllm/model_executor/models/clip.py
+1
-1
vllm/model_executor/models/config.py
vllm/model_executor/models/config.py
+1
-1
vllm/model_executor/models/deepencoder.py
vllm/model_executor/models/deepencoder.py
+1
-1
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+6
-4
vllm/model_executor/models/dots_ocr.py
vllm/model_executor/models/dots_ocr.py
+4
-4
vllm/model_executor/models/ernie45_vl.py
vllm/model_executor/models/ernie45_vl.py
+4
-4
vllm/model_executor/models/gemma3.py
vllm/model_executor/models/gemma3.py
+4
-2
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+1
-1
vllm/model_executor/models/glm4_1v.py
vllm/model_executor/models/glm4_1v.py
+4
-4
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+1
-1
vllm/model_executor/models/glmasr.py
vllm/model_executor/models/glmasr.py
+1
-1
vllm/model_executor/models/gpt_oss.py
vllm/model_executor/models/gpt_oss.py
+1
-1
vllm/model_executor/models/hunyuan_v1.py
vllm/model_executor/models/hunyuan_v1.py
+1
-1
No files found.
vllm/model_executor/layers/mamba/short_conv.py
View file @
2612ba92
...
...
@@ -4,7 +4,6 @@
import
torch
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
ModelConfig
,
get_current_vllm_config
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.forward_context
import
ForwardContext
,
get_forward_context
...
...
@@ -24,6 +23,7 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_update
,
)
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.v1.attention.backend
import
AttentionMetadata
from
vllm.v1.attention.backends.short_conv_attn
import
ShortConvAttentionMetadata
...
...
vllm/model_executor/models/afmoe.py
View file @
2612ba92
...
...
@@ -9,7 +9,6 @@ from itertools import islice
import
torch
from
torch
import
nn
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
,
get_current_vllm_config
...
...
@@ -50,6 +49,7 @@ from vllm.model_executor.models.utils import (
maybe_prefix
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
logger
=
init_logger
(
__name__
)
...
...
vllm/model_executor/models/aimv2.py
View file @
2612ba92
...
...
@@ -8,10 +8,10 @@ from collections.abc import Iterable
import
torch
import
torch.nn
as
nn
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed.utils
import
divide
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
...
...
vllm/model_executor/models/apertus.py
View file @
2612ba92
...
...
@@ -32,13 +32,14 @@ import torch
from
torch
import
nn
from
transformers
import
ApertusConfig
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
XIELU
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -57,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
(
...
...
vllm/model_executor/models/bert.py
View file @
2612ba92
...
...
@@ -7,11 +7,13 @@ import torch
from
torch
import
nn
from
transformers
import
BertConfig
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
PoolerConfig
,
VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
QKVParallelLinear
,
...
...
vllm/model_executor/models/bert_with_rope.py
View file @
2612ba92
...
...
@@ -6,7 +6,6 @@ import torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
...
...
@@ -16,6 +15,9 @@ from vllm.distributed import (
tensor_model_parallel_all_reduce
,
)
from
vllm.model_executor.layers.activation
import
get_act_and_mul_fn
,
get_act_fn
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.fused_moe
import
activation_without_mul
,
fused_topk
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
vllm/model_executor/models/blip.py
View file @
2612ba92
...
...
@@ -9,9 +9,9 @@ import torch
import
torch.nn
as
nn
from
transformers
import
Blip2VisionConfig
,
BlipVisionConfig
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
vllm/model_executor/models/clip.py
View file @
2612ba92
...
...
@@ -15,11 +15,11 @@ from transformers import (
)
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
MultiModalConfig
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
vllm/model_executor/models/config.py
View file @
2612ba92
...
...
@@ -4,12 +4,12 @@ from copy import deepcopy
from
math
import
lcm
from
typing
import
TYPE_CHECKING
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.logger
import
init_logger
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
cdiv
,
round_up
from
vllm.utils.torch_utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.kv_cache_interface
import
FullAttentionSpec
,
MambaSpec
,
MLAAttentionSpec
if
TYPE_CHECKING
:
...
...
vllm/model_executor/models/deepencoder.py
View file @
2612ba92
...
...
@@ -18,8 +18,8 @@ import torch.nn as nn
import
torch.nn.functional
as
F
from
transformers
import
CLIPVisionConfig
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
MultiModalConfig
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
2612ba92
...
...
@@ -33,9 +33,7 @@ from torch import nn
from
transformers
import
DeepseekV2Config
,
DeepseekV3Config
from
vllm._aiter_ops
import
rocm_aiter_ops
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.attention.layer
import
Attention
from
vllm.attention.ops.common
import
pack_seq_triton
,
unpack_seq_triton
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
ParallelConfig
,
VllmConfig
,
get_current_vllm_config
from
vllm.distributed
import
(
...
...
@@ -78,10 +76,12 @@ from vllm.platforms import current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.deep_gemm
import
fp8_mqa_logits
,
fp8_paged_mqa_logits
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.v1.attention.backend
import
AttentionBackend
from
vllm.v1.attention.backends.mla.indexer
import
(
DeepseekV32IndexerBackend
,
DeepseekV32IndexerMetadata
,
)
from
vllm.v1.attention.ops.common
import
pack_seq_triton
,
unpack_seq_triton
from
vllm.v1.kv_cache_interface
import
KVCacheSpec
,
MLAAttentionSpec
from
vllm.v1.worker.workspace
import
current_workspace_manager
...
...
@@ -679,7 +679,9 @@ def sparse_attn_indexer(
)
fp8_mqa_logits_func
=
fp8_mqa_logits
if
current_platform
.
is_rocm
():
from
vllm.attention.ops.rocm_aiter_mla_sparse
import
rocm_fp8_mqa_logits
from
vllm.v1.attention.ops.rocm_aiter_mla_sparse
import
(
rocm_fp8_mqa_logits
,
)
fp8_mqa_logits_func
=
rocm_fp8_mqa_logits
logits
=
fp8_mqa_logits_func
(
...
...
@@ -729,7 +731,7 @@ def sparse_attn_indexer(
num_padded_tokens
=
batch_size
*
next_n
fp8_paged_mqa_logits_func
=
fp8_paged_mqa_logits
if
current_platform
.
is_rocm
():
from
vllm.attention.ops.rocm_aiter_mla_sparse
import
(
from
vllm.
v1.
attention.ops.rocm_aiter_mla_sparse
import
(
rocm_fp8_paged_mqa_logits
,
)
...
...
vllm/model_executor/models/dots_ocr.py
View file @
2612ba92
...
...
@@ -8,10 +8,6 @@ import torch.nn as nn
from
torch.nn
import
LayerNorm
from
transformers.models.qwen2_vl
import
Qwen2VLProcessor
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layers.mm_encoder_attention
import
(
MMEncoderAttention
,
)
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
utils
as
dist_utils
...
...
@@ -20,6 +16,9 @@ from vllm.distributed.parallel_state import (
get_tensor_model_parallel_world_size
,
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
(
MMEncoderAttention
,
)
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
...
...
@@ -59,6 +58,7 @@ from vllm.multimodal.inputs import MultiModalDataDict
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.dotsocr
import
DotsOCRConfig
,
DotsVisionConfig
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.vision
import
run_dp_sharded_mrope_vision_model
...
...
vllm/model_executor/models/ernie45_vl.py
View file @
2612ba92
...
...
@@ -36,16 +36,15 @@ import torch.nn.functional as F
from
einops
import
rearrange
from
transformers
import
BatchFeature
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layers.mm_encoder_attention
import
(
MMEncoderAttention
,
)
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
QuickGELU
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
(
MMEncoderAttention
,
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -74,6 +73,7 @@ from vllm.multimodal.processing import (
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.ernie45_vl_moe
import
Ernie4_5_VLMoeForCausalLM
from
.interfaces
import
(
...
...
vllm/model_executor/models/gemma3.py
View file @
2612ba92
...
...
@@ -22,13 +22,15 @@ import torch
from
torch
import
nn
from
transformers
import
Gemma3TextConfig
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
GeluAndMul
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.layernorm
import
GemmaRMSNorm
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
@@ -47,8 +49,8 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
from
...attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
...
...
vllm/model_executor/models/glm4.py
View file @
2612ba92
...
...
@@ -29,7 +29,6 @@ import torch
from
torch
import
nn
from
transformers
import
Glm4Config
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -41,6 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.llama
import
LlamaMLP
as
Glm4MLP
...
...
vllm/model_executor/models/glm4_1v.py
View file @
2612ba92
...
...
@@ -46,15 +46,14 @@ from transformers.models.glm4v.image_processing_glm4v import (
from
transformers.models.glm4v.video_processing_glm4v
import
Glm4vVideoProcessor
from
transformers.video_utils
import
VideoMetadata
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layers.mm_encoder_attention
import
(
MMEncoderAttention
,
)
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
,
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
(
MMEncoderAttention
,
)
from
vllm.model_executor.layers.conv
import
Conv2dLayer
,
Conv3dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
...
...
@@ -89,6 +88,7 @@ from vllm.multimodal.processing import (
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
..layers.activation
import
SiluAndMul
from
.interfaces
import
(
...
...
vllm/model_executor/models/glm4v.py
View file @
2612ba92
...
...
@@ -19,11 +19,11 @@ from transformers import BatchFeature, PreTrainedTokenizer, TensorType
from
transformers.image_utils
import
ImageInput
from
transformers.tokenization_utils_base
import
TextInput
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
vllm/model_executor/models/glmasr.py
View file @
2612ba92
...
...
@@ -11,12 +11,12 @@ from transformers import BatchFeature
from
transformers.models.glmasr
import
GlmAsrConfig
,
GlmAsrProcessor
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
ModelConfig
,
SpeechToTextConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed.parallel_state
import
get_tensor_model_parallel_world_size
from
vllm.inputs.data
import
PromptType
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
QKVParallelLinear
,
...
...
vllm/model_executor/models/gpt_oss.py
View file @
2612ba92
...
...
@@ -7,7 +7,6 @@ import torch.distributed as dist
from
torch
import
nn
from
transformers
import
GptOssConfig
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -37,6 +36,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.math_utils
import
cdiv
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.utils
import
(
...
...
vllm/model_executor/models/hunyuan_v1.py
View file @
2612ba92
...
...
@@ -33,7 +33,6 @@ import torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
,
get_current_vllm_config
...
...
@@ -65,6 +64,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
MixtureOfExperts
,
SupportsLoRA
,
SupportsPP
from
.utils
import
(
...
...
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment