Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2612ba92
Unverified
Commit
2612ba92
authored
Jan 09, 2026
by
Matthew Bonanni
Committed by
GitHub
Jan 09, 2026
Browse files
[1/N][Attention] Restructure attention: move files (#31916)
Signed-off-by:
Matthew Bonanni
<
mbonanni@redhat.com
>
parent
1f8b7c53
Changes
195
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
44 additions
and
38 deletions
+44
-38
vllm/model_executor/models/plamo2.py
vllm/model_executor/models/plamo2.py
+1
-1
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+4
-2
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+2
-2
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+2
-2
vllm/model_executor/models/qwen3.py
vllm/model_executor/models/qwen3.py
+1
-1
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+1
-1
vllm/model_executor/models/qwen3_omni_moe_thinker.py
vllm/model_executor/models/qwen3_omni_moe_thinker.py
+1
-1
vllm/model_executor/models/qwen3_vl.py
vllm/model_executor/models/qwen3_vl.py
+1
-1
vllm/model_executor/models/seed_oss.py
vllm/model_executor/models/seed_oss.py
+1
-1
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip.py
+4
-2
vllm/model_executor/models/siglip2.py
vllm/model_executor/models/siglip2.py
+1
-1
vllm/model_executor/models/siglip2navit.py
vllm/model_executor/models/siglip2navit.py
+1
-1
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+1
-1
vllm/model_executor/models/transformers/base.py
vllm/model_executor/models/transformers/base.py
+4
-2
vllm/model_executor/models/vision.py
vllm/model_executor/models/vision.py
+1
-1
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+5
-5
vllm/model_executor/models/whisper_utils.py
vllm/model_executor/models/whisper_utils.py
+5
-5
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+2
-2
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+4
-4
vllm/platforms/interface.py
vllm/platforms/interface.py
+2
-2
No files found.
vllm/model_executor/models/plamo2.py
View file @
2612ba92
...
@@ -9,7 +9,6 @@ import torch
...
@@ -9,7 +9,6 @@ import torch
from
torch
import
nn
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
VllmConfig
,
get_current_vllm_config
from
vllm.config
import
VllmConfig
,
get_current_vllm_config
...
@@ -66,6 +65,7 @@ from vllm.model_executor.models.utils import (
...
@@ -66,6 +65,7 @@ from vllm.model_executor.models.utils import (
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.v1.attention.backend
import
AttentionMetadata
from
vllm.v1.attention.backends.mamba2_attn
import
Mamba2AttentionMetadata
from
vllm.v1.attention.backends.mamba2_attn
import
Mamba2AttentionMetadata
...
...
vllm/model_executor/models/qwen2.py
View file @
2612ba92
...
@@ -33,13 +33,14 @@ import torch
...
@@ -33,13 +33,14 @@ import torch
from
torch
import
nn
from
torch
import
nn
from
transformers
import
Qwen2Config
from
transformers
import
Qwen2Config
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -59,6 +60,7 @@ from vllm.model_executor.model_loader.weight_utils import (
...
@@ -59,6 +60,7 @@ from vllm.model_executor.model_loader.weight_utils import (
)
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
is_interleaved
,
set_default_rope_theta
from
vllm.transformers_utils.config
import
is_interleaved
,
set_default_rope_theta
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.utils
import
(
from
.utils
import
(
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
2612ba92
...
@@ -41,8 +41,6 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
...
@@ -41,8 +41,6 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
Qwen2_5_VLVisionConfig
,
Qwen2_5_VLVisionConfig
,
)
)
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
parallel_state
...
@@ -50,6 +48,7 @@ from vllm.distributed import utils as dist_utils
...
@@ -50,6 +48,7 @@ from vllm.distributed import utils as dist_utils
from
vllm.forward_context
import
set_forward_context
from
vllm.forward_context
import
set_forward_context
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
get_act_and_mul_fn
from
vllm.model_executor.layers.activation
import
get_act_and_mul_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv3dLayer
from
vllm.model_executor.layers.conv
import
Conv3dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
...
@@ -83,6 +82,7 @@ from vllm.multimodal.processing import PromptReplacement, PromptUpdate
...
@@ -83,6 +82,7 @@ from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.platform_utils
import
is_pin_memory_available
from
vllm.utils.platform_utils
import
is_pin_memory_available
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interfaces
import
(
from
.interfaces
import
(
MultiModalEmbeddings
,
MultiModalEmbeddings
,
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
2612ba92
...
@@ -43,14 +43,13 @@ from transformers.models.qwen2_vl.configuration_qwen2_vl import (
...
@@ -43,14 +43,13 @@ from transformers.models.qwen2_vl.configuration_qwen2_vl import (
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
smart_resize
from
transformers.models.qwen2_vl.image_processing_qwen2_vl
import
smart_resize
from
transformers.models.qwen2_vl.video_processing_qwen2_vl
import
Qwen2VLVideoProcessor
from
transformers.models.qwen2_vl.video_processing_qwen2_vl
import
Qwen2VLVideoProcessor
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
parallel_state
,
tensor_model_parallel_all_gather
from
vllm.distributed
import
parallel_state
,
tensor_model_parallel_all_gather
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
QuickGELU
from
vllm.model_executor.layers.activation
import
QuickGELU
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv3dLayer
from
vllm.model_executor.layers.conv
import
Conv3dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
@@ -90,6 +89,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
...
@@ -90,6 +89,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers
import
TokenizerLike
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interfaces
import
(
from
.interfaces
import
(
MultiModalEmbeddings
,
MultiModalEmbeddings
,
...
...
vllm/model_executor/models/qwen3.py
View file @
2612ba92
...
@@ -30,7 +30,6 @@ import torch
...
@@ -30,7 +30,6 @@ import torch
from
torch
import
nn
from
torch
import
nn
from
transformers
import
Qwen3Config
from
transformers
import
Qwen3Config
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
...
@@ -44,6 +43,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -44,6 +43,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
set_default_rope_theta
from
vllm.transformers_utils.config
import
set_default_rope_theta
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.qwen2
import
Qwen2MLP
as
Qwen3MLP
from
.qwen2
import
Qwen2MLP
as
Qwen3MLP
...
...
vllm/model_executor/models/qwen3_next.py
View file @
2612ba92
...
@@ -10,7 +10,6 @@ from einops import rearrange
...
@@ -10,7 +10,6 @@ from einops import rearrange
from
torch
import
nn
from
torch
import
nn
from
transformers.activations
import
ACT2FN
from
transformers.activations
import
ACT2FN
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
from
vllm.config
import
(
...
@@ -75,6 +74,7 @@ from vllm.sequence import IntermediateTensors
...
@@ -75,6 +74,7 @@ from vllm.sequence import IntermediateTensors
from
vllm.transformers_utils.configs
import
Qwen3NextConfig
from
vllm.transformers_utils.configs
import
Qwen3NextConfig
from
vllm.triton_utils
import
tl
,
triton
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.v1.attention.backend
import
AttentionMetadata
from
vllm.v1.attention.backends.gdn_attn
import
GDNAttentionMetadata
from
vllm.v1.attention.backends.gdn_attn
import
GDNAttentionMetadata
from
.interfaces
import
(
from
.interfaces
import
(
...
...
vllm/model_executor/models/qwen3_omni_moe_thinker.py
View file @
2612ba92
...
@@ -46,7 +46,6 @@ from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
...
@@ -46,7 +46,6 @@ from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
)
)
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
from
vllm.distributed
import
get_pp_group
...
@@ -75,6 +74,7 @@ from vllm.multimodal.processing import (
...
@@ -75,6 +74,7 @@ from vllm.multimodal.processing import (
PromptUpdateDetails
,
PromptUpdateDetails
,
)
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interfaces
import
(
from
.interfaces
import
(
MultiModalEmbeddings
,
MultiModalEmbeddings
,
...
...
vllm/model_executor/models/qwen3_vl.py
View file @
2612ba92
...
@@ -48,7 +48,6 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
...
@@ -48,7 +48,6 @@ from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
)
)
from
transformers.video_utils
import
VideoMetadata
from
transformers.video_utils
import
VideoMetadata
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
...
@@ -92,6 +91,7 @@ from vllm.multimodal.processing import (
...
@@ -92,6 +91,7 @@ from vllm.multimodal.processing import (
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interfaces
import
(
from
.interfaces
import
(
MultiModalEmbeddings
,
MultiModalEmbeddings
,
...
...
vllm/model_executor/models/seed_oss.py
View file @
2612ba92
...
@@ -30,7 +30,6 @@ import torch
...
@@ -30,7 +30,6 @@ import torch
from
torch
import
nn
from
torch
import
nn
from
transformers
import
PretrainedConfig
as
SeedOssConfig
from
transformers
import
PretrainedConfig
as
SeedOssConfig
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
...
@@ -56,6 +55,7 @@ from vllm.model_executor.model_loader.weight_utils import (
...
@@ -56,6 +55,7 @@ from vllm.model_executor.model_loader.weight_utils import (
)
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
set_default_rope_theta
from
vllm.transformers_utils.config
import
set_default_rope_theta
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
(
from
.utils
import
(
...
...
vllm/model_executor/models/siglip.py
View file @
2612ba92
...
@@ -15,12 +15,14 @@ from transformers import (
...
@@ -15,12 +15,14 @@ from transformers import (
SiglipVisionConfig
,
SiglipVisionConfig
,
)
)
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
MultiModalConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
MultiModalConfig
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
...
vllm/model_executor/models/siglip2.py
View file @
2612ba92
...
@@ -10,11 +10,11 @@ from torch import nn
...
@@ -10,11 +10,11 @@ from torch import nn
from
torch.nn
import
functional
as
F
from
torch.nn
import
functional
as
F
from
transformers
import
Siglip2VisionConfig
from
transformers
import
Siglip2VisionConfig
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
MultiModalConfig
from
vllm.config
import
MultiModalConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
...
vllm/model_executor/models/siglip2navit.py
View file @
2612ba92
...
@@ -11,10 +11,10 @@ from torch.nn import functional as F
...
@@ -11,10 +11,10 @@ from torch.nn import functional as F
from
transformers
import
Siglip2VisionConfig
from
transformers
import
Siglip2VisionConfig
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.configuration_utils
import
PretrainedConfig
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
MultiModalConfig
from
vllm.config
import
MultiModalConfig
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
...
vllm/model_executor/models/step3_vl.py
View file @
2612ba92
...
@@ -15,11 +15,11 @@ from torchvision import transforms
...
@@ -15,11 +15,11 @@ from torchvision import transforms
from
torchvision.transforms.functional
import
InterpolationMode
from
torchvision.transforms.functional
import
InterpolationMode
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
...
vllm/model_executor/models/transformers/base.py
View file @
2612ba92
...
@@ -27,13 +27,14 @@ from torch import nn
...
@@ -27,13 +27,14 @@ from torch import nn
from
transformers
import
AutoModel
from
transformers
import
AutoModel
from
transformers.modeling_utils
import
ALL_ATTENTION_FUNCTIONS
from
transformers.modeling_utils
import
ALL_ATTENTION_FUNCTIONS
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.encoder_only_attention
import
EncoderOnlyAttention
from
vllm.config.utils
import
getattr_iter
from
vllm.config.utils
import
getattr_iter
from
vllm.distributed
import
get_pp_group
,
get_tp_group
from
vllm.distributed
import
get_pp_group
,
get_tp_group
from
vllm.distributed.utils
import
get_pp_indices
from
vllm.distributed.utils
import
get_pp_indices
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention.encoder_only_attention
import
(
EncoderOnlyAttention
,
)
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
vllm.model_executor.models.interfaces
import
(
from
vllm.model_executor.models.interfaces
import
(
SupportsEagle
,
SupportsEagle
,
...
@@ -59,6 +60,7 @@ from vllm.model_executor.models.utils import (
...
@@ -59,6 +60,7 @@ from vllm.model_executor.models.utils import (
maybe_prefix
,
maybe_prefix
,
)
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedModel
from
transformers
import
PreTrainedModel
...
...
vllm/model_executor/models/vision.py
View file @
2612ba92
...
@@ -10,7 +10,6 @@ from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar
...
@@ -10,7 +10,6 @@ from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar
import
torch
import
torch
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
(
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_rank
,
...
@@ -19,6 +18,7 @@ from vllm.distributed import (
...
@@ -19,6 +18,7 @@ from vllm.distributed import (
)
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
...
vllm/model_executor/models/whisper.py
View file @
2612ba92
...
@@ -18,18 +18,15 @@ from transformers import (
...
@@ -18,18 +18,15 @@ from transformers import (
)
)
from
transformers.models.whisper.modeling_whisper
import
sinusoids
from
transformers.models.whisper.modeling_whisper
import
sinusoids
from
vllm.attention.backends.abstract
import
(
AttentionType
,
)
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.attention.layers.cross_attention
import
CrossAttention
from
vllm.attention.layers.mm_encoder_attention
import
MMEncoderAttention
from
vllm.config
import
CacheConfig
,
ModelConfig
,
SpeechToTextConfig
,
VllmConfig
from
vllm.config
import
CacheConfig
,
ModelConfig
,
SpeechToTextConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.inputs.data
import
PromptType
from
vllm.inputs.data
import
PromptType
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention.cross_attention
import
CrossAttention
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
MMEncoderAttention
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -62,6 +59,9 @@ from vllm.transformers_utils.processor import cached_processor_from_config
...
@@ -62,6 +59,9 @@ from vllm.transformers_utils.processor import cached_processor_from_config
from
vllm.utils.jsontree
import
json_map_leaves
from
vllm.utils.jsontree
import
json_map_leaves
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.torch_utils
import
set_default_torch_dtype
from
vllm.utils.torch_utils
import
set_default_torch_dtype
from
vllm.v1.attention.backend
import
(
AttentionType
,
)
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsTranscription
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsTranscription
from
.utils
import
(
from
.utils
import
(
...
...
vllm/model_executor/models/whisper_utils.py
View file @
2612ba92
...
@@ -9,20 +9,20 @@ import torch
...
@@ -9,20 +9,20 @@ import torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch
import
nn
from
vllm.attention.backends.abstract
import
(
from
vllm.attention.layer
import
Attention
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.v1.attention.backend
import
(
AttentionBackend
,
AttentionBackend
,
AttentionMetadata
,
AttentionMetadata
,
AttentionType
,
AttentionType
,
)
)
from
vllm.attention.layer
import
Attention
from
vllm.attention.selector
import
get_attn_backend
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.v1.attention.backends.flash_attn
import
FlashAttentionBackend
from
vllm.v1.attention.backends.flash_attn
import
FlashAttentionBackend
from
vllm.v1.attention.backends.utils
import
(
from
vllm.v1.attention.backends.utils
import
(
CommonAttentionMetadata
,
CommonAttentionMetadata
,
subclass_attention_backend_with_overrides
,
subclass_attention_backend_with_overrides
,
)
)
from
vllm.v1.attention.selector
import
get_attn_backend
from
vllm.v1.kv_cache_interface
import
AttentionSpec
from
vllm.v1.kv_cache_interface
import
AttentionSpec
# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
...
...
vllm/platforms/cpu.py
View file @
2612ba92
...
@@ -15,16 +15,16 @@ import regex as re
...
@@ -15,16 +15,16 @@ import regex as re
import
torch
import
torch
from
vllm
import
envs
from
vllm
import
envs
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interface
import
CpuArchEnum
,
Platform
,
PlatformEnum
from
.interface
import
CpuArchEnum
,
Platform
,
PlatformEnum
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.attention.selector
import
AttentionSelectorConfig
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.v1.attention.selector
import
AttentionSelectorConfig
else
:
else
:
VllmConfig
=
None
VllmConfig
=
None
...
...
vllm/platforms/cuda.py
View file @
2612ba92
...
@@ -14,17 +14,17 @@ from typing_extensions import ParamSpec
...
@@ -14,17 +14,17 @@ from typing_extensions import ParamSpec
# import custom ops, trigger op registration
# import custom ops, trigger op registration
import
vllm._C
# noqa
import
vllm._C
# noqa
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils.import_utils
import
import_pynvml
from
vllm.utils.import_utils
import
import_pynvml
from
vllm.utils.torch_utils
import
cuda_device_count_stateless
from
vllm.utils.torch_utils
import
cuda_device_count_stateless
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.attention.selector
import
AttentionSelectorConfig
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.config.cache
import
CacheDType
from
vllm.config.cache
import
CacheDType
from
vllm.v1.attention.selector
import
AttentionSelectorConfig
else
:
else
:
VllmConfig
=
None
VllmConfig
=
None
CacheDType
=
None
CacheDType
=
None
...
@@ -148,7 +148,7 @@ class CudaPlatformBase(Platform):
...
@@ -148,7 +148,7 @@ class CudaPlatformBase(Platform):
@
classmethod
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
def
check_and_update_config
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.
v1.
attention.backends.registry
import
AttentionBackendEnum
parallel_config
=
vllm_config
.
parallel_config
parallel_config
=
vllm_config
.
parallel_config
model_config
=
vllm_config
.
model_config
model_config
=
vllm_config
.
model_config
...
@@ -200,7 +200,7 @@ class CudaPlatformBase(Platform):
...
@@ -200,7 +200,7 @@ class CudaPlatformBase(Platform):
use_cutlass_mla
=
backend
==
AttentionBackendEnum
.
CUTLASS_MLA
use_cutlass_mla
=
backend
==
AttentionBackendEnum
.
CUTLASS_MLA
use_flashinfer_mla
=
backend
==
AttentionBackendEnum
.
FLASHINFER_MLA
use_flashinfer_mla
=
backend
==
AttentionBackendEnum
.
FLASHINFER_MLA
from
vllm.attention.ops.flashmla
import
is_flashmla_dense_supported
from
vllm.
v1.
attention.ops.flashmla
import
is_flashmla_dense_supported
if
(
if
(
use_flashmla
use_flashmla
...
...
vllm/platforms/interface.py
View file @
2612ba92
...
@@ -13,18 +13,18 @@ import numpy as np
...
@@ -13,18 +13,18 @@ import numpy as np
import
torch
import
torch
from
typing_extensions
import
deprecated
from
typing_extensions
import
deprecated
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
torch.distributed
import
PrefixStore
,
ProcessGroup
from
torch.distributed
import
PrefixStore
,
ProcessGroup
from
vllm.attention.selector
import
AttentionSelectorConfig
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.inputs
import
ProcessorInputs
,
PromptType
from
vllm.inputs
import
ProcessorInputs
,
PromptType
from
vllm.pooling_params
import
PoolingParams
from
vllm.pooling_params
import
PoolingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
from
vllm.v1.attention.selector
import
AttentionSelectorConfig
else
:
else
:
FlexibleArgumentParser
=
object
FlexibleArgumentParser
=
object
...
...
Prev
1
…
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment