Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
91 additions
and
147 deletions
+91
-147
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma.py
+0
-1
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/gemma2.py
+0
-1
vllm/model_executor/models/gemma3.py
vllm/model_executor/models/gemma3.py
+0
-1
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+3
-2
vllm/model_executor/models/gemma3n.py
vllm/model_executor/models/gemma3n.py
+0
-1
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+0
-2
vllm/model_executor/models/glm4_1v.py
vllm/model_executor/models/glm4_1v.py
+57
-94
vllm/model_executor/models/glm4_moe.py
vllm/model_executor/models/glm4_moe.py
+0
-1
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_j.py
+3
-2
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/gpt_neox.py
+0
-1
vllm/model_executor/models/gpt_oss.py
vllm/model_executor/models/gpt_oss.py
+0
-1
vllm/model_executor/models/granite.py
vllm/model_executor/models/granite.py
+0
-1
vllm/model_executor/models/granitemoe.py
vllm/model_executor/models/granitemoe.py
+0
-1
vllm/model_executor/models/granitemoehybrid.py
vllm/model_executor/models/granitemoehybrid.py
+0
-1
vllm/model_executor/models/grok1.py
vllm/model_executor/models/grok1.py
+0
-1
vllm/model_executor/models/hunyuan_v1.py
vllm/model_executor/models/hunyuan_v1.py
+0
-2
vllm/model_executor/models/hunyuan_vision.py
vllm/model_executor/models/hunyuan_vision.py
+9
-2
vllm/model_executor/models/interfaces.py
vllm/model_executor/models/interfaces.py
+18
-15
vllm/model_executor/models/interfaces_base.py
vllm/model_executor/models/interfaces_base.py
+1
-16
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2.py
+0
-1
No files found.
vllm/model_executor/models/gemma.py
View file @
a3f8d5dd
...
...
@@ -174,7 +174,6 @@ class GemmaAttention(nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
rope_parameters
,
is_neox_style
=
True
,
...
...
vllm/model_executor/models/gemma2.py
View file @
a3f8d5dd
...
...
@@ -152,7 +152,6 @@ class Gemma2Attention(nn.Module):
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
config
.
rope_parameters
,
is_neox_style
=
True
,
...
...
vllm/model_executor/models/gemma3.py
View file @
a3f8d5dd
...
...
@@ -176,7 +176,6 @@ class Gemma3Attention(nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
rope_parameters
,
is_neox_style
=
True
,
...
...
vllm/model_executor/models/gemma3_mm.py
View file @
a3f8d5dd
...
...
@@ -237,8 +237,9 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
)
max_num_crops
=
images_kwargs
[
"pan_and_scan_max_num_crops"
]
# Result in the max possible feature size (h:w = max_num_crops:1)
return
ImageSize
(
height
=
50
*
max_num_crops
,
width
=
50
)
vision_config
=
self
.
get_hf_config
().
vision_config
native_size
=
vision_config
.
image_size
return
ImageSize
(
height
=
native_size
*
max_num_crops
,
width
=
native_size
)
class
Gemma3DummyInputsBuilder
(
BaseDummyInputsBuilder
[
Gemma3ProcessingInfo
]):
...
...
vllm/model_executor/models/gemma3n.py
View file @
a3f8d5dd
...
...
@@ -384,7 +384,6 @@ class Gemma3nAttention(nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
rope_parameters
,
is_neox_style
=
True
,
...
...
vllm/model_executor/models/glm4.py
View file @
a3f8d5dd
...
...
@@ -81,7 +81,6 @@ class Glm4Attention(nn.Module):
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
0.5
)
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
self
.
head_dim
=
head_dim
or
hidden_size
//
self
.
total_num_heads
self
.
rotary_dim
=
self
.
head_dim
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
self
.
kv_size
=
self
.
num_kv_heads
*
self
.
head_dim
self
.
scaling
=
self
.
head_dim
**-
0.5
...
...
@@ -103,7 +102,6 @@ class Glm4Attention(nn.Module):
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
rotary_dim
,
max_position
=
max_position
,
rope_parameters
=
config
.
rope_parameters
,
is_neox_style
=
False
,
...
...
vllm/model_executor/models/glm4_1v.py
View file @
a3f8d5dd
...
...
@@ -47,8 +47,10 @@ from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor
from
transformers.video_utils
import
VideoMetadata
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.attention.layer
import
maybe_get_vit_flash_attn_backend
from
vllm.config
import
VllmConfig
from
vllm.attention.layers.mm_encoder_attention
import
(
MMEncoderAttention
,
)
from
vllm.config
import
MultiModalConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
,
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
...
...
@@ -63,6 +65,9 @@ from vllm.model_executor.layers.linear import (
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding.common
import
(
ApplyRotaryEmb
,
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
...
...
@@ -93,7 +98,7 @@ from .interfaces import (
SupportsMultiModal
,
SupportsPP
,
)
from
.qwen2_vl
import
_create_qwen2vl_field_factory
,
apply_rotary_pos_emb_vision
from
.qwen2_vl
import
_create_qwen2vl_field_factory
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
...
...
@@ -191,10 +196,15 @@ class Glm4vVisionMLP(nn.Module):
hidden_features
:
int
,
bias
:
bool
=
False
,
quant_config
:
QuantizationConfig
|
None
=
None
,
multimodal_config
:
MultiModalConfig
|
None
=
None
,
prefix
:
str
=
""
,
use_data_parallel
:
bool
=
False
,
):
super
().
__init__
()
use_data_parallel
=
(
multimodal_config
.
mm_encoder_tp_mode
==
"data"
if
multimodal_config
else
False
)
self
.
gate_up_proj
=
MergedColumnParallelLinear
(
input_size
=
in_features
,
output_sizes
=
[
hidden_features
]
*
2
,
...
...
@@ -248,12 +258,16 @@ class Glm4vVisionAttention(nn.Module):
num_heads
:
int
,
projection_size
:
int
,
quant_config
:
QuantizationConfig
|
None
=
None
,
multimodal_config
:
MultiModalConfig
|
None
=
None
,
prefix
:
str
=
""
,
use_data_parallel
:
bool
=
False
,
attn_backend_override
:
AttentionBackendEnum
|
None
=
None
,
)
->
None
:
super
().
__init__
()
# Per attention head and per partition values.
use_data_parallel
=
(
multimodal_config
.
mm_encoder_tp_mode
==
"data"
if
multimodal_config
else
False
)
self
.
tp_size
=
(
1
if
use_data_parallel
else
get_tensor_model_parallel_world_size
()
)
...
...
@@ -287,33 +301,13 @@ class Glm4vVisionAttention(nn.Module):
disable_tp
=
use_data_parallel
,
)
# Detect attention implem
ent
at
ion
.
self
.
attn_backend
=
get_vit_attn_backend
(
self
.
attn
=
MMEncoderAtt
ention
(
num_heads
=
self
.
num_attention_heads_per_partition
,
head_size
=
self
.
hidden_size_per_attention_head
,
dtype
=
torch
.
get_default_dtype
(),
attn_backend_override
=
attn_backend_override
,
multimodal_config
=
multimodal_config
,
)
self
.
attn_backend
,
self
.
flash_attn_varlen_func
=
(
maybe_get_vit_flash_attn_backend
(
self
.
attn_backend
,
attn_backend_override
=
attn_backend_override
,
)
)
if
self
.
attn_backend
not
in
{
AttentionBackendEnum
.
FLASH_ATTN
,
AttentionBackendEnum
.
TORCH_SDPA
,
AttentionBackendEnum
.
ROCM_AITER_FA
,
}:
raise
RuntimeError
(
f
"GLM-4V does not support
{
self
.
attn_backend
}
backend now."
)
self
.
is_flash_attn_backend
=
self
.
attn_backend
in
{
AttentionBackendEnum
.
FLASH_ATTN
,
AttentionBackendEnum
.
ROCM_AITER_FA
,
}
self
.
apply_rotary_emb
=
ApplyRotaryEmb
(
enforce_enable
=
True
)
def
split_qkv
(
self
,
qkv
:
torch
.
Tensor
)
->
tuple
[
torch
.
Tensor
,
...]:
# [s, b, 3 * head * head_dim]
...
...
@@ -338,61 +332,33 @@ class Glm4vVisionAttention(nn.Module):
cu_seqlens
:
torch
.
Tensor
,
rotary_pos_emb_cos
:
torch
.
Tensor
,
rotary_pos_emb_sin
:
torch
.
Tensor
,
max_seqlen
:
int
|
None
=
None
,
# Only used for Flash Attention
max_seqlen
:
torch
.
Tensor
|
None
=
None
,
# Only used for Flash Attention
)
->
torch
.
Tensor
:
# [s, b, c] --> [s, b, head * 3 * head_dim]
x
,
_
=
self
.
qkv
(
x
)
# [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
q
,
k
,
v
=
self
.
split_qkv
(
x
)
batch_size
=
q
.
shape
[
1
]
q
,
k
,
v
=
(
rearrange
(
x
,
"s b ... -> b s ..."
).
contiguous
()
for
x
in
(
q
,
k
,
v
))
if
rotary_pos_emb_cos
is
not
None
and
rotary_pos_emb_sin
is
not
None
:
# [2 * b, s, heads, head_dim]
qk_concat
=
torch
.
cat
([
q
,
k
],
dim
=
0
)
qk_rotated
=
apply_rotary_pos_emb_vision
(
qk_concat
,
rotary_pos_emb_cos
,
rotary_pos_emb_sin
qk_rotated
=
self
.
apply_rotary_emb
(
qk_concat
,
rotary_pos_emb_cos
,
rotary_pos_emb_sin
,
)
q
,
k
=
torch
.
chunk
(
qk_rotated
,
2
,
dim
=
0
)
if
self
.
is_flash_attn_backend
:
q
,
k
,
v
=
(
rearrange
(
x
,
"b s ... -> (b s) ..."
)
for
x
in
[
q
,
k
,
v
])
output
=
self
.
flash_attn_varlen_func
(
q
,
k
,
v
,
cu_seqlens_q
=
cu_seqlens
,
cu_seqlens_k
=
cu_seqlens
,
max_seqlen_q
=
max_seqlen
,
max_seqlen_k
=
max_seqlen
,
dropout_p
=
0.0
,
causal
=
False
,
)
context_layer
=
rearrange
(
output
,
"(b s) h d -> s b (h d)"
,
b
=
batch_size
).
contiguous
()
elif
self
.
attn_backend
==
AttentionBackendEnum
.
TORCH_SDPA
:
# Execute attention entry by entry for speed & less VRAM.
outputs
=
[]
lens
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
tolist
()
q_chunks
=
torch
.
split
(
q
,
lens
,
dim
=
1
)
k_chunks
=
torch
.
split
(
k
,
lens
,
dim
=
1
)
v_chunks
=
torch
.
split
(
v
,
lens
,
dim
=
1
)
for
q_i
,
k_i
,
v_i
in
zip
(
q_chunks
,
k_chunks
,
v_chunks
):
q_i
,
k_i
,
v_i
=
(
rearrange
(
x
,
"b s h d -> b h s d"
)
for
x
in
[
q_i
,
k_i
,
v_i
]
)
output_i
=
F
.
scaled_dot_product_attention
(
q_i
,
k_i
,
v_i
,
dropout_p
=
0.0
)
output_i
=
rearrange
(
output_i
,
"b h s d -> b s h d "
)
outputs
.
append
(
output_i
)
context_layer
=
torch
.
cat
(
outputs
,
dim
=
1
)
context_layer
=
rearrange
(
context_layer
,
"b s h d -> s b (h d)"
).
contiguous
()
context_layer
=
self
.
attn
(
query
=
q
,
key
=
k
,
value
=
v
,
cu_seqlens
=
cu_seqlens
,
max_seqlen
=
max_seqlen
,
)
context_layer
=
rearrange
(
context_layer
,
"b s h d -> s b (h d)"
).
contiguous
()
output
,
_
=
self
.
proj
(
context_layer
)
return
output
...
...
@@ -406,9 +372,8 @@ class Glm4vVisionBlock(nn.Module):
mlp_hidden_dim
:
int
,
norm_layer
:
Callable
[[
int
],
nn
.
Module
]
|
None
=
None
,
quant_config
:
QuantizationConfig
|
None
=
None
,
multimodal_config
:
MultiModalConfig
|
None
=
None
,
prefix
:
str
=
""
,
use_data_parallel
:
bool
=
False
,
attn_backend_override
:
AttentionBackendEnum
|
None
=
None
,
)
->
None
:
super
().
__init__
()
if
norm_layer
is
None
:
...
...
@@ -420,17 +385,16 @@ class Glm4vVisionBlock(nn.Module):
num_heads
=
num_heads
,
projection_size
=
dim
,
quant_config
=
quant_config
,
multimodal_config
=
multimodal_config
,
prefix
=
f
"
{
prefix
}
.attn"
,
use_data_parallel
=
use_data_parallel
,
attn_backend_override
=
attn_backend_override
,
)
self
.
mlp
=
Glm4vVisionMLP
(
dim
,
mlp_hidden_dim
,
bias
=
False
,
quant_config
=
quant_config
,
multimodal_config
=
multimodal_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
use_data_parallel
=
use_data_parallel
,
)
def
forward
(
...
...
@@ -489,11 +453,16 @@ class Glm4vPatchMerger(nn.Module):
d_model
:
int
,
context_dim
:
int
,
quant_config
:
QuantizationConfig
|
None
=
None
,
multimodal_config
:
MultiModalConfig
|
None
=
None
,
bias
:
bool
=
False
,
prefix
:
str
=
""
,
use_data_parallel
:
bool
=
False
,
)
->
None
:
super
().
__init__
()
use_data_parallel
=
(
multimodal_config
.
mm_encoder_tp_mode
==
"data"
if
multimodal_config
else
False
)
self
.
hidden_size
=
d_model
self
.
proj
=
ColumnParallelLinear
(
self
.
hidden_size
,
...
...
@@ -649,19 +618,19 @@ class Glm4vVisionTransformer(nn.Module):
vision_config
:
Glm4vVisionConfig
,
norm_eps
:
float
=
1e-6
,
quant_config
:
QuantizationConfig
|
None
=
None
,
multimodal_config
:
MultiModalConfig
|
None
=
None
,
prefix
:
str
=
""
,
use_data_parallel
:
bool
=
False
,
attn_backend_override
:
AttentionBackendEnum
|
None
=
None
,
)
->
None
:
super
().
__init__
()
assert
multimodal_config
is
not
None
,
"multimodal_config must be provided"
patch_size
=
vision_config
.
patch_size
temporal_patch_size
=
vision_config
.
temporal_patch_size
in_channels
=
vision_config
.
in_channels
depth
=
vision_config
.
depth
self
.
hidden_size
=
vision_config
.
hidden_size
self
.
num_heads
=
vision_config
.
num_heads
self
.
use_data_parallel
=
use_data_parallel
self
.
patch_size
=
vision_config
.
patch_size
self
.
spatial_merge_size
=
vision_config
.
spatial_merge_size
...
...
@@ -678,9 +647,9 @@ class Glm4vVisionTransformer(nn.Module):
head_dim
=
self
.
hidden_size
//
self
.
num_heads
self
.
rotary_pos_emb
=
get_rope
(
head_size
=
head_dim
,
rotary_dim
=
head_dim
//
2
,
max_position
=
8192
,
is_neox_style
=
True
,
rope_parameters
=
{
"partial_rotary_factor"
:
0.5
},
)
self
.
blocks
=
nn
.
ModuleList
(
[
...
...
@@ -690,9 +659,8 @@ class Glm4vVisionTransformer(nn.Module):
mlp_hidden_dim
=
vision_config
.
out_hidden_size
,
norm_layer
=
norm_layer
,
quant_config
=
quant_config
,
multimodal_config
=
multimodal_config
,
prefix
=
f
"
{
prefix
}
.blocks.
{
layer_idx
}
"
,
use_data_parallel
=
self
.
use_data_parallel
,
attn_backend_override
=
attn_backend_override
,
)
for
layer_idx
in
range
(
depth
)
]
...
...
@@ -701,9 +669,9 @@ class Glm4vVisionTransformer(nn.Module):
d_model
=
vision_config
.
out_hidden_size
,
context_dim
=
vision_config
.
intermediate_size
,
quant_config
=
quant_config
,
multimodal_config
=
multimodal_config
,
bias
=
False
,
prefix
=
f
"
{
prefix
}
.merger"
,
use_data_parallel
=
self
.
use_data_parallel
,
)
self
.
embeddings
=
Glm4vVisionEmbeddings
(
vision_config
)
...
...
@@ -723,7 +691,7 @@ class Glm4vVisionTransformer(nn.Module):
self
.
attn_backend
=
get_vit_attn_backend
(
head_size
=
head_dim
,
dtype
=
torch
.
get_default_dtype
(),
attn_backend_override
=
attn_backend
_override
,
attn_backend_override
=
multimodal_config
.
mm_encoder_
attn_backend
,
)
@
property
...
...
@@ -775,13 +743,13 @@ class Glm4vVisionTransformer(nn.Module):
def
compute_attn_mask_seqlen
(
self
,
cu_seqlens
:
torch
.
Tensor
,
)
->
int
|
None
:
)
->
torch
.
Tensor
|
None
:
max_seqlen
=
None
if
(
self
.
attn_backend
==
AttentionBackendEnum
.
FLASH_ATTN
or
self
.
attn_backend
==
AttentionBackendEnum
.
ROCM_AITER_FA
):
max_seqlen
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
max
()
.
item
()
max_seqlen
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
max
()
return
max_seqlen
def
forward
(
...
...
@@ -1257,6 +1225,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
)
height
=
min
(
height
,
overrides
.
height
)
num_frames
=
max
(
num_frames
,
2
)
# GLM 4.6V requires 2 frames
video
=
np
.
full
((
num_frames
,
width
,
height
,
3
),
255
,
dtype
=
np
.
uint8
)
video_items
=
[]
for
i
in
range
(
num_videos
):
...
...
@@ -1464,18 +1433,12 @@ class Glm4vForConditionalGeneration(
self
.
multimodal_config
=
multimodal_config
self
.
use_data_parallel
=
multimodal_config
.
mm_encoder_tp_mode
==
"data"
attn_backend_override
=
(
multimodal_config
.
mm_encoder_attn_backend
if
multimodal_config
is
not
None
else
None
)
self
.
visual
=
Glm4vVisionTransformer
(
config
.
vision_config
,
norm_eps
=
getattr
(
config
,
"rms_norm_eps"
,
1e-5
),
quant_config
=
quant_config
,
multimodal_config
=
multimodal_config
,
prefix
=
maybe_prefix
(
prefix
,
"visual"
),
use_data_parallel
=
self
.
use_data_parallel
,
attn_backend_override
=
attn_backend_override
,
)
if
config
.
model_type
==
"glm4v"
:
...
...
vllm/model_executor/models/glm4_moe.py
View file @
a3f8d5dd
...
...
@@ -285,7 +285,6 @@ class Glm4MoeAttention(nn.Module):
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
0.5
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
config
.
rope_parameters
,
)
...
...
vllm/model_executor/models/gpt_j.py
View file @
a3f8d5dd
...
...
@@ -95,12 +95,13 @@ class GPTJAttention(nn.Module):
scaling
=
self
.
head_size
**-
0.5
assert
getattr
(
config
,
"rotary"
,
True
)
assert
config
.
rotary_dim
%
2
==
0
rope_parameters
=
getattr
(
config
,
"rope_parameters"
,
{})
rope_parameters
[
"partial_rotary_factor"
]
=
config
.
rotary_dim
/
self
.
head_size
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
rotary_emb
=
get_rope
(
self
.
head_size
,
rotary_dim
=
config
.
rotary_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
getattr
(
config
,
"
rope_parameters
"
,
None
)
,
rope_parameters
=
rope_parameters
,
is_neox_style
=
False
,
)
self
.
attn
=
Attention
(
...
...
vllm/model_executor/models/gpt_neox.py
View file @
a3f8d5dd
...
...
@@ -92,7 +92,6 @@ class GPTNeoXAttention(nn.Module):
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
rotary_emb
=
get_rope
(
self
.
head_size
,
rotary_dim
=
self
.
head_size
,
max_position
=
max_position_embeddings
,
rope_parameters
=
config
.
rope_parameters
,
)
...
...
vllm/model_executor/models/gpt_oss.py
View file @
a3f8d5dd
...
...
@@ -67,7 +67,6 @@ class OAIAttention(nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
config
.
max_position_embeddings
,
dtype
=
torch
.
float32
,
rope_parameters
=
{
...
...
vllm/model_executor/models/granite.py
View file @
a3f8d5dd
...
...
@@ -160,7 +160,6 @@ class GraniteAttention(nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
config
.
rope_parameters
,
)
...
...
vllm/model_executor/models/granitemoe.py
View file @
a3f8d5dd
...
...
@@ -190,7 +190,6 @@ class GraniteMoeAttention(nn.Module):
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position
,
rope_parameters
=
rope_parameters
,
is_neox_style
=
True
,
...
...
vllm/model_executor/models/granitemoehybrid.py
View file @
a3f8d5dd
...
...
@@ -271,7 +271,6 @@ class GraniteMoeHybridAttention(nn.Module):
if
config
.
position_embedding_type
==
"rope"
:
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
config
.
max_position_embeddings
,
rope_parameters
=
config
.
rope_parameters
,
is_neox_style
=
True
,
...
...
vllm/model_executor/models/grok1.py
View file @
a3f8d5dd
...
...
@@ -181,7 +181,6 @@ class Grok1Attention(nn.Module):
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position
,
rope_parameters
=
rope_parameters
,
is_neox_style
=
True
,
...
...
vllm/model_executor/models/hunyuan_v1.py
View file @
a3f8d5dd
...
...
@@ -199,7 +199,6 @@ class HunYuanAttention(nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
config
.
rope_parameters
,
is_neox_style
=
True
,
...
...
@@ -305,7 +304,6 @@ class HunYuanCrossAttention(nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
config
.
rope_parameters
,
is_neox_style
=
True
,
...
...
vllm/model_executor/models/hunyuan_vision.py
View file @
a3f8d5dd
...
...
@@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module):
cu_seqlens
:
list
=
[
0
]
hidden_states
=
x
.
to
(
device
=
self
.
device
,
dtype
=
self
.
dtype
)
# embeddings = patch_embeds + patch_pos_embed
hidden_states
=
self
.
embeddings
(
hidden_states
,
grid_thw
)
for
t
,
h
,
w
in
grid_thw
:
...
...
@@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module):
hidden_states
=
hidden_states
.
reshape
(
seq_len
,
-
1
)
hidden_states
=
hidden_states
.
unsqueeze
(
0
)
for
layer_num
,
layer
in
enumerate
(
self
.
layers
):
hidden_states
=
layer
(
hidden_states
)
# build per-image lengths once
split_lengths
=
[
int
(
h
)
*
int
(
w
)
for
(
_
,
h
,
w
)
in
grid_thw
]
for
layer
in
self
.
layers
:
# hidden_states: (1, T_total, D)
parts
=
hidden_states
.
split
(
split_lengths
,
dim
=
1
)
# list of (1, L_i, D)
parts
=
[
layer
(
p
)
for
p
in
parts
]
hidden_states
=
torch
.
cat
(
parts
,
dim
=
1
)
# adapter
split_lengths
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
tolist
()
...
...
vllm/model_executor/models/interfaces.py
View file @
a3f8d5dd
...
...
@@ -53,6 +53,22 @@ The output embeddings must be one of the following formats:
"""
def
_require_is_multimodal
(
is_multimodal
:
Tensor
|
None
)
->
Tensor
:
"""
A helper function to be used in the context of
[vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids][]
to provide a better error message.
"""
if
is_multimodal
is
None
:
raise
ValueError
(
"`embed_input_ids` now requires `is_multimodal` arg, "
"please update your model runner according to "
"https://github.com/vllm-project/vllm/pull/16229."
)
return
is_multimodal
@
runtime_checkable
class
SupportsMultiModal
(
Protocol
):
"""The interface required for all multi-modal models."""
...
...
@@ -111,13 +127,7 @@ class SupportsMultiModal(Protocol):
the appearances of their corresponding multimodal data item in the
input prompt.
"""
if
hasattr
(
self
,
"get_multimodal_embeddings"
):
logger
.
warning_once
(
"`get_multimodal_embeddings` for vLLM models is deprecated and will be "
"removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
"this method to `embed_multimodal`."
)
return
self
.
get_multimodal_embeddings
(
**
kwargs
)
...
def
get_language_model
(
self
)
->
VllmModel
:
"""
...
...
@@ -196,17 +206,10 @@ class SupportsMultiModal(Protocol):
if
multimodal_embeddings
is
None
or
len
(
multimodal_embeddings
)
==
0
:
return
inputs_embeds
if
is_multimodal
is
None
:
raise
ValueError
(
"`embed_input_ids` now requires `is_multimodal` arg, "
"please update your model runner according to "
"https://github.com/vllm-project/vllm/pull/16229."
)
return
_merge_multimodal_embeddings
(
inputs_embeds
=
inputs_embeds
,
multimodal_embeddings
=
multimodal_embeddings
,
is_multimodal
=
is_multimodal
,
is_multimodal
=
_require_
is_multimodal
(
is_multimodal
)
,
)
...
...
vllm/model_executor/models/interfaces_base.py
View file @
a3f8d5dd
...
...
@@ -49,13 +49,7 @@ class VllmModel(Protocol[T_co]):
def
embed_input_ids
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Apply token embeddings to `input_ids`."""
if
hasattr
(
self
,
"get_input_embeddings"
):
logger
.
warning_once
(
"`get_input_embeddings` for vLLM models is deprecated and will be "
"removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
"this method to `embed_input_ids`."
)
return
self
.
get_input_embeddings
(
input_ids
)
...
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
)
->
T_co
:
...
...
...
@@ -68,15 +62,6 @@ def _check_vllm_model_init(model: type[object] | object) -> bool:
def
_check_vllm_model_embed_input_ids
(
model
:
type
[
object
]
|
object
)
->
bool
:
model_embed_input_ids
=
getattr
(
model
,
"embed_input_ids"
,
None
)
if
not
callable
(
model_embed_input_ids
):
model_get_input_embeddings
=
getattr
(
model
,
"get_input_embeddings"
,
None
)
if
callable
(
model_get_input_embeddings
):
logger
.
warning
(
"`get_input_embeddings` for vLLM models is deprecated and will be "
"removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
"this method to `embed_input_ids`."
)
model
.
embed_input_ids
=
model_get_input_embeddings
return
True
logger
.
warning
(
"The model (%s) is missing the `embed_input_ids` method."
,
model
,
...
...
vllm/model_executor/models/internlm2.py
View file @
a3f8d5dd
...
...
@@ -140,7 +140,6 @@ class InternLM2Attention(nn.Module):
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
rope_parameters
,
)
...
...
Prev
1
…
13
14
15
16
17
18
19
20
21
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment