Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8d75f22e
Commit
8d75f22e
authored
Dec 13, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori
parents
ce888aa4
7d80c73d
Changes
656
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
58 additions
and
100 deletions
+58
-100
vllm/model_executor/models/bamba.py
vllm/model_executor/models/bamba.py
+1
-3
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+0
-2
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+0
-2
vllm/model_executor/models/clip.py
vllm/model_executor/models/clip.py
+0
-1
vllm/model_executor/models/cohere2_vision.py
vllm/model_executor/models/cohere2_vision.py
+0
-2
vllm/model_executor/models/config.py
vllm/model_executor/models/config.py
+24
-30
vllm/model_executor/models/deepseek_ocr.py
vllm/model_executor/models/deepseek_ocr.py
+2
-4
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+3
-3
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+0
-2
vllm/model_executor/models/dots_ocr.py
vllm/model_executor/models/dots_ocr.py
+0
-2
vllm/model_executor/models/ernie45_vl.py
vllm/model_executor/models/ernie45_vl.py
+6
-8
vllm/model_executor/models/falcon_h1.py
vllm/model_executor/models/falcon_h1.py
+1
-3
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+0
-2
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+0
-2
vllm/model_executor/models/gemma3n_mm.py
vllm/model_executor/models/gemma3n_mm.py
+0
-1
vllm/model_executor/models/glm.py
vllm/model_executor/models/glm.py
+2
-1
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+1
-2
vllm/model_executor/models/glm4_1v.py
vllm/model_executor/models/glm4_1v.py
+17
-26
vllm/model_executor/models/glm4_moe.py
vllm/model_executor/models/glm4_moe.py
+1
-2
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+0
-2
No files found.
Too many changes to show.
To preserve performance only
656 of 656+
files are displayed.
Plain diff
Email patch
vllm/model_executor/models/bamba.py
View file @
8d75f22e
...
...
@@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module):
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
max_position_embeddings
=
max_position_embeddings
if
hasattr
(
config
,
"partial_rotary_factor"
):
rotary_dim
=
int
(
self
.
head_dim
*
config
.
partial_rotary_factor
)
elif
hasattr
(
config
,
"attn_rotary_emb"
):
if
hasattr
(
config
,
"attn_rotary_emb"
):
rotary_dim
=
config
.
attn_rotary_emb
# for backward compatibility
else
:
rotary_dim
=
self
.
head_dim
# default
...
...
vllm/model_executor/models/blip2.py
View file @
8d75f22e
...
...
@@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
class
Blip2ForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
,
SupportsQuant
):
merge_by_field_config
=
True
@
classmethod
def
get_placeholder_str
(
cls
,
modality
:
str
,
i
:
int
)
->
str
|
None
:
if
modality
.
startswith
(
"image"
):
...
...
vllm/model_executor/models/chameleon.py
View file @
8d75f22e
...
...
@@ -918,8 +918,6 @@ class ChameleonModel(nn.Module):
class
ChameleonForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
,
SupportsQuant
):
merge_by_field_config
=
True
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
],
"gate_up_proj"
:
[
"gate_proj"
,
"up_proj"
],
...
...
vllm/model_executor/models/clip.py
View file @
8d75f22e
...
...
@@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
is_pooling_model
=
True
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
]}
merge_by_field_config
=
True
@
classmethod
def
get_placeholder_str
(
cls
,
modality
:
str
,
i
:
int
)
->
str
|
None
:
...
...
vllm/model_executor/models/cohere2_vision.py
View file @
8d75f22e
...
...
@@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor(
dummy_inputs
=
Cohere2VisionDummyInputsBuilder
,
)
class
Cohere2VisionForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
merge_by_field_config
=
True
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model.vision_tower."
:
"vision_tower."
,
...
...
vllm/model_executor/models/config.py
View file @
8d75f22e
...
...
@@ -4,11 +4,10 @@ from copy import deepcopy
from
math
import
lcm
from
typing
import
TYPE_CHECKING
import
vllm.envs
as
envs
from
vllm.attention.backends.registry
import
AttentionBackendEnum
from
vllm.logger
import
init_logger
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.platforms
import
current_platform
from
vllm.transformers_utils.config
import
set_default_rope_theta
from
vllm.utils.math_utils
import
cdiv
,
round_up
from
vllm.utils.torch_utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.v1.kv_cache_interface
import
FullAttentionSpec
,
MambaSpec
,
MLAAttentionSpec
...
...
@@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
if
not
model_config
.
enforce_eager
:
max_position
=
round_up
(
max_position
,
8
)
set_default_rope_theta
(
config
,
default_theta
=
config
.
rotary_emb_base
)
config
.
rotary_kwargs
=
{
"head_size"
:
head_dim
,
"rotary_dim"
:
getattr
(
config
,
"rotary_emb_dim"
,
head_dim
),
...
...
@@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
rotary_emb_dim
=
int
(
head_dim
*
config
.
rotary_emb_fraction
)
max_trained_positions
=
getattr
(
config
,
"max_trained_positions"
,
2048
)
set_default_rope_theta
(
config
,
default_theta
=
config
.
rotary_emb_base
)
config
.
rotary_kwargs
=
{
"head_size"
:
head_dim
,
"rotary_dim"
:
rotary_emb_dim
,
...
...
@@ -336,6 +331,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
# Enable FULL_AND_PIECEWISE by default
MambaModelConfig
.
verify_and_update_config
(
vllm_config
)
attention_config
=
vllm_config
.
attention_config
cache_config
=
vllm_config
.
cache_config
model_config
=
vllm_config
.
model_config
parallel_config
=
vllm_config
.
parallel_config
...
...
@@ -352,7 +348,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
# * CUTLASS_MLA backend: kernel_block_size 128 alignment
# * Other MLA backends: kernel_block_size 64 alignment
if
model_config
.
use_mla
:
use_cutlass_mla
=
envs
.
VLLM_ATTENTION_BACKEND
==
"CUTLASS_MLA"
use_cutlass_mla
=
(
attention_config
.
backend
==
AttentionBackendEnum
.
CUTLASS_MLA
)
kernel_block_alignment_size
=
128
if
use_cutlass_mla
else
64
attn_page_size_1_token
=
MLAAttentionSpec
(
block_size
=
1
,
...
...
@@ -366,8 +364,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
current_platform
.
is_device_capability
(
100
)
and
model_config
.
get_head_size
()
==
256
and
(
envs
.
VLLM_ATTENTION_BACKEND
is
None
or
envs
.
VLLM_ATTENTION_BACKEND
==
"
FLASHINFER
"
attention_config
.
backend
is
None
or
attention_config
.
backend
==
AttentionBackendEnum
.
FLASHINFER
)
):
# https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
...
...
@@ -490,29 +488,24 @@ class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
logger
.
info
(
"Using bfloat16 kv-cache for DeepSeekV3.2"
)
class
DeepseekV32
ForCausalLM
(
VerifyAndUpdateConfig
):
@
classmethod
def
verify_and_update_config
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
"""
Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
class
NemotronH
ForCausalLM
Config
(
VerifyAndUpdateConfig
):
@
staticmethod
def
verify_and_update_config
(
vllm_config
:
"VllmConfig"
)
->
None
:
"""Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
(or not explicitly set), to the value specified in the HF config, or to
float16 if not specified.
"""
hf_config
=
vllm_config
.
model_config
.
hf_config
# Mirror the check in vllm/model_executor/models/deepseek_v2.py
# is_v32 = hasattr(hf_config, "index_topk")
# assert is_v32
# For DeepSeekV3.2, we use a custom fp8 format as default (i.e.
# "auto")
cache_config
=
vllm_config
.
cache_config
if
cache_config
.
cache_dtype
==
"auto"
or
\
cache_config
.
cache_dtype
.
startswith
(
"fp8"
):
cache_config
.
cache_dtype
=
"fp8_ds_mla"
logger
.
info
(
"Using custom fp8 kv-cache format for DeepSeekV3.2"
)
if
cache_config
.
cache_dtype
==
"bfloat16"
:
cache_config
.
cache_dtype
=
"auto"
logger
.
info
(
"Using bfloat16 kv-cache for DeepSeekV3.2"
)
if
cache_config
.
mamba_ssm_cache_dtype
==
"auto"
:
hf_config
=
vllm_config
.
model_config
.
hf_config
mamba_ssm_cache_dtype
=
getattr
(
hf_config
,
"mamba_ssm_cache_dtype"
,
"float16"
)
logger
.
info
(
"Updating mamba_ssm_cache_dtype to '%s' for NemotronH model"
,
mamba_ssm_cache_dtype
,
)
cache_config
.
mamba_ssm_cache_dtype
=
mamba_ssm_cache_dtype
MODELS_CONFIG_MAP
:
dict
[
str
,
type
[
VerifyAndUpdateConfig
]]
=
{
...
...
@@ -532,4 +525,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"Mamba2ForCausalLM"
:
MambaModelConfig
,
"FalconMambaForCausalLM"
:
MambaModelConfig
,
"DeepseekV32ForCausalLM"
:
DeepseekV32ForCausalLM
,
"NemotronHForCausalLM"
:
NemotronHForCausalLMConfig
,
}
vllm/model_executor/models/deepseek_ocr.py
View file @
8d75f22e
...
...
@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargs
,
MultiModalKwargs
Items
,
NestedTensors
,
)
from
vllm.multimodal.parse
import
(
...
...
@@ -305,7 +305,7 @@ class DeepseekOCRMultiModalProcessor(
self
,
mm_items
:
MultiModalDataItems
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
out_mm_kwargs
:
MultiModalKwargs
,
out_mm_kwargs
:
MultiModalKwargs
Items
,
)
->
Sequence
[
PromptUpdate
]:
hf_processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
...
...
@@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor(
dummy_inputs
=
DeepseekOCRDummyInputsBuilder
,
)
class
DeepseekOCRForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
merge_by_field_config
=
True
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
# map prefix for language backbone
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
8d75f22e
...
...
@@ -686,11 +686,10 @@ def sparse_attn_indexer(
chunk
.
cu_seqlen_ke
,
)
num_rows
=
logits
.
shape
[
0
]
assert
topk_tokens
==
2048
,
"top_k_per_row assumes size 2048"
topk_indices
=
topk_indices_buffer
[
chunk
.
token_start
:
chunk
.
token_end
,
:
topk_tokens
]
torch
.
ops
.
_C
.
top_k_per_row
(
torch
.
ops
.
_C
.
top_k_per_row
_prefill
(
logits
,
chunk
.
cu_seqlen_ks
,
chunk
.
cu_seqlen_ke
,
...
...
@@ -698,6 +697,7 @@ def sparse_attn_indexer(
num_rows
,
logits
.
stride
(
0
),
logits
.
stride
(
1
),
topk_tokens
,
)
if
has_decode
:
...
...
@@ -740,7 +740,6 @@ def sparse_attn_indexer(
max_model_len
=
max_model_len
,
)
num_rows
=
logits
.
shape
[
0
]
assert
topk_tokens
==
2048
,
"top_k_per_row assumes size 2048"
topk_indices
=
topk_indices_buffer
[:
num_decode_tokens
,
:
topk_tokens
]
torch
.
ops
.
_C
.
top_k_per_row_decode
(
...
...
@@ -751,6 +750,7 @@ def sparse_attn_indexer(
num_rows
,
logits
.
stride
(
0
),
logits
.
stride
(
1
),
topk_tokens
,
)
if
decode_metadata
.
requires_padding
:
# if padded, we need to unpack
...
...
vllm/model_executor/models/deepseek_vl2.py
View file @
8d75f22e
...
...
@@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor(
dummy_inputs
=
DeepseekVL2DummyInputsBuilder
,
)
class
DeepseekVLV2ForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
merge_by_field_config
=
True
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"language."
:
"language_model."
,
...
...
vllm/model_executor/models/dots_ocr.py
View file @
8d75f22e
...
...
@@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module):
dummy_inputs
=
DotsOCRDummyInputsBuilder
,
)
class
DotsOCRForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
,
SupportsLoRA
):
merge_by_field_config
=
True
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_substr
=
{
".attn.qkv_proj."
:
".attn.qkv."
,
...
...
vllm/model_executor/models/ernie45_vl.py
View file @
8d75f22e
...
...
@@ -289,12 +289,12 @@ class Ernie4_5_VisionAttention(nn.Module):
elif
self
.
attn_backend
==
AttentionBackendEnum
.
TORCH_SDPA
:
# Execute attention entry by entry for speed & less VRAM.
outputs
=
[]
for
i
in
range
(
1
,
len
(
cu_seqlens
)):
start_idx
=
cu_seqlens
[
i
-
1
]
end_idx
=
cu_seqlens
[
i
]
q_i
=
q
[:,
start_idx
:
end_idx
]
k_i
=
k
[:,
start_idx
:
end_idx
]
v_i
=
v
[:,
start_idx
:
end_idx
]
lens
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
tolist
()
q_chunks
=
torch
.
split
(
q
,
lens
,
dim
=
1
)
k_chunks
=
torch
.
split
(
k
,
lens
,
dim
=
1
)
v_chunks
=
torch
.
split
(
v
,
lens
,
dim
=
1
)
for
q_i
,
k_i
,
v_i
in
zip
(
q_chunks
,
k_chunks
,
v_chunks
):
q_i
,
k_i
,
v_i
=
(
rearrange
(
x
,
"b s h d -> b h s d"
)
for
x
in
[
q_i
,
k_i
,
v_i
]
)
...
...
@@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
class
Ernie4_5_VLMoeForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsLoRA
,
SupportsPP
,
SupportsMRoPE
):
merge_by_field_config
=
True
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
...
...
vllm/model_executor/models/falcon_h1.py
View file @
8d75f22e
...
...
@@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
max_position_embeddings
=
max_position_embeddings
if
hasattr
(
config
,
"partial_rotary_factor"
):
rotary_dim
=
self
.
head_dim
*
config
.
partial_rotary_factor
elif
hasattr
(
config
,
"attn_rotary_emb"
):
if
hasattr
(
config
,
"attn_rotary_emb"
):
rotary_dim
=
config
.
attn_rotary_emb
# for backward compatibility
else
:
rotary_dim
=
self
.
head_dim
# default
...
...
vllm/model_executor/models/fuyu.py
View file @
8d75f22e
...
...
@@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
dummy_inputs
=
FuyuDummyInputsBuilder
,
)
class
FuyuForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
merge_by_field_config
=
True
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model.vision_embed_tokens."
:
"vision_embed_tokens."
,
...
...
vllm/model_executor/models/gemma3_mm.py
View file @
8d75f22e
...
...
@@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module):
class
Gemma3ForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
,
SupportsLoRA
):
merge_by_field_config
=
True
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
...
...
vllm/model_executor/models/gemma3n_mm.py
View file @
8d75f22e
...
...
@@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module):
class
Gemma3nForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsTranscription
):
merge_by_field_config
=
True
supported_languages
=
ISO639_1_SUPPORTED_LANGS
packed_modules_mapping
=
{
...
...
vllm/model_executor/models/glm.py
View file @
8d75f22e
...
...
@@ -10,7 +10,8 @@ from .utils import PPMissingLayer
class
GlmForCausalLM
(
LlamaForCausalLM
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
vllm_config
.
model_config
.
hf_config
.
partial_rotary_factor
=
0.5
hf_config
=
vllm_config
.
model_config
.
hf_config
hf_config
.
rope_parameters
[
"partial_rotary_factor"
]
=
0.5
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
# Hack Llama model to fit HF format GLM implementation
# Attention difference between GLM and Llama:
...
...
vllm/model_executor/models/glm4.py
View file @
8d75f22e
...
...
@@ -78,7 +78,7 @@ class Glm4Attention(nn.Module):
# Number of KV heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert
tp_size
%
self
.
total_num_kv_heads
==
0
partial_rotary_factor
=
getattr
(
config
,
"partial_rotary_factor"
,
0.5
)
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
0.5
)
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
self
.
head_dim
=
head_dim
or
hidden_size
//
self
.
total_num_heads
self
.
rotary_dim
=
self
.
head_dim
...
...
@@ -106,7 +106,6 @@ class Glm4Attention(nn.Module):
rotary_dim
=
self
.
rotary_dim
,
max_position
=
max_position
,
rope_parameters
=
config
.
rope_parameters
,
partial_rotary_factor
=
partial_rotary_factor
,
is_neox_style
=
False
,
)
self
.
attn
=
Attention
(
...
...
vllm/model_executor/models/glm4_1v.py
View file @
8d75f22e
...
...
@@ -377,12 +377,12 @@ class Glm4vVisionAttention(nn.Module):
elif
self
.
attn_backend
==
AttentionBackendEnum
.
TORCH_SDPA
:
# Execute attention entry by entry for speed & less VRAM.
outputs
=
[]
for
i
in
range
(
1
,
len
(
cu_seqlens
)):
start_idx
=
cu_seqlens
[
i
-
1
]
end_idx
=
cu_seqlens
[
i
]
q_i
=
q
[:,
start_idx
:
end_idx
]
k_i
=
k
[:,
start_idx
:
end_idx
]
v_i
=
v
[:,
start_idx
:
end_idx
]
lens
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
tolist
()
q_chunks
=
torch
.
split
(
q
,
lens
,
dim
=
1
)
k_chunks
=
torch
.
split
(
k
,
lens
,
dim
=
1
)
v_chunks
=
torch
.
split
(
v
,
lens
,
dim
=
1
)
for
q_i
,
k_i
,
v_i
in
zip
(
q_chunks
,
k_chunks
,
v_chunks
):
q_i
,
k_i
,
v_i
=
(
rearrange
(
x
,
"b s h d -> b h s d"
)
for
x
in
[
q_i
,
k_i
,
v_i
]
)
...
...
@@ -787,10 +787,10 @@ class Glm4vVisionTransformer(nn.Module):
def
forward
(
self
,
x
:
torch
.
Tensor
,
grid_thw
:
list
[
list
[
int
]],
grid_thw
:
torch
.
Tensor
|
list
[
list
[
int
]],
)
->
torch
.
Tensor
:
# Convert grid_thw to tensor (always expecting list format now)
grid_thw
=
torch
.
tensor
(
grid_thw
,
device
=
x
.
device
,
dtype
=
torch
.
long
)
if
isinstance
(
grid_thw
,
list
):
grid_thw
=
torch
.
tensor
(
grid_thw
,
dtype
=
torch
.
int32
)
# patchify
x
=
x
.
to
(
device
=
self
.
device
,
dtype
=
self
.
dtype
)
...
...
@@ -805,7 +805,8 @@ class Glm4vVisionTransformer(nn.Module):
cu_seqlens
=
torch
.
repeat_interleave
(
grid_thw
[:,
1
]
*
grid_thw
[:,
2
],
grid_thw
[:,
0
]
).
cumsum
(
dim
=
0
,
dtype
=
torch
.
int32
)
cu_seqlens
=
F
.
pad
(
cu_seqlens
,
(
1
,
0
),
"constant"
,
0
)
cu_seqlens
=
torch
.
cat
([
cu_seqlens
.
new_zeros
(
1
),
cu_seqlens
])
cu_seqlens
=
cu_seqlens
.
to
(
self
.
device
,
non_blocking
=
True
)
# pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
max_seqlen
=
self
.
compute_attn_mask_seqlen
(
cu_seqlens
)
...
...
@@ -1424,8 +1425,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
class
Glm4vForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsLoRA
,
SupportsPP
,
SupportsMRoPE
):
merge_by_field_config
=
True
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
...
...
@@ -1550,7 +1549,6 @@ class Glm4vForConditionalGeneration(
)
->
tuple
[
torch
.
Tensor
,
...]:
grid_thw
=
image_input
[
"image_grid_thw"
]
assert
grid_thw
.
ndim
==
2
grid_thw_list
=
grid_thw
.
tolist
()
if
image_input
[
"type"
]
==
"image_embeds"
:
image_embeds
=
image_input
[
"image_embeds"
].
type
(
self
.
visual
.
dtype
)
...
...
@@ -1561,12 +1559,10 @@ class Glm4vForConditionalGeneration(
self
.
visual
,
pixel_values
,
grid_thw
.
tolist
(),
rope_type
=
"rope_3d"
)
else
:
image_embeds
=
self
.
visual
(
pixel_values
,
grid_thw
=
grid_thw
.
tolist
())
image_embeds
=
self
.
visual
(
pixel_values
,
grid_thw
=
grid_thw
)
merge_size
=
self
.
visual
.
spatial_merge_size
sizes
=
(
torch
.
tensor
(
grid_thw_list
,
dtype
=
torch
.
long
).
prod
(
-
1
)
//
(
merge_size
*
merge_size
)
).
tolist
()
sizes
=
(
grid_thw
.
prod
(
-
1
)
//
merge_size
//
merge_size
).
tolist
()
return
image_embeds
.
split
(
sizes
)
def
_process_video_input
(
...
...
@@ -1574,7 +1570,6 @@ class Glm4vForConditionalGeneration(
)
->
tuple
[
torch
.
Tensor
,
...]:
grid_thw
=
video_input
[
"video_grid_thw"
]
assert
grid_thw
.
ndim
==
2
grid_thw_list
=
grid_thw
.
tolist
()
if
video_input
[
"type"
]
==
"video_embeds"
:
video_embeds
=
video_input
[
"video_embeds"
].
type
(
self
.
visual
.
dtype
)
...
...
@@ -1590,15 +1585,11 @@ class Glm4vForConditionalGeneration(
rope_type
=
"rope_3d"
,
)
else
:
video_embeds
=
self
.
visual
(
pixel_values_videos
,
grid_thw
=
grid_thw
.
tolist
()
)
video_embeds
=
self
.
visual
(
pixel_values_videos
,
grid_thw
=
grid_thw
)
# Split concatenated embeddings for each video item.
merge_size
=
self
.
visual
.
spatial_merge_size
sizes
=
(
torch
.
tensor
(
grid_thw_list
,
dtype
=
torch
.
long
).
prod
(
-
1
)
//
(
merge_size
*
merge_size
)
).
tolist
()
sizes
=
(
grid_thw
.
prod
(
-
1
)
//
merge_size
//
merge_size
).
tolist
()
return
video_embeds
.
split
(
sizes
)
def
_parse_and_validate_multimodal_inputs
(
self
,
**
kwargs
:
object
)
->
dict
:
...
...
vllm/model_executor/models/glm4_moe.py
View file @
8d75f22e
...
...
@@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module):
prefix
=
f
"
{
prefix
}
.o_proj"
,
)
partial_rotary_factor
=
getattr
(
config
,
"partial_rotary_factor"
,
0.5
)
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
0.5
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
rope_parameters
=
config
.
rope_parameters
,
partial_rotary_factor
=
partial_rotary_factor
,
)
self
.
attn
=
Attention
(
self
.
num_heads
,
...
...
vllm/model_executor/models/glm4v.py
View file @
8d75f22e
...
...
@@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
class
GLM4VForCausalLM
(
ChatGLMBaseModel
,
SupportsMultiModal
,
SupportsLoRA
,
SupportsPP
,
SupportsMRoPE
):
merge_by_field_config
=
True
packed_modules_mapping
=
{
"query_key_value"
:
[
"query_key_value"
],
"dense_h_to_4h"
:
[
"dense_h_to_4h"
],
...
...
Prev
1
…
23
24
25
26
27
28
29
30
31
…
33
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment