Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
eefa41c1
Commit
eefa41c1
authored
Mar 24, 2026
by
zhuwenwen
Browse files
sync v0.18.0
parent
82155c76
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
184 additions
and
49 deletions
+184
-49
vllm/model_executor/models/dots_ocr.py
vllm/model_executor/models/dots_ocr.py
+2
-2
vllm/model_executor/models/eagle2_5_vl.py
vllm/model_executor/models/eagle2_5_vl.py
+2
-3
vllm/model_executor/models/ernie45_moe.py
vllm/model_executor/models/ernie45_moe.py
+3
-3
vllm/model_executor/models/ernie45_vl.py
vllm/model_executor/models/ernie45_vl.py
+2
-2
vllm/model_executor/models/ernie45_vl_moe.py
vllm/model_executor/models/ernie45_vl_moe.py
+3
-3
vllm/model_executor/models/ernie_mtp.py
vllm/model_executor/models/ernie_mtp.py
+2
-2
vllm/model_executor/models/exaone.py
vllm/model_executor/models/exaone.py
+2
-2
vllm/model_executor/models/exaone4.py
vllm/model_executor/models/exaone4.py
+2
-2
vllm/model_executor/models/exaone_moe.py
vllm/model_executor/models/exaone_moe.py
+2
-2
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon.py
+2
-2
vllm/model_executor/models/falcon_h1.py
vllm/model_executor/models/falcon_h1.py
+2
-2
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+2
-2
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma.py
+3
-3
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/gemma2.py
+2
-2
vllm/model_executor/models/gemma3.py
vllm/model_executor/models/gemma3.py
+2
-2
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+40
-2
vllm/model_executor/models/gemma3n.py
vllm/model_executor/models/gemma3n.py
+4
-4
vllm/model_executor/models/gemma3n_mm.py
vllm/model_executor/models/gemma3n_mm.py
+2
-2
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+101
-4
vllm/model_executor/models/glm4_1v.py
vllm/model_executor/models/glm4_1v.py
+4
-3
No files found.
vllm/model_executor/models/dots_ocr.py
View file @
eefa41c1
...
...
@@ -754,7 +754,7 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -790,4 +790,4 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
language_model
=
"language_model"
,
connector
=
"vision_tower.merger"
,
tower_model
=
"vision_tower."
,
)
)
\ No newline at end of file
vllm/model_executor/models/eagle2_5_vl.py
View file @
eefa41c1
...
...
@@ -429,7 +429,7 @@ class Eagle2_5_VLForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -437,7 +437,6 @@ class Eagle2_5_VLForConditionalGeneration(
)
->
IntermediateTensors
:
"""Forward pass through the model."""
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
forward_kwargs
=
{
...
...
@@ -468,4 +467,4 @@ class Eagle2_5_VLForConditionalGeneration(
language_model
=
"language_model"
,
connector
=
"mlp1"
,
tower_model
=
"vision_model"
,
)
)
\ No newline at end of file
vllm/model_executor/models/ernie45_moe.py
View file @
eefa41c1
...
...
@@ -465,7 +465,7 @@ class Ernie4_5_MoeModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -727,7 +727,7 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExpe
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -752,4 +752,4 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExpe
return
loader
.
load_weights
(
weights
)
def
get_expert_mapping
(
self
)
->
list
[
tuple
[
str
,
str
,
int
,
str
]]:
return
self
.
model
.
get_expert_mapping
()
return
self
.
model
.
get_expert_mapping
()
\ No newline at end of file
vllm/model_executor/models/ernie45_vl.py
View file @
eefa41c1
...
...
@@ -1680,7 +1680,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1716,4 +1716,4 @@ class Ernie4_5_VLMoeForConditionalGeneration(
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
\ No newline at end of file
vllm/model_executor/models/ernie45_vl_moe.py
View file @
eefa41c1
...
...
@@ -563,7 +563,7 @@ class Ernie4_5_VLMoeModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -644,7 +644,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -798,4 +798,4 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
return
loaded_params
\ No newline at end of file
vllm/model_executor/models/ernie_mtp.py
View file @
eefa41c1
...
...
@@ -164,7 +164,7 @@ class ErnieMTP(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
...
...
@@ -275,4 +275,4 @@ class ErnieMTP(nn.Module):
name
=
name
.
replace
(
"model.mtp_block.0."
,
f
"model.layers.
{
layer_idx
}
.mtp_block."
)
return
name
return
name
\ No newline at end of file
vllm/model_executor/models/exaone.py
View file @
eefa41c1
...
...
@@ -496,7 +496,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -521,4 +521,4 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# processed with quantization, LoRA, fine-tuning, etc.
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/exaone4.py
View file @
eefa41c1
...
...
@@ -490,7 +490,7 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -515,4 +515,4 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# processed with quantization, LoRA, fine-tuning, etc.
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/exaone_moe.py
View file @
eefa41c1
...
...
@@ -549,7 +549,7 @@ class ExaoneMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -576,4 +576,4 @@ class ExaoneMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
[
"lm_head."
,
"mtp."
]
if
self
.
config
.
tie_word_embeddings
else
[
"mtp."
]
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/falcon.py
View file @
eefa41c1
...
...
@@ -402,7 +402,7 @@ class FalconModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -540,4 +540,4 @@ class FalconForCausalLM(nn.Module, SupportsPP):
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/falcon_h1.py
View file @
eefa41c1
...
...
@@ -465,7 +465,7 @@ class FalconH1Model(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -608,7 +608,7 @@ class FalconH1ForCausalLM(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/fuyu.py
View file @
eefa41c1
...
...
@@ -340,7 +340,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -365,4 +365,4 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gemma.py
View file @
eefa41c1
...
...
@@ -297,7 +297,7 @@ class GemmaModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -399,7 +399,7 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -421,4 +421,4 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gemma2.py
View file @
eefa41c1
...
...
@@ -406,7 +406,7 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -428,4 +428,4 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gemma3.py
View file @
eefa41c1
...
...
@@ -494,7 +494,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -517,4 +517,4 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gemma3_mm.py
View file @
eefa41c1
...
...
@@ -606,7 +606,7 @@ class Gemma3ForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -643,4 +643,42 @@ class Gemma3ForConditionalGeneration(
language_model
=
"language_model"
,
connector
=
"multi_modal_projector"
,
tower_model
=
"vision_tower"
,
)
\ No newline at end of file
)
def
get_num_mm_encoder_tokens
(
self
,
num_image_tokens
:
int
)
->
int
:
"""
Calculate the number of tokens output by the vision encoder.
The vision encoder processes images into patch embeddings. For Gemma3,
the relationship between prompt placeholder tokens and actual vision
encoder output tokens depends on the patch grid size.
Args:
num_image_tokens: Number of image placeholder tokens in the prompt
(typically mm_tokens_per_image per image)
Returns:
Number of tokens output by the vision encoder
"""
# For Gemma3, the vision encoder outputs tokens_per_side x tokens_per_side
# tokens per image. Since num_image_tokens represents the number of
# connector output tokens (mm_tokens_per_image = 256), and tokens_per_side
# is sqrt(256) = 16, we need to account for the token expansion.
# Based on empirical testing, the multiplier of 16 works correctly.
return
num_image_tokens
*
16
def
get_num_mm_connector_tokens
(
self
,
num_vision_tokens
:
int
)
->
int
:
"""
Calculate the number of tokens output by the multimodal connector.
The connector applies projection and normalization but maintains the
token count for Gemma3.
Args:
num_vision_tokens: Number of tokens from vision encoder
Returns:
Number of tokens after connector processing
"""
# The Gemma3 connector maintains a 1:1 token mapping
return
num_vision_tokens
\ No newline at end of file
vllm/model_executor/models/gemma3n.py
View file @
eefa41c1
...
...
@@ -704,7 +704,7 @@ class Gemma3nSelfDecoder(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
per_layer_inputs
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -887,7 +887,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
def
fast_prefill_forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
per_layer_inputs
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -964,7 +964,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
def
normal_forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
per_layer_inputs
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1131,7 +1131,7 @@ class Gemma3nForCausalLM(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
*
,
per_layer_inputs
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/gemma3n_mm.py
View file @
eefa41c1
...
...
@@ -713,7 +713,7 @@ class Gemma3nForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -822,4 +822,4 @@ class Gemma3nForConditionalGeneration(
sample_rate
=
16000
,
# TODO enable chunking after more thorough testing.
min_energy_split_window_size
=
None
,
)
)
\ No newline at end of file
vllm/model_executor/models/glm4.py
View file @
eefa41c1
...
...
@@ -39,13 +39,22 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.llama
import
LlamaMLP
as
Glm4MLP
from
.llama
import
LlamaModel
from
.utils
import
AutoWeightsLoader
,
PPMissingLayer
,
maybe_prefix
from
.utils
import
(
AutoWeightsLoader
,
PPMissingLayer
,
is_pp_missing_parameter
,
maybe_prefix
,
)
class
Glm4Attention
(
nn
.
Module
):
...
...
@@ -78,7 +87,15 @@ class Glm4Attention(nn.Module):
# Number of KV heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert
tp_size
%
self
.
total_num_kv_heads
==
0
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
0.5
)
rope_params
=
getattr
(
config
,
"rope_parameters"
,
None
)
if
isinstance
(
rope_params
,
dict
)
and
"partial_rotary_factor"
in
rope_params
:
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
rope_params
[
"partial_rotary_factor"
]
)
else
:
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
0.5
)
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
self
.
head_dim
=
head_dim
or
hidden_size
//
self
.
total_num_heads
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
...
...
@@ -220,6 +237,73 @@ class Glm4Model(LlamaModel):
vllm_config
=
vllm_config
,
prefix
=
prefix
,
layer_type
=
Glm4DecoderLayer
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
".qkv_proj"
,
".q_proj"
,
"q"
),
(
".qkv_proj"
,
".k_proj"
,
"k"
),
(
".qkv_proj"
,
".v_proj"
,
"v"
),
(
".gate_up_proj"
,
".gate_proj"
,
0
),
(
".gate_up_proj"
,
".up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
spec_layer
=
get_spec_layer_idx_from_weight_name
(
self
.
config
,
name
)
if
spec_layer
is
not
None
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
if
"rotary_emb.cos_cached"
in
name
or
"rotary_emb.sin_cached"
in
name
:
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
if
self
.
quant_config
is
not
None
and
(
scale_name
:
=
self
.
quant_config
.
get_cache_scale
(
name
)
):
# Loading kv cache quantization scales
param
=
params_dict
[
scale_name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
loaded_weight
=
(
loaded_weight
if
loaded_weight
.
dim
()
==
0
else
loaded_weight
[
0
]
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
scale_name
)
continue
if
"scale"
in
name
or
"zero_point"
in
name
:
# Remapping the name of FP8 kv-scale or zero point.
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
class
Glm4ForCausalLM
(
nn
.
Module
,
SupportsLoRA
,
SupportsPP
):
packed_modules_mapping
=
{
...
...
@@ -270,7 +354,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -292,4 +376,17 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
return
loader
.
load_weights
(
weights
)
def
get_spec_layer_idx_from_weight_name
(
config
:
Glm4Config
,
weight_name
:
str
)
->
int
|
None
:
if
hasattr
(
config
,
"num_nextn_predict_layers"
)
and
(
config
.
num_nextn_predict_layers
>
0
):
layer_idx
=
config
.
num_hidden_layers
for
i
in
range
(
config
.
num_nextn_predict_layers
):
if
f
"layers.
{
layer_idx
+
i
}
."
in
weight_name
:
return
layer_idx
+
i
return
None
\ No newline at end of file
vllm/model_executor/models/glm4_1v.py
View file @
eefa41c1
...
...
@@ -24,7 +24,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GLM-4V model compatible with HuggingFace weights."""
"""Inference-only GLM-4.1V & GLM-4.6V-Flash, AutoGLM-Phone-9B model
compatible with HuggingFace weights."""
import
math
from
collections.abc
import
Callable
,
Iterable
,
Iterator
,
Mapping
,
Sequence
...
...
@@ -1447,7 +1448,7 @@ class Glm4vForConditionalGeneration(
prefix
=
maybe_prefix
(
prefix
,
"visual"
),
)
if
config
.
model_type
==
"glm4v"
:
if
config
.
model_type
in
(
"glm4v"
,
"glm_ocr"
)
:
architectures
=
[
"Glm4ForCausalLM"
]
elif
config
.
model_type
==
"glm4v_moe"
:
architectures
=
[
"Glm4MoeForCausalLM"
]
...
...
@@ -1664,7 +1665,7 @@ class Glm4vForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment