Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
df704163
Commit
df704163
authored
Feb 06, 2026
by
zhuwenwen
Browse files
sync v0.15.1 (models)
parent
d7db129a
Changes
169
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
46 additions
and
855 deletions
+46
-855
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+2
-40
vllm/model_executor/models/gemma3n.py
vllm/model_executor/models/gemma3n.py
+5
-5
vllm/model_executor/models/gemma3n_mm.py
vllm/model_executor/models/gemma3n_mm.py
+2
-2
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+3
-97
vllm/model_executor/models/glm4_1v.py
vllm/model_executor/models/glm4_1v.py
+2
-3
vllm/model_executor/models/glm4_moe.py
vllm/model_executor/models/glm4_moe.py
+3
-3
vllm/model_executor/models/glm4_moe_lite.py
vllm/model_executor/models/glm4_moe_lite.py
+3
-3
vllm/model_executor/models/glm4_moe_lite_mtp.py
vllm/model_executor/models/glm4_moe_lite_mtp.py
+2
-2
vllm/model_executor/models/glm4_moe_mtp.py
vllm/model_executor/models/glm4_moe_mtp.py
+1
-1
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+2
-2
vllm/model_executor/models/glm_ocr.py
vllm/model_executor/models/glm_ocr.py
+0
-389
vllm/model_executor/models/glm_ocr_mtp.py
vllm/model_executor/models/glm_ocr_mtp.py
+0
-285
vllm/model_executor/models/glmasr.py
vllm/model_executor/models/glmasr.py
+2
-2
vllm/model_executor/models/gpt2.py
vllm/model_executor/models/gpt2.py
+4
-4
vllm/model_executor/models/gpt_bigcode.py
vllm/model_executor/models/gpt_bigcode.py
+3
-3
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_j.py
+3
-3
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/gpt_neox.py
+2
-3
vllm/model_executor/models/gpt_oss.py
vllm/model_executor/models/gpt_oss.py
+3
-4
vllm/model_executor/models/granite.py
vllm/model_executor/models/granite.py
+2
-2
vllm/model_executor/models/granite_speech.py
vllm/model_executor/models/granite_speech.py
+2
-2
No files found.
vllm/model_executor/models/gemma3_mm.py
View file @
df704163
...
...
@@ -618,7 +618,7 @@ class Gemma3ForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -655,42 +655,4 @@ class Gemma3ForConditionalGeneration(
language_model
=
"language_model"
,
connector
=
"multi_modal_projector"
,
tower_model
=
"vision_tower"
,
)
def
get_num_mm_encoder_tokens
(
self
,
num_image_tokens
:
int
)
->
int
:
"""
Calculate the number of tokens output by the vision encoder.
The vision encoder processes images into patch embeddings. For Gemma3,
the relationship between prompt placeholder tokens and actual vision
encoder output tokens depends on the patch grid size.
Args:
num_image_tokens: Number of image placeholder tokens in the prompt
(typically mm_tokens_per_image per image)
Returns:
Number of tokens output by the vision encoder
"""
# For Gemma3, the vision encoder outputs tokens_per_side x tokens_per_side
# tokens per image. Since num_image_tokens represents the number of
# connector output tokens (mm_tokens_per_image = 256), and tokens_per_side
# is sqrt(256) = 16, we need to account for the token expansion.
# Based on empirical testing, the multiplier of 16 works correctly.
return
num_image_tokens
*
16
def
get_num_mm_connector_tokens
(
self
,
num_vision_tokens
:
int
)
->
int
:
"""
Calculate the number of tokens output by the multimodal connector.
The connector applies projection and normalization but maintains the
token count for Gemma3.
Args:
num_vision_tokens: Number of tokens from vision encoder
Returns:
Number of tokens after connector processing
"""
# The Gemma3 connector maintains a 1:1 token mapping
return
num_vision_tokens
)
\ No newline at end of file
vllm/model_executor/models/gemma3n.py
View file @
df704163
...
...
@@ -704,7 +704,7 @@ class Gemma3nSelfDecoder(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
per_layer_inputs
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -887,7 +887,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
def
fast_prefill_forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
per_layer_inputs
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -964,7 +964,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
def
normal_forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
per_layer_inputs
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1131,7 +1131,7 @@ class Gemma3nForCausalLM(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
*
,
per_layer_inputs
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1163,4 +1163,4 @@ class Gemma3nForCausalLM(nn.Module):
[
"embed_audio."
,
"embed_vision."
,
"audio_tower."
,
"vision_tower."
]
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gemma3n_mm.py
View file @
df704163
...
...
@@ -707,7 +707,7 @@ class Gemma3nForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -815,4 +815,4 @@ class Gemma3nForConditionalGeneration(
sample_rate
=
16000
,
# TODO enable chunking after more thorough testing.
min_energy_split_window_size
=
None
,
)
)
\ No newline at end of file
vllm/model_executor/models/glm4.py
View file @
df704163
...
...
@@ -41,22 +41,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.v1.attention.backend
import
AttentionType
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.llama
import
LlamaMLP
as
Glm4MLP
from
.llama
import
LlamaModel
from
.utils
import
(
AutoWeightsLoader
,
PPMissingLayer
,
is_pp_missing_parameter
,
maybe_prefix
,
)
from
.utils
import
AutoWeightsLoader
,
PPMissingLayer
,
maybe_prefix
from
vllm.utils
import
W8a8GetCacheJSON
from
vllm
import
_custom_ops
as
ops
...
...
@@ -94,12 +85,7 @@ class Glm4Attention(nn.Module):
assert
tp_size
%
self
.
total_num_kv_heads
==
0
rope_params
=
getattr
(
config
,
"rope_parameters"
,
None
)
if
isinstance
(
rope_params
,
dict
)
and
"partial_rotary_factor"
in
rope_params
:
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
rope_params
[
"partial_rotary_factor"
]
)
else
:
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
0.5
)
config
.
rope_parameters
.
setdefault
(
"partial_rotary_factor"
,
0.5
)
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
self
.
head_dim
=
head_dim
or
hidden_size
//
self
.
total_num_heads
...
...
@@ -242,73 +228,6 @@ class Glm4Model(LlamaModel):
vllm_config
=
vllm_config
,
prefix
=
prefix
,
layer_type
=
Glm4DecoderLayer
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
".qkv_proj"
,
".q_proj"
,
"q"
),
(
".qkv_proj"
,
".k_proj"
,
"k"
),
(
".qkv_proj"
,
".v_proj"
,
"v"
),
(
".gate_up_proj"
,
".gate_proj"
,
0
),
(
".gate_up_proj"
,
".up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
spec_layer
=
get_spec_layer_idx_from_weight_name
(
self
.
config
,
name
)
if
spec_layer
is
not
None
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
if
"rotary_emb.cos_cached"
in
name
or
"rotary_emb.sin_cached"
in
name
:
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
if
self
.
quant_config
is
not
None
and
(
scale_name
:
=
self
.
quant_config
.
get_cache_scale
(
name
)
):
# Loading kv cache quantization scales
param
=
params_dict
[
scale_name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
loaded_weight
=
(
loaded_weight
if
loaded_weight
.
dim
()
==
0
else
loaded_weight
[
0
]
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
scale_name
)
continue
if
"scale"
in
name
or
"zero_point"
in
name
:
# Remapping the name of FP8 kv-scale or zero point.
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
class
Glm4ForCausalLM
(
nn
.
Module
,
SupportsLoRA
,
SupportsPP
):
packed_modules_mapping
=
{
...
...
@@ -371,7 +290,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -394,16 +313,3 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
def
get_spec_layer_idx_from_weight_name
(
config
:
Glm4Config
,
weight_name
:
str
)
->
int
|
None
:
if
hasattr
(
config
,
"num_nextn_predict_layers"
)
and
(
config
.
num_nextn_predict_layers
>
0
):
layer_idx
=
config
.
num_hidden_layers
for
i
in
range
(
config
.
num_nextn_predict_layers
):
if
f
"layers.
{
layer_idx
+
i
}
."
in
weight_name
:
return
layer_idx
+
i
return
None
vllm/model_executor/models/glm4_1v.py
View file @
df704163
...
...
@@ -24,8 +24,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GLM-4.1V & GLM-4.6V-Flash, AutoGLM-Phone-9B model
compatible with HuggingFace weights."""
"""Inference-only GLM-4V model compatible with HuggingFace weights."""
import
os
import
itertools
...
...
@@ -1739,7 +1738,7 @@ class Glm4vForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/glm4_moe.py
View file @
df704163
...
...
@@ -451,7 +451,7 @@ class Glm4MoeModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -687,7 +687,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, Glm4MixtureOfExper
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -722,4 +722,4 @@ def get_spec_layer_idx_from_weight_name(
for
i
in
range
(
config
.
num_nextn_predict_layers
):
if
f
"layers.
{
layer_idx
+
i
}
."
in
weight_name
:
return
layer_idx
+
i
return
None
return
None
\ No newline at end of file
vllm/model_executor/models/glm4_moe_lite.py
View file @
df704163
...
...
@@ -264,7 +264,7 @@ class Glm4MoeLiteModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -596,7 +596,7 @@ class Glm4MoeLiteForCausalLM(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -639,4 +639,4 @@ def get_spec_layer_idx_from_weight_name(
for
i
in
range
(
config
.
num_nextn_predict_layers
):
if
f
"layers.
{
layer_idx
+
i
}
."
in
weight_name
:
return
layer_idx
+
i
return
None
return
None
\ No newline at end of file
vllm/model_executor/models/glm4_moe_lite_mtp.py
View file @
df704163
...
...
@@ -230,7 +230,7 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
...
...
@@ -461,4 +461,4 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
elif
shared_weight
:
# treat shared weights as top level weights
name
=
name
.
replace
(
f
"model.layers.
{
spec_layer
}
."
,
"model."
)
return
name
return
name
\ No newline at end of file
vllm/model_executor/models/glm4_moe_mtp.py
View file @
df704163
...
...
@@ -216,7 +216,7 @@ class Glm4MoeMTP(nn.Module, Glm4MixtureOfExperts):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
...
...
vllm/model_executor/models/glm4v.py
View file @
df704163
...
...
@@ -769,7 +769,7 @@ class GLM4VForCausalLM(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -782,4 +782,4 @@ class GLM4VForCausalLM(
input_ids
,
positions
,
intermediate_tensors
,
inputs_embeds
)
return
hidden_states
return
hidden_states
\ No newline at end of file
vllm/model_executor/models/glm_ocr.py
deleted
100644 → 0
View file @
d7db129a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/Glm4v/modeling_Glm4v.py
# Copyright 2026 The ZhipuAI Team.
# Copyright 2026 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
# All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GLM-OCR model compatible with HuggingFace weights."""
from
collections.abc
import
Callable
from
functools
import
partial
from
typing
import
TYPE_CHECKING
import
torch
import
torch.nn
as
nn
from
einops
import
rearrange
if
TYPE_CHECKING
:
from
transformers.models.glm_ocr.configuration_glm_ocr
import
GlmOcrVisionConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
,
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention.mm_encoder_attention
import
(
MMEncoderAttention
,
)
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
QKVParallelLinear
,
RowParallelLinear
,
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding.common
import
(
ApplyRotaryEmb
,
)
from
vllm.model_executor.models.glm4_1v
import
(
Glm4vDummyInputsBuilder
,
Glm4vForConditionalGeneration
,
Glm4vMultiModalProcessor
,
Glm4vPatchMerger
,
Glm4vProcessingInfo
,
Glm4vVisionBlock
,
Glm4vVisionMLP
,
Glm4vVisionPatchEmbed
,
Glm4vVisionTransformer
,
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
.utils
import
(
maybe_prefix
,
)
from
.vision
import
(
get_vit_attn_backend
,
is_vit_use_data_parallel
,
)
logger
=
init_logger
(
__name__
)
class
GlmOcrVisionMLP
(
Glm4vVisionMLP
):
pass
class
GlmOcrVisionAttention
(
nn
.
Module
):
def
__init__
(
self
,
embed_dim
:
int
,
num_heads
:
int
,
projection_size
:
int
,
quant_config
:
QuantizationConfig
|
None
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
# Per attention head and per partition values.
use_data_parallel
=
is_vit_use_data_parallel
()
self
.
tp_size
=
(
1
if
use_data_parallel
else
get_tensor_model_parallel_world_size
()
)
self
.
tp_rank
=
(
0
if
use_data_parallel
else
parallel_state
.
get_tensor_model_parallel_rank
()
)
self
.
hidden_size_per_attention_head
=
dist_utils
.
divide
(
projection_size
,
num_heads
)
self
.
num_attention_heads_per_partition
=
dist_utils
.
divide
(
num_heads
,
self
.
tp_size
)
self
.
head_dim
=
embed_dim
//
num_heads
self
.
q_norm
=
RMSNorm
(
self
.
head_dim
,
eps
=
1e-5
)
self
.
k_norm
=
RMSNorm
(
self
.
head_dim
,
eps
=
1e-5
)
self
.
qkv
=
QKVParallelLinear
(
hidden_size
=
embed_dim
,
head_size
=
self
.
hidden_size_per_attention_head
,
total_num_heads
=
num_heads
,
total_num_kv_heads
=
num_heads
,
bias
=
True
,
quant_config
=
quant_config
,
# Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
prefix
=
f
"
{
prefix
}
.qkv_proj"
if
quant_config
else
f
"
{
prefix
}
.qkv"
,
disable_tp
=
use_data_parallel
,
)
self
.
proj
=
RowParallelLinear
(
input_size
=
projection_size
,
output_size
=
embed_dim
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.proj"
,
bias
=
True
,
disable_tp
=
use_data_parallel
,
)
self
.
attn
=
MMEncoderAttention
(
num_heads
=
self
.
num_attention_heads_per_partition
,
head_size
=
self
.
hidden_size_per_attention_head
,
scale
=
self
.
hidden_size_per_attention_head
**-
0.5
,
)
self
.
apply_rotary_emb
=
ApplyRotaryEmb
(
enforce_enable
=
True
)
def
split_qkv
(
self
,
qkv
:
torch
.
Tensor
)
->
tuple
[
torch
.
Tensor
,
...]:
# [s, b, 3 * head * head_dim]
seq_len
,
bs
,
_
=
qkv
.
shape
# [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
q
,
k
,
v
=
qkv
.
chunk
(
3
,
dim
=
2
)
# 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
new_shape
=
(
seq_len
,
bs
,
self
.
num_attention_heads_per_partition
,
self
.
hidden_size_per_attention_head
,
)
q
,
k
,
v
=
(
x
.
view
(
*
new_shape
)
for
x
in
(
q
,
k
,
v
))
return
q
,
k
,
v
def
forward
(
self
,
x
:
torch
.
Tensor
,
cu_seqlens
:
torch
.
Tensor
,
rotary_pos_emb_cos
:
torch
.
Tensor
,
rotary_pos_emb_sin
:
torch
.
Tensor
,
max_seqlen
:
torch
.
Tensor
|
None
=
None
,
# Only used for Flash Attention
)
->
torch
.
Tensor
:
# [s, b, c] --> [s, b, head * 3 * head_dim]
x
,
_
=
self
.
qkv
(
x
)
# [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
q
,
k
,
v
=
self
.
split_qkv
(
x
)
# RMSNorm on q, k
q_shape
,
k_shape
=
q
.
shape
,
k
.
shape
q
=
self
.
q_norm
(
q
.
reshape
(
-
1
,
self
.
head_dim
)).
view
(
q_shape
)
k
=
self
.
k_norm
(
k
.
reshape
(
-
1
,
self
.
head_dim
)).
view
(
k_shape
)
q
,
k
,
v
=
(
rearrange
(
x
,
"s b ... -> b s ..."
).
contiguous
()
for
x
in
(
q
,
k
,
v
))
if
rotary_pos_emb_cos
is
not
None
and
rotary_pos_emb_sin
is
not
None
:
# [2 * b, s, heads, head_dim]
qk_concat
=
torch
.
cat
([
q
,
k
],
dim
=
0
)
qk_rotated
=
self
.
apply_rotary_emb
(
qk_concat
,
rotary_pos_emb_cos
,
rotary_pos_emb_sin
,
)
q
,
k
=
torch
.
chunk
(
qk_rotated
,
2
,
dim
=
0
)
context_layer
=
self
.
attn
(
query
=
q
,
key
=
k
,
value
=
v
,
cu_seqlens
=
cu_seqlens
,
max_seqlen
=
max_seqlen
,
)
context_layer
=
rearrange
(
context_layer
,
"b s h d -> s b (h d)"
).
contiguous
()
output
,
_
=
self
.
proj
(
context_layer
)
return
output
class
GlmOcrVisionBlock
(
Glm4vVisionBlock
):
def
__init__
(
self
,
dim
:
int
,
num_heads
:
int
,
mlp_hidden_dim
:
int
,
norm_layer
:
Callable
[[
int
],
nn
.
Module
]
|
None
=
None
,
quant_config
:
QuantizationConfig
|
None
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
(
dim
,
num_heads
,
mlp_hidden_dim
,
norm_layer
,
quant_config
,
prefix
,
)
if
norm_layer
is
None
:
norm_layer
=
partial
(
nn
.
LayerNorm
,
eps
=
1e-6
)
self
.
norm1
=
norm_layer
(
dim
)
self
.
norm2
=
norm_layer
(
dim
)
self
.
attn
=
GlmOcrVisionAttention
(
embed_dim
=
dim
,
num_heads
=
num_heads
,
projection_size
=
dim
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
,
)
self
.
mlp
=
GlmOcrVisionMLP
(
dim
,
mlp_hidden_dim
,
bias
=
True
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
class
GlmOcrVisionPatchEmbed
(
Glm4vVisionPatchEmbed
):
pass
class
GlmOcrPatchMerger
(
Glm4vPatchMerger
):
pass
class
GlmOcrVisionTransformer
(
Glm4vVisionTransformer
):
def
__init__
(
self
,
vision_config
:
GlmOcrVisionConfig
,
norm_eps
:
float
=
1e-5
,
quant_config
:
QuantizationConfig
|
None
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
(
vision_config
,
norm_eps
,
quant_config
,
prefix
)
del
self
.
post_conv_layernorm
del
self
.
embeddings
patch_size
=
vision_config
.
patch_size
temporal_patch_size
=
vision_config
.
temporal_patch_size
in_channels
=
vision_config
.
in_channels
depth
=
vision_config
.
depth
self
.
hidden_size
=
vision_config
.
hidden_size
self
.
num_heads
=
vision_config
.
num_heads
self
.
patch_size
=
vision_config
.
patch_size
self
.
spatial_merge_size
=
vision_config
.
spatial_merge_size
self
.
out_hidden_size
=
vision_config
.
out_hidden_size
self
.
patch_embed
=
Glm4vVisionPatchEmbed
(
patch_size
=
patch_size
,
temporal_patch_size
=
temporal_patch_size
,
in_channels
=
in_channels
,
hidden_size
=
self
.
hidden_size
,
)
norm_layer
=
partial
(
RMSNorm
,
eps
=
norm_eps
)
head_dim
=
self
.
hidden_size
//
self
.
num_heads
self
.
rotary_pos_emb
=
get_rope
(
head_size
=
head_dim
,
max_position
=
8192
,
is_neox_style
=
True
,
rope_parameters
=
{
"partial_rotary_factor"
:
0.5
},
)
self
.
blocks
=
nn
.
ModuleList
(
[
GlmOcrVisionBlock
(
dim
=
self
.
hidden_size
,
num_heads
=
self
.
num_heads
,
mlp_hidden_dim
=
vision_config
.
intermediate_size
,
norm_layer
=
norm_layer
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.blocks.
{
layer_idx
}
"
,
)
for
layer_idx
in
range
(
depth
)
]
)
self
.
merger
=
GlmOcrPatchMerger
(
d_model
=
vision_config
.
out_hidden_size
,
context_dim
=
vision_config
.
out_hidden_size
*
vision_config
.
in_channels
,
quant_config
=
quant_config
,
bias
=
False
,
prefix
=
f
"
{
prefix
}
.merger"
,
)
self
.
downsample
=
Conv2dLayer
(
in_channels
=
vision_config
.
hidden_size
,
out_channels
=
vision_config
.
out_hidden_size
,
kernel_size
=
vision_config
.
spatial_merge_size
,
stride
=
vision_config
.
spatial_merge_size
,
)
self
.
post_layernorm
=
RMSNorm
(
vision_config
.
hidden_size
,
eps
=
vision_config
.
rms_norm_eps
)
self
.
attn_backend
=
get_vit_attn_backend
(
head_size
=
head_dim
,
dtype
=
torch
.
get_default_dtype
(),
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
grid_thw
:
torch
.
Tensor
|
list
[
list
[
int
]],
)
->
torch
.
Tensor
:
if
isinstance
(
grid_thw
,
list
):
grid_thw
=
torch
.
tensor
(
grid_thw
,
dtype
=
torch
.
int32
)
# patchify
x
=
x
.
to
(
device
=
self
.
device
,
dtype
=
self
.
dtype
)
x
=
self
.
patch_embed
(
x
)
# compute position embedding
rotary_pos_emb_cos
,
rotary_pos_emb_sin
,
image_type_ids
=
self
.
rot_pos_emb
(
grid_thw
)
# compute cu_seqlens
cu_seqlens
=
torch
.
repeat_interleave
(
grid_thw
[:,
1
]
*
grid_thw
[:,
2
],
grid_thw
[:,
0
]
).
cumsum
(
dim
=
0
,
dtype
=
torch
.
int32
)
cu_seqlens
=
torch
.
cat
([
cu_seqlens
.
new_zeros
(
1
),
cu_seqlens
])
cu_seqlens
=
cu_seqlens
.
to
(
self
.
device
,
non_blocking
=
True
)
# pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
max_seqlen
=
self
.
compute_attn_mask_seqlen
(
cu_seqlens
)
# transformers
x
=
x
.
unsqueeze
(
1
)
for
blk
in
self
.
blocks
:
x
=
blk
(
x
,
cu_seqlens
=
cu_seqlens
,
rotary_pos_emb_cos
=
rotary_pos_emb_cos
,
rotary_pos_emb_sin
=
rotary_pos_emb_sin
,
max_seqlen
=
max_seqlen
,
)
# adapter
x
=
self
.
post_layernorm
(
x
)
x
=
x
.
view
(
-
1
,
self
.
spatial_merge_size
,
self
.
spatial_merge_size
,
x
.
shape
[
-
1
])
x
=
x
.
permute
(
0
,
3
,
1
,
2
)
x
=
self
.
downsample
(
x
).
view
(
-
1
,
self
.
out_hidden_size
)
x
=
self
.
merger
(
x
)
return
x
@
MULTIMODAL_REGISTRY
.
register_processor
(
Glm4vMultiModalProcessor
,
info
=
Glm4vProcessingInfo
,
dummy_inputs
=
Glm4vDummyInputsBuilder
,
)
class
GlmOcrForConditionalGeneration
(
Glm4vForConditionalGeneration
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
with
self
.
_mark_tower_model
(
vllm_config
,
{
"image"
,
"video"
}):
self
.
visual
=
GlmOcrVisionTransformer
(
config
.
vision_config
,
norm_eps
=
getattr
(
config
,
"rms_norm_eps"
,
1e-5
),
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"visual"
),
)
vllm/model_executor/models/glm_ocr_mtp.py
deleted
100644 → 0
View file @
d7db129a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2026 The ZhipuAI Team.
# Copyright 2026 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GLM-OCR MTP model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
import
torch
import
torch.nn
as
nn
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
,
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
,
)
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
.glm4
import
Glm4DecoderLayer
,
get_spec_layer_idx_from_weight_name
from
.glm4_moe_lite_mtp
import
(
Glm4MoeLiteMultiTokenPredictor
,
SharedHead
,
)
from
.interfaces
import
SupportsPP
from
.utils
import
(
is_pp_missing_parameter
,
maybe_prefix
,
)
class
GlmOcrMultiTokenPredictorLayer
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
nn
.
Module
.
__init__
(
self
)
config
=
vllm_config
.
speculative_config
.
draft_model_config
.
hf_config
.
text_config
self
.
config
=
config
quant_config
=
vllm_config
.
quant_config
self
.
enorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
hnorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
eh_proj
=
nn
.
Linear
(
config
.
hidden_size
*
2
,
config
.
hidden_size
,
bias
=
False
)
self
.
device
=
current_platform
.
device_type
self
.
shared_head
=
SharedHead
(
config
=
config
,
prefix
=
prefix
,
quant_config
=
quant_config
)
self
.
mtp_block
=
Glm4DecoderLayer
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
config
=
self
.
config
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
previous_hidden_states
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
spec_step_index
:
int
=
0
,
)
->
torch
.
Tensor
:
assert
inputs_embeds
is
not
None
# masking inputs at position 0, as not needed by MTP
inputs_embeds
[
positions
[
0
]
==
0
]
=
0
inputs_embeds
=
self
.
enorm
(
inputs_embeds
)
previous_hidden_states
=
self
.
hnorm
(
previous_hidden_states
)
hidden_states
=
self
.
eh_proj
(
torch
.
cat
([
inputs_embeds
,
previous_hidden_states
],
dim
=-
1
)
)
hidden_states
,
residual
=
self
.
mtp_block
(
positions
=
positions
,
hidden_states
=
hidden_states
,
residual
=
None
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
GlmOcrMultiTokenPredictor
(
Glm4MoeLiteMultiTokenPredictor
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
nn
.
Module
.
__init__
(
self
)
config
=
vllm_config
.
model_config
.
hf_config
.
text_config
self
.
mtp_start_layer_idx
=
config
.
num_hidden_layers
self
.
num_mtp_layers
=
config
.
num_nextn_predict_layers
self
.
layers
=
torch
.
nn
.
ModuleDict
(
{
str
(
idx
):
GlmOcrMultiTokenPredictorLayer
(
vllm_config
=
vllm_config
,
prefix
=
f
"
{
prefix
}
.layers.
{
idx
}
"
,
)
for
idx
in
range
(
self
.
mtp_start_layer_idx
,
self
.
mtp_start_layer_idx
+
self
.
num_mtp_layers
,
)
}
)
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
class
GlmOcrMTP
(
nn
.
Module
,
SupportsPP
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
self
.
config
=
vllm_config
.
model_config
.
hf_config
.
text_config
quant_config
=
vllm_config
.
quant_config
self
.
quant_config
=
quant_config
self
.
model
=
GlmOcrMultiTokenPredictor
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
expert_weights
=
[]
self
.
num_layers
=
self
.
config
.
num_nextn_predict_layers
for
layer
in
self
.
model
.
layers
.
values
():
assert
isinstance
(
layer
,
GlmOcrMultiTokenPredictorLayer
)
layer
=
layer
.
mtp_block
assert
isinstance
(
layer
,
Glm4DecoderLayer
)
def
embed_input_ids
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
embed_input_ids
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
hidden_states
,
inputs_embeds
,
spec_step_idx
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
|
None
:
return
self
.
model
.
compute_logits
(
hidden_states
,
spec_step_idx
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
".qkv_proj"
,
".q_proj"
,
"q"
),
(
".qkv_proj"
,
".k_proj"
,
"k"
),
(
".qkv_proj"
,
".v_proj"
,
"v"
),
(
".gate_up_proj"
,
".gate_proj"
,
0
),
(
".gate_up_proj"
,
".up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
if
name
==
"lm_head.weight"
:
spec_layer
=
self
.
model
.
mtp_start_layer_idx
name
=
f
"model.layers.
{
spec_layer
}
.shared_head.head.weight"
elif
name
==
"model.embed_tokens.weight"
:
spec_layer
=
self
.
model
.
mtp_start_layer_idx
else
:
spec_layer
=
get_spec_layer_idx_from_weight_name
(
self
.
config
,
name
)
if
spec_layer
is
None
:
continue
name
=
self
.
_rewrite_spec_layer_name
(
spec_layer
,
name
)
if
self
.
quant_config
is
not
None
and
(
scale_name
:
=
self
.
quant_config
.
get_cache_scale
(
name
)
):
# Loading kv cache quantization scales
param
=
params_dict
[
scale_name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
loaded_weight
=
(
loaded_weight
if
loaded_weight
.
dim
()
==
0
else
loaded_weight
[
0
]
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
scale_name
)
continue
if
"scale"
in
name
or
"zero_point"
in
name
:
# Remapping the name of FP8 kv-scale or zero point.
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
# Some checkpoints include weight scale tensors for the
# LM head even when the quantized head isn't built. Skip
# them if the model does not expose a matching parameter
# to avoid KeyError during load.
if
name
.
endswith
(
".weight_scale"
)
and
name
not
in
params_dict
:
continue
# According to DeepSeek-V3 Technical Report, MTP modules
# shares embedding layer. We only load the first weights.
if
(
spec_layer
!=
self
.
model
.
mtp_start_layer_idx
and
".layers"
not
in
name
):
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
def
_rewrite_spec_layer_name
(
self
,
spec_layer
:
int
,
name
:
str
)
->
str
:
"""
Rewrite the weight name to match the format of the original model.
Add .mtp_block for modules in transformer layer block for spec layer
and rename shared layer weights to be top level.
"""
name
=
name
.
replace
(
"model.language_model.layers"
,
"model.layers"
)
spec_layer_weight_names
=
[
"embed_tokens"
,
"enorm"
,
"hnorm"
,
"eh_proj"
,
"shared_head"
,
]
shared_weight_names
=
[
"embed_tokens"
]
spec_layer_weight
=
False
shared_weight
=
False
for
weight_name
in
spec_layer_weight_names
:
if
weight_name
in
name
:
spec_layer_weight
=
True
if
weight_name
in
shared_weight_names
:
shared_weight
=
True
break
if
not
spec_layer_weight
:
# treat rest weights as weights for transformer layer block
name
=
name
.
replace
(
f
"model.layers.
{
spec_layer
}
."
,
f
"model.layers.
{
spec_layer
}
.mtp_block."
)
elif
shared_weight
:
# treat shared weights as top level weights
name
=
name
.
replace
(
f
"model.layers.
{
spec_layer
}
."
,
"model."
)
return
name
vllm/model_executor/models/glmasr.py
View file @
df704163
...
...
@@ -1075,7 +1075,7 @@ class GlmAsrForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1159,4 +1159,4 @@ class GlmAsrForConditionalGeneration(
"prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
{
"audio"
:
audio
},
}
return
cast
(
PromptType
,
prompt_dict
)
return
cast
(
PromptType
,
prompt_dict
)
\ No newline at end of file
vllm/model_executor/models/gpt2.py
View file @
df704163
...
...
@@ -218,7 +218,7 @@ class GPT2Model(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
,
...
...
@@ -298,7 +298,7 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -362,7 +362,7 @@ class GPT2ForSequenceClassification(nn.Module, SupportsCrossEncoding):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -382,4 +382,4 @@ def _add_transformer_prefix(
for
name
,
tensor
in
weights
:
if
not
name
.
startswith
(
"transformer."
)
and
not
name
.
startswith
(
"lm_head"
):
name
=
"transformer."
+
name
yield
name
,
tensor
yield
name
,
tensor
\ No newline at end of file
vllm/model_executor/models/gpt_bigcode.py
View file @
df704163
...
...
@@ -235,7 +235,7 @@ class GPTBigCodeModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -311,7 +311,7 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -336,4 +336,4 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
skip_prefixes
,
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gpt_j.py
View file @
df704163
...
...
@@ -220,7 +220,7 @@ class GPTJModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -324,7 +324,7 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -343,4 +343,4 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gpt_neox.py
View file @
df704163
...
...
@@ -237,7 +237,7 @@ class GPTNeoXModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -294,7 +294,6 @@ class GPTNeoXModel(nn.Module):
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
...
...
@@ -326,7 +325,7 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/gpt_oss.py
View file @
df704163
...
...
@@ -33,7 +33,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.utils
import
sequence_parallel_chunk
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.math_utils
import
cdiv
...
...
@@ -276,7 +275,7 @@ class GptOssModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -715,7 +714,7 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -743,4 +742,4 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
\ No newline at end of file
vllm/model_executor/models/granite.py
View file @
df704163
...
...
@@ -437,7 +437,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -472,4 +472,4 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
skip_prefixes
,
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/granite_speech.py
View file @
df704163
...
...
@@ -806,7 +806,7 @@ class GraniteSpeechForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -915,4 +915,4 @@ class GraniteSpeechForConditionalGeneration(
# Default settings are reasonable for this model and we don't currently
# expose this information in the model configs, but this may change in
# the future
return
SpeechToTextConfig
()
return
SpeechToTextConfig
()
\ No newline at end of file
Prev
1
2
3
4
5
6
7
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment