Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
eefa41c1
Commit
eefa41c1
authored
Mar 24, 2026
by
zhuwenwen
Browse files
sync v0.18.0
parent
82155c76
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
335 additions
and
50 deletions
+335
-50
vllm/model_executor/models/glm4_moe.py
vllm/model_executor/models/glm4_moe.py
+3
-3
vllm/model_executor/models/glm4_moe_lite.py
vllm/model_executor/models/glm4_moe_lite.py
+3
-3
vllm/model_executor/models/glm4_moe_lite_mtp.py
vllm/model_executor/models/glm4_moe_lite_mtp.py
+2
-2
vllm/model_executor/models/glm4_moe_mtp.py
vllm/model_executor/models/glm4_moe_mtp.py
+2
-2
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+2
-2
vllm/model_executor/models/glm_ocr_mtp.py
vllm/model_executor/models/glm_ocr_mtp.py
+285
-0
vllm/model_executor/models/glmasr.py
vllm/model_executor/models/glmasr.py
+2
-2
vllm/model_executor/models/gpt2.py
vllm/model_executor/models/gpt2.py
+4
-4
vllm/model_executor/models/gpt_bigcode.py
vllm/model_executor/models/gpt_bigcode.py
+3
-3
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_j.py
+3
-3
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/gpt_neox.py
+3
-3
vllm/model_executor/models/gpt_oss.py
vllm/model_executor/models/gpt_oss.py
+3
-3
vllm/model_executor/models/granite.py
vllm/model_executor/models/granite.py
+2
-2
vllm/model_executor/models/granite_speech.py
vllm/model_executor/models/granite_speech.py
+2
-2
vllm/model_executor/models/granitemoe.py
vllm/model_executor/models/granitemoe.py
+3
-3
vllm/model_executor/models/granitemoehybrid.py
vllm/model_executor/models/granitemoehybrid.py
+3
-3
vllm/model_executor/models/granitemoeshared.py
vllm/model_executor/models/granitemoeshared.py
+3
-3
vllm/model_executor/models/grok1.py
vllm/model_executor/models/grok1.py
+3
-3
vllm/model_executor/models/hunyuan_v1.py
vllm/model_executor/models/hunyuan_v1.py
+2
-2
vllm/model_executor/models/hunyuan_vision.py
vllm/model_executor/models/hunyuan_vision.py
+2
-2
No files found.
vllm/model_executor/models/glm4_moe.py
View file @
eefa41c1
...
...
@@ -451,7 +451,7 @@ class Glm4MoeModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -687,7 +687,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, Glm4MixtureOfExper
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -722,4 +722,4 @@ def get_spec_layer_idx_from_weight_name(
for
i
in
range
(
config
.
num_nextn_predict_layers
):
if
f
"layers.
{
layer_idx
+
i
}
."
in
weight_name
:
return
layer_idx
+
i
return
None
return
None
\ No newline at end of file
vllm/model_executor/models/glm4_moe_lite.py
View file @
eefa41c1
...
...
@@ -264,7 +264,7 @@ class Glm4MoeLiteModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -596,7 +596,7 @@ class Glm4MoeLiteForCausalLM(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -640,4 +640,4 @@ def get_spec_layer_idx_from_weight_name(
for
i
in
range
(
config
.
num_nextn_predict_layers
):
if
f
"layers.
{
layer_idx
+
i
}
."
in
weight_name
:
return
layer_idx
+
i
return
None
return
None
\ No newline at end of file
vllm/model_executor/models/glm4_moe_lite_mtp.py
View file @
eefa41c1
...
...
@@ -230,7 +230,7 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
...
...
@@ -461,4 +461,4 @@ class Glm4MoeLiteMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts):
elif
shared_weight
:
# treat shared weights as top level weights
name
=
name
.
replace
(
f
"model.layers.
{
spec_layer
}
."
,
"model."
)
return
name
return
name
\ No newline at end of file
vllm/model_executor/models/glm4_moe_mtp.py
View file @
eefa41c1
...
...
@@ -216,7 +216,7 @@ class Glm4MoeMTP(nn.Module, Glm4MixtureOfExperts):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
...
...
@@ -363,4 +363,4 @@ class Glm4MoeMTP(nn.Module, Glm4MixtureOfExperts):
elif
shared_weight
:
# treat shared weights as top level weights
name
=
name
.
replace
(
f
"model.layers.
{
spec_layer
}
."
,
"model."
)
return
name
return
name
\ No newline at end of file
vllm/model_executor/models/glm4v.py
View file @
eefa41c1
...
...
@@ -625,7 +625,7 @@ class GLM4VForCausalLM(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -638,4 +638,4 @@ class GLM4VForCausalLM(
input_ids
,
positions
,
intermediate_tensors
,
inputs_embeds
)
return
hidden_states
return
hidden_states
\ No newline at end of file
vllm/model_executor/models/glm_ocr_mtp.py
0 → 100644
View file @
eefa41c1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2026 The ZhipuAI Team.
# Copyright 2026 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GLM-OCR MTP model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
import
torch
import
torch.nn
as
nn
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
,
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
,
)
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
.glm4
import
Glm4DecoderLayer
,
get_spec_layer_idx_from_weight_name
from
.glm4_moe_lite_mtp
import
(
Glm4MoeLiteMultiTokenPredictor
,
SharedHead
,
)
from
.interfaces
import
SupportsPP
from
.utils
import
(
is_pp_missing_parameter
,
maybe_prefix
,
)
class
GlmOcrMultiTokenPredictorLayer
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
nn
.
Module
.
__init__
(
self
)
config
=
vllm_config
.
speculative_config
.
draft_model_config
.
hf_config
.
text_config
self
.
config
=
config
quant_config
=
vllm_config
.
quant_config
self
.
enorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
hnorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
eh_proj
=
nn
.
Linear
(
config
.
hidden_size
*
2
,
config
.
hidden_size
,
bias
=
False
)
self
.
device
=
current_platform
.
device_type
self
.
shared_head
=
SharedHead
(
config
=
config
,
prefix
=
prefix
,
quant_config
=
quant_config
)
self
.
mtp_block
=
Glm4DecoderLayer
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
config
=
self
.
config
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
previous_hidden_states
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
spec_step_index
:
int
=
0
,
)
->
torch
.
Tensor
:
assert
inputs_embeds
is
not
None
# masking inputs at position 0, as not needed by MTP
inputs_embeds
[
positions
[
0
]
==
0
]
=
0
inputs_embeds
=
self
.
enorm
(
inputs_embeds
)
previous_hidden_states
=
self
.
hnorm
(
previous_hidden_states
)
hidden_states
=
self
.
eh_proj
(
torch
.
cat
([
inputs_embeds
,
previous_hidden_states
],
dim
=-
1
)
)
hidden_states
,
residual
=
self
.
mtp_block
(
positions
=
positions
,
hidden_states
=
hidden_states
,
residual
=
None
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
GlmOcrMultiTokenPredictor
(
Glm4MoeLiteMultiTokenPredictor
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
nn
.
Module
.
__init__
(
self
)
config
=
vllm_config
.
model_config
.
hf_config
.
text_config
self
.
mtp_start_layer_idx
=
config
.
num_hidden_layers
self
.
num_mtp_layers
=
config
.
num_nextn_predict_layers
self
.
layers
=
torch
.
nn
.
ModuleDict
(
{
str
(
idx
):
GlmOcrMultiTokenPredictorLayer
(
vllm_config
=
vllm_config
,
prefix
=
f
"
{
prefix
}
.layers.
{
idx
}
"
,
)
for
idx
in
range
(
self
.
mtp_start_layer_idx
,
self
.
mtp_start_layer_idx
+
self
.
num_mtp_layers
,
)
}
)
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
class
GlmOcrMTP
(
nn
.
Module
,
SupportsPP
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
self
.
config
=
vllm_config
.
model_config
.
hf_config
.
text_config
quant_config
=
vllm_config
.
quant_config
self
.
quant_config
=
quant_config
self
.
model
=
GlmOcrMultiTokenPredictor
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
self
.
expert_weights
=
[]
self
.
num_layers
=
self
.
config
.
num_nextn_predict_layers
for
layer
in
self
.
model
.
layers
.
values
():
assert
isinstance
(
layer
,
GlmOcrMultiTokenPredictorLayer
)
layer
=
layer
.
mtp_block
assert
isinstance
(
layer
,
Glm4DecoderLayer
)
def
embed_input_ids
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
embed_input_ids
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
hidden_states
,
inputs_embeds
,
spec_step_idx
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
|
None
:
return
self
.
model
.
compute_logits
(
hidden_states
,
spec_step_idx
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
".qkv_proj"
,
".q_proj"
,
"q"
),
(
".qkv_proj"
,
".k_proj"
,
"k"
),
(
".qkv_proj"
,
".v_proj"
,
"v"
),
(
".gate_up_proj"
,
".gate_proj"
,
0
),
(
".gate_up_proj"
,
".up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
if
name
==
"lm_head.weight"
:
spec_layer
=
self
.
model
.
mtp_start_layer_idx
name
=
f
"model.layers.
{
spec_layer
}
.shared_head.head.weight"
elif
name
==
"model.embed_tokens.weight"
:
spec_layer
=
self
.
model
.
mtp_start_layer_idx
else
:
spec_layer
=
get_spec_layer_idx_from_weight_name
(
self
.
config
,
name
)
if
spec_layer
is
None
:
continue
name
=
self
.
_rewrite_spec_layer_name
(
spec_layer
,
name
)
if
self
.
quant_config
is
not
None
and
(
scale_name
:
=
self
.
quant_config
.
get_cache_scale
(
name
)
):
# Loading kv cache quantization scales
param
=
params_dict
[
scale_name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
loaded_weight
=
(
loaded_weight
if
loaded_weight
.
dim
()
==
0
else
loaded_weight
[
0
]
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
scale_name
)
continue
if
"scale"
in
name
or
"zero_point"
in
name
:
# Remapping the name of FP8 kv-scale or zero point.
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
# Some checkpoints include weight scale tensors for the
# LM head even when the quantized head isn't built. Skip
# them if the model does not expose a matching parameter
# to avoid KeyError during load.
if
name
.
endswith
(
".weight_scale"
)
and
name
not
in
params_dict
:
continue
# According to DeepSeek-V3 Technical Report, MTP modules
# shares embedding layer. We only load the first weights.
if
(
spec_layer
!=
self
.
model
.
mtp_start_layer_idx
and
".layers"
not
in
name
):
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
def
_rewrite_spec_layer_name
(
self
,
spec_layer
:
int
,
name
:
str
)
->
str
:
"""
Rewrite the weight name to match the format of the original model.
Add .mtp_block for modules in transformer layer block for spec layer
and rename shared layer weights to be top level.
"""
name
=
name
.
replace
(
"model.language_model.layers"
,
"model.layers"
)
spec_layer_weight_names
=
[
"embed_tokens"
,
"enorm"
,
"hnorm"
,
"eh_proj"
,
"shared_head"
,
]
shared_weight_names
=
[
"embed_tokens"
]
spec_layer_weight
=
False
shared_weight
=
False
for
weight_name
in
spec_layer_weight_names
:
if
weight_name
in
name
:
spec_layer_weight
=
True
if
weight_name
in
shared_weight_names
:
shared_weight
=
True
break
if
not
spec_layer_weight
:
# treat rest weights as weights for transformer layer block
name
=
name
.
replace
(
f
"model.layers.
{
spec_layer
}
."
,
f
"model.layers.
{
spec_layer
}
.mtp_block."
)
elif
shared_weight
:
# treat shared weights as top level weights
name
=
name
.
replace
(
f
"model.layers.
{
spec_layer
}
."
,
"model."
)
return
name
\ No newline at end of file
vllm/model_executor/models/glmasr.py
View file @
eefa41c1
...
...
@@ -1081,7 +1081,7 @@ class GlmAsrForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1165,4 +1165,4 @@ class GlmAsrForConditionalGeneration(
return
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
{
"audio"
:
audio
},
)
)
\ No newline at end of file
vllm/model_executor/models/gpt2.py
View file @
eefa41c1
...
...
@@ -218,7 +218,7 @@ class GPT2Model(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
position_ids
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
,
...
...
@@ -298,7 +298,7 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -362,7 +362,7 @@ class GPT2ForSequenceClassification(nn.Module, SupportsCrossEncoding):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -382,4 +382,4 @@ def _add_transformer_prefix(
for
name
,
tensor
in
weights
:
if
not
name
.
startswith
(
"transformer."
)
and
not
name
.
startswith
(
"lm_head"
):
name
=
"transformer."
+
name
yield
name
,
tensor
yield
name
,
tensor
\ No newline at end of file
vllm/model_executor/models/gpt_bigcode.py
View file @
eefa41c1
...
...
@@ -235,7 +235,7 @@ class GPTBigCodeModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
position_ids
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -311,7 +311,7 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -336,4 +336,4 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
skip_prefixes
,
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gpt_j.py
View file @
eefa41c1
...
...
@@ -220,7 +220,7 @@ class GPTJModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
position_ids
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -324,7 +324,7 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -343,4 +343,4 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gpt_neox.py
View file @
eefa41c1
...
...
@@ -230,7 +230,7 @@ class GPTNeoXModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
position_ids
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -318,7 +318,7 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -337,4 +337,4 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/gpt_oss.py
View file @
eefa41c1
...
...
@@ -297,7 +297,7 @@ class GptOssModel(nn.Module, EagleModelMixin):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1210,7 +1210,7 @@ class GptOssForCausalLM(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1226,4 +1226,4 @@ class GptOssForCausalLM(
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
\ No newline at end of file
vllm/model_executor/models/granite.py
View file @
eefa41c1
...
...
@@ -437,7 +437,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -472,4 +472,4 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
skip_prefixes
,
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/granite_speech.py
View file @
eefa41c1
...
...
@@ -812,7 +812,7 @@ class GraniteSpeechForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -921,4 +921,4 @@ class GraniteSpeechForConditionalGeneration(
# Default settings are reasonable for this model and we don't currently
# expose this information in the model configs, but this may change in
# the future
return
SpeechToTextConfig
()
return
SpeechToTextConfig
()
\ No newline at end of file
vllm/model_executor/models/granitemoe.py
View file @
eefa41c1
...
...
@@ -312,7 +312,7 @@ class GraniteMoeModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -528,7 +528,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -558,4 +558,4 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/granitemoehybrid.py
View file @
eefa41c1
...
...
@@ -368,7 +368,7 @@ class GraniteMoeHybridModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -685,7 +685,7 @@ class GraniteMoeHybridForCausalLM(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -706,4 +706,4 @@ class GraniteMoeHybridForCausalLM(
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/granitemoeshared.py
View file @
eefa41c1
...
...
@@ -182,7 +182,7 @@ class GraniteMoeSharedModel(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -294,7 +294,7 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -324,4 +324,4 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/grok1.py
View file @
eefa41c1
...
...
@@ -490,7 +490,7 @@ class Grok1Model(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -704,7 +704,7 @@ class GrokBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -799,4 +799,4 @@ class GrokForCausalLM(GrokBaseForCausalLM):
cls
.
packed_modules_mapping
=
dict
(
cls
.
packed_modules_mapping
)
cls
.
packed_modules_mapping
.
update
(
instance_cls
.
packed_modules_mapping
)
return
instance_cls
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
return
instance_cls
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
\ No newline at end of file
vllm/model_executor/models/hunyuan_v1.py
View file @
eefa41c1
...
...
@@ -954,7 +954,7 @@ class HunyuanV1ModelBase(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1055,4 +1055,4 @@ class HunYuanDenseV1ForCausalLM(HunYuanDenseV1Base):
class
HunYuanMoEV1ForCausalLM
(
HunYuanMoEV1Base
):
pass
pass
\ No newline at end of file
vllm/model_executor/models/hunyuan_vision.py
View file @
eefa41c1
...
...
@@ -992,7 +992,7 @@ class HunYuanVLForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
,
...
...
@@ -1030,4 +1030,4 @@ class HunYuanVLForConditionalGeneration(
language_model
=
"language_model.model"
,
connector
=
"visual.perceive"
,
tower_model
=
"visual"
,
)
)
\ No newline at end of file
Prev
1
…
3
4
5
6
7
8
9
10
11
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment