Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dcb5624a
Commit
dcb5624a
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-dev
parents
55880ca2
ba41cc90
Changes
554
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
53 additions
and
263 deletions
+53
-263
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+25
-28
vllm/model_executor/models/bloom.py
vllm/model_executor/models/bloom.py
+0
-11
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+0
-10
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+0
-11
vllm/model_executor/models/commandr.py
vllm/model_executor/models/commandr.py
+8
-11
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/dbrx.py
+0
-10
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek.py
+0
-10
vllm/model_executor/models/deepseek_mtp.py
vllm/model_executor/models/deepseek_mtp.py
+1
-11
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+0
-10
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+0
-16
vllm/model_executor/models/eagle.py
vllm/model_executor/models/eagle.py
+0
-13
vllm/model_executor/models/exaone.py
vllm/model_executor/models/exaone.py
+0
-11
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon.py
+0
-10
vllm/model_executor/models/florence2.py
vllm/model_executor/models/florence2.py
+0
-21
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+0
-13
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma.py
+0
-10
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/gemma2.py
+2
-12
vllm/model_executor/models/gemma3.py
vllm/model_executor/models/gemma3.py
+3
-11
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+12
-22
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+2
-12
No files found.
Too many changes to show.
To preserve performance only
554 of 554+
files are displayed.
Plain diff
Email patch
vllm/model_executor/models/blip2.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
functools
import
cached_property
from
typing
import
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
import
torch
...
...
@@ -12,7 +11,6 @@ from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
...
@@ -62,6 +60,7 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
is_cross_attention
:
bool
=
False
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
...
...
@@ -141,7 +140,7 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
class
Blip2QFormerSelfOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
)
->
None
:
def
__init__
(
self
,
config
:
Blip2QFormerConfig
,
prefix
:
str
=
""
)
->
None
:
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
...
...
@@ -169,6 +168,7 @@ class Blip2QFormerAttention(nn.Module):
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
is_cross_attention
:
bool
=
False
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
...
...
@@ -177,9 +177,10 @@ class Blip2QFormerAttention(nn.Module):
quant_config
=
quant_config
,
cache_config
=
cache_config
,
is_cross_attention
=
is_cross_attention
,
prefix
=
f
"
{
prefix
}
.attention"
,
)
self
.
output
=
Blip2QFormerSelfOutput
(
config
)
self
.
output
=
Blip2QFormerSelfOutput
(
config
,
prefix
=
f
"
{
prefix
}
.output"
)
def
forward
(
self
,
...
...
@@ -197,7 +198,7 @@ class Blip2QFormerAttention(nn.Module):
class
Blip2QFormerIntermediate
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
)
->
None
:
def
__init__
(
self
,
config
:
Blip2QFormerConfig
,
prefix
:
str
=
""
)
->
None
:
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
...
...
@@ -211,7 +212,7 @@ class Blip2QFormerIntermediate(nn.Module):
class
Blip2QFormerOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
)
->
None
:
def
__init__
(
self
,
config
:
Blip2QFormerConfig
,
prefix
:
str
=
""
)
->
None
:
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
...
...
@@ -239,6 +240,7 @@ class Blip2QFormerLayer(nn.Module):
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
layer_idx
:
int
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
...
...
@@ -246,7 +248,8 @@ class Blip2QFormerLayer(nn.Module):
self
.
seq_len_dim
=
1
self
.
attention
=
Blip2QFormerAttention
(
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
)
cache_config
=
cache_config
,
prefix
=
f
"
{
prefix
}
.attention"
)
self
.
layer_idx
=
layer_idx
...
...
@@ -255,13 +258,16 @@ class Blip2QFormerLayer(nn.Module):
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
,
is_cross_attention
=
True
)
is_cross_attention
=
True
,
prefix
=
f
"
{
prefix
}
.crossattention"
)
self
.
has_cross_attention
=
True
else
:
self
.
has_cross_attention
=
False
self
.
intermediate_query
=
Blip2QFormerIntermediate
(
config
)
self
.
output_query
=
Blip2QFormerOutput
(
config
)
self
.
intermediate_query
=
Blip2QFormerIntermediate
(
config
,
prefix
=
f
"
{
prefix
}
.intermediate_query"
)
self
.
output_query
=
Blip2QFormerOutput
(
config
,
prefix
=
f
"
{
prefix
}
.output_query"
)
def
forward
(
self
,
...
...
@@ -327,6 +333,7 @@ class Blip2QFormerEncoder(nn.Module):
*
,
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
...
...
@@ -336,7 +343,8 @@ class Blip2QFormerEncoder(nn.Module):
Blip2QFormerLayer
(
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
,
layer_idx
=
layer_idx
)
layer_idx
=
layer_idx
,
prefix
=
f
"
{
prefix
}
.layer.
{
layer_idx
}
"
)
for
layer_idx
in
range
(
config
.
num_hidden_layers
)
])
...
...
@@ -367,6 +375,7 @@ class Blip2QFormerModel(nn.Module):
*
,
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
...
...
@@ -378,7 +387,8 @@ class Blip2QFormerModel(nn.Module):
self
.
encoder
=
Blip2QFormerEncoder
(
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
)
cache_config
=
cache_config
,
prefix
=
f
"
{
prefix
}
.encoder"
)
def
forward
(
self
,
...
...
@@ -513,7 +523,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
self
.
qformer
=
Blip2QFormerModel
(
config
.
qformer_config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.qformer"
)
self
.
language_projection
=
nn
.
Linear
(
config
.
qformer_config
.
hidden_size
,
...
...
@@ -530,13 +541,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
self
.
make_empty_intermediate_tensors
=
(
self
.
language_model
.
make_empty_intermediate_tensors
)
@
cached_property
def
sampler
(
self
):
if
hasattr
(
self
.
language_model
,
"sampler"
):
return
self
.
language_model
.
sampler
return
get_sampler
()
def
_validate_pixel_values
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
h
=
w
=
self
.
config
.
vision_config
.
image_size
expected_dims
=
(
3
,
h
,
w
)
...
...
@@ -649,7 +653,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
)
->
Union
[
SamplerOutput
,
IntermediateTensors
]
:
)
->
IntermediateTensors
:
"""Run forward pass for BLIP-2.
One key thing to understand is the `input_ids` already accounts for the
...
...
@@ -707,13 +711,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
return
self
.
language_model
.
sample
(
logits
,
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
...
...
vllm/model_executor/models/bloom.py
View file @
dcb5624a
...
...
@@ -37,7 +37,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -308,8 +307,6 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
self
.
config
.
hidden_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
...
...
@@ -345,14 +342,6 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
params_dict
=
dict
(
self
.
named_parameters
(
remove_duplicate
=
False
))
...
...
vllm/model_executor/models/chameleon.py
View file @
dcb5624a
...
...
@@ -22,7 +22,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -950,7 +949,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
@@ -1054,14 +1052,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
stacked_params_mapping
=
[
...
...
vllm/model_executor/models/chatglm.py
View file @
dcb5624a
...
...
@@ -23,7 +23,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -494,8 +493,6 @@ class ChatGLMBaseModel(nn.Module):
self
.
transformer
.
embedding
.
weight
)
self
.
lm_head
=
self
.
transformer
.
output_layer
self
.
logits_processor
=
LogitsProcessor
(
config
.
padded_vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
...
...
@@ -508,14 +505,6 @@ class ChatGLMBaseModel(nn.Module):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
...
...
vllm/model_executor/models/commandr.py
View file @
dcb5624a
...
...
@@ -38,7 +38,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -89,6 +88,7 @@ class CohereMLP(nn.Module):
self
,
config
:
CohereConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
):
super
().
__init__
()
self
.
config
=
config
...
...
@@ -99,12 +99,14 @@ class CohereMLP(nn.Module):
[
self
.
intermediate_size
]
*
2
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.gate_up_proj"
,
)
self
.
down_proj
=
RowParallelLinear
(
self
.
intermediate_size
,
self
.
hidden_size
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.down_proj"
,
)
self
.
act_fn
=
SiluAndMul
()
...
...
@@ -158,12 +160,14 @@ class CohereAttention(nn.Module):
self
.
total_num_kv_heads
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.qkv_proj"
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
self
.
hidden_size
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.o_proj"
,
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
...
...
@@ -244,7 +248,9 @@ class CohereDecoderLayer(nn.Module):
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.self_attn"
)
self
.
mlp
=
CohereMLP
(
config
,
quant_config
=
quant_config
)
self
.
mlp
=
CohereMLP
(
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
)
self
.
input_layernorm
=
LayerNorm
(
param_shape
=
(
config
.
hidden_size
),
eps
=
config
.
layer_norm_eps
)
...
...
@@ -365,7 +371,6 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
scale
=
config
.
logit_scale
)
self
.
model
=
CohereModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
@@ -399,14 +404,6 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
stacked_params_mapping
=
[
...
...
vllm/model_executor/models/dbrx.py
View file @
dcb5624a
...
...
@@ -16,7 +16,6 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -390,7 +389,6 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
...
...
@@ -417,14 +415,6 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
Optional
[
torch
.
Tensor
],
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
expert_params_mapping
=
[(
...
...
vllm/model_executor/models/deepseek.py
View file @
dcb5624a
...
...
@@ -43,7 +43,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -453,7 +452,6 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
@@ -480,14 +478,6 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
Optional
[
torch
.
Tensor
],
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
...
...
vllm/model_executor/models/deepseek_mtp.py
View file @
dcb5624a
...
...
@@ -13,7 +13,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -165,10 +164,9 @@ class DeepSeekMTP(nn.Module):
self
.
model
=
DeepSeekMultiTokenPredictor
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
sampler
=
get_sampler
()
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
...
...
@@ -192,14 +190,6 @@ class DeepSeekMTP(nn.Module):
return
self
.
model
.
compute_logits
(
hidden_states
,
sampling_metadata
,
spec_step_idx
)
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
stacked_params_mapping
=
[
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
dcb5624a
...
...
@@ -47,7 +47,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -712,7 +711,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
else
:
self
.
lm_head
=
PPMissingLayer
()
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
...
...
@@ -741,14 +739,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
Optional
[
torch
.
Tensor
],
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
make_empty_intermediate_tensors
(
self
,
batch_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
)
->
IntermediateTensors
:
...
...
vllm/model_executor/models/deepseek_vl2.py
View file @
dcb5624a
...
...
@@ -4,7 +4,6 @@
"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
import
math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
functools
import
cached_property
from
typing
import
List
,
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
import
torch
...
...
@@ -16,7 +15,6 @@ from transformers import BatchFeature
from
vllm.config
import
VllmConfig
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
...
@@ -393,13 +391,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
model
=
model
.
to
(
dtype
=
torch
.
get_default_dtype
())
return
model
@
cached_property
def
sampler
(
self
):
if
hasattr
(
self
.
language_model
,
"sampler"
):
return
self
.
language_model
.
sampler
return
get_sampler
()
def
_validate_pixel_values
(
self
,
data
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
)
->
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
...
...
@@ -647,13 +638,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
return
self
.
language_model
.
sample
(
logits
,
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
...
...
vllm/model_executor/models/eagle.py
View file @
dcb5624a
...
...
@@ -9,7 +9,6 @@ from vllm.config import VllmConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -131,10 +130,6 @@ class EAGLE(nn.Module):
# checkpoint file has token_map tensor.
self
.
token_map
=
None
@
property
def
sampler
(
self
):
return
self
.
model
.
sampler
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
model
.
get_input_embeddings
(
input_ids
)
...
...
@@ -188,14 +183,6 @@ class EAGLE(nn.Module):
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
# This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
# due to missing lm_head weights and its config being that of a
...
...
vllm/model_executor/models/exaone.py
View file @
dcb5624a
...
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -510,8 +509,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
else
:
self
.
lm_head
=
PPMissingLayer
()
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
...
...
@@ -538,14 +535,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
...
...
vllm/model_executor/models/falcon.py
View file @
dcb5624a
...
...
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -527,7 +526,6 @@ class FalconForCausalLM(nn.Module, SupportsPP):
quant_config
=
quant_config
,
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
...
...
@@ -554,14 +552,6 @@ class FalconForCausalLM(nn.Module, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
...
...
vllm/model_executor/models/florence2.py
View file @
dcb5624a
...
...
@@ -3,7 +3,6 @@
import
math
from
collections
import
OrderedDict
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
functools
import
cached_property
from
typing
import
List
,
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
import
torch
...
...
@@ -14,7 +13,6 @@ from transformers import BartTokenizer, BatchFeature, PretrainedConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.bart
import
(
BartDecoder
,
BartEncoder
,
BartParallelLMHead
,
...
...
@@ -673,7 +671,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
self
.
logits_processor
=
LogitsProcessor
(
self
.
vocab_size
,
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
def
forward
(
self
,
...
...
@@ -716,11 +713,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
SamplerOutput
:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
stacked_params_mapping
=
[
...
...
@@ -929,12 +921,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
raise
NotImplementedError
(
'Florence2 only supports COSINE as temporal embedding.'
)
@
cached_property
def
sampler
(
self
):
if
hasattr
(
self
.
language_model
,
"sampler"
):
return
self
.
language_model
.
sampler
return
get_sampler
()
def
_validate_pixel_values
(
self
,
data
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
)
->
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
...
...
@@ -1110,13 +1096,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
SamplerOutput
:
return
self
.
language_model
.
sample
(
logits
,
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
...
...
vllm/model_executor/models/fuyu.py
View file @
dcb5624a
...
...
@@ -27,7 +27,6 @@ from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.models.persimmon
import
PersimmonForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
...
...
@@ -270,10 +269,6 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self
.
make_empty_intermediate_tensors
=
(
self
.
language_model
.
make_empty_intermediate_tensors
)
@
property
def
sampler
(
self
):
return
self
.
language_model
.
sampler
def
_validate_pixel_values
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
h
=
w
=
self
.
config
.
patch_size
...
...
@@ -387,14 +382,6 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self
.
language_model
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
language_model
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
...
...
vllm/model_executor/models/gemma.py
View file @
dcb5624a
...
...
@@ -35,7 +35,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -388,7 +387,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
model
=
GemmaModel
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
@@ -415,14 +413,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
...
...
vllm/model_executor/models/gemma2.py
View file @
dcb5624a
...
...
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -146,8 +145,8 @@ class Gemma2Attention(nn.Module):
# reference:
# https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
layer_idx
=
extract_layer_index
(
prefix
)
use_sliding_window
=
(
layer_idx
%
2
==
0
and
config
.
interleaved_sliding_window
is
not
None
)
use_sliding_window
=
(
layer_idx
%
2
==
0
and
getattr
(
config
,
"
interleaved_sliding_window
"
,
None
)
is
not
None
)
sliding_window
=
config
.
interleaved_sliding_window
if
\
use_sliding_window
else
None
self
.
attn
=
Attention
(
self
.
num_heads
,
...
...
@@ -388,7 +387,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
,
soft_cap
=
config
.
final_logit_softcapping
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
@@ -415,14 +413,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
...
...
vllm/model_executor/models/gemma3.py
View file @
dcb5624a
...
...
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -147,7 +146,9 @@ class Gemma3Attention(nn.Module):
# TODO(woosuk): Add reference to the original HF implementation.
layer_idx
=
extract_layer_index
(
prefix
)
self
.
is_sliding
=
bool
((
layer_idx
+
1
)
%
config
.
sliding_window_pattern
)
self
.
is_sliding
=
(
getattr
(
config
,
"interleaved_sliding_window"
,
None
)
is
not
None
and
bool
(
(
layer_idx
+
1
)
%
config
.
sliding_window_pattern
))
# Initialize the rotary embedding.
if
self
.
is_sliding
:
# Local attention. Override the values in config.json.
...
...
@@ -493,7 +494,6 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
,
soft_cap
=
config
.
final_logit_softcapping
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
@@ -521,14 +521,6 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
...
...
vllm/model_executor/models/gemma3_mm.py
View file @
dcb5624a
# SPDX-License-Identifier: Apache-2.0
import
math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Any
,
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
from
typing
import
Any
,
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
import
torch
from
torch
import
nn
...
...
@@ -12,7 +12,6 @@ import vllm.envs as envs
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.layernorm
import
GemmaRMSNorm
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
...
...
@@ -479,7 +478,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
multimodal_config
=
multimodal_config
self
.
sliding_window
=
config
.
text_config
.
interleaved_sliding_window
self
.
sliding_window
=
getattr
(
config
.
text_config
,
"interleaved_sliding_window"
,
None
)
self
.
vision_tower
=
SiglipVisionModel
(
config
.
vision_config
,
quant_config
,
...
...
@@ -503,10 +503,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
def
dtype
(
self
):
return
next
(
self
.
parameters
()).
dtype
@
property
def
sampler
(
self
):
return
self
.
language_model
.
sampler
def
_validate_pixel_values
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
h
=
w
=
self
.
config
.
vision_config
.
image_size
expected_dims
=
(
3
,
h
,
w
)
...
...
@@ -607,7 +603,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
)
->
Union
[
SamplerOutput
,
IntermediateTensors
]
:
**
kwargs
:
object
)
->
IntermediateTensors
:
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
...
...
@@ -685,13 +681,14 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
global_attn_mask
=
torch
.
where
(
img_mask
==
2
,
0
,
global_attn_mask
)
global_attn_masks
.
append
(
global_attn_mask
)
# Create a local causal mask with sliding window (1024).
local_attn_mask
=
torch
.
ones_like
(
global_attn_mask
)
local_attn_mask
=
torch
.
tril
(
local_attn_mask
,
diagonal
=-
self
.
sliding_window
)
local_attn_mask
=
torch
.
where
(
local_attn_mask
==
0
,
global_attn_mask
,
float
(
"-inf"
))
local_attn_masks
.
append
(
local_attn_mask
)
if
self
.
sliding_window
is
not
None
:
# Create a local causal mask with sliding window (1024).
local_attn_mask
=
torch
.
ones_like
(
global_attn_mask
)
local_attn_mask
=
torch
.
tril
(
local_attn_mask
,
diagonal
=-
self
.
sliding_window
)
local_attn_mask
=
torch
.
where
(
local_attn_mask
==
0
,
global_attn_mask
,
float
(
"-inf"
))
local_attn_masks
.
append
(
local_attn_mask
)
kwargs
[
"global_attn_masks"
]
=
global_attn_masks
kwargs
[
"local_attn_masks"
]
=
local_attn_masks
return
kwargs
...
...
@@ -704,13 +701,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
return
self
.
language_model
.
sample
(
logits
,
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
...
...
vllm/model_executor/models/glm4.py
View file @
dcb5624a
...
...
@@ -37,7 +37,6 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
...
...
@@ -200,8 +199,8 @@ class Glm4DecoderLayer(nn.Module):
hidden_states
=
self
.
post_self_attn_layernorm
(
hidden_states
)
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
,
residual
=
self
.
post_attention_layernorm
(
hidden_states
,
residual
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
self
.
post_mlp_layernorm
(
hidden_states
)
...
...
@@ -267,7 +266,6 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
lm_head
=
PPMissingLayer
()
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
@@ -295,14 +293,6 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
...
...
Prev
1
…
22
23
24
25
26
27
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment