Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
388 additions
and
32 deletions
+388
-32
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+7
-1
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_moe.py
+2
-0
vllm/model_executor/models/qwen2_rm.py
vllm/model_executor/models/qwen2_rm.py
+2
-0
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+25
-24
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+14
-2
vllm/model_executor/models/roberta.py
vllm/model_executor/models/roberta.py
+2
-0
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip.py
+1
-0
vllm/model_executor/models/solar.py
vllm/model_executor/models/solar.py
+2
-0
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/stablelm.py
+2
-0
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/starcoder2.py
+2
-0
vllm/model_executor/models/telechat2.py
vllm/model_executor/models/telechat2.py
+2
-0
vllm/model_executor/models/transformers.py
vllm/model_executor/models/transformers.py
+266
-0
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+33
-3
vllm/model_executor/models/utils.py
vllm/model_executor/models/utils.py
+2
-0
vllm/model_executor/models/vision.py
vllm/model_executor/models/vision.py
+2
-0
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+17
-2
vllm/model_executor/parameter.py
vllm/model_executor/parameter.py
+2
-0
vllm/model_executor/pooling_metadata.py
vllm/model_executor/pooling_metadata.py
+2
-0
vllm/model_executor/sampling_metadata.py
vllm/model_executor/sampling_metadata.py
+2
-0
vllm/model_executor/utils.py
vllm/model_executor/utils.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
vllm/model_executor/models/qwen2_audio.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
...
...
@@ -108,7 +110,11 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"audio"
:
None
}
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
)
->
Mapping
[
str
,
int
]:
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
int
]:
hf_config
=
self
.
get_hf_config
()
max_source_positions
=
hf_config
.
audio_config
.
max_source_positions
max_output_lengths
=
(
max_source_positions
-
2
)
//
2
+
1
...
...
vllm/model_executor/models/qwen2_moe.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
# Copyright 2024 The Qwen team.
...
...
vllm/model_executor/models/qwen2_rm.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
# Copyright 2024 The Qwen team.
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
# Copyright 2024 The Qwen team.
...
...
@@ -707,8 +709,8 @@ class Qwen2VisionTransformer(nn.Module):
return
loaded_params
class
Qwen2EmbeddingItems
(
ModalityDataItems
[
dict
[
str
,
torch
.
Tensor
],
dict
[
str
,
torch
.
Tensor
]]):
class
Qwen2
VL
EmbeddingItems
(
ModalityDataItems
[
dict
[
str
,
torch
.
Tensor
],
dict
[
str
,
torch
.
Tensor
]]):
def
__init__
(
self
,
data
:
dict
,
modality
:
str
)
->
None
:
super
().
__init__
(
data
,
modality
)
...
...
@@ -740,26 +742,26 @@ class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
return
self
.
data
class
Qwen2ImageEmbeddingItems
(
Qwen2EmbeddingItems
):
class
Qwen2
VL
ImageEmbeddingItems
(
Qwen2
VL
EmbeddingItems
):
def
__init__
(
self
,
data
:
dict
)
->
None
:
super
().
__init__
(
data
,
"image"
)
class
Qwen2VideoEmbeddingItems
(
Qwen2EmbeddingItems
):
class
Qwen2V
LV
ideoEmbeddingItems
(
Qwen2
VL
EmbeddingItems
):
def
__init__
(
self
,
data
:
dict
)
->
None
:
super
().
__init__
(
data
,
"video"
)
class
Qwen2MultiModalDataParser
(
MultiModalDataParser
):
class
Qwen2
VL
MultiModalDataParser
(
MultiModalDataParser
):
def
_parse_image_data
(
self
,
data
:
Union
[
dict
[
str
,
torch
.
Tensor
],
ModalityData
[
ImageItem
]],
)
->
ModalityDataItems
[
Any
,
Any
]:
if
isinstance
(
data
,
dict
):
return
Qwen2EmbeddingItems
(
data
,
modality
=
"image"
)
return
Qwen2
VL
EmbeddingItems
(
data
,
modality
=
"image"
)
return
super
().
_parse_image_data
(
data
)
...
...
@@ -768,7 +770,7 @@ class Qwen2MultiModalDataParser(MultiModalDataParser):
data
:
Union
[
dict
[
str
,
torch
.
Tensor
],
ModalityData
[
VideoItem
]],
)
->
ModalityDataItems
[
Any
,
Any
]:
if
isinstance
(
data
,
dict
):
return
Qwen2EmbeddingItems
(
data
,
modality
=
"video"
)
return
Qwen2
VL
EmbeddingItems
(
data
,
modality
=
"video"
)
return
super
().
_parse_video_data
(
data
)
...
...
@@ -815,7 +817,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
,
"video"
:
None
}
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
)
->
Mapping
[
str
,
int
]:
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
int
]:
return
{
"image"
:
self
.
get_max_image_tokens
(),
"video"
:
self
.
get_max_video_tokens
(
seq_len
),
...
...
@@ -1001,7 +1007,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
):
def
_get_data_parser
(
self
)
->
MultiModalDataParser
:
return
Qwen2MultiModalDataParser
()
return
Qwen2
VL
MultiModalDataParser
()
def
_get_prompt_replacements
(
self
,
...
...
@@ -1046,26 +1052,21 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
image_grid_thw
=
hf_inputs
.
get
(
"image_grid_thw"
,
torch
.
empty
((
0
,
3
)))
image_slice_idxs
=
[
0
]
+
image_grid_thw
.
prod
(
-
1
).
cumsum_
(
0
).
tolist
()
image_slices
=
[
slice
(
image_slice_idxs
[
i
],
image_slice_idxs
[
i
+
1
])
for
i
in
range
(
len
(
image_grid_thw
))
]
image_grid_sizes
=
image_grid_thw
.
prod
(
-
1
)
video_grid_thw
=
hf_inputs
.
get
(
"video_grid_thw"
,
torch
.
empty
((
0
,
3
)))
video_slice_idxs
=
[
0
]
+
video_grid_thw
.
prod
(
-
1
).
cumsum_
(
0
).
tolist
()
video_slices
=
[
slice
(
video_slice_idxs
[
i
],
video_slice_idxs
[
i
+
1
])
for
i
in
range
(
len
(
video_grid_thw
))
]
video_grid_sizes
=
video_grid_thw
.
prod
(
-
1
)
return
dict
(
pixel_values
=
MultiModalFieldConfig
.
flat
(
"image"
,
image_slices
),
image_embeds
=
MultiModalFieldConfig
.
flat
(
"image"
,
image_slices
),
pixel_values
=
MultiModalFieldConfig
.
flat_from_sizes
(
"image"
,
image_grid_sizes
),
image_embeds
=
MultiModalFieldConfig
.
flat_from_sizes
(
"image"
,
image_grid_sizes
),
image_grid_thw
=
MultiModalFieldConfig
.
batched
(
"image"
),
pixel_values_videos
=
MultiModalFieldConfig
.
flat
(
"video"
,
video_slices
),
video_embeds
=
MultiModalFieldConfig
.
flat
(
"video"
,
video_slices
),
pixel_values_videos
=
MultiModalFieldConfig
.
flat_from_sizes
(
"video"
,
video_grid_sizes
),
video_embeds
=
MultiModalFieldConfig
.
flat_from_sizes
(
"video"
,
video_grid_sizes
),
video_grid_thw
=
MultiModalFieldConfig
.
batched
(
"video"
),
)
...
...
vllm/model_executor/models/registry.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
Whenever you add an architecture to this page, please also update
`tests/models/registry.py` with example HuggingFace models for it.
...
...
@@ -44,7 +45,7 @@ _TEXT_GENERATION_MODELS = {
"DeciLMForCausalLM"
:
(
"decilm"
,
"DeciLMForCausalLM"
),
"DeepseekForCausalLM"
:
(
"deepseek"
,
"DeepseekForCausalLM"
),
"DeepseekV2ForCausalLM"
:
(
"deepseek_v2"
,
"DeepseekV2ForCausalLM"
),
"DeepseekV3ForCausalLM"
:
(
"deepseek_v
3
"
,
"DeepseekV3ForCausalLM"
),
"DeepseekV3ForCausalLM"
:
(
"deepseek_v
2
"
,
"DeepseekV3ForCausalLM"
),
"ExaoneForCausalLM"
:
(
"exaone"
,
"ExaoneForCausalLM"
),
"FalconForCausalLM"
:
(
"falcon"
,
"FalconForCausalLM"
),
"Fairseq2LlamaForCausalLM"
:
(
"fairseq2_llama"
,
"Fairseq2LlamaForCausalLM"
),
...
...
@@ -171,6 +172,7 @@ _MULTIMODAL_MODELS = {
"PixtralForConditionalGeneration"
:
(
"pixtral"
,
"PixtralForConditionalGeneration"
),
# noqa: E501
"QWenLMHeadModel"
:
(
"qwen"
,
"QWenLMHeadModel"
),
"Qwen2VLForConditionalGeneration"
:
(
"qwen2_vl"
,
"Qwen2VLForConditionalGeneration"
),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
(
"qwen2_5_vl"
,
"Qwen2_5_VLForConditionalGeneration"
),
# noqa: E501
"Qwen2AudioForConditionalGeneration"
:
(
"qwen2_audio"
,
"Qwen2AudioForConditionalGeneration"
),
# noqa: E501
"UltravoxModel"
:
(
"ultravox"
,
"UltravoxModel"
),
# [Encoder-decoder]
...
...
@@ -183,6 +185,10 @@ _SPECULATIVE_DECODING_MODELS = {
"MedusaModel"
:
(
"medusa"
,
"Medusa"
),
"MLPSpeculatorPreTrainedModel"
:
(
"mlp_speculator"
,
"MLPSpeculator"
),
}
_FALLBACK_MODEL
=
{
"TransformersModel"
:
(
"transformers"
,
"TransformersModel"
),
}
# yapf: enable
_VLLM_MODELS
=
{
...
...
@@ -191,6 +197,7 @@ _VLLM_MODELS = {
**
_CROSS_ENCODER_MODELS
,
**
_MULTIMODAL_MODELS
,
**
_SPECULATIVE_DECODING_MODELS
,
**
_FALLBACK_MODEL
,
}
...
...
@@ -377,7 +384,12 @@ class _ModelRegistry:
if
not
architectures
:
logger
.
warning
(
"No model architectures are specified"
)
return
architectures
normalized_arch
=
[]
for
model
in
architectures
:
if
model
not
in
self
.
models
:
model
=
"TransformersModel"
normalized_arch
.
append
(
model
)
return
normalized_arch
def
inspect_model_cls
(
self
,
...
...
vllm/model_executor/models/roberta.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
itertools
from
typing
import
Iterable
,
List
,
Optional
,
Tuple
...
...
vllm/model_executor/models/siglip.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Implementation of SiglipVisionModel intended to be only used
within a vision language model."""
...
...
vllm/model_executor/models/solar.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
...
...
vllm/model_executor/models/stablelm.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
# All rights reserved.
#
...
...
vllm/model_executor/models/starcoder2.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
...
...
vllm/model_executor/models/telechat2.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
...
...
vllm/model_executor/models/transformers.py
0 → 100644
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wrapper around `transformers` models"""
import
re
from
typing
import
Iterable
,
Literal
,
Optional
,
Union
import
torch
from
torch
import
nn
from
transformers
import
AutoModel
,
PreTrainedModel
from
transformers.modeling_utils
import
ALL_ATTENTION_FUNCTIONS
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed.utils
import
divide
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.utils
import
maybe_prefix
logger
=
init_logger
(
__name__
)
def
vllm_flash_attention_forward
(
# Transformers args
module
:
torch
.
nn
.
Module
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
attention_mask
:
torch
.
Tensor
,
# Transformers kwargs
scaling
:
float
=
None
,
# vLLM kwargs
attn_metadata
:
AttentionMetadata
=
None
,
attention_instances
:
list
[
Attention
]
=
None
,
**
kwargs
):
self_attn
=
attention_instances
[
module
.
layer_idx
]
if
scaling
is
not
None
:
self_attn
.
impl
.
scale
=
float
(
scaling
)
hidden
=
query
.
shape
[
-
2
]
query
,
key
,
value
=
(
x
.
transpose
(
1
,
2
)
for
x
in
(
query
,
key
,
value
))
query
,
key
,
value
=
(
x
.
reshape
(
hidden
,
-
1
)
for
x
in
(
query
,
key
,
value
))
return
self_attn
.
forward
(
query
,
key
,
value
,
kv_cache
=
None
,
# argument not used
attn_metadata
=
attn_metadata
),
None
ALL_ATTENTION_FUNCTIONS
[
"vllm"
]
=
vllm_flash_attention_forward
def
log_replacement
(
name
:
str
,
old_module
:
nn
.
Module
,
new_module
:
nn
.
Module
):
logger
.
debug
(
"%s: %s -> %s"
,
name
,
old_module
,
new_module
)
def
replace_linear_class
(
linear
:
nn
.
Linear
,
style
:
Literal
[
"colwise"
,
"rowwise"
],
quant_config
=
None
)
->
Union
[
ColumnParallelLinear
,
RowParallelLinear
]:
"""
Replace nn.Linear with one of vLLM's tensor parallel linear classes.
`quant_config` is not yet supported.
Args:
linear (nn.Linear): `nn.Linear` to be replaced.
style (str): Tensor parallel style of the new linear, e.g. "colwise".
quant_config (QuantConfig): Quantization config for the new linear.
Returns:
Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
"""
if
not
isinstance
(
style
,
str
):
raise
ValueError
(
f
"Unsupported parallel style type
{
type
(
style
)
}
, expected str"
)
vllm_linear_cls
=
{
"colwise"
:
ColumnParallelLinear
,
"rowwise"
:
RowParallelLinear
,
}.
get
(
style
)
if
vllm_linear_cls
is
None
:
logger
.
warning
(
"Unsupported parallel style value: %s. "
"This layer will not be tensor parallelized."
,
style
)
return
linear
class
HFCompatibleLinear
(
vllm_linear_cls
):
"""
Wrapper class that removes `output_bias` from returned output.
"""
def
forward
(
self
,
input
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
super
().
forward
(
input
)[
0
]
return
HFCompatibleLinear
(
input_size
=
linear
.
in_features
,
output_size
=
linear
.
out_features
,
bias
=
linear
.
bias
is
not
None
,
)
class
TransformersModel
(
nn
.
Module
):
embedding_padding_modules
=
[
"lm_head"
]
embedding_modules
=
[
"embed_tokens"
]
# TODO transformers will have a util to get it
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
)
->
None
:
super
().
__init__
()
logger
.
info
(
"Using Transformers backend."
)
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
vocab_size
=
config
.
vocab_size
self
.
unpadded_vocab_size
=
config
.
vocab_size
self
.
model
:
PreTrainedModel
=
AutoModel
.
from_config
(
self
.
config
,
attn_implementation
=
"vllm"
,
trust_remote_code
=
vllm_config
.
model_config
.
trust_remote_code
,
)
prefix
=
self
.
model
.
base_model_prefix
# MLP modifications
self
.
apply_base_model_tp_plan
(
self
.
model
)
# Attention modifications (assumes 1 attention op per hidden layer)
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
attention_instances
=
[
Attention
(
num_heads
=
divide
(
config
.
num_attention_heads
,
tp_size
),
head_size
=
config
.
head_dim
,
# NOTE: We use Llama scale as default, if it's set by
# Transformers, it's updated in vllm_flash_attention_forward
scale
=
config
.
head_dim
**-
0.5
,
num_kv_heads
=
divide
(
config
.
num_key_value_heads
,
tp_size
),
cache_config
=
cache_config
,
quant_config
=
None
,
prefix
=
f
"
{
i
}
.attn"
)
for
i
in
range
(
config
.
num_hidden_layers
)
]
# Model modifications
self
.
replace_vocab_embed_class
(
self
.
model
)
# ForCausalLM modifications
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
quant_config
=
None
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
))
if
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
get_input_embeddings
().
weight
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
)
self
.
sampler
=
get_sampler
()
def
apply_base_model_tp_plan
(
self
,
module
:
nn
.
Module
,
prefix
:
str
=
""
):
"""
Apply the base model tensor parallelization plan to a module.
Currently only supports linear layers.
"""
if
(
self
.
config
.
base_model_tp_plan
is
None
and
get_tensor_model_parallel_world_size
()
>
1
):
raise
ValueError
(
"Trying to run tensor parallelization but the model does not "
"support it yet!"
)
for
child_name
,
child_module
in
module
.
named_children
():
qual_name
=
maybe_prefix
(
prefix
,
child_name
)
for
pattern
,
style
in
self
.
config
.
base_model_tp_plan
.
items
():
if
re
.
match
(
pattern
,
qual_name
)
and
isinstance
(
child_module
,
nn
.
Linear
):
new_module
=
replace_linear_class
(
child_module
,
style
,
self
.
quant_config
)
setattr
(
module
,
child_name
,
new_module
)
log_replacement
(
qual_name
,
child_module
,
new_module
)
else
:
self
.
apply_base_model_tp_plan
(
child_module
,
prefix
=
qual_name
)
def
replace_vocab_embed_class
(
self
,
module
:
nn
.
Module
):
# Use native set input embeddings
new_module
=
VocabParallelEmbedding
(
self
.
vocab_size
,
self
.
config
.
hidden_size
,
org_num_embeddings
=
self
.
config
.
vocab_size
,
quant_config
=
None
,
)
log_replacement
(
"input embedding"
,
self
.
model
.
get_input_embeddings
(),
new_module
)
self
.
model
.
set_input_embeddings
(
new_module
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
list
[
torch
.
Tensor
],
# argument not used
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
model_output
=
self
.
model
(
input_ids
[
None
,
...],
use_cache
=
False
,
position_ids
=
positions
[
None
,
...],
attn_metadata
=
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
attention_instances
=
self
.
attention_instances
,
return_dict
=
False
)[
0
][
0
,
...]
# we remove batch dimension for now
return
model_output
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
=
set
[
str
]()
for
name
,
loaded_weight
in
weights
:
if
name
not
in
params_dict
:
name
=
f
"
{
self
.
model
.
base_model_prefix
}
.
{
name
}
"
if
name
in
params_dict
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
vllm/model_executor/models/ultravox.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
"""PyTorch Ultravox model."""
import
math
...
...
@@ -20,6 +22,7 @@ from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.model_loader.loader
import
DefaultModelLoader
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
...
...
@@ -31,7 +34,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.ultravox
import
UltravoxConfig
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
,
...
...
@@ -90,7 +93,11 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"audio"
:
None
}
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
)
->
Mapping
[
str
,
int
]:
def
get_mm_max_tokens_per_item
(
self
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
int
]:
feature_extractor
=
self
.
get_feature_extractor
()
max_audio_tokens
=
math
.
ceil
(
feature_extractor
.
chunk_length
*
_AUDIO_TOKENS_PER_SECOND
)
...
...
@@ -337,7 +344,20 @@ class ModifiedWhisperEncoder(WhisperEncoder):
UltravoxMultiModalProcessor
,
info
=
UltravoxProcessingInfo
,
dummy_inputs
=
UltravoxDummyInputsBuilder
)
class
UltravoxModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
class
UltravoxModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
,
SupportsLoRA
):
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
],
"gate_up_proj"
:
[
"gate_proj"
,
"up_proj"
]
}
# LoRA specific attributes
# TODO : Add LoRA to the audio tower and projector.
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"audio_tower.model.encoder."
:
"audio_tower."
})
...
...
@@ -385,6 +405,16 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
return
get_sampler
()
def
get_mm_mapping
(
self
)
->
MultiModelKeys
:
"""
Get the module prefix in multimodal models
"""
return
MultiModelKeys
.
from_string_field
(
language_model
=
"language_model."
,
connector
=
"multi_modal_projector."
,
tower_model
=
"audio_tower."
,
)
def
_audio_features_to_embeddings
(
self
,
input_features
:
torch
.
Tensor
)
->
torch
.
Tensor
:
audio_input
=
input_features
.
to
(
self
.
audio_tower
.
dtype
)
...
...
vllm/model_executor/models/utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
itertools
from
dataclasses
import
dataclass
,
field
from
typing
import
(
Callable
,
Dict
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
...
...
vllm/model_executor/models/vision.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
abc
import
ABC
,
abstractmethod
from
typing
import
Final
,
Generic
,
Optional
,
Protocol
,
TypeVar
,
Union
...
...
vllm/model_executor/models/whisper.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
math
from
typing
import
(
Iterable
,
List
,
Mapping
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
)
...
...
@@ -636,6 +638,19 @@ def input_mapper_for_whisper(
@
MULTIMODAL_REGISTRY
.
register_max_multimodal_tokens
(
"audio"
,
get_max_whisper_audio_tokens
)
class
WhisperForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
):
packed_modules_mapping
=
{
"self_attn.qkv_proj"
:
[
"self_attn.q_proj"
,
"self_attn.k_proj"
,
"self_attn.v_proj"
,
],
"encoder_attn.kv_proj"
:
[
"encoder_attn.k_proj"
,
"encoder_attn.v_proj"
],
}
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_substr
=
{
".fc1."
:
".mlp.fc1."
,
".fc2."
:
".mlp.fc2."
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
...
...
@@ -729,10 +744,10 @@ class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
,
skip_prefixes
=
[
"proj_out."
])
mapper
=
WeightsMapper
({
".fc1."
:
".mlp.fc1."
,
".fc2."
:
".mlp.fc2."
})
# add fake zeros bias for k_proj to state_dict
weights
=
_create_fake_bias_for_k_proj
(
weights
)
return
loader
.
load_weights
(
weights
,
mapper
=
mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_
mapper
)
def
_create_fake_bias_for_k_proj
(
...
...
vllm/model_executor/parameter.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
fractions
import
Fraction
from
typing
import
Callable
,
Optional
,
Union
...
...
vllm/model_executor/pooling_metadata.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
dataclasses
import
dataclass
from
typing
import
Any
,
Dict
,
List
,
Tuple
...
...
vllm/model_executor/sampling_metadata.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
array
import
array
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
,
Optional
,
Tuple
...
...
vllm/model_executor/utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Utils for model executor."""
from
typing
import
Any
,
Dict
,
Optional
...
...
Prev
1
…
43
44
45
46
47
48
49
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment