Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
601bd326
Unverified
Commit
601bd326
authored
Mar 14, 2025
by
Cyrus Leung
Committed by
GitHub
Mar 14, 2025
Browse files
[Misc] Clean up type annotation for `SupportsMultiModal` (#14794)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
09269b31
Changes
27
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
89 additions
and
104 deletions
+89
-104
docs/source/contributing/model/multimodal.md
docs/source/contributing/model/multimodal.md
+3
-2
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+2
-2
vllm/model_executor/models/aria.py
vllm/model_executor/models/aria.py
+4
-6
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+4
-6
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+4
-6
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+3
-4
vllm/model_executor/models/florence2.py
vllm/model_executor/models/florence2.py
+5
-5
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+4
-5
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+5
-5
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+5
-5
vllm/model_executor/models/idefics3.py
vllm/model_executor/models/idefics3.py
+3
-4
vllm/model_executor/models/interfaces.py
vllm/model_executor/models/interfaces.py
+13
-11
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+3
-4
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+4
-3
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+4
-5
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_next_video.py
+4
-6
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+6
-8
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+4
-5
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+4
-6
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+5
-6
No files found.
docs/source/contributing/model/multimodal.md
View file @
601bd326
...
...
@@ -34,7 +34,8 @@ Further update the model as follows:
image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features)
def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
# Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs)
...
...
@@ -61,7 +62,7 @@ Further update the model as follows:
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[
NestedTensor
s] = None,
multimodal_embeddings: Optional[
MultiModalEmbedding
s] = None,
) -> torch.Tensor:
# `get_input_embeddings` should already be implemented for the language
...
...
tests/distributed/test_pipeline_parallel.py
View file @
601bd326
...
...
@@ -214,7 +214,7 @@ MULTIMODAL_MODELS = {
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM-Llama3-V-2_5"
:
PPTestSettings
.
fast
(),
"allenai/Molmo-7B-D-0924"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3-vision-
128k-
instruct"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3
.5
-vision-instruct"
:
PPTestSettings
.
fast
(),
"mistralai/Pixtral-12B-2409"
:
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
"Qwen/Qwen-VL-Chat"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2-Audio-7B-Instruct"
:
PPTestSettings
.
fast
(),
...
...
@@ -237,7 +237,7 @@ TEST_MODELS = [
"BAAI/bge-multilingual-gemma2"
,
# [MULTIMODAL GENERATION]
"OpenGVLab/InternVL2-1B"
,
"microsoft/Phi-3-vision-
128k-
instruct"
,
"microsoft/Phi-3
.5
-vision-instruct"
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# [LANGUAGE GENERATION - HYBRID ARCH]
"ai21labs/Jamba-tiny-dev"
,
...
...
vllm/model_executor/models/aria.py
View file @
601bd326
...
...
@@ -21,8 +21,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargs
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
,
...
...
@@ -35,7 +34,7 @@ from .idefics2_vision_model import Idefics2VisionConfig
from
.idefics2_vision_model
import
(
Idefics2VisionTransformer
as
Idefics3VisionTransformer
)
# yapf: enable
from
.interfaces
import
SupportsMultiModal
,
SupportsQuant
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsQuant
from
.llama
import
LlamaDecoderLayer
,
LlamaMLP
,
LlamaModel
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
is_pp_missing_parameter
,
maybe_prefix
,
...
...
@@ -607,8 +606,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
return
self
.
multi_modal_projector
(
image_outputs
,
image_attn_mask
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -618,7 +616,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/blip2.py
View file @
601bd326
...
...
@@ -15,8 +15,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargs
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptIndexTargets
,
...
...
@@ -25,7 +24,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
.blip
import
BlipVisionModel
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -629,8 +628,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
return
self
.
language_projection
(
query_output
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -640,7 +638,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/chameleon.py
View file @
601bd326
...
...
@@ -30,8 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import (
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargs
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
,
...
...
@@ -39,7 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -986,8 +985,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -1000,7 +998,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
...
...
vllm/model_executor/models/deepseek_vl2.py
View file @
601bd326
...
...
@@ -36,7 +36,7 @@ from vllm.transformers_utils.processors.deepseek_vl2 import (
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
vllm.utils
import
is_list_of
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -605,8 +605,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
pixel_values
=
pixel_values
,
images_spatial_crop
=
images_spatial_crop
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -616,7 +615,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/florence2.py
View file @
601bd326
...
...
@@ -20,7 +20,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
BartParallelLMHead
,
BartScaledWordEmbedding
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
NestedTensors
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargs
from
vllm.multimodal.parse
import
MultiModalDataDict
,
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseProcessingInfo
,
...
...
@@ -30,7 +30,8 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsMultiModal
,
SupportsV0Only
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsV0Only
)
from
.utils
import
AutoWeightsLoader
,
flatten_bn
,
merge_multimodal_embeddings
...
...
@@ -1037,8 +1038,7 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
return
self
.
_encode_image
(
pixel_values
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -1048,7 +1048,7 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/fuyu.py
View file @
601bd326
...
...
@@ -18,7 +18,7 @@
""" PyTorch Fuyu model."""
import
math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
List
,
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
from
typing
import
List
,
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
import
torch
import
torch.nn
as
nn
...
...
@@ -41,7 +41,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -327,8 +327,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
return
vision_embeddings_flat
.
split
(
patches_per_image
,
dim
=
0
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -338,7 +337,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/gemma3_mm.py
View file @
601bd326
...
...
@@ -14,8 +14,7 @@ from vllm.model_executor.layers.layernorm import GemmaRMSNorm
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargs
from
vllm.multimodal.parse
import
(
ImageProcessorItems
,
ImageSize
,
MultiModalDataItems
)
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
...
...
@@ -24,7 +23,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.siglip
import
SiglipVisionModel
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -481,7 +480,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
)
return
self
.
multi_modal_projector
(
vision_outputs
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -491,7 +491,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
if
multimodal_embeddings
is
None
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
...
...
vllm/model_executor/models/glm4v.py
View file @
601bd326
...
...
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalKwargs
,
NestedTensors
from
vllm.multimodal.inputs
import
MultiModalKwargs
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
BatchFeature
,
...
...
@@ -39,7 +39,8 @@ from vllm.sequence import IntermediateTensors
from
vllm.transformers_utils.configs
import
ChatGLMConfig
from
.chatglm
import
ChatGLMBaseModel
,
ChatGLMModel
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
)
from
.utils
import
flatten_bn
,
merge_multimodal_embeddings
...
...
@@ -596,8 +597,7 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
return
self
.
transformer
.
vision
(
pixel_values
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -608,7 +608,7 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
transformer
.
get_input_embeddings
(
input_ids
)
...
...
vllm/model_executor/models/idefics3.py
View file @
601bd326
...
...
@@ -49,7 +49,7 @@ from vllm.sequence import IntermediateTensors
from
.idefics2_vision_model
import
(
Idefics2VisionTransformer
as
Idefics3VisionTransformer
)
# yapf: enable
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
from
.interfaces
import
MultiModalEmbeddings
,
SupportsLoRA
,
SupportsMultiModal
from
.llama
import
LlamaModel
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -617,8 +617,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
self
.
sampler
=
get_sampler
()
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
model
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -628,7 +627,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/interfaces.py
View file @
601bd326
...
...
@@ -5,7 +5,7 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
import
torch
from
torch
import
Tensor
from
typing_extensions
import
TypeIs
,
TypeVar
from
typing_extensions
import
TypeIs
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization.base_config
import
(
...
...
@@ -20,7 +20,14 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
T
=
TypeVar
(
"T"
,
default
=
Union
[
list
[
Tensor
],
Tensor
,
tuple
[
Tensor
,
...]])
MultiModalEmbeddings
=
Union
[
list
[
Tensor
],
Tensor
,
tuple
[
Tensor
,
...]]
"""
The output embeddings must be one of the following formats:
- A list or tuple of 2D tensors, where each tensor corresponds to
each input multimodal data item (e.g, image).
- A single 3D tensor, with the batch dimension grouping the 2D tensors.
"""
@
runtime_checkable
...
...
@@ -36,17 +43,12 @@ class SupportsMultiModal(Protocol):
MRO of your model class.
"""
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
T
:
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
"""
Returns multimodal embeddings generated from multimodal kwargs
to be merged with text embeddings.
The output embeddings must be one of the following formats:
- A list or tuple of 2D tensors, where each tensor corresponds to
each input multimodal data item (e.g, image).
- A single 3D tensor, with the batch dimension grouping the 2D tensors.
Note:
The returned multimodal embeddings must be in the same order as
the appearances of their corresponding multimodal data item in the
...
...
@@ -60,7 +62,7 @@ class SupportsMultiModal(Protocol):
def
get_input_embeddings
(
self
,
input_ids
:
Tensor
,
multimodal_embeddings
:
Optional
[
T
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
attn_metadata
:
Optional
[
"AttentionMetadata"
]
=
None
,
)
->
Tensor
:
...
...
...
@@ -69,7 +71,7 @@ class SupportsMultiModal(Protocol):
def
get_input_embeddings
(
self
,
input_ids
:
Tensor
,
multimodal_embeddings
:
Optional
[
T
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
Tensor
:
"""
Returns the input embeddings merged from the text embeddings from
...
...
vllm/model_executor/models/internvl.py
View file @
601bd326
...
...
@@ -37,7 +37,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -905,8 +905,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
self
.
visual_token_mask
=
None
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -916,7 +915,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/llava.py
View file @
601bd326
...
...
@@ -38,7 +38,7 @@ from vllm.sequence import IntermediateTensors
from
vllm.utils
import
JSONTree
,
flatten_2d_lists
,
json_map_leaves
from
.clip
import
CLIPVisionModel
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.pixtral
import
(
PixtralHFVisionModel
,
get_pixtral_hf_image_feature_grid_size
)
from
.siglip
import
SiglipVisionModel
...
...
@@ -778,7 +778,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
return
embeds_in_batch
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
def
get_multimodal_embeddings
(
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -800,7 +801,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/llava_next.py
View file @
601bd326
...
...
@@ -16,12 +16,12 @@ from vllm.config import VllmConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
NestedTensors
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
from
vllm.multimodal.parse
import
ImageSize
from
vllm.sequence
import
IntermediateTensors
from
.clip
import
CLIPVisionModel
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.llava
import
(
BaseLlavaMultiModalProcessor
,
BaseLlavaProcessingInfo
,
LlavaDummyInputsBuilder
,
LlavaLikeConfig
,
LlavaMultiModalProjector
,
init_vision_tower_for_llava
)
...
...
@@ -480,8 +480,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
]
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -491,7 +490,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
if
multimodal_embeddings
is
None
:
...
...
vllm/model_executor/models/llava_next_video.py
View file @
601bd326
...
...
@@ -16,8 +16,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from
vllm.model_executor.models.clip
import
CLIPVisionModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargs
from
vllm.multimodal.parse
import
(
ImageSize
,
MultiModalDataItems
,
VideoEmbeddingItems
,
VideoProcessorItems
)
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
...
...
@@ -27,7 +26,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_list_of
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.llava
import
init_vision_tower_for_llava
from
.siglip
import
SiglipVisionModel
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
...
...
@@ -421,8 +420,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
f
"Unsupported type of video input
{
type
(
video_pixels
)
}
"
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
video_input
=
self
.
_parse_and_validate_video_input
(
**
kwargs
)
if
video_input
is
None
:
return
None
...
...
@@ -432,7 +430,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/llava_onevision.py
View file @
601bd326
...
...
@@ -19,8 +19,7 @@ from vllm.model_executor.layers.activation import get_act_fn
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargs
from
vllm.multimodal.parse
import
(
ImageSize
,
MultiModalDataItems
,
VideoEmbeddingItems
,
VideoProcessorItems
)
from
vllm.multimodal.processing
import
PromptReplacement
,
PromptUpdate
...
...
@@ -29,7 +28,7 @@ from vllm.sequence import IntermediateTensors
from
vllm.utils
import
is_list_of
from
.clip
import
CLIPVisionModel
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.llava
import
LlavaDummyInputsBuilder
,
init_vision_tower_for_llava
from
.llava_next
import
(
BaseLlavaNextMultiModalProcessor
,
LlavaNextLikeConfig
,
LlavaNextProcessingInfo
)
...
...
@@ -856,7 +855,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
return
image_feature
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
tuple
[
torch
.
Tensor
,
...]
]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
modalities
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
if
not
modalities
:
return
None
...
...
@@ -882,7 +881,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
tuple
[
torch
.
Tensor
,
...]
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbeddings
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
@@ -894,10 +893,9 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_input_embeddings_v0
(
self
,
input_ids
:
torch
.
Tensor
,
image_input
:
Optional
[
NestedTensor
s
]
=
None
,
video_input
:
Optional
[
NestedTensor
s
]
=
None
,
image_input
:
Optional
[
LlavaOnevisionImagePixelInput
s
]
=
None
,
video_input
:
Optional
[
LlavaOnevisionVideoPixelInput
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
)
if
image_input
is
not
None
:
image_embeds
=
self
.
_process_image_input
(
image_input
)
...
...
vllm/model_executor/models/molmo.py
View file @
601bd326
...
...
@@ -52,8 +52,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
JSONTree
,
flatten_2d_lists
,
json_map_leaves
from
.interfaces
import
(
SupportsLoRA
,
Supports
MultiModal
,
Supports
PP
,
SupportsQuant
)
from
.interfaces
import
(
MultiModal
Embeddings
,
Supports
LoRA
,
SupportsMultiModal
,
SupportsPP
,
SupportsQuant
)
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
...
...
@@ -1577,8 +1577,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
return
embeds_in_batch
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -1598,7 +1597,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/paligemma.py
View file @
601bd326
...
...
@@ -13,8 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalInputs
,
MultiModalKwargs
,
NestedTensors
)
MultiModalInputs
,
MultiModalKwargs
)
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptIndexTargets
,
...
...
@@ -23,7 +22,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.siglip
import
SiglipVisionModel
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -328,8 +327,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
return
self
.
multi_modal_projector
(
image_features
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -341,7 +339,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
vllm/model_executor/models/phi3v.py
View file @
601bd326
...
...
@@ -31,8 +31,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargs
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
ImageProcessorItems
,
ImageSize
,
MultiModalDataItems
)
# yapf conflicts with isort for this block
...
...
@@ -48,7 +47,8 @@ from vllm.sequence import IntermediateTensors
from
vllm.utils
import
is_list_of
from
.clip
import
CLIPVisionModel
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
,
SupportsQuant
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
,
SupportsQuant
)
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
...
...
@@ -649,8 +649,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
return
image_embeds
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]]:
self
,
**
kwargs
:
object
)
->
Optional
[
MultiModalEmbeddings
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
...
...
@@ -660,7 +659,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensor
s
]
=
None
,
multimodal_embeddings
:
Optional
[
MultiModalEmbedding
s
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment