Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2f0a0a17
Unverified
Commit
2f0a0a17
authored
Nov 26, 2024
by
Roger Wang
Committed by
GitHub
Nov 26, 2024
Browse files
[V1] Refactor model executable interface for multimodal models (#10570)
Signed-off-by:
Roger Wang
<
ywang@roblox.com
>
parent
7576cd38
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
581 additions
and
306 deletions
+581
-306
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+37
-24
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+41
-17
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+35
-19
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+29
-14
vllm/model_executor/models/interfaces.py
vllm/model_executor/models/interfaces.py
+35
-1
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+39
-15
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+8
-7
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+33
-18
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_next_video.py
+29
-15
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+53
-21
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+46
-42
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+31
-21
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+9
-7
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+37
-22
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+68
-34
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+48
-24
vllm/model_executor/models/utils.py
vllm/model_executor/models/utils.py
+1
-4
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-1
No files found.
vllm/model_executor/models/blip2.py
View file @
2f0a0a17
...
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
...
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
consecutive_placeholder_ranges
from
vllm.multimodal.utils
import
consecutive_placeholder_ranges
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
...
@@ -609,6 +610,25 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -609,6 +610,25 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
return
self
.
language_projection
(
query_output
)
return
self
.
language_projection
(
query_output
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
BLIP2_IMAGE_TOKEN_ID
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -616,6 +636,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -616,6 +636,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
Union
[
SamplerOutput
,
IntermediateTensors
]:
)
->
Union
[
SamplerOutput
,
IntermediateTensors
]:
"""Run forward pass for BLIP-2.
"""Run forward pass for BLIP-2.
...
@@ -648,32 +669,24 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -648,32 +669,24 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
See also:
See also:
:class:`Blip2ImageInputs`
:class:`Blip2ImageInputs`
"""
"""
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
else
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
if
image_input
is
not
None
:
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
language_model
.
model
.
get_input_embeddings
(
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
input_ids
)
vision_embeddings
)
input_ids
=
None
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
vision_embeddings
,
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
BLIP2_IMAGE_TOKEN_ID
)
positions
,
kv_caches
,
input_ids
=
None
attn_metadata
,
else
:
intermediate_tensors
,
inputs_embeds
=
None
inputs_embeds
=
inputs_embeds
)
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
)
return
hidden_states
return
hidden_states
...
...
vllm/model_executor/models/chameleon.py
View file @
2f0a0a17
...
@@ -29,6 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import (
...
@@ -29,6 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import (
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
consecutive_placeholder_ranges
,
consecutive_placeholder_ranges
,
repeat_and_pad_placeholder_tokens
)
repeat_and_pad_placeholder_tokens
)
...
@@ -38,7 +39,7 @@ from vllm.utils import print_warning_once
...
@@ -38,7 +39,7 @@ from vllm.utils import print_warning_once
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
is_pp_missing_parameter
,
from
.utils
import
(
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
maybe_prefix
,
merge_multimodal_embeddings
)
# These configs are not part of the model config but the preprocessor
# These configs are not part of the model config but the preprocessor
# and processor files, so we hardcode them in the model file for now.
# and processor files, so we hardcode them in the model file for now.
...
@@ -987,6 +988,29 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -987,6 +988,29 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
data
=
self
.
_validate_pixel_values
(
pixel_values
),
data
=
self
.
_validate_pixel_values
(
pixel_values
),
)
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
assert
self
.
model
.
vqmodel
is
not
None
image_tokens
=
self
.
model
.
get_image_tokens
(
image_input
[
"data"
].
to
(
self
.
config
.
torch_dtype
))
vision_embeddings
=
self
.
model
.
get_input_embeddings
(
image_tokens
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
model
.
vocabulary_mapping
.
image_token_id
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -994,27 +1018,27 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -994,27 +1018,27 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
,
**
kwargs
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
input_ids
=
None
input_ids
=
None
else
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
hidden_states
=
self
.
model
(
input_ids
,
positions
,
if
image_input
is
not
None
:
kv_caches
,
assert
self
.
model
.
vqmodel
is
not
None
attn_metadata
,
image_tokens
=
self
.
model
.
get_image_tokens
(
intermediate_tensors
,
image_input
[
"data"
].
to
(
self
.
config
.
torch_dtype
))
inputs_embeds
=
inputs_embeds
)
image_token_id
=
self
.
model
.
vocabulary_mapping
.
image_token_id
special_image_mask
=
input_ids
==
image_token_id
image_tokens
=
image_tokens
.
to
(
input_ids
.
device
,
input_ids
.
dtype
)
input_ids
=
input_ids
.
masked_scatter
(
special_image_mask
,
image_tokens
)
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
)
return
hidden_states
return
hidden_states
def
compute_logits
(
def
compute_logits
(
...
...
vllm/model_executor/models/chatglm.py
View file @
2f0a0a17
...
@@ -33,7 +33,8 @@ from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
...
@@ -33,7 +33,8 @@ from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalData
,
MultiModalKwargs
from
vllm.multimodal.inputs
import
(
MultiModalData
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
SequenceData
)
SequenceData
)
...
@@ -545,6 +546,30 @@ class ChatGLMModel(nn.Module):
...
@@ -545,6 +546,30 @@ class ChatGLMModel(nn.Module):
"""
)
"""
)
return
GLMImagePixelInputs
(
pixel_values
=
pixel_values
)
return
GLMImagePixelInputs
(
pixel_values
=
pixel_values
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
[
"pixel_values"
]
is
None
:
return
None
pixel_values
=
image_input
[
"pixel_values"
].
to
(
dtype
=
self
.
config
.
torch_dtype
)
vision_embeddings
=
self
.
vision
(
pixel_values
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
embedding
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_glm_vision_embeddings
(
input_ids
=
input_ids
,
inputs_embeds
=
inputs_embeds
,
vision_embeddings
=
multimodal_embeddings
,
boi_token_id
=
self
.
config
.
boi_token_id
,
eoi_token_id
=
self
.
config
.
eoi_token_id
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -552,26 +577,17 @@ class ChatGLMModel(nn.Module):
...
@@ -552,26 +577,17 @@ class ChatGLMModel(nn.Module):
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
if
intermediate_tensors
is
None
:
inputs_embeds
=
self
.
embedding
(
input_ids
)
# NOTE: In v1, inputs_embeds is always generated at model runner, this
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
# condition is for v0 compatibility.
if
intermediate_tensors
is
None
and
inputs_embeds
is
None
:
if
image_input
[
"pixel_values"
]
is
not
None
:
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
pixel_values
=
image_input
[
"pixel_values"
].
to
(
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
dtype
=
inputs_embeds
.
dtype
)
vision_embeddings
)
image_embeds
=
self
.
vision
(
pixel_values
)
input_ids
=
None
boi_token_id
=
self
.
config
.
boi_token_id
eoi_token_id
=
self
.
config
.
eoi_token_id
inputs_embeds
=
merge_glm_vision_embeddings
(
input_ids
=
input_ids
,
inputs_embeds
=
inputs_embeds
,
vision_embeddings
=
image_embeds
,
boi_token_id
=
boi_token_id
,
eoi_token_id
=
eoi_token_id
)
else
:
else
:
inputs_embeds
=
intermediate_tensors
[
"hidden_states"
]
inputs_embeds
=
intermediate_tensors
[
"hidden_states"
]
...
...
vllm/model_executor/models/fuyu.py
View file @
2f0a0a17
...
@@ -35,6 +35,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM
...
@@ -35,6 +35,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
consecutive_placeholder_ranges
)
consecutive_placeholder_ranges
)
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
...
@@ -302,6 +303,25 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -302,6 +303,25 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
vision_embeddings
,
_
=
self
.
vision_embed_tokens
(
image_input
[
"data"
])
vision_embeddings
,
_
=
self
.
vision_embed_tokens
(
image_input
[
"data"
])
return
vision_embeddings
return
vision_embeddings
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
_IMAGE_TOKEN_ID
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -309,24 +329,19 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -309,24 +329,19 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
):
):
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
else
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
if
image_input
is
not
None
:
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
language_model
.
model
.
embed_tokens
(
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
input_ids
)
vision_embeddings
)
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
None
input_ids
,
inputs_embeds
,
vision_embeddings
,
self
.
image_token_id
)
else
:
inputs_embeds
=
None
hidden_states
=
self
.
language_model
(
hidden_states
=
self
.
language_model
(
input_ids
=
input_ids
,
input_ids
=
input_ids
,
...
...
vllm/model_executor/models/interfaces.py
View file @
2f0a0a17
...
@@ -2,7 +2,7 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
...
@@ -2,7 +2,7 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
Protocol
,
Type
,
Union
,
overload
,
runtime_checkable
)
Protocol
,
Type
,
Union
,
overload
,
runtime_checkable
)
import
torch
import
torch
from
typing_extensions
import
TypeIs
from
typing_extensions
import
TypeIs
,
TypeVar
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
supports_kw
from
vllm.utils
import
supports_kw
...
@@ -10,10 +10,14 @@ from vllm.utils import supports_kw
...
@@ -10,10 +10,14 @@ from vllm.utils import supports_kw
from
.interfaces_base
import
is_embedding_model
from
.interfaces_base
import
is_embedding_model
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.attention
import
AttentionMetadata
from
vllm.multimodal.inputs
import
NestedTensors
# noqa: F401
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
T
=
TypeVar
(
"T"
,
default
=
"NestedTensors"
)
@
runtime_checkable
@
runtime_checkable
class
SupportsMultiModal
(
Protocol
):
class
SupportsMultiModal
(
Protocol
):
...
@@ -28,6 +32,36 @@ class SupportsMultiModal(Protocol):
...
@@ -28,6 +32,36 @@ class SupportsMultiModal(Protocol):
MRO of your model class.
MRO of your model class.
"""
"""
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
T
]:
"""
Returns multimodal embeddings generated from multimodal kwargs
to be merged with text embeddings.
"""
...
# Only for models that support v0 chunked prefill
# TODO(ywang96): Remove this overload once v0 is deprecated
@
overload
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
T
]
=
None
,
attn_metadata
:
Optional
[
"AttentionMetadata"
]
=
None
,
)
->
torch
.
Tensor
:
...
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
T
]
=
None
,
)
->
torch
.
Tensor
:
"""
Returns the input embeddings merged from the text embeddings from
input_ids and the multimodal embeddings generated from multimodal
kwargs.
"""
...
# We can't use runtime_checkable with ClassVar for issubclass checks
# We can't use runtime_checkable with ClassVar for issubclass checks
# so we need to treat the class as an instance and use isinstance instead
# so we need to treat the class as an instance and use isinstance instead
...
...
vllm/model_executor/models/internvl.py
View file @
2f0a0a17
...
@@ -26,6 +26,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
...
@@ -26,6 +26,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
InternVisionPatchModel
)
InternVisionPatchModel
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_list_of
from
vllm.utils
import
is_list_of
...
@@ -641,6 +642,26 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -641,6 +642,26 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
visual_token_mask
=
None
visual_token_mask
=
None
return
visual_token_mask
return
visual_token_mask
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
assert
self
.
img_context_token_id
is
not
None
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
img_context_token_id
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -648,26 +669,22 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -648,26 +669,22 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
Union
[
SamplerOutput
,
IntermediateTensors
]:
)
->
Union
[
SamplerOutput
,
IntermediateTensors
]:
visual_token_mask
=
None
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
visual_token_mask
=
None
else
:
# NOTE: In v1, inputs_embeds is always generated at model runner, this
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
# condition is for v0 compatibility.
if
image_input
is
not
None
:
elif
inputs_embeds
is
None
:
inputs_embeds
=
self
.
language_model
.
model
.
get_input_embeddings
(
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
input_ids
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
vision_embeddings
)
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
None
input_ids
,
inputs_embeds
,
vision_embeddings
,
self
.
img_context_token_id
)
visual_token_mask
=
self
.
_get_visual_token_mask
(
input_ids
)
input_ids
=
None
else
:
inputs_embeds
=
None
visual_token_mask
=
None
forward_kwargs
=
{
forward_kwargs
=
{
"input_ids"
:
input_ids
,
"input_ids"
:
input_ids
,
...
@@ -677,6 +694,13 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -677,6 +694,13 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
"intermediate_tensors"
:
intermediate_tensors
,
"intermediate_tensors"
:
intermediate_tensors
,
"inputs_embeds"
:
inputs_embeds
,
"inputs_embeds"
:
inputs_embeds
,
}
}
if
self
.
img_context_token_id
is
not
None
:
visual_token_mask
=
self
.
_get_visual_token_mask
(
input_ids
)
# We always overwrite it back to None after computing visual token
# mask so that this doesn't need to depend on encoder output
self
.
img_context_token_id
=
None
if
self
.
is_mono
:
if
self
.
is_mono
:
forward_kwargs
.
update
({
"visual_token_mask"
:
visual_token_mask
})
forward_kwargs
.
update
({
"visual_token_mask"
:
visual_token_mask
})
...
...
vllm/model_executor/models/llava.py
View file @
2f0a0a17
...
@@ -478,7 +478,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -478,7 +478,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
image_features
=
self
.
_process_image_pixels
(
image_input
)
image_features
=
self
.
_process_image_pixels
(
image_input
)
return
self
.
multi_modal_projector
(
image_features
)
return
self
.
multi_modal_projector
(
image_features
)
def
process_mm_inputs
(
self
,
**
kwargs
)
:
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
None
...
@@ -488,12 +488,12 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -488,12 +488,12 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
def
get_input_embeddings
(
def
get_input_embeddings
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
vision
_embeddings
:
Optional
[
NestedTensors
]
=
None
,
multimodal
_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
vision
_embeddings
is
not
None
:
if
multimodal
_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
vision
_embeddings
,
input_ids
,
inputs_embeds
,
multimodal
_embeddings
,
self
.
config
.
image_token_index
)
self
.
config
.
image_token_index
)
return
inputs_embeds
return
inputs_embeds
...
@@ -544,10 +544,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -544,10 +544,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
"""
"""
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
inputs_embeds
=
None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif
inputs_embeds
is
None
:
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
process_mm_inputs
(
**
kwargs
)
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
# always pass the input via `inputs_embeds`
# to make sure the computation graph is consistent
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
vision_embeddings
)
input_ids
=
None
input_ids
=
None
...
...
vllm/model_executor/models/llava_next.py
View file @
2f0a0a17
...
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
...
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.sequence
import
IntermediateTensors
,
PoolerOutput
from
vllm.sequence
import
IntermediateTensors
,
PoolerOutput
from
vllm.utils
import
is_list_of
from
vllm.utils
import
is_list_of
...
@@ -565,6 +566,30 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -565,6 +566,30 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
for
i
,
patch_features_batch
in
enumerate
(
patch_embeddings
)
for
i
,
patch_features_batch
in
enumerate
(
patch_embeddings
)
]
]
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
if
multimodal_embeddings
is
None
:
return
self
.
language_model
.
get_input_embeddings
(
input_ids
)
inputs_embeds
=
embed_multimodal
(
input_ids
,
self
.
config
.
image_token_index
,
self
.
language_model
.
model
.
get_input_embeddings
,
multimodal_embeddings
,
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -572,6 +597,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -572,6 +597,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
"""Run forward pass for LlaVA-NeXT.
"""Run forward pass for LlaVA-NeXT.
...
@@ -620,24 +646,14 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -620,24 +646,14 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
"""
"""
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
inputs_embeds
=
None
else
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
not
None
:
inputs_embeds
=
embed_multimodal
(
input_ids
,
self
.
config
.
image_token_index
,
self
.
language_model
.
model
.
get_input_embeddings
,
lambda
_
:
self
.
_process_image_input
(
image_input
),
)
else
:
inputs_embeds
=
self
.
language_model
.
model
.
get_input_embeddings
(
input_ids
)
# always pass the input via `inputs_embeds`
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# to make sure the computation graph is consistent
# condition is for v0 compatibility.
# for `torch.compile` integration
elif
inputs_embeds
is
None
:
input_ids
=
None
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
input_ids
=
None
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
positions
,
positions
,
...
@@ -645,7 +661,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -645,7 +661,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
attn_metadata
,
attn_metadata
,
intermediate_tensors
,
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
)
inputs_embeds
=
inputs_embeds
)
return
hidden_states
return
hidden_states
def
compute_logits
(
def
compute_logits
(
...
...
vllm/model_executor/models/llava_next_video.py
View file @
2f0a0a17
...
@@ -18,6 +18,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
...
@@ -18,6 +18,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from
vllm.model_executor.models.clip
import
CLIPVisionModel
from
vllm.model_executor.models.clip
import
CLIPVisionModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
repeat_and_pad_placeholder_tokens
)
repeat_and_pad_placeholder_tokens
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -388,6 +389,25 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -388,6 +389,25 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
raise
ValueError
(
raise
ValueError
(
f
"Unsupported type of video input
{
type
(
video_pixels
)
}
"
)
f
"Unsupported type of video input
{
type
(
video_pixels
)
}
"
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
video_input
=
self
.
_parse_and_validate_video_input
(
**
kwargs
)
if
video_input
is
None
:
return
None
vision_embeddings
=
self
.
_process_video_pixels
(
video_input
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
config
.
video_token_index
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -395,6 +415,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -395,6 +415,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
"""Run forward pass for LlaVA-NeXT-Video.
"""Run forward pass for LlaVA-NeXT-Video.
...
@@ -404,22 +425,15 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -404,22 +425,15 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
pixel_values_videos: Pixels in each frames for each input videos.
pixel_values_videos: Pixels in each frames for each input videos.
"""
"""
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
else
:
video_input
=
self
.
_parse_and_validate_video_input
(
**
kwargs
)
# NOTE: In v1, inputs_embeds is always generated at model runner, this
if
video_input
is
not
None
:
# condition is for v0 compatibility.
video_embeddings
=
self
.
_process_video_pixels
(
video_input
)
elif
inputs_embeds
is
None
:
inputs_embeds
=
self
.
language_model
\
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
.
model
.
get_input_embeddings
(
input_ids
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
=
None
input_ids
,
inputs_embeds
,
video_embeddings
,
self
.
config
.
video_token_index
)
input_ids
=
None
else
:
inputs_embeds
=
None
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
positions
,
positions
,
...
...
vllm/model_executor/models/llava_onevision.py
View file @
2f0a0a17
...
@@ -21,6 +21,7 @@ from vllm.model_executor.layers.activation import get_act_fn
...
@@ -21,6 +21,7 @@ from vllm.model_executor.layers.activation import get_act_fn
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
repeat_and_pad_placeholder_tokens
)
repeat_and_pad_placeholder_tokens
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -824,6 +825,49 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -824,6 +825,49 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
image_feature
=
image_feature
.
view
(
batch_frames
,
-
1
,
dim
)
image_feature
=
image_feature
.
view
(
batch_frames
,
-
1
,
dim
)
return
image_feature
return
image_feature
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
List
[
Tuple
[
NestedTensors
,
str
]]]:
modalities
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
if
not
modalities
:
return
None
# We make a tuple of each embedding with its modality string. This is a
# temporary workaround for models to handle mixed modalities when
# get_multimodal_embeddings and get_input_embeddings are called
# separately.
# TODO(ywang96): Add support for mixed-modality inference for v1.
multimodal_embeddings
:
List
[
Tuple
[
NestedTensors
,
str
]]
=
[]
if
"images"
in
modalities
:
image_input
=
modalities
[
"images"
]
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
.
append
((
vision_embeddings
,
"image"
))
if
"videos"
in
modalities
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_video_pixels
(
video_input
)
multimodal_embeddings
.
append
((
video_embeddings
,
"video"
))
return
multimodal_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
List
[
Tuple
[
NestedTensors
,
str
]]]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
for
embeddings
,
modality
in
multimodal_embeddings
:
if
modality
==
"image"
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
embeddings
,
self
.
config
.
image_token_index
)
if
modality
==
"video"
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
embeddings
,
self
.
config
.
video_token_index
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -831,6 +875,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -831,6 +875,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
"""Run forward pass for LlaVA-Onevision.
"""Run forward pass for LlaVA-Onevision.
...
@@ -840,28 +885,15 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -840,28 +885,15 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
pixel_values_videos: Pixels in each frames for each input videos.
pixel_values_videos: Pixels in each frames for each input videos.
"""
"""
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
else
:
modalities
=
self
.
_parse_and_validate_multimodal_inputs
(
**
kwargs
)
# NOTE: In v1, inputs_embeds is always generated at model runner, this
if
modalities
:
# condition is for v0 compatibility.
inputs_embeds
=
self
.
language_model
.
model
.
get_input_embeddings
(
elif
inputs_embeds
is
None
:
input_ids
)
multimodal_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
if
"images"
in
modalities
:
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
image_input
=
modalities
[
"images"
]
multimodal_embeddings
)
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
input_ids
=
None
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
vision_embeddings
,
self
.
config
.
image_token_index
)
if
"videos"
in
modalities
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_video_pixels
(
video_input
)
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
video_embeddings
,
self
.
config
.
video_token_index
)
input_ids
=
None
else
:
inputs_embeds
=
None
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
positions
,
positions
,
...
...
vllm/model_executor/models/molmo.py
View file @
2f0a0a17
...
@@ -3,7 +3,7 @@ import re
...
@@ -3,7 +3,7 @@ import re
from
array
import
array
from
array
import
array
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
functools
import
lru_cache
,
partial
from
functools
import
lru_cache
,
partial
from
typing
import
Iterable
,
List
,
Mapping
,
Optional
,
Tuple
,
TypedDict
,
Union
from
typing
import
Iterable
,
List
,
Mapping
,
Optional
,
Tuple
,
TypedDict
import
torch
import
torch
from
einops
import
rearrange
from
einops
import
rearrange
...
@@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.platforms
import
_Backend
from
vllm.platforms
import
_Backend
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
...
@@ -756,6 +757,12 @@ class MolmoModel(nn.Module):
...
@@ -756,6 +757,12 @@ class MolmoModel(nn.Module):
make_empty_intermediate_tensors_factory
(
make_empty_intermediate_tensors_factory
(
[
"hidden_states"
,
"residual"
],
config
.
hidden_size
))
[
"hidden_states"
,
"residual"
],
config
.
hidden_size
))
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -1098,19 +1105,16 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -1098,19 +1105,16 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
return
image_features
return
image_features
def
_mer
ge_multimodal_embeddings
(
def
ge
t
_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
self
,
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
i
nputs_embeds
:
torch
.
Tensor
,
i
f
image_input
is
None
:
image_features
:
torch
.
Tensor
,
return
None
image_
input_idx
:
torch
.
Tensor
,
image_
features
=
self
.
_process_image_input
(
image_input
)
seq_len
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]],
image_input_idx
=
image_input
[
"image_input_idx"
]
)
->
torch
.
Tensor
:
seq_len
=
image_input
[
"seq_len"
]
batch_size
,
num_image
,
num_patch
=
image_features
.
shape
[:
3
]
batch_size
,
num_image
,
num_patch
=
image_features
.
shape
[:
3
]
assert
image_input_idx
.
shape
==
(
batch_size
,
num_image
,
num_patch
)
assert
image_input_idx
.
shape
==
(
batch_size
,
num_image
,
num_patch
)
image_features
=
image_features
.
to
(
inputs_embeds
.
device
)
seq_len
=
seq_len
.
to
(
inputs_embeds
.
device
)
# insert the image feature into the embedding.
# insert the image feature into the embedding.
image_features
=
image_features
.
view
(
batch_size
,
num_image
*
num_patch
,
image_features
=
image_features
.
view
(
batch_size
,
num_image
*
num_patch
,
-
1
)
-
1
)
...
@@ -1130,12 +1134,24 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -1130,12 +1134,24 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
image_input_idx
=
image_input_idx
+
offset
.
to
(
image_input_idx
.
dtype
)
image_input_idx
=
image_input_idx
+
offset
.
to
(
image_input_idx
.
dtype
)
image_input_idx
=
image_input_idx
.
flatten
()[:,
None
]
image_input_idx
=
image_input_idx
.
flatten
()[:,
None
]
mat
=
image_input_idx
==
torch
.
arange
(
mat
=
image_input_idx
==
torch
.
arange
(
seq_len
.
sum
().
item
(),
device
=
i
nputs_embed
s
.
device
)[
None
,
:]
seq_len
.
sum
().
item
(),
device
=
i
mage_feature
s
.
device
)[
None
,
:]
mat
=
mat
.
to
(
image_features
.
dtype
)
mat
=
mat
.
to
(
image_features
.
dtype
)
inputs_embeds
=
inputs_embeds
+
torch
.
einsum
(
'nd,nm->md'
,
# Note: In this original implementation from AI2, the final
image_features
,
mat
)
# vision_embeddings will be always be the same length
# of input embedddings, which is not very efficient.
# TODO(ywang96): see if this can be optimized.
vision_embeddings
=
torch
.
einsum
(
'nd,nm->md'
,
image_features
,
mat
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
inputs_embeds
+
multimodal_embeddings
return
inputs_embeds
return
inputs_embeds
def
forward
(
def
forward
(
...
@@ -1145,39 +1161,27 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -1145,39 +1161,27 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
SamplerOutput
:
)
->
SamplerOutput
:
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
inputs_embeds
=
None
else
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
not
None
:
inputs_embeds
=
self
.
model
.
embed_tokens
(
input_ids
)
image_features
=
self
.
_process_image_input
(
image_input
)
inputs_embeds
=
self
.
_merge_multimodal_embeddings
(
inputs_embeds
,
image_features
,
image_input
[
"image_input_idx"
],
image_input
[
"seq_len"
],
)
else
:
inputs_embeds
=
self
.
model
.
embed_tokens
(
input_ids
)
# always pass the input via `inputs_embeds`
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# to make sure the computation graph is consistent
# condition is for v0 compatibility.
# for `torch.compile` integration
elif
inputs_embeds
is
None
:
input_ids
=
None
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
hidden_states
=
self
.
model
(
vision_embeddings
)
input_ids
=
input_ids
,
input_ids
=
None
positions
=
positions
,
kv_caches
=
kv_caches
,
hidden_states
=
self
.
model
(
input_ids
,
attn_metadata
=
attn_metadata
,
positions
,
intermediate_tensors
=
intermediate_tensors
,
kv_caches
,
inputs_embeds
=
inputs_embeds
,
attn_metadata
,
)
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
)
return
hidden_states
return
hidden_states
...
...
vllm/model_executor/models/paligemma.py
View file @
2f0a0a17
...
@@ -13,6 +13,7 @@ from vllm.logger import init_logger
...
@@ -13,6 +13,7 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -240,36 +241,45 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -240,36 +241,45 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
return
self
.
multi_modal_projector
(
image_features
)
return
self
.
multi_modal_projector
(
image_features
)
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
vision_embeddings
=
vision_embeddings
*
(
self
.
config
.
hidden_size
**-
0.5
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
config
.
image_token_index
)
return
inputs_embeds
def
forward
(
self
,
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
)
->
Union
[
SamplerOutput
,
IntermediateTensors
]:
**
kwargs
:
object
)
->
Union
[
SamplerOutput
,
IntermediateTensors
]:
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
else
:
parsed_image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
if
parsed_image_input
is
not
None
:
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
_process_image_input
(
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
parsed_image_input
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
vision_embeddings
)
vision_embeddings
=
vision_embeddings
*
(
input_ids
=
None
self
.
config
.
hidden_size
**-
0.5
)
inputs_embeds
=
self
.
language_model
.
model
.
get_input_embeddings
(
input_ids
)
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
vision_embeddings
,
self
.
config
.
image_token_index
)
input_ids
=
None
else
:
inputs_embeds
=
None
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
positions
,
positions
,
...
...
vllm/model_executor/models/phi3v.py
View file @
2f0a0a17
...
@@ -676,7 +676,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -676,7 +676,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
return
image_embeds
return
image_embeds
def
process_mm_inputs
(
self
,
**
kwargs
)
:
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
if
image_input
is
None
:
return
None
return
None
...
@@ -686,12 +686,12 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -686,12 +686,12 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def
get_input_embeddings
(
def
get_input_embeddings
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
vision
_embeddings
:
Optional
[
NestedTensors
]
=
None
,
multimodal
_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
if
vision
_embeddings
is
not
None
:
if
multimodal
_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
vision
_embeddings
,
input_ids
,
inputs_embeds
,
multimodal
_embeddings
,
self
.
image_token_id
)
self
.
image_token_id
)
return
inputs_embeds
return
inputs_embeds
...
@@ -703,12 +703,14 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -703,12 +703,14 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
):
**
kwargs
:
object
):
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
inputs_embeds
=
None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility
elif
inputs_embeds
is
None
:
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
process_mm_inputs
(
**
kwargs
)
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
# always pass the input via `inputs_embeds`
# to make sure the computation graph is consistent
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
vision_embeddings
)
input_ids
=
None
input_ids
=
None
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
2f0a0a17
...
@@ -42,10 +42,12 @@ from vllm.model_executor.model_loader.weight_utils import (
...
@@ -42,10 +42,12 @@ from vllm.model_executor.model_loader.weight_utils import (
from
vllm.model_executor.models.qwen2
import
Qwen2Model
from
vllm.model_executor.models.qwen2
import
Qwen2Model
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
consecutive_placeholder_ranges
from
vllm.multimodal.utils
import
consecutive_placeholder_ranges
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.utils
import
merge_multimodal_embeddings
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -371,6 +373,25 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -371,6 +373,25 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
return
masked_audio_features
return
masked_audio_features
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
if
audio_input
is
None
:
return
None
masked_audio_features
=
self
.
_process_audio_input
(
audio_input
)
return
masked_audio_features
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
config
.
audio_token_index
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -378,33 +399,27 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -378,33 +399,27 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
else
:
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
if
audio_input
is
None
:
# NOTE: In v1, inputs_embeds is always generated at model runner, this
inputs_embeds
=
None
# condition is for v0 compatibility.
else
:
elif
inputs_embeds
is
None
:
inputs_embeds
=
self
.
language_model
.
embed_tokens
(
input_ids
)
multimodal_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
masked_audio_features
=
self
.
_process_audio_input
(
audio_input
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
# merge llm embeddings and audio features
multimodal_embeddings
)
mask
=
(
input_ids
==
self
.
config
.
audio_token_index
)
input_ids
=
None
inputs_embeds
[
mask
,
:]
=
masked_audio_features
hidden_states
=
self
.
language_model
(
input_ids
,
input_ids
=
None
positions
,
kv_caches
,
hidden_states
=
self
.
language_model
(
attn_metadata
,
input_ids
=
input_ids
,
intermediate_tensors
,
positions
=
positions
,
inputs_embeds
=
inputs_embeds
)
kv_caches
=
kv_caches
,
attn_metadata
=
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
,
)
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
2f0a0a17
...
@@ -63,7 +63,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
...
@@ -63,7 +63,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.inputs
import
(
MultiModalData
,
MultiModalDataDict
,
from
vllm.multimodal.inputs
import
(
MultiModalData
,
MultiModalDataDict
,
MultiModalKwargs
)
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.platforms
import
_Backend
from
vllm.platforms
import
_Backend
from
vllm.sequence
import
IntermediateTensors
,
PoolerOutput
,
SequenceData
from
vllm.sequence
import
IntermediateTensors
,
PoolerOutput
,
SequenceData
...
@@ -1238,6 +1238,55 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1238,6 +1238,55 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
inputs_embeds
[
mask
,
:]
=
multimodal_embeddings
inputs_embeds
[
mask
,
:]
=
multimodal_embeddings
return
inputs_embeds
return
inputs_embeds
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
List
[
Tuple
[
NestedTensors
,
str
]]]:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
video_input
=
self
.
_parse_and_validate_video_input
(
**
kwargs
)
if
image_input
is
None
and
video_input
is
None
:
return
None
# We make a tuple of each embedding with its modality string. This is a
# temporary workaround for models to handle mixed modalities when
# get_multimodal_embeddings and get_input_embeddings are called
# separately.
# TODO(ywang96): Add support for mixed-modality inference for v1.
multimodal_embeddings
:
List
[
Tuple
[
NestedTensors
,
str
]]
=
[]
if
image_input
is
not
None
:
image_embeds
=
self
.
_process_image_input
(
image_input
)
multimodal_embeddings
.
append
((
image_embeds
,
"image"
))
if
video_input
is
not
None
:
video_embeds
=
self
.
_process_video_input
(
video_input
)
multimodal_embeddings
.
append
((
video_embeds
,
"video"
))
return
multimodal_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
List
[
Tuple
[
NestedTensors
,
str
]]]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
for
embeddings
,
modality
in
multimodal_embeddings
:
if
modality
==
"image"
:
inputs_embeds
=
self
.
_merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
embeddings
,
placeholder_token_id
=
self
.
config
.
image_token_id
,
)
if
modality
==
"video"
:
inputs_embeds
=
self
.
_merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
embeddings
,
placeholder_token_id
=
self
.
config
.
video_token_id
,
)
return
inputs_embeds
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -1245,6 +1294,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1245,6 +1294,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
**
kwargs
:
object
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
"""Run forward pass for Qwen2-VL.
"""Run forward pass for Qwen2-VL.
...
@@ -1266,42 +1316,26 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1266,42 +1316,26 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
`None` if no videos are passed.
`None` if no videos are passed.
"""
"""
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
else
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
video_input
=
self
.
_parse_and_validate_video_input
(
**
kwargs
)
if
image_input
is
None
and
video_input
is
None
:
inputs_embeds
=
None
else
:
if
uses_mrope
(
self
.
config
):
assert
positions
.
ndim
==
2
and
positions
.
size
(
0
)
==
3
,
(
"multimodal section rotary embedding requires "
f
"(3, seq_len) positions, but got
{
positions
.
size
()
}
"
)
inputs_embeds
=
self
.
model
.
embed_tokens
(
input_ids
)
if
image_input
is
not
None
:
image_embeds
=
self
.
_process_image_input
(
image_input
)
inputs_embeds
=
self
.
_merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
image_embeds
,
placeholder_token_id
=
self
.
config
.
image_token_id
,
)
if
video_input
is
not
None
:
video_embeds
=
self
.
_process_video_input
(
video_input
)
inputs_embeds
=
self
.
_merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
video_embeds
,
placeholder_token_id
=
self
.
config
.
video_token_id
,
)
input_ids
=
None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif
inputs_embeds
is
None
:
multimodal_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
# We need to check for usage of mrope here in case there is
# multimodal data.
# TODO (ywang96): move this to model runner in V1.
if
multimodal_embeddings
is
not
None
and
uses_mrope
(
self
.
config
):
assert
positions
.
ndim
==
2
and
positions
.
size
(
0
)
==
3
,
(
"multimodal section rotary embedding requires "
f
"(3, seq_len) positions, but got
{
positions
.
size
()
}
"
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
multimodal_embeddings
)
input_ids
=
None
hidden_states
=
self
.
model
(
hidden_states
=
self
.
model
(
input_ids
=
input_ids
,
input_ids
=
input_ids
,
...
...
vllm/model_executor/models/ultravox.py
View file @
2f0a0a17
...
@@ -449,10 +449,36 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -449,10 +449,36 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
return
result
return
result
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
if
audio_input
is
None
:
return
None
audio_embeddings
=
self
.
_process_audio_input
(
audio_input
)
return
audio_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
attn_metadata
:
Optional
[
AttentionMetadata
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
# TODO(ywang96): use merge_multimodal_embeddings after
# v0 is deprecated
merge_multimodal_embeddings_from_map
(
inputs_embeds
,
multimodal_embeddings
,
attn_metadata
.
multi_modal_placeholder_index_maps
[
"audio"
])
return
inputs_embeds
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
torch
.
Tensor
],
intermediate_tensors
:
Optional
[
torch
.
Tensor
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
**
kwargs
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
"""Run forward pass for Ultravox
"""Run forward pass for Ultravox
...
@@ -466,30 +492,28 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -466,30 +492,28 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
Args:
Args:
audio_features: A batch of audio inputs [B, N, 80, M].
audio_features: A batch of audio inputs [B, N, 80, M].
"""
"""
if
intermediate_tensors
is
not
None
:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
inputs_embeds
=
None
else
:
audio_input
=
self
.
_parse_and_validate_audio_input
(
**
kwargs
)
# NOTE: In v1, inputs_embeds is always generated at model runner, this
if
audio_input
is
not
None
:
# condition is for v0 compatibility.
audio_embeddings
=
self
.
_process_audio_input
(
audio_input
)
elif
inputs_embeds
is
None
:
inputs_embeds
=
self
.
language_model
.
model
.
get_input_embeddings
(
multimodal_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
input_ids
)
# TODO(ywang96): remove attn_metadata from get_input_embeddings
merge_multimodal_embeddings_from_map
(
# after v0 is deprecated
inputs_embeds
,
audio_embeddings
,
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
attn_metadata
.
multi_modal_placeholder_index_maps
[
"audio"
])
multimodal_embeddings
,
input_ids
=
None
attn_metadata
)
else
:
input_ids
=
None
inputs_embeds
=
None
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
hidden_states
=
self
.
language_model
.
model
(
positions
,
input_ids
=
input_ids
,
kv_caches
,
positions
=
positions
,
attn_metadata
,
kv_caches
=
kv_caches
,
intermediate_tensors
,
attn_metadata
=
attn_metadata
,
inputs_embeds
=
inputs_embeds
)
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
)
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
...
...
vllm/model_executor/models/utils.py
View file @
2f0a0a17
...
@@ -356,8 +356,7 @@ def embed_multimodal(
...
@@ -356,8 +356,7 @@ def embed_multimodal(
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
multimodal_token_id
:
int
,
multimodal_token_id
:
int
,
get_text_embeds
:
Callable
[[
torch
.
Tensor
],
torch
.
Tensor
],
get_text_embeds
:
Callable
[[
torch
.
Tensor
],
torch
.
Tensor
],
get_multimodal_embeds
:
Callable
[[
torch
.
Tensor
],
Union
[
torch
.
Tensor
,
multimodal_embeds
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]],
List
[
torch
.
Tensor
]]],
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
"""
"""
Embed token IDs and multimodal inputs and combine their embeddings.
Embed token IDs and multimodal inputs and combine their embeddings.
...
@@ -374,8 +373,6 @@ def embed_multimodal(
...
@@ -374,8 +373,6 @@ def embed_multimodal(
is_text
=
~
is_multimodal
is_text
=
~
is_multimodal
text_embeds
=
get_text_embeds
(
input_ids
[
is_text
])
text_embeds
=
get_text_embeds
(
input_ids
[
is_text
])
multimodal_embeds
=
get_multimodal_embeds
(
input_ids
[
is_multimodal
])
merged_embeds
=
torch
.
empty
(
merged_embeds
=
torch
.
empty
(
(
input_ids
.
shape
[
0
],
text_embeds
.
shape
[
1
]),
(
input_ids
.
shape
[
0
],
text_embeds
.
shape
[
1
]),
dtype
=
text_embeds
.
dtype
,
dtype
=
text_embeds
.
dtype
,
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
2f0a0a17
...
@@ -363,7 +363,8 @@ class GPUModelRunner:
...
@@ -363,7 +363,8 @@ class GPUModelRunner:
# 2. A list (length: num_images) of tensors, each of shape
# 2. A list (length: num_images) of tensors, each of shape
# [feature_size, hidden_size] in case when the feature size is
# [feature_size, hidden_size] in case when the feature size is
# dynamic depending on input images.
# dynamic depending on input images.
encoder_outputs
=
self
.
model
.
process_mm_inputs
(
**
batched_mm_inputs
)
encoder_outputs
=
self
.
model
.
get_multimodal_embeddings
(
**
batched_mm_inputs
)
# Cache the encoder outputs.
# Cache the encoder outputs.
for
(
req_id
,
input_id
),
output
in
zip
(
req_input_ids
,
encoder_outputs
):
for
(
req_id
,
input_id
),
output
in
zip
(
req_input_ids
,
encoder_outputs
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment