Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ab656f2c
Unverified
Commit
ab656f2c
authored
Mar 18, 2025
by
Cyrus Leung
Committed by
GitHub
Mar 18, 2025
Browse files
[Bugfix] Loosen type check to avoid errors in V1 (#15021)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
64fc2193
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
28 additions
and
37 deletions
+28
-37
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+5
-7
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+3
-4
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+1
-1
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+1
-1
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+4
-2
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_next_video.py
+6
-9
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+1
-4
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+4
-6
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+3
-3
No files found.
vllm/model_executor/models/blip2.py
View file @
ab656f2c
...
@@ -25,7 +25,7 @@ from vllm.sequence import IntermediateTensors
...
@@ -25,7 +25,7 @@ from vllm.sequence import IntermediateTensors
from
.blip
import
BlipVisionModel
from
.blip
import
BlipVisionModel
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
maybe_prefix
,
merge_multimodal_embeddings
)
# We use this internally as placeholders since there is no image token
# We use this internally as placeholders since there is no image token
...
@@ -565,12 +565,11 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -565,12 +565,11 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
return
None
return
None
if
pixel_values
is
not
None
:
if
pixel_values
is
not
None
:
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of pixel values. "
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
f
"Got type:
{
type
(
pixel_values
)
}
"
)
# Remove the N dimension until multiple images are supported.
pixel_values
=
flatten_bn
(
pixel_values
,
concat
=
True
)
pixel_values
=
pixel_values
.
squeeze
(
1
)
return
Blip2ImagePixelInputs
(
return
Blip2ImagePixelInputs
(
type
=
"pixel_values"
,
type
=
"pixel_values"
,
...
@@ -578,12 +577,11 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -578,12 +577,11 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
)
)
if
image_embeds
is
not
None
:
if
image_embeds
is
not
None
:
if
not
isinstance
(
image_embeds
,
torch
.
Tensor
):
if
not
isinstance
(
image_embeds
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of image embeddings. "
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
f
"Got type:
{
type
(
image_embeds
)
}
"
)
# Remove the N dimension until multiple images are supported.
image_embeds
=
flatten_bn
(
image_embeds
,
concat
=
True
)
image_embeds
=
image_embeds
.
squeeze
(
1
)
return
Blip2ImageEmbeddingInputs
(
return
Blip2ImageEmbeddingInputs
(
type
=
"image_embeds"
,
type
=
"image_embeds"
,
...
...
vllm/model_executor/models/chameleon.py
View file @
ab656f2c
...
@@ -39,7 +39,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
...
@@ -39,7 +39,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
is_pp_missing_parameter
,
from
.utils
import
(
flatten_bn
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
,
merge_multimodal_embeddings
)
maybe_prefix
,
merge_multimodal_embeddings
)
...
@@ -972,12 +972,11 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -972,12 +972,11 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
if
pixel_values
is
None
:
if
pixel_values
is
None
:
return
None
return
None
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of pixel values. "
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
f
"Got type:
{
type
(
pixel_values
)
}
"
)
# Remove the N dimension until multiple images are supported.
pixel_values
=
flatten_bn
(
pixel_values
,
concat
=
True
)
pixel_values
=
pixel_values
.
squeeze
(
1
)
return
ChameleonImagePixelInputs
(
return
ChameleonImagePixelInputs
(
type
=
"pixel_values"
,
type
=
"pixel_values"
,
...
...
vllm/model_executor/models/deepseek_vl2.py
View file @
ab656f2c
...
@@ -478,7 +478,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -478,7 +478,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
flatten_bn
(
images_spatial_crop
,
concat
=
True
)))
flatten_bn
(
images_spatial_crop
,
concat
=
True
)))
if
image_embeds
is
not
None
:
if
image_embeds
is
not
None
:
if
not
isinstance
(
image_embeds
,
torch
.
Tensor
):
if
not
isinstance
(
image_embeds
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of image embeddings. "
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
f
"Got type:
{
type
(
image_embeds
)
}
"
)
...
...
vllm/model_executor/models/glm4v.py
View file @
ab656f2c
...
@@ -578,7 +578,7 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
...
@@ -578,7 +578,7 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
if
pixel_values
is
not
None
:
if
pixel_values
is
not
None
:
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of pixel values. "
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
f
"Got type:
{
type
(
pixel_values
)
}
"
)
...
...
vllm/model_executor/models/internvl.py
View file @
ab656f2c
...
@@ -838,7 +838,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -838,7 +838,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
return
None
return
None
if
image_embeds
is
not
None
:
if
image_embeds
is
not
None
:
if
not
isinstance
(
image_embeds
,
torch
.
Tensor
):
if
not
isinstance
(
image_embeds
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of image embeddings. "
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
f
"Got type:
{
type
(
image_embeds
)
}
"
)
...
@@ -856,7 +856,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -856,7 +856,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
raise
ValueError
(
"Incorrect type of pixel values. "
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values_flat
)
}
"
)
f
"Got type:
{
type
(
pixel_values_flat
)
}
"
)
assert
isinstance
(
image_num_patches
,
(
torch
.
Tensor
,
list
))
if
not
isinstance
(
image_num_patches
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of image_num_patches. "
f
"Got type:
{
type
(
pixel_values_flat
)
}
"
)
return
InternVLImagePixelInputs
(
return
InternVLImagePixelInputs
(
type
=
"pixel_values"
,
type
=
"pixel_values"
,
...
...
vllm/model_executor/models/llava_next_video.py
View file @
ab656f2c
...
@@ -349,21 +349,18 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -349,21 +349,18 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
List[b, Tensor(nb_frames, nb_channels, height, width)]
List[b, Tensor(nb_frames, nb_channels, height, width)]
}
}
"""
"""
pixel_values
=
kwargs
.
pop
(
"pixel_values_videos"
,
None
)
pixel_values
_videos
=
kwargs
.
pop
(
"pixel_values_videos"
,
None
)
if
pixel_values
is
None
:
if
pixel_values
_videos
is
None
:
return
None
return
None
if
not
(
is_list_of
(
pixel_values
,
if
not
isinstance
(
pixel_values_videos
,
(
torch
.
Tensor
,
list
)):
(
torch
.
Tensor
))
# different shape videos
raise
ValueError
(
"Incorrect type of pixel_values_videos. "
or
isinstance
(
pixel_values
,
f
"Got type:
{
type
(
pixel_values_videos
)
}
"
)
torch
.
Tensor
)):
# same shape videos
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
return
LlavaNextVideoPixelInputs
(
return
LlavaNextVideoPixelInputs
(
type
=
"pixel_values_videos"
,
type
=
"pixel_values_videos"
,
data
=
pixel_values
,
data
=
pixel_values
_videos
,
)
)
def
_select_image_features
(
self
,
image_features
:
torch
.
Tensor
,
*
,
def
_select_image_features
(
self
,
image_features
:
torch
.
Tensor
,
*
,
...
...
vllm/model_executor/models/llava_onevision.py
View file @
ab656f2c
...
@@ -574,10 +574,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -574,10 +574,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
if
pixel_values_videos
is
None
:
if
pixel_values_videos
is
None
:
return
None
return
None
if
not
(
is_list_of
(
pixel_values_videos
,
if
not
isinstance
(
pixel_values_videos
,
(
torch
.
Tensor
,
list
)):
torch
.
Tensor
)
# different shape videos
or
isinstance
(
pixel_values_videos
,
torch
.
Tensor
)):
# same shape videos
raise
ValueError
(
"Incorrect type of pixel_values_videos. "
raise
ValueError
(
"Incorrect type of pixel_values_videos. "
f
"Got type:
{
type
(
pixel_values_videos
)
}
"
)
f
"Got type:
{
type
(
pixel_values_videos
)
}
"
)
...
...
vllm/model_executor/models/paligemma.py
View file @
ab656f2c
...
@@ -23,7 +23,7 @@ from vllm.sequence import IntermediateTensors
...
@@ -23,7 +23,7 @@ from vllm.sequence import IntermediateTensors
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.siglip
import
SiglipVisionModel
from
.siglip
import
SiglipVisionModel
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
maybe_prefix
,
merge_multimodal_embeddings
)
from
.vision
import
get_vision_encoder_info
from
.vision
import
get_vision_encoder_info
...
@@ -270,12 +270,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -270,12 +270,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
return
None
return
None
if
pixel_values
is
not
None
:
if
pixel_values
is
not
None
:
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of pixel values. "
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
f
"Got type:
{
type
(
pixel_values
)
}
"
)
# Remove the N dimension until multiple images are supported.
pixel_values
=
flatten_bn
(
pixel_values
,
concat
=
True
)
pixel_values
=
pixel_values
.
squeeze
(
1
)
return
PaliGemmaImagePixelInputs
(
return
PaliGemmaImagePixelInputs
(
type
=
"pixel_values"
,
type
=
"pixel_values"
,
...
@@ -287,8 +286,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -287,8 +286,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
raise
ValueError
(
"Incorrect type of image embeddings. "
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
f
"Got type:
{
type
(
image_embeds
)
}
"
)
# Remove the N dimension until multiple images are supported.
image_embeds
=
flatten_bn
(
image_embeds
,
concat
=
True
)
image_embeds
=
image_embeds
.
squeeze
(
1
)
return
PaliGemmaImageEmbeddingInputs
(
return
PaliGemmaImageEmbeddingInputs
(
type
=
"image_embeds"
,
type
=
"image_embeds"
,
...
...
vllm/model_executor/models/qwen_vl.py
View file @
ab656f2c
...
@@ -711,7 +711,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
...
@@ -711,7 +711,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
image_embeds
=
kwargs
.
pop
(
"image_embeds"
,
None
)
image_embeds
=
kwargs
.
pop
(
"image_embeds"
,
None
)
if
pixel_values
is
not
None
:
if
pixel_values
is
not
None
:
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of pixel values. "
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
f
"Got type:
{
type
(
pixel_values
)
}
"
)
...
@@ -722,13 +722,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
...
@@ -722,13 +722,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
)
)
if
image_embeds
is
not
None
:
if
image_embeds
is
not
None
:
if
not
isinstance
(
image_embeds
,
torch
.
Tensor
):
if
not
isinstance
(
image_embeds
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of image embeddings. "
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
f
"Got type:
{
type
(
image_embeds
)
}
"
)
return
QwenImageEmbeddingInputs
(
return
QwenImageEmbeddingInputs
(
type
=
"image_embeds"
,
type
=
"image_embeds"
,
data
=
flatten_bn
(
image_embeds
),
data
=
flatten_bn
(
image_embeds
,
concat
=
True
),
)
)
return
None
return
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment