Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2c0d9335
Unverified
Commit
2c0d9335
authored
Jun 10, 2024
by
Cyrus Leung
Committed by
GitHub
Jun 10, 2024
Browse files
[Bugfix] Fix LLaVA-NeXT (#5380)
parent
774d1035
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
25 additions
and
1 deletion
+25
-1
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+24
-0
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+1
-1
No files found.
vllm/model_executor/models/llava_next.py
View file @
2c0d9335
...
@@ -216,6 +216,30 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
...
@@ -216,6 +216,30 @@ class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
return
None
return
None
def
_select_image_features
(
self
,
image_features
:
torch
.
Tensor
,
*
,
strategy
:
str
)
->
torch
.
Tensor
:
# Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421 # noqa
if
strategy
==
"default"
:
return
image_features
[:,
1
:]
elif
strategy
==
"full"
:
return
image_features
raise
ValueError
(
f
"Unexpected select feature strategy:
{
strategy
}
"
)
def
_image_pixels_to_features
(
self
,
vision_tower
:
CLIPVisionModel
,
pixel_values
:
torch
.
Tensor
)
->
torch
.
Tensor
:
# TODO(xwjiang): Maybe port minimal CLIPVisionModel over.
image_outputs
=
vision_tower
(
pixel_values
.
to
(
vision_tower
.
device
),
output_hidden_states
=
True
)
image_features
=
image_outputs
.
hidden_states
[
self
.
config
.
vision_feature_layer
]
return
self
.
_select_image_features
(
image_features
,
strategy
=
self
.
config
.
vision_feature_select_strategy
,
)
def
_merge_image_patch_embeddings
(
self
,
image_size
:
torch
.
Tensor
,
def
_merge_image_patch_embeddings
(
self
,
image_size
:
torch
.
Tensor
,
patch_embeddings
:
torch
.
Tensor
,
*
,
patch_embeddings
:
torch
.
Tensor
,
*
,
strategy
:
str
)
->
torch
.
Tensor
:
strategy
:
str
)
->
torch
.
Tensor
:
...
...
vllm/multimodal/utils.py
View file @
2c0d9335
...
@@ -77,7 +77,7 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
...
@@ -77,7 +77,7 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
"""Combine image and text prompts for vision language model depending on
"""Combine image and text prompts for vision language model depending on
the model architecture."""
the model architecture."""
if
config
.
hf_config
.
model_type
==
"llava"
:
if
config
.
hf_config
.
model_type
in
(
"llava"
,
"llava_next"
)
:
full_prompt
=
f
"
{
image_prompt
}
\n
{
text_prompt
}
"
full_prompt
=
f
"
{
image_prompt
}
\n
{
text_prompt
}
"
else
:
else
:
raise
ValueError
(
raise
ValueError
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment