Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
53b3a330
Unverified
Commit
53b3a330
authored
Oct 04, 2024
by
hhzhang16
Committed by
GitHub
Oct 04, 2024
Browse files
[Bugfix] Fixes Phi3v & Ultravox Multimodal EmbeddingInputs (#8979)
parent
dac914b0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
43 additions
and
25 deletions
+43
-25
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+14
-6
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+29
-19
No files found.
vllm/model_executor/models/phi3v.py
View file @
53b3a330
...
@@ -467,9 +467,10 @@ def input_processor_for_phi3v(ctx: InputContext,
...
@@ -467,9 +467,10 @@ def input_processor_for_phi3v(ctx: InputContext,
input_height
=
h
,
input_height
=
h
,
num_crops
=
num_crops
))
num_crops
=
num_crops
))
elif
isinstance
(
image_data
,
torch
.
Tensor
):
elif
isinstance
(
image_data
,
torch
.
Tensor
):
num_images
,
image_feature_size
,
hidden_size
=
image_data
.
shape
image_feature_size
=
[
image_data
.
shape
[
0
]]
image_data
=
[
image_data
]
elif
is_list_of
(
image_data
,
torch
.
Tensor
):
elif
is_list_of
(
image_data
,
torch
.
Tensor
):
image_feature_size
=
[
item
.
shape
[
1
]
for
item
in
image_data
]
image_feature_size
=
[
item
.
shape
[
0
]
for
item
in
image_data
]
else
:
else
:
raise
TypeError
(
f
"Invalid image type:
{
type
(
image_data
)
}
"
)
raise
TypeError
(
f
"Invalid image type:
{
type
(
image_data
)
}
"
)
...
@@ -611,9 +612,6 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -611,9 +612,6 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
image_sizes
=
kwargs
.
pop
(
"image_sizes"
,
None
)
image_sizes
=
kwargs
.
pop
(
"image_sizes"
,
None
)
image_embeds
=
kwargs
.
pop
(
"image_embeds"
,
None
)
image_embeds
=
kwargs
.
pop
(
"image_embeds"
,
None
)
if
pixel_values
is
None
:
return
None
if
pixel_values
is
None
and
image_embeds
is
None
:
if
pixel_values
is
None
and
image_embeds
is
None
:
return
None
return
None
...
@@ -650,7 +648,17 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -650,7 +648,17 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
if
image_input
[
"type"
]
==
"image_embeds"
:
if
image_input
[
"type"
]
==
"image_embeds"
:
return
image_input
[
"data"
]
image_data
=
image_input
[
"data"
]
if
is_list_of
(
image_data
,
torch
.
Tensor
):
# it's already a list of tensors
return
image_data
if
len
(
image_data
.
shape
)
==
3
:
# 3D tensor
return
list
(
torch
.
unbind
(
image_data
,
dim
=
0
))
raise
ValueError
(
"We expect batched 2D tensors;"
"this can be either a list of 2D tensors or a single 3D tensor."
)
assert
self
.
vision_embed_tokens
is
not
None
assert
self
.
vision_embed_tokens
is
not
None
image_embeds
=
self
.
vision_embed_tokens
(
image_input
[
"data"
],
image_embeds
=
self
.
vision_embed_tokens
(
image_input
[
"data"
],
...
...
vllm/model_executor/models/ultravox.py
View file @
53b3a330
...
@@ -38,6 +38,7 @@ from vllm.multimodal.utils import (cached_get_tokenizer,
...
@@ -38,6 +38,7 @@ from vllm.multimodal.utils import (cached_get_tokenizer,
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
SequenceData
)
SequenceData
)
from
vllm.transformers_utils.configs.ultravox
import
UltravoxConfig
from
vllm.transformers_utils.configs.ultravox
import
UltravoxConfig
from
vllm.utils
import
is_list_of
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
...
@@ -119,6 +120,10 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
...
@@ -119,6 +120,10 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
if
not
isinstance
(
data
,
list
):
if
not
isinstance
(
data
,
list
):
data
=
[
data
]
data
=
[
data
]
# If the audio inputs are embeddings, no need for preprocessing
if
is_list_of
(
data
,
torch
.
Tensor
,
check
=
"all"
):
return
MultiModalInputs
({
"audio_embeds"
:
data
})
audio_features
=
[]
audio_features
=
[]
for
audio_input
in
data
:
for
audio_input
in
data
:
if
not
isinstance
(
audio_input
,
tuple
):
if
not
isinstance
(
audio_input
,
tuple
):
...
@@ -165,25 +170,30 @@ def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs):
...
@@ -165,25 +170,30 @@ def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs):
audios
=
[
audios
]
audios
=
[
audios
]
audio_token_counts
=
[]
audio_token_counts
=
[]
for
audio_data
,
sample_rate
in
audios
:
for
audio
in
audios
:
audio_length
=
audio_data
.
shape
[
0
]
if
isinstance
(
audio
,
torch
.
Tensor
):
if
sample_rate
!=
feature_extractor
.
sampling_rate
:
audio_num_tokens
=
audio
.
shape
[
1
]
# Account for resampling.
audio_token_counts
.
append
(
audio_num_tokens
)
adjustment
=
feature_extractor
.
sampling_rate
/
sample_rate
else
:
audio_length
=
math
.
ceil
(
adjustment
*
audio_length
)
audio_data
,
sample_rate
=
audio
audio_length
=
audio_data
.
shape
[
0
]
feature_extractor_output_length
=
math
.
ceil
(
if
sample_rate
!=
feature_extractor
.
sampling_rate
:
(
audio_length
-
(
feature_extractor
.
hop_length
-
1
))
/
# Account for resampling.
feature_extractor
.
hop_length
)
adjustment
=
feature_extractor
.
sampling_rate
/
sample_rate
audio_length
=
math
.
ceil
(
adjustment
*
audio_length
)
uv_config
=
ctx
.
get_hf_config
(
UltravoxConfig
)
audio_num_tokens
=
min
(
feature_extractor_output_length
=
math
.
ceil
(
max
(
(
audio_length
-
(
feature_extractor
.
hop_length
-
1
))
/
1
,
feature_extractor
.
hop_length
)
math
.
ceil
(
feature_extractor_output_length
/
(
uv_config
.
stack_factor
*
2
))),
uv_config
=
ctx
.
get_hf_config
(
UltravoxConfig
)
get_ultravox_max_audio_tokens
(
ctx
))
audio_num_tokens
=
min
(
audio_token_counts
.
append
(
audio_num_tokens
)
max
(
1
,
math
.
ceil
(
feature_extractor_output_length
/
(
uv_config
.
stack_factor
*
2
))),
get_ultravox_max_audio_tokens
(
ctx
))
audio_token_counts
.
append
(
audio_num_tokens
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment