Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0a4d9685
Unverified
Commit
0a4d9685
authored
Nov 26, 2024
by
Roger Wang
Committed by
GitHub
Nov 27, 2024
Browse files
[V1] Update interface for idefics3 (#10680)
Signed-off-by:
Roger Wang
<
ywang@roblox.com
>
parent
0a71900b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
46 additions
and
27 deletions
+46
-27
vllm/model_executor/models/idefics3.py
vllm/model_executor/models/idefics3.py
+46
-27
No files found.
vllm/model_executor/models/idefics3.py
View file @
0a4d9685
...
...
@@ -39,6 +39,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.transformers_utils.processor
import
cached_get_processor
from
vllm.utils
import
is_list_of
...
...
@@ -597,6 +598,12 @@ class Idefics3Model(nn.Module):
image_features
=
self
.
_process_image_pixels
(
image_input
)
return
self
.
connector
(
image_features
)
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
return
self
.
text_model
.
get_input_embeddings
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
...
...
@@ -604,26 +611,8 @@ class Idefics3Model(nn.Module):
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
**
kwargs
:
object
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
intermediate_tensors
is
not
None
:
input_ids
=
None
inputs_embeds
=
None
else
:
# always pass the input via `inputs_embeds`
# to make sure the computation graph is consistent
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
not
None
:
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
inputs_embeds
=
self
.
text_model
.
get_input_embeddings
(
input_ids
)
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
vision_embeddings
,
self
.
image_token_id
)
else
:
inputs_embeds
=
self
.
text_model
.
get_input_embeddings
(
input_ids
)
input_ids
=
None
hidden_states
=
self
.
text_model
(
input_ids
,
...
...
@@ -718,6 +707,25 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
self
.
logits_processor
=
LogitsProcessor
(
config
.
text_config
.
vocab_size
)
self
.
sampler
=
Sampler
()
def
get_multimodal_embeddings
(
self
,
**
kwargs
)
->
Optional
[
NestedTensors
]:
image_input
=
self
.
model
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
vision_embeddings
=
self
.
model
.
_process_image_input
(
image_input
)
return
vision_embeddings
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
Optional
[
NestedTensors
]
=
None
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
model
.
get_input_embeddings
(
input_ids
)
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
config
.
image_token_id
)
return
inputs_embeds
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
...
...
@@ -725,16 +733,27 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
:
object
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
**
kwargs
,
)
if
intermediate_tensors
is
not
None
:
inputs_embeds
=
None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif
inputs_embeds
is
None
:
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
input_ids
=
None
hidden_states
=
self
.
model
.
text_model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment