Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
df76e5af
Unverified
Commit
df76e5af
authored
Jan 22, 2025
by
Cyrus Leung
Committed by
GitHub
Jan 21, 2025
Browse files
[VLM] Simplify post-processing of replacement info (#12269)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
09ccc9c8
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
175 additions
and
208 deletions
+175
-208
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+1
-1
tests/models/registry.py
tests/models/registry.py
+2
-1
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+23
-19
vllm/model_executor/models/aria.py
vllm/model_executor/models/aria.py
+2
-8
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+8
-25
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+13
-29
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+11
-27
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+17
-28
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+14
-29
vllm/multimodal/processing.py
vllm/multimodal/processing.py
+84
-41
No files found.
tests/models/multimodal/processing/test_common.py
View file @
df76e5af
...
@@ -35,7 +35,7 @@ def _test_processing_correctness(
...
@@ -35,7 +35,7 @@ def _test_processing_correctness(
task
=
"auto"
,
task
=
"auto"
,
tokenizer
=
model_id
,
tokenizer
=
model_id
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
Tru
e
,
trust_remote_code
=
model_info
.
trust_remote_cod
e
,
seed
=
0
,
seed
=
0
,
dtype
=
"float16"
,
dtype
=
"float16"
,
revision
=
None
,
revision
=
None
,
...
...
tests/models/registry.py
View file @
df76e5af
...
@@ -261,7 +261,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -261,7 +261,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-Audio-7B-Instruct"
),
# noqa: E501
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-Audio-7B-Instruct"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-VL-2B-Instruct"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-VL-2B-Instruct"
),
# noqa: E501
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_3"
),
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_3"
,
trust_remote_code
=
True
),
# [Encoder-decoder]
# [Encoder-decoder]
"MllamaForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
# noqa: E501
"MllamaForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
# noqa: E501
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3"
),
# noqa: E501
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3"
),
# noqa: E501
...
...
tests/multimodal/test_processing.py
View file @
df76e5af
...
@@ -7,12 +7,16 @@ import pytest
...
@@ -7,12 +7,16 @@ import pytest
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.processing
import
(
PlaceholderInfo
,
PromptReplacement
,
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.multimodal.processing
import
(
PlaceholderFeaturesInfo
,
PromptReplacement
,
find_mm_placeholders
,
find_mm_placeholders
,
find_text_matches
,
find_token_matches
,
find_text_matches
,
find_token_matches
,
iter_token_matches
,
iter_token_matches
,
replace_text_matches
,
replace_text_matches
,
replace_token_matches
)
replace_token_matches
)
# yapf: enable
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
...
@@ -433,19 +437,19 @@ def test_find_replace_tokens(
...
@@ -433,19 +437,19 @@ def test_find_replace_tokens(
[
1
,
9833
,
28747
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
1
,
9833
,
28747
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
{
{
"pattern_1"
:
[
"pattern_1"
:
[
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_1"
,
modality
=
"pattern_1"
,
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
6
,
start_idx
=
6
,
replacem
en
t
=
[
32000
,
32000
],
tok
en
s
=
[
32000
,
32000
],
),
),
],
],
"pattern_4"
:
[
"pattern_4"
:
[
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_4"
,
modality
=
"pattern_4"
,
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
3
,
start_idx
=
3
,
replacem
en
t
=
[
32000
],
tok
en
s
=
[
32000
],
),
),
],
],
}
}
...
@@ -455,25 +459,25 @@ def test_find_replace_tokens(
...
@@ -455,25 +459,25 @@ def test_find_replace_tokens(
[
1
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
1550
,
918
,
1550
],
[
1
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
1550
,
918
,
1550
],
{
{
"pattern_1"
:
[
"pattern_1"
:
[
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_1"
,
modality
=
"pattern_1"
,
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
1
,
start_idx
=
1
,
replacem
en
t
=
[
32000
,
32000
],
tok
en
s
=
[
32000
,
32000
],
),
),
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_1"
,
modality
=
"pattern_1"
,
item_idx
=
1
,
item_idx
=
1
,
start_idx
=
5
,
start_idx
=
5
,
replacem
en
t
=
[
32000
,
32000
],
tok
en
s
=
[
32000
,
32000
],
),
),
],
],
"pattern_3"
:
[
"pattern_3"
:
[
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_3"
,
modality
=
"pattern_3"
,
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
7
,
start_idx
=
7
,
replacem
en
t
=
[
1550
,
918
,
1550
],
tok
en
s
=
[
1550
,
918
,
1550
],
),
),
],
],
# No match for pattern_4 as it has lower priority than pattern_1
# No match for pattern_4 as it has lower priority than pattern_1
...
@@ -483,33 +487,33 @@ def test_find_replace_tokens(
...
@@ -483,33 +487,33 @@ def test_find_replace_tokens(
[
1
,
32000
,
32000
,
32000
,
32000
,
32000
,
1550
,
918
,
1550
],
[
1
,
32000
,
32000
,
32000
,
32000
,
32000
,
1550
,
918
,
1550
],
{
{
"pattern_1"
:
[
"pattern_1"
:
[
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_1"
,
modality
=
"pattern_1"
,
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
1
,
start_idx
=
1
,
replacem
en
t
=
[
32000
,
32000
],
tok
en
s
=
[
32000
,
32000
],
),
),
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_1"
,
modality
=
"pattern_1"
,
item_idx
=
1
,
item_idx
=
1
,
start_idx
=
3
,
start_idx
=
3
,
replacem
en
t
=
[
32000
,
32000
],
tok
en
s
=
[
32000
,
32000
],
),
),
],
],
"pattern_4"
:
[
"pattern_4"
:
[
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_4"
,
modality
=
"pattern_4"
,
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
5
,
start_idx
=
5
,
replacem
en
t
=
[
32000
],
tok
en
s
=
[
32000
],
),
),
],
],
"pattern_3"
:
[
"pattern_3"
:
[
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
"pattern_3"
,
modality
=
"pattern_3"
,
item_idx
=
0
,
item_idx
=
0
,
start_idx
=
6
,
start_idx
=
6
,
replacem
en
t
=
[
1550
,
918
,
1550
],
tok
en
s
=
[
1550
,
918
,
1550
],
),
),
],
],
}
}
...
...
vllm/model_executor/models/aria.py
View file @
df76e5af
...
@@ -342,13 +342,7 @@ class AriaProcessingInfo(BaseProcessingInfo):
...
@@ -342,13 +342,7 @@ class AriaProcessingInfo(BaseProcessingInfo):
return
self
.
get_hf_config
().
vision_config
return
self
.
get_hf_config
().
vision_config
def
get_hf_processor
(
self
):
def
get_hf_processor
(
self
):
processor
=
self
.
ctx
.
get_hf_processor
(
AriaProcessor
)
return
self
.
ctx
.
get_hf_processor
(
AriaProcessor
)
# Patch for https://github.com/huggingface/transformers/issues/35768
processor
.
tokenizer
.
image_token
=
"<|img|>"
processor
.
image_token
=
"<|img|>"
return
processor
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
return
{
"image"
:
None
}
...
@@ -381,7 +375,7 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
...
@@ -381,7 +375,7 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
}
}
hf_processor
=
self
.
info
.
get_hf_processor
()
hf_processor
=
self
.
info
.
get_hf_processor
()
image_token
:
str
=
hf_processor
.
image_token
# type: ignore
image_token
:
str
=
hf_processor
.
tokenizer
.
image_token
# type: ignore
return
ProcessorInputs
(
return
ProcessorInputs
(
prompt_text
=
image_token
*
num_images
,
prompt_text
=
image_token
*
num_images
,
...
...
vllm/model_executor/models/blip2.py
View file @
df76e5af
...
@@ -14,12 +14,12 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
...
@@ -14,12 +14,12 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
MultiModalInputs
,
MultiModalKwargs
,
NestedTensors
)
NestedTensors
,
PlaceholderRange
)
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
)
BaseProcessingInfo
,
PromptReplacement
,
PromptReplacementDetails
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -481,30 +481,13 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
...
@@ -481,30 +481,13 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
PromptReplacement
(
PromptReplacement
(
modality
=
"image"
,
modality
=
"image"
,
target
=
"</s>"
,
target
=
"</s>"
,
replacement
=
"<image>"
*
num_image_tokens
+
"</s>"
,
replacement
=
PromptReplacementDetails
(
full
=
"<image>"
*
num_image_tokens
+
"</s>"
,
features
=
"<image>"
*
num_image_tokens
,
),
)
)
]
]
def
apply
(
self
,
prompt
:
Union
[
str
,
list
[
int
]],
mm_data
:
MultiModalDataDict
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
MultiModalInputs
:
result
=
super
().
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Only <image> tokens should be considered as placeholders,
# so we ignore the trailing bos_token
result
[
"mm_placeholders"
]
=
{
modality
:
[
PlaceholderRange
(
offset
=
p
[
"offset"
],
length
=
p
[
"length"
]
-
1
)
for
p
in
ps
]
for
modality
,
ps
in
result
[
"mm_placeholders"
].
items
()
}
return
result
@
MULTIMODAL_REGISTRY
.
register_processor
(
Blip2MultiModalProcessor
,
@
MULTIMODAL_REGISTRY
.
register_processor
(
Blip2MultiModalProcessor
,
info
=
Blip2ProcessingInfo
,
info
=
Blip2ProcessingInfo
,
...
...
vllm/model_executor/models/chameleon.py
View file @
df76e5af
...
@@ -28,12 +28,12 @@ from vllm.model_executor.model_loader.weight_utils import (
...
@@ -28,12 +28,12 @@ from vllm.model_executor.model_loader.weight_utils import (
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
MultiModalInputs
,
MultiModalKwargs
,
NestedTensors
)
NestedTensors
,
PlaceholderRange
)
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.parse
import
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
)
BaseProcessingInfo
,
PromptReplacement
,
PromptReplacementDetails
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -141,39 +141,23 @@ class ChameleonMultiModalProcessor(
...
@@ -141,39 +141,23 @@ class ChameleonMultiModalProcessor(
out_mm_kwargs
:
MultiModalKwargs
,
out_mm_kwargs
:
MultiModalKwargs
,
)
->
list
[
PromptReplacement
]:
)
->
list
[
PromptReplacement
]:
processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
image_tokens
=
processor
.
image_token
*
self
.
info
.
get_num_image_tokens
()
return
[
return
[
PromptReplacement
(
PromptReplacement
(
modality
=
"image"
,
modality
=
"image"
,
target
=
"<image>"
,
target
=
"<image>"
,
replacement
=
""
.
join
([
replacement
=
PromptReplacementDetails
(
processor
.
image_start_token
,
full
=
""
.
join
([
processor
.
image_token
*
self
.
info
.
get_num_image_tokens
(),
processor
.
image_start_token
,
processor
.
image_end_token
,
image_tokens
,
]),
processor
.
image_end_token
,
]),
features
=
image_tokens
,
),
)
)
]
]
def
apply
(
self
,
prompt
:
Union
[
str
,
list
[
int
]],
mm_data
:
MultiModalDataDict
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
MultiModalInputs
:
result
=
super
().
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Only <image> tokens should be considered as placeholders,
# so we ignore the image_start_token and image_end_token
result
[
"mm_placeholders"
]
=
{
modality
:
[
PlaceholderRange
(
offset
=
p
[
"offset"
]
+
1
,
length
=
p
[
"length"
]
-
2
)
for
p
in
ps
]
for
modality
,
ps
in
result
[
"mm_placeholders"
].
items
()
}
return
result
class
ChameleonLayerNorm
(
nn
.
LayerNorm
):
class
ChameleonLayerNorm
(
nn
.
LayerNorm
):
...
...
vllm/model_executor/models/fuyu.py
View file @
df76e5af
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
""" PyTorch Fuyu model."""
""" PyTorch Fuyu model."""
import
math
import
math
from
typing
import
(
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
Tuple
,
from
typing
import
(
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
Set
,
Tuple
,
TypedDict
,
Union
)
TypedDict
)
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
...
@@ -30,13 +30,13 @@ from vllm.model_executor.layers.sampler import SamplerOutput
...
@@ -30,13 +30,13 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from
vllm.model_executor.models.persimmon
import
PersimmonForCausalLM
from
vllm.model_executor.models.persimmon
import
PersimmonForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
MultiModalInputs
,
MultiModalKwargs
,
NestedTensors
)
NestedTensors
,
PlaceholderRange
)
from
vllm.multimodal.parse
import
(
ImageProcessorItems
,
ImageSize
,
from
vllm.multimodal.parse
import
(
ImageProcessorItems
,
ImageSize
,
MultiModalDataItems
)
MultiModalDataItems
)
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
)
BaseProcessingInfo
,
PromptReplacement
,
PromptReplacementDetails
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -215,9 +215,13 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
...
@@ -215,9 +215,13 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
image_width
=
image_size
.
width
,
image_width
=
image_size
.
width
,
image_height
=
image_size
.
height
,
image_height
=
image_size
.
height
,
)
)
image_tokens
=
([
_IMAGE_TOKEN_ID
]
*
ncols
+
[
_NEWLINE_TOKEN_ID
])
*
nrows
return
(([
_IMAGE_TOKEN_ID
]
*
ncols
+
[
_NEWLINE_TOKEN_ID
])
*
nrows
+
return
PromptReplacementDetails
(
[
bos_token_id
])
full
=
image_tokens
+
[
bos_token_id
],
features
=
image_tokens
,
)
return
[
return
[
PromptReplacement
(
PromptReplacement
(
...
@@ -227,26 +231,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
...
@@ -227,26 +231,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
)
)
]
]
def
apply
(
self
,
prompt
:
Union
[
str
,
list
[
int
]],
mm_data
:
MultiModalDataDict
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
MultiModalInputs
:
result
=
super
().
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Only |SPEAKER| (image) tokens should be considered as placeholders,
# so we ignore the trailing bos_token_id
result
[
"mm_placeholders"
]
=
{
modality
:
[
PlaceholderRange
(
offset
=
p
[
"offset"
],
length
=
p
[
"length"
]
-
1
)
for
p
in
ps
]
for
modality
,
ps
in
result
[
"mm_placeholders"
].
items
()
}
return
result
@
MULTIMODAL_REGISTRY
.
register_processor
(
FuyuMultiModalProcessor
,
@
MULTIMODAL_REGISTRY
.
register_processor
(
FuyuMultiModalProcessor
,
info
=
FuyuProcessingInfo
,
info
=
FuyuProcessingInfo
,
...
...
vllm/model_executor/models/phi3v.py
View file @
df76e5af
...
@@ -30,15 +30,19 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -30,15 +30,19 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding
)
VocabParallelEmbedding
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
MultiModalInputs
,
MultiModalKwargs
,
NestedTensors
)
NestedTensors
,
PlaceholderRange
)
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
ImageProcessorItems
,
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
ImageProcessorItems
,
ImageSize
,
MultiModalDataItems
)
ImageSize
,
MultiModalDataItems
)
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
BaseProcessingInfo
,
BoundPromptReplacement
,
BoundPromptReplacement
,
PlaceholderInfo
,
PromptReplacement
)
PlaceholderFeaturesInfo
,
PromptReplacement
,
PromptReplacementDetails
)
# yapf: enable
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_list_of
from
vllm.utils
import
is_list_of
...
@@ -437,7 +441,12 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
...
@@ -437,7 +441,12 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
processor
=
hf_processor
,
processor
=
hf_processor
,
)
)
return
[
_IMAGE_TOKEN_ID
]
*
num_image_tokens
+
[
bos_token_id
]
image_tokens
=
[
_IMAGE_TOKEN_ID
]
*
num_image_tokens
return
PromptReplacementDetails
(
full
=
image_tokens
+
[
bos_token_id
],
features
=
image_tokens
,
)
num_images
=
mm_items
.
get_count
(
"image"
,
strict
=
False
)
num_images
=
mm_items
.
get_count
(
"image"
,
strict
=
False
)
...
@@ -454,7 +463,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
...
@@ -454,7 +463,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
token_ids
:
list
[
int
],
token_ids
:
list
[
int
],
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
mm_item_counts
:
Mapping
[
str
,
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
)
->
tuple
[
list
[
int
],
str
,
Mapping
[
str
,
list
[
PlaceholderInfo
]]]:
)
->
tuple
[
list
[
int
],
str
,
Mapping
[
str
,
list
[
Placeholder
Features
Info
]]]:
token_ids
,
text
,
placeholders
=
super
().
_apply_prompt_replacements
(
token_ids
,
text
,
placeholders
=
super
().
_apply_prompt_replacements
(
token_ids
=
token_ids
,
token_ids
=
token_ids
,
mm_prompt_repls
=
mm_prompt_repls
,
mm_prompt_repls
=
mm_prompt_repls
,
...
@@ -467,11 +476,11 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
...
@@ -467,11 +476,11 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
token_ids
=
[
token_ids
[
0
],
*
token_ids
[
2
:]]
token_ids
=
[
token_ids
[
0
],
*
token_ids
[
2
:]]
placeholders
=
{
placeholders
=
{
modality
:
[
modality
:
[
PlaceholderInfo
(
Placeholder
Features
Info
(
modality
=
p
.
modality
,
modality
=
p
.
modality
,
item_idx
=
p
.
item_idx
,
item_idx
=
p
.
item_idx
,
start_idx
=
p
.
start_idx
-
1
,
start_idx
=
p
.
start_idx
-
1
,
replacem
en
t
=
p
.
replacem
en
t
,
tok
en
s
=
p
.
tok
en
s
,
)
for
p
in
ps
)
for
p
in
ps
]
]
for
modality
,
ps
in
placeholders
.
items
()
for
modality
,
ps
in
placeholders
.
items
()
...
@@ -479,26 +488,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
...
@@ -479,26 +488,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
return
token_ids
,
text
,
placeholders
return
token_ids
,
text
,
placeholders
def
apply
(
self
,
prompt
:
Union
[
str
,
list
[
int
]],
mm_data
:
MultiModalDataDict
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
MultiModalInputs
:
result
=
super
().
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Only <|image|> tokens should be considered as placeholders,
# so we ignore the trailing bos_token_id
result
[
"mm_placeholders"
]
=
{
modality
:
[
PlaceholderRange
(
offset
=
p
[
"offset"
],
length
=
p
[
"length"
]
-
1
)
for
p
in
ps
]
for
modality
,
ps
in
result
[
"mm_placeholders"
].
items
()
}
return
result
@
MULTIMODAL_REGISTRY
.
register_processor
(
Phi3VMultiModalProcessor
,
@
MULTIMODAL_REGISTRY
.
register_processor
(
Phi3VMultiModalProcessor
,
info
=
Phi3VProcessingInfo
,
info
=
Phi3VProcessingInfo
,
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
df76e5af
...
@@ -36,13 +36,13 @@ from vllm.config import VllmConfig
...
@@ -36,13 +36,13 @@ from vllm.config import VllmConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
MultiModalInputs
,
MultiModalKwargs
,
NestedTensors
)
NestedTensors
,
PlaceholderRange
)
from
vllm.multimodal.parse
import
(
AudioProcessorItems
,
MultiModalDataItems
,
from
vllm.multimodal.parse
import
(
AudioProcessorItems
,
MultiModalDataItems
,
MultiModalDataParser
)
MultiModalDataParser
)
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
)
BaseProcessingInfo
,
PromptReplacement
,
PromptReplacementDetails
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -216,11 +216,16 @@ class Qwen2AudioMultiModalProcessor(
...
@@ -216,11 +216,16 @@ class Qwen2AudioMultiModalProcessor(
f
"The audio
{
audio
}
(len=
{
len
(
audio
)
}
) is too short "
f
"The audio
{
audio
}
(len=
{
len
(
audio
)
}
) is too short "
"to be represented inside the model"
)
"to be represented inside the model"
)
return
""
.
join
([
audio_tokens
=
audio_token
*
num_placeholders
audio_bos_token
,
audio_token
*
num_placeholders
,
return
PromptReplacementDetails
(
audio_eos_token
,
full
=
""
.
join
([
])
audio_bos_token
,
audio_tokens
,
audio_eos_token
,
]),
features
=
audio_tokens
,
)
return
[
return
[
PromptReplacement
(
PromptReplacement
(
...
@@ -240,26 +245,6 @@ class Qwen2AudioMultiModalProcessor(
...
@@ -240,26 +245,6 @@ class Qwen2AudioMultiModalProcessor(
# tokens than the number of audio items)
# tokens than the number of audio items)
return
not
hasattr
(
self
.
info
.
get_hf_processor
(),
"audio_token"
)
return
not
hasattr
(
self
.
info
.
get_hf_processor
(),
"audio_token"
)
def
apply
(
self
,
prompt
:
Union
[
str
,
list
[
int
]],
mm_data
:
MultiModalDataDict
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
MultiModalInputs
:
result
=
super
().
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Only <|AUDIO|> tokens should be considered as placeholders,
# so we ignore the audio_bos_token and audio_eos_token
result
[
"mm_placeholders"
]
=
{
modality
:
[
PlaceholderRange
(
offset
=
p
[
"offset"
]
+
1
,
length
=
p
[
"length"
]
-
2
)
for
p
in
ps
]
for
modality
,
ps
in
result
[
"mm_placeholders"
].
items
()
}
return
result
@
MULTIMODAL_REGISTRY
.
register_processor
(
@
MULTIMODAL_REGISTRY
.
register_processor
(
Qwen2AudioMultiModalProcessor
,
Qwen2AudioMultiModalProcessor
,
...
...
vllm/multimodal/processing.py
View file @
df76e5af
import
re
import
re
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
collections
import
defaultdict
from
collections
import
defaultdict
from
collections.abc
import
Callable
,
ItemsView
,
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
(
Callable
,
Generator
,
ItemsView
,
Iterable
,
Mapping
,
Sequence
)
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
from
functools
import
lru_cache
from
functools
import
lru_cache
from
typing
import
(
TYPE_CHECKING
,
Generic
,
NamedTuple
,
Optional
,
Protocol
,
from
typing
import
(
TYPE_CHECKING
,
Generic
,
NamedTuple
,
Optional
,
Protocol
,
...
@@ -31,6 +32,24 @@ _S = TypeVar("_S", str, list[int])
...
@@ -31,6 +32,24 @@ _S = TypeVar("_S", str, list[int])
_PromptSeq
=
Union
[
str
,
list
[
int
]]
_PromptSeq
=
Union
[
str
,
list
[
int
]]
@
dataclass
class
PromptReplacementDetails
:
full
:
_PromptSeq
"""The full replacement."""
features
:
_PromptSeq
"""
The part of the replacement that corresponds to placeholder feature tokens.
"""
@
staticmethod
def
from_seq
(
seq
:
_PromptSeq
):
return
PromptReplacementDetails
(
full
=
seq
,
features
=
seq
)
_PromptRepl
=
Union
[
_PromptSeq
,
PromptReplacementDetails
]
@
dataclass
@
dataclass
class
PromptReplacement
:
class
PromptReplacement
:
"""
"""
...
@@ -43,8 +62,8 @@ class PromptReplacement:
...
@@ -43,8 +62,8 @@ class PromptReplacement:
target
:
_PromptSeq
target
:
_PromptSeq
"""The token sequence (or text) to find and replace."""
"""The token sequence (or text) to find and replace."""
replacement
:
Union
[
Callable
[[
int
],
_Prompt
Seq
],
replacement
:
Union
[
Callable
[[
int
],
_Prompt
Repl
],
_Prompt
Seq
]
=
field
(
repr
=
False
)
_Prompt
Repl
]
=
field
(
repr
=
False
)
"""
"""
Given the index of the processed item within :attr:`modality`,
Given the index of the processed item within :attr:`modality`,
output the replacement token sequence (or text).
output the replacement token sequence (or text).
...
@@ -112,6 +131,14 @@ class _BoundPromptSequence:
...
@@ -112,6 +131,14 @@ class _BoundPromptSequence:
_text
:
Optional
[
str
]
_text
:
Optional
[
str
]
_token_ids
:
Optional
[
list
[
int
]]
_token_ids
:
Optional
[
list
[
int
]]
@
staticmethod
def
from_seq
(
tokenizer
:
AnyTokenizer
,
seq
:
_PromptSeq
):
return
_BoundPromptSequence
(
tokenizer
=
tokenizer
,
_text
=
seq
if
isinstance
(
seq
,
str
)
else
None
,
_token_ids
=
seq
if
isinstance
(
seq
,
list
)
else
None
,
)
def
__post_init__
(
self
)
->
None
:
def
__post_init__
(
self
)
->
None
:
if
self
.
_text
is
None
and
self
.
_token_ids
is
None
:
if
self
.
_text
is
None
and
self
.
_token_ids
is
None
:
raise
ValueError
(
"At least one of 'text' and 'token_ids' must be "
raise
ValueError
(
"At least one of 'text' and 'token_ids' must be "
...
@@ -134,6 +161,12 @@ class _BoundPromptSequence:
...
@@ -134,6 +161,12 @@ class _BoundPromptSequence:
return
self
.
_token_ids
return
self
.
_token_ids
@
dataclass
class
_BoundPromptReplacementGroup
:
full
:
_BoundPromptSequence
features
:
_BoundPromptSequence
@
dataclass
@
dataclass
class
BoundPromptReplacement
:
class
BoundPromptReplacement
:
"""
"""
...
@@ -145,24 +178,18 @@ class BoundPromptReplacement:
...
@@ -145,24 +178,18 @@ class BoundPromptReplacement:
modality
:
str
modality
:
str
_target
:
_PromptSeq
_target
:
_PromptSeq
_replacement
:
Union
[
Callable
[[
int
],
_Prompt
Seq
],
_replacement
:
Union
[
Callable
[[
int
],
_Prompt
Repl
],
_Prompt
Seq
]
=
field
(
repr
=
False
)
_Prompt
Repl
]
=
field
(
repr
=
False
)
def
__post_init__
(
self
)
->
None
:
def
__post_init__
(
self
)
->
None
:
self
.
_replacement_cache
=
dict
[
int
,
_BoundPrompt
Sequence
]()
self
.
_replacement_cache
=
dict
[
int
,
_BoundPrompt
ReplacementGroup
]()
@
property
@
property
def
target
(
self
)
->
_BoundPromptSequence
:
def
target
(
self
)
->
_BoundPromptSequence
:
"""The token sequence (or text) to find and replace."""
"""The token sequence (or text) to find and replace."""
target
=
self
.
_target
return
_BoundPromptSequence
.
from_seq
(
self
.
tokenizer
,
self
.
_target
)
return
_BoundPromptSequence
(
def
get_replacement
(
self
,
item_idx
:
int
)
->
_BoundPromptReplacementGroup
:
tokenizer
=
self
.
tokenizer
,
_text
=
target
if
isinstance
(
target
,
str
)
else
None
,
_token_ids
=
target
if
isinstance
(
target
,
list
)
else
None
,
)
def
get_replacement
(
self
,
item_idx
:
int
)
->
_BoundPromptSequence
:
"""
"""
Given the index of the processed item within :attr:`modality`,
Given the index of the processed item within :attr:`modality`,
output the replacement token sequence (or text).
output the replacement token sequence (or text).
...
@@ -177,10 +204,16 @@ class BoundPromptReplacement:
...
@@ -177,10 +204,16 @@ class BoundPromptReplacement:
else
:
else
:
cache_key
=
None
cache_key
=
None
bound_replacement
=
_BoundPromptSequence
(
if
not
isinstance
(
replacement
,
PromptReplacementDetails
):
tokenizer
=
self
.
tokenizer
,
replacement
=
PromptReplacementDetails
.
from_seq
(
replacement
)
_text
=
replacement
if
isinstance
(
replacement
,
str
)
else
None
,
_token_ids
=
replacement
if
isinstance
(
replacement
,
list
)
else
None
,
bound_full
=
_BoundPromptSequence
.
from_seq
(
self
.
tokenizer
,
replacement
.
full
)
bound_features
=
_BoundPromptSequence
.
from_seq
(
self
.
tokenizer
,
replacement
.
features
)
bound_replacement
=
_BoundPromptReplacementGroup
(
full
=
bound_full
,
features
=
bound_features
,
)
)
if
cache_key
is
not
None
:
if
cache_key
is
not
None
:
...
@@ -197,7 +230,7 @@ class _TokenMatch(NamedTuple):
...
@@ -197,7 +230,7 @@ class _TokenMatch(NamedTuple):
def
iter_token_matches
(
def
iter_token_matches
(
token_ids
:
list
[
int
],
token_ids
:
list
[
int
],
match_ids
:
list
[
int
],
match_ids
:
list
[
int
],
)
->
Iterable
[
_TokenMatch
]:
)
->
Generator
[
_TokenMatch
]:
"""
"""
Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
...
@@ -272,15 +305,15 @@ class _PromptReplacementTextMatch(_PromptReplacementMatch):
...
@@ -272,15 +305,15 @@ class _PromptReplacementTextMatch(_PromptReplacementMatch):
@
dataclass
@
dataclass
class
PlaceholderInfo
:
class
Placeholder
Features
Info
:
modality
:
str
modality
:
str
item_idx
:
int
item_idx
:
int
start_idx
:
int
start_idx
:
int
replacem
en
t
:
list
[
int
]
tok
en
s
:
list
[
int
]
@
property
@
property
def
length
(
self
)
->
int
:
def
length
(
self
)
->
int
:
return
len
(
self
.
replacem
en
t
)
return
len
(
self
.
tok
en
s
)
def
to_range
(
self
)
->
PlaceholderRange
:
def
to_range
(
self
)
->
PlaceholderRange
:
return
PlaceholderRange
(
return
PlaceholderRange
(
...
@@ -362,10 +395,10 @@ def _replace_matches(
...
@@ -362,10 +395,10 @@ def _replace_matches(
replacement
=
repl_info
.
get_replacement
(
item_idx
)
replacement
=
repl_info
.
get_replacement
(
item_idx
)
if
isinstance
(
prompt
,
str
):
if
isinstance
(
prompt
,
str
):
repl_seq
=
replacement
.
text
repl_seq
=
replacement
.
full
.
text
out_seqs
.
append
(
prompt
[
prev_end_idx
:
start_idx
]
+
repl_seq
)
out_seqs
.
append
(
prompt
[
prev_end_idx
:
start_idx
]
+
repl_seq
)
else
:
else
:
repl_seq
=
replacement
.
token_ids
repl_seq
=
replacement
.
full
.
token_ids
out_seqs
.
append
(
prompt
[
prev_end_idx
:
start_idx
]
+
repl_seq
)
out_seqs
.
append
(
prompt
[
prev_end_idx
:
start_idx
]
+
repl_seq
)
prev_end_idx
=
end_idx
prev_end_idx
=
end_idx
...
@@ -408,7 +441,7 @@ def _iter_placeholders(
...
@@ -408,7 +441,7 @@ def _iter_placeholders(
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
prompt
:
list
[
int
],
prompt
:
list
[
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
)
->
Iterable
[
PlaceholderInfo
]:
)
->
Iterable
[
Placeholder
Features
Info
]:
"""
"""
Yield each set of placeholder tokens found in :code:`prompt`.
Yield each set of placeholder tokens found in :code:`prompt`.
...
@@ -432,23 +465,33 @@ def _iter_placeholders(
...
@@ -432,23 +465,33 @@ def _iter_placeholders(
for
repl_info
in
modality_repls
:
for
repl_info
in
modality_repls
:
replacement
=
repl_info
.
get_replacement
(
item_idx
)
replacement
=
repl_info
.
get_replacement
(
item_idx
)
repl_tokens
=
replacement
.
token_ids
repl_tokens
_full
=
replacement
.
full
.
token_ids
repl_len
=
len
(
repl_tokens
)
repl_len
_full
=
len
(
repl_tokens
_full
)
end_idx
=
start_idx
+
repl_len
end_idx
_full
=
start_idx
+
repl_len
_full
if
repl_len
==
0
or
end_idx
>
prompt_len
:
if
repl_len
_full
==
0
or
end_idx
_full
>
prompt_len
:
continue
continue
if
prompt
[
start_idx
:
end_idx
]
==
repl_tokens
:
if
prompt
[
start_idx
:
end_idx_full
]
==
repl_tokens_full
:
yield
PlaceholderInfo
(
repl_tokens_feat
=
replacement
.
features
.
token_ids
modality
=
modality
,
item_idx
=
item_idx
,
try
:
start_idx
=
start_idx
,
match
=
next
(
replacement
=
repl_tokens
,
iter_token_matches
(
repl_tokens_full
,
)
repl_tokens_feat
))
yield
PlaceholderFeaturesInfo
(
modality
=
modality
,
item_idx
=
item_idx
,
start_idx
=
start_idx
+
match
.
start_idx
,
tokens
=
repl_tokens_feat
,
)
except
StopIteration
:
raise
AssertionError
(
f
"
{
repl_tokens_feat
=
}
should be a "
f
"subsequence of
{
repl_tokens_full
=
}
"
)
from
None
# Exclude overlapping matches
# Exclude overlapping matches
start_idx
=
end_idx
start_idx
=
end_idx
_full
item_idx_by_modality
[
modality
]
+=
1
item_idx_by_modality
[
modality
]
+=
1
found
=
True
found
=
True
break
break
...
@@ -464,7 +507,7 @@ def find_mm_placeholders(
...
@@ -464,7 +507,7 @@ def find_mm_placeholders(
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
prompt
:
list
[
int
],
prompt
:
list
[
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
list
[
PlaceholderInfo
]]:
)
->
Mapping
[
str
,
list
[
Placeholder
Features
Info
]]:
it
=
_iter_placeholders
(
mm_prompt_repls
,
prompt
,
mm_item_counts
)
it
=
_iter_placeholders
(
mm_prompt_repls
,
prompt
,
mm_item_counts
)
return
dict
(
full_groupby_modality
(
it
))
return
dict
(
full_groupby_modality
(
it
))
...
@@ -679,7 +722,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
...
@@ -679,7 +722,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
new_token_ids
:
list
[
int
],
new_token_ids
:
list
[
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
list
[
PlaceholderInfo
]]:
)
->
Mapping
[
str
,
list
[
Placeholder
Features
Info
]]:
return
find_mm_placeholders
(
mm_prompt_repls
,
new_token_ids
,
return
find_mm_placeholders
(
mm_prompt_repls
,
new_token_ids
,
mm_item_counts
)
mm_item_counts
)
...
@@ -948,7 +991,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
...
@@ -948,7 +991,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
token_ids
:
list
[
int
],
token_ids
:
list
[
int
],
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
mm_prompt_repls
:
Mapping
[
str
,
Sequence
[
BoundPromptReplacement
]],
mm_item_counts
:
Mapping
[
str
,
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
)
->
tuple
[
list
[
int
],
str
,
Mapping
[
str
,
list
[
PlaceholderInfo
]]]:
)
->
tuple
[
list
[
int
],
str
,
Mapping
[
str
,
list
[
Placeholder
Features
Info
]]]:
tokenizer
=
self
.
info
.
get_tokenizer
()
tokenizer
=
self
.
info
.
get_tokenizer
()
mm_token_matches
=
{
mm_token_matches
=
{
...
@@ -1037,7 +1080,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
...
@@ -1037,7 +1080,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
def
_validate_mm_placeholders
(
def
_validate_mm_placeholders
(
self
,
self
,
mm_placeholders
:
Mapping
[
str
,
list
[
PlaceholderInfo
]],
mm_placeholders
:
Mapping
[
str
,
list
[
Placeholder
Features
Info
]],
mm_item_counts
:
Mapping
[
str
,
int
],
mm_item_counts
:
Mapping
[
str
,
int
],
*
,
*
,
allow_missing
:
bool
=
False
,
allow_missing
:
bool
=
False
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment