Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
377d10bd
Unverified
Commit
377d10bd
authored
Feb 19, 2025
by
Cyrus Leung
Committed by
GitHub
Feb 19, 2025
Browse files
[VLM][Bugfix] Pass processor kwargs properly on init (#13516)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
52ce14d3
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
182 additions
and
148 deletions
+182
-148
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+35
-10
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+14
-13
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+2
-2
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_next_video.py
+2
-2
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+2
-2
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+2
-5
vllm/model_executor/models/mllama.py
vllm/model_executor/models/mllama.py
+2
-2
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+2
-2
vllm/model_executor/models/nvlm_d.py
vllm/model_executor/models/nvlm_d.py
+14
-5
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+2
-2
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+3
-2
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+6
-14
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+12
-35
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+2
-1
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+64
-30
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+7
-2
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+2
-1
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper.py
+4
-4
vllm/multimodal/image.py
vllm/multimodal/image.py
+1
-4
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+4
-10
No files found.
vllm/model_executor/models/internvl.py
View file @
377d10bd
...
...
@@ -120,6 +120,7 @@ def resolve_internvl_min_max_num(
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
...
...
@@ -247,6 +248,7 @@ class BaseInternVLProcessor(ABC):
config
:
PretrainedConfig
,
tokenizer
:
AnyTokenizer
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
)
->
None
:
...
...
@@ -258,18 +260,22 @@ class BaseInternVLProcessor(ABC):
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
dynamic_
image_size
is
None
:
dynamic_
image_size
=
config
.
dynamic_
image_size
assert
isinstance
(
dynamic_
image_size
,
bool
)
if
min_
dynamic_
patch
is
None
:
min_
dynamic_
patch
=
config
.
min_
dynamic_
patch
assert
isinstance
(
min_
dynamic_
patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
))
self
.
image_size
=
image_size
self
.
min_dynamic_patch
:
int
=
config
.
min_dynamic_patch
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
...
...
@@ -298,11 +304,13 @@ class BaseInternVLProcessor(ABC):
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
use_thumbnail
:
Optional
[
bool
]
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
self
.
min_dynamic_patch
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
...
...
@@ -320,11 +328,13 @@ class BaseInternVLProcessor(ABC):
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
use_thumbnail
:
Optional
[
bool
]
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
...
...
@@ -355,10 +365,12 @@ class BaseInternVLProcessor(ABC):
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
...
...
@@ -378,6 +390,7 @@ class BaseInternVLProcessor(ABC):
self
,
text
:
Optional
[
Union
[
str
,
list
[
str
]]]
=
None
,
images
:
Optional
[
Union
[
Image
.
Image
,
list
[
Image
.
Image
]]]
=
None
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
return_tensors
:
Optional
[
Union
[
str
,
TensorType
]]
=
None
,
...
...
@@ -396,6 +409,7 @@ class BaseInternVLProcessor(ABC):
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
...
...
@@ -451,8 +465,10 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
def
get_hf_processor
(
self
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
**
kwargs
:
object
,
)
->
BaseInternVLProcessor
:
raise
NotImplementedError
...
...
@@ -642,14 +658,23 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
def
get_hf_processor
(
self
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
**
kwargs
:
object
,
)
->
InternVLProcessor
:
return
InternVLProcessor
(
self
.
get_hf_config
(),
self
.
get_tokenizer
(),
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
if
min_dynamic_patch
is
not
None
:
kwargs
[
"min_dynamic_patch"
]
=
min_dynamic_patch
if
max_dynamic_patch
is
not
None
:
kwargs
[
"max_dynamic_patch"
]
=
max_dynamic_patch
if
dynamic_image_size
is
not
None
:
kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
return
self
.
ctx
.
init_processor
(
InternVLProcessor
,
config
=
self
.
get_hf_config
(),
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
)
...
...
vllm/model_executor/models/llava.py
View file @
377d10bd
...
...
@@ -119,7 +119,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
return
get_vision_encoder_info
(
self
.
get_hf_config
())
@
abstractmethod
def
get_hf_processor
(
self
)
->
LlavaLikeProcessor
:
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
LlavaLikeProcessor
:
raise
NotImplementedError
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
...
...
@@ -208,8 +208,8 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
class
LlavaProcessingInfo
(
BaseLlavaProcessingInfo
):
def
get_hf_processor
(
self
):
return
self
.
ctx
.
get_hf_processor
(
LlavaProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
LlavaProcessor
,
**
kwargs
)
class
BaseLlavaMultiModalProcessor
(
BaseMultiModalProcessor
[
_I
]):
...
...
@@ -272,8 +272,8 @@ class LlavaMultiModalProcessor(
class
PixtralHFProcessingInfo
(
BaseLlavaProcessingInfo
):
def
get_hf_processor
(
self
):
return
self
.
ctx
.
get_hf_processor
(
PixtralProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
PixtralProcessor
,
**
kwargs
)
class
PixtralHFMultiModalProcessor
(
...
...
@@ -742,23 +742,24 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
class
MantisProcessingInfo
(
LlavaProcessingInfo
):
def
get_hf_processor
(
self
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
hf_config
=
self
.
get_hf_config
()
vision_info
=
self
.
get_vision_encoder_info
()
kwargs
.
setdefault
(
"patch_size"
,
vision_info
.
get_patch_size
())
if
Version
(
TRANSFORMERS_VERSION
)
<
Version
(
"4.48"
):
# BUG: num_additional_image_tokens = 0 but treated as 1,
# so we set vision_feature_select_strategy to None to offset this
vision_feature_select_strategy
=
None
kwargs
.
setdefault
(
"
vision_feature_select_strategy
"
,
None
)
else
:
# FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
vision_feature_select_strategy
=
hf_config
.
vision_feature_select_strategy
# noqa: E501
kwargs
.
setdefault
(
"vision_feature_select_strategy"
,
hf_config
.
vision_feature_select_strategy
,
)
return
self
.
ctx
.
get_hf_processor
(
LlavaProcessor
,
patch_size
=
vision_info
.
get_patch_size
(),
vision_feature_select_strategy
=
vision_feature_select_strategy
,
)
return
self
.
ctx
.
get_hf_processor
(
LlavaProcessor
,
**
kwargs
)
class
MantisMultiModalProcessor
(
LlavaMultiModalProcessor
):
...
...
vllm/model_executor/models/llava_next.py
View file @
377d10bd
...
...
@@ -72,8 +72,8 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
def
get_hf_config
(
self
)
->
LlavaNextLikeConfig
:
return
self
.
ctx
.
get_hf_config
(
LlavaNextConfig
)
def
get_hf_processor
(
self
):
hf_processor
=
self
.
ctx
.
get_hf_processor
(
LlavaNextProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
hf_processor
=
self
.
ctx
.
get_hf_processor
(
LlavaNextProcessor
,
**
kwargs
)
# In case patch_size is omitted from `processor_config.json`
# e.g. for E5-V: https://huggingface.co/royokong/e5-v
...
...
vllm/model_executor/models/llava_next_video.py
View file @
377d10bd
...
...
@@ -56,8 +56,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
def
get_vision_encoder_info
(
self
):
return
get_vision_encoder_info
(
self
.
get_hf_config
())
def
get_hf_processor
(
self
):
return
self
.
ctx
.
get_hf_processor
(
LlavaNextVideoProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
LlavaNextVideoProcessor
,
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"video"
:
1
}
...
...
vllm/model_executor/models/llava_onevision.py
View file @
377d10bd
...
...
@@ -97,8 +97,8 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
def
get_hf_config
(
self
)
->
LlavaOnevisionLikeConfig
:
return
self
.
ctx
.
get_hf_config
(
LlavaOnevisionConfig
)
def
get_hf_processor
(
self
):
return
self
.
ctx
.
get_hf_processor
(
LlavaOnevisionProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
LlavaOnevisionProcessor
,
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
,
"video"
:
None
}
...
...
vllm/model_executor/models/minicpmv.py
View file @
377d10bd
...
...
@@ -331,11 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
()
def
get_hf_processor
(
self
,
**
kwargs
:
object
,
):
hf_processor
=
self
.
ctx
.
get_hf_processor
()
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
hf_processor
=
self
.
ctx
.
get_hf_processor
(
**
kwargs
)
# NumPy arrays are considered as Iterable but not Sequence in
# https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428
...
...
vllm/model_executor/models/mllama.py
View file @
377d10bd
...
...
@@ -94,8 +94,8 @@ class MllamaProcessingInfo(BaseProcessingInfo):
def
get_hf_config
(
self
)
->
MllamaConfig
:
return
self
.
ctx
.
get_hf_config
(
MllamaConfig
)
def
get_hf_processor
(
self
)
->
MllamaProcessor
:
return
self
.
ctx
.
get_hf_processor
(
MllamaProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
MllamaProcessor
:
return
self
.
ctx
.
get_hf_processor
(
MllamaProcessor
,
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
...
...
vllm/model_executor/models/molmo.py
View file @
377d10bd
...
...
@@ -1200,8 +1200,8 @@ class MolmoProcessorWrapper:
class
MolmoProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_processor
(
self
)
->
MolmoProcessorWrapper
:
processor
=
self
.
ctx
.
get_hf_processor
()
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
MolmoProcessorWrapper
:
processor
=
self
.
ctx
.
get_hf_processor
(
**
kwargs
)
return
MolmoProcessorWrapper
(
processor
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
...
...
vllm/model_executor/models/nvlm_d.py
View file @
377d10bd
...
...
@@ -69,14 +69,23 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def
get_hf_processor
(
self
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
**
kwargs
:
object
,
)
->
NVLMProcessor
:
return
NVLMProcessor
(
self
.
get_hf_config
(),
self
.
get_tokenizer
(),
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
if
min_dynamic_patch
is
not
None
:
kwargs
[
"min_dynamic_patch"
]
=
min_dynamic_patch
if
max_dynamic_patch
is
not
None
:
kwargs
[
"max_dynamic_patch"
]
=
max_dynamic_patch
if
dynamic_image_size
is
not
None
:
kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
return
self
.
ctx
.
init_processor
(
NVLMProcessor
,
config
=
self
.
get_hf_config
(),
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
)
def
get_max_image_tokens
(
self
)
->
int
:
...
...
vllm/model_executor/models/paligemma.py
View file @
377d10bd
...
...
@@ -16,8 +16,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.siglip
import
(
SiglipVisionModel
,
dummy_image_for_siglip
,
...
...
@@ -88,7 +88,7 @@ def input_processor_for_paligemma(ctx: InputContext,
model_config
=
ctx
.
model_config
hf_config
=
ctx
.
get_hf_config
(
PaliGemmaConfig
)
tokenizer
=
cached_
get_
tokenizer
(
model_config
.
tokenizer
)
tokenizer
=
cached_tokenizer
_from_config
(
model_config
)
image_feature_size
=
hf_config
.
text_config
.
num_image_tokens
image_token_str
=
tokenizer
.
decode
(
hf_config
.
image_token_index
)
bos_token
=
tokenizer
.
decode
(
hf_config
.
bos_token_id
)
...
...
vllm/model_executor/models/phi3v.py
View file @
377d10bd
...
...
@@ -313,11 +313,12 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
self
,
*
,
num_crops
:
Optional
[
int
]
=
None
,
**
kwargs
:
object
,
)
->
ProcessorMixin
:
if
num_crops
is
not
None
:
return
self
.
ctx
.
get_hf_processor
(
num_crops
=
num_crops
)
kwargs
[
"
num_crops
"
]
=
num_crops
return
self
.
ctx
.
get_hf_processor
()
return
self
.
ctx
.
get_hf_processor
(
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
...
...
vllm/model_executor/models/pixtral.py
View file @
377d10bd
...
...
@@ -32,9 +32,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.inputs
import
NestedTensors
,
PlaceholderRange
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
consecutive_placeholder_ranges
)
from
vllm.multimodal.utils
import
consecutive_placeholder_ranges
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
init_vllm_registered_model
,
maybe_prefix
,
...
...
@@ -49,9 +49,7 @@ except ImportError:
def
get_max_pixtral_image_tokens
(
ctx
:
InputContext
):
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
tokenizer_mode
=
ctx
.
model_config
.
tokenizer_mode
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
mm_encoder
=
tokenizer
.
instruct
.
mm_encoder
image_config
=
mm_encoder
.
mm_config
if
hasattr
(
...
...
@@ -65,9 +63,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext):
def
dummy_data_for_pixtral
(
ctx
:
InputContext
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
]):
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
tokenizer_mode
=
ctx
.
model_config
.
tokenizer_mode
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
mm_encoder
=
tokenizer
.
mistral
.
instruct_tokenizer
.
mm_encoder
image_token_id
=
mm_encoder
.
special_ids
.
img
...
...
@@ -109,9 +105,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
MultiModalKwargs containing the stacked normalized images tensor or
image embeddings.
"""
model_config
=
ctx
.
model_config
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
tokenizer_mode
=
model_config
.
tokenizer_mode
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
data_list
=
data
if
isinstance
(
data
,
list
)
else
[
data
]
...
...
@@ -138,9 +132,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
prompt_token_ids
=
inputs
.
get
(
"prompt_token_ids"
)
prompt
=
inputs
.
get
(
"prompt"
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
tokenizer_mode
=
ctx
.
model_config
.
tokenizer_mode
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
mm_encoder
=
tokenizer
.
mistral
.
instruct_tokenizer
.
mm_encoder
image_token_id
=
mm_encoder
.
special_ids
.
img
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
377d10bd
...
...
@@ -36,8 +36,6 @@ from transformers import BatchFeature
from
transformers.models.qwen2_5_vl
import
Qwen2_5_VLProcessor
from
transformers.models.qwen2_5_vl.configuration_qwen2_5_vl
import
(
Qwen2_5_VLConfig
,
Qwen2_5_VLVisionConfig
)
from
transformers.models.qwen2_vl
import
(
Qwen2VLImageProcessor
,
Qwen2VLImageProcessorFast
)
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
...
...
@@ -690,41 +688,20 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
fps
:
Optional
[
float
]
=
2.0
,
size
:
Optional
[
dict
[
str
,
int
]]
=
None
,
fps
:
Optional
[
float
]
=
None
,
**
kwargs
:
object
,
)
->
Qwen2_5_VLProcessor
:
hf_processor
=
self
.
ctx
.
get_hf_processor
(
Qwen2_5_VLProcessor
)
image_processor
=
hf_processor
.
image_processor
# type: ignore
assert
isinstance
(
image_processor
,
(
Qwen2VLImageProcessor
,
Qwen2VLImageProcessorFast
))
if
min_pixels
:
image_processor
.
min_pixels
=
min_pixels
if
max_pixels
:
image_processor
.
max_pixels
=
max_pixels
if
max_pixels
or
min_pixels
:
image_processor
.
size
=
{
"min_pixels"
:
image_processor
.
min_pixels
,
"max_pixels"
:
image_processor
.
max_pixels
,
}
return
hf_processor
def
get_image_processor
(
self
,
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
fps
:
Optional
[
float
]
=
2.0
,
)
->
Union
[
Qwen2VLImageProcessor
,
Qwen2VLImageProcessorFast
]:
hf_processor
=
self
.
get_hf_processor
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
fps
=
fps
,
if
fps
is
not
None
:
kwargs
[
"fps"
]
=
fps
return
self
.
ctx
.
get_hf_processor
(
Qwen2_5_VLProcessor
,
image_processor
=
self
.
get_image_processor
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
size
=
size
),
**
kwargs
,
)
image_processor
=
hf_processor
.
image_processor
# type: ignore
assert
isinstance
(
image_processor
,
(
Qwen2VLImageProcessor
,
Qwen2VLImageProcessorFast
))
return
image_processor
class
Qwen2_5_VLMultiModalProcessor
(
Qwen2VLMultiModalProcessor
):
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
377d10bd
...
...
@@ -93,8 +93,9 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
*
,
# Ignored in initialization
sampling_rate
:
Optional
[
int
]
=
None
,
**
kwargs
:
object
,
)
->
Qwen2AudioProcessor
:
return
self
.
ctx
.
get_hf_processor
(
Qwen2AudioProcessor
)
return
self
.
ctx
.
get_hf_processor
(
Qwen2AudioProcessor
,
**
kwargs
)
def
get_feature_extractor
(
self
,
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
377d10bd
...
...
@@ -31,9 +31,7 @@ import torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
einops
import
rearrange
,
repeat
from
packaging.version
import
Version
from
transformers
import
BatchFeature
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
transformers.models.qwen2_vl
import
(
Qwen2VLImageProcessor
,
Qwen2VLProcessor
)
from
transformers.models.qwen2_vl.configuration_qwen2_vl
import
(
...
...
@@ -69,6 +67,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from
vllm.platforms
import
_Backend
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
uses_mrope
from
vllm.transformers_utils.processor
import
(
cached_image_processor_from_config
)
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
...
...
@@ -722,40 +722,64 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
size
:
Optional
[
dict
[
str
,
int
]]
=
None
,
**
kwargs
:
object
,
)
->
Qwen2VLProcessor
:
hf_processor
=
self
.
ctx
.
get_hf_processor
(
Qwen2VLProcessor
)
image_processor
=
hf_processor
.
image_processor
# type: ignore
assert
isinstance
(
image_processor
,
Qwen2VLImageProcessor
)
if
min_pixels
:
image_processor
.
min_pixels
=
min_pixels
if
max_pixels
:
image_processor
.
max_pixels
=
max_pixels
if
max_pixels
or
min_pixels
:
image_processor
.
size
=
{
"min_pixels"
:
image_processor
.
min_pixels
,
"max_pixels"
:
image_processor
.
max_pixels
,
}
return
hf_processor
return
self
.
ctx
.
get_hf_processor
(
Qwen2VLProcessor
,
image_processor
=
self
.
get_image_processor
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
size
=
size
),
**
kwargs
,
)
def
_get_image_processor_kwargs
(
self
,
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
size
:
Optional
[
dict
[
str
,
int
]]
=
None
,
**
kwargs
:
object
,
):
if
self
.
ctx
.
model_config
.
mm_processor_kwargs
:
kwargs
.
update
(
self
.
ctx
.
model_config
.
mm_processor_kwargs
)
if
min_pixels
is
not
None
:
kwargs
[
"min_pixels"
]
=
min_pixels
if
size
is
None
:
size
=
{
"shortest_edge"
:
min_pixels
}
else
:
size
[
"shortest_edge"
]
=
min_pixels
if
max_pixels
is
not
None
:
kwargs
[
"max_pixels"
]
=
max_pixels
if
size
is
None
:
size
=
{
"longest_edge"
:
max_pixels
}
else
:
size
[
"longest_edge"
]
=
max_pixels
if
size
is
not
None
:
kwargs
[
"size"
]
=
size
return
kwargs
def
get_image_processor
(
self
,
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
size
:
Optional
[
dict
[
str
,
int
]]
=
None
,
**
kwargs
:
object
,
):
hf_processor
=
self
.
get_hf_processor
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
)
image_processor
=
hf_processor
.
image_processor
# type: ignore
if
Version
(
TRANSFORMERS_VERSION
)
>=
Version
(
"4.49"
):
from
transformers.models.qwen2_vl
import
Qwen2VLImageProcessorFast
assert
isinstance
(
image_processor
,
(
Qwen2VLImageProcessor
,
Qwen2VLImageProcessorFast
))
else
:
assert
isinstance
(
image_processor
,
Qwen2VLImageProcessor
)
return
image_processor
return
cached_image_processor_from_config
(
self
.
ctx
.
model_config
,
**
self
.
_get_image_processor_kwargs
(
min_pixels
=
min_pixels
,
max_pixels
=
max_pixels
,
size
=
size
,
**
kwargs
),
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
,
"video"
:
None
}
...
...
@@ -952,6 +976,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
def
_get_data_parser
(
self
)
->
MultiModalDataParser
:
return
Qwen2VLMultiModalDataParser
()
def
_call_hf_processor
(
self
,
prompt
:
str
,
mm_data
:
Mapping
[
str
,
object
],
mm_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
return
self
.
info
.
ctx
.
call_hf_processor
(
self
.
info
.
get_hf_processor
(
**
mm_kwargs
),
dict
(
text
=
prompt
,
**
mm_data
),
self
.
info
.
_get_image_processor_kwargs
(
**
mm_kwargs
),
)
def
_get_prompt_replacements
(
self
,
mm_items
:
MultiModalDataItems
,
...
...
@@ -964,8 +1000,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
tokenizer
=
self
.
info
.
get_tokenizer
()
vocab
=
tokenizer
.
get_vocab
()
# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
# image_token and video_token registered
placeholder
=
{
"image"
:
vocab
[
hf_processor
.
image_token
],
"video"
:
vocab
[
hf_processor
.
video_token
],
...
...
vllm/model_executor/models/qwen_vl.py
View file @
377d10bd
...
...
@@ -519,8 +519,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
return
_get_tokenizer_without_image_pad
(
tokenizer
)
def
get_hf_processor
(
self
)
->
QwenVLProcessor
:
return
QwenVLProcessor
(
self
.
get_hf_config
(),
self
.
get_tokenizer
())
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
QwenVLProcessor
:
return
self
.
ctx
.
init_processor
(
QwenVLProcessor
,
config
=
self
.
get_hf_config
(),
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
...
...
vllm/model_executor/models/ultravox.py
View file @
377d10bd
...
...
@@ -68,8 +68,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
*
,
# Ignored in initialization
sampling_rate
:
Optional
[
int
]
=
None
,
**
kwargs
:
object
,
)
->
ProcessorMixin
:
hf_processor
=
self
.
ctx
.
get_hf_processor
()
hf_processor
=
self
.
ctx
.
get_hf_processor
(
**
kwargs
)
# NOTE: Ultravox processing definition uses '<|eot_id|>' as the
# placeholder that will cause confusion with the actual end of turn
...
...
vllm/model_executor/models/whisper.py
View file @
377d10bd
...
...
@@ -29,7 +29,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
NestedTensors
)
from
vllm.multimodal.audio
import
resample_audio
from
vllm.sequence
import
SequenceData
from
vllm.transformers_utils.processor
import
cached_
get_
processor
from
vllm.transformers_utils.processor
import
cached_processor
_from_config
from
.interfaces
import
SupportsMultiModal
,
SupportsTranscription
from
.utils
import
AutoWeightsLoader
,
WeightsMapper
,
make_layers
...
...
@@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
mm_counts
:
Mapping
[
str
,
int
]):
assert
mm_counts
[
"audio"
]
==
1
num_tokens
=
get_max_whisper_audio_tokens
(
ctx
)
processor
=
cached_
get_
processor
(
ctx
.
model_config
.
model
)
processor
=
cached_processor
_from_config
(
ctx
.
model_config
)
chunk_length
=
processor
.
feature_extractor
.
chunk_length
sampling_rate
=
processor
.
feature_extractor
.
sampling_rate
num_samples
=
chunk_length
*
sampling_rate
...
...
@@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs):
multi_modal_data
[
"audio"
]
=
multi_modal_data
[
"audio"
][
0
]
# Resample and process audio
audio
,
orig_sr
=
multi_modal_data
[
"audio"
]
processor
=
cached_
get_
processor
(
ctx
.
model_config
.
model
)
processor
=
cached_processor
_from_config
(
ctx
.
model_config
)
target_sr
=
processor
.
feature_extractor
.
sampling_rate
audio
=
resample_audio
(
audio
,
orig_sr
=
orig_sr
,
target_sr
=
target_sr
)
multi_modal_data
[
"audio"
]
=
(
audio
,
target_sr
)
...
...
@@ -618,7 +618,7 @@ def input_mapper_for_whisper(
if
len
(
multi_modal_data
)
==
0
:
return
MultiModalKwargs
()
processor
=
cached_
get_
processor
(
ctx
.
model_config
.
model
)
processor
=
cached_processor
_from_config
(
ctx
.
model_config
)
sampling_rate
=
processor
.
feature_extractor
.
sampling_rate
audios
=
[
audio
for
audio
,
_
in
multi_modal_data
]
...
...
vllm/multimodal/image.py
View file @
377d10bd
# SPDX-License-Identifier: Apache-2.0
import
base64
from
functools
import
lru_cache
from
io
import
BytesIO
from
pathlib
import
Path
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
Optional
...
...
@@ -11,7 +10,7 @@ from PIL import Image
from
vllm.inputs.registry
import
InputContext
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.processor
import
get_image_processor
from
vllm.transformers_utils.processor
import
cached_
get_image_processor
from
vllm.utils
import
is_list_of
from
.base
import
MediaIO
,
MultiModalPlugin
...
...
@@ -22,8 +21,6 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
cached_get_image_processor
=
lru_cache
(
get_image_processor
)
class
ImagePlugin
(
MultiModalPlugin
):
"""Plugin for image data."""
...
...
vllm/multimodal/registry.py
View file @
377d10bd
...
...
@@ -11,7 +11,8 @@ import torch.nn as nn
from
vllm.envs
import
VLLM_MM_INPUT_CACHE_SIZE
from
vllm.inputs
import
InputProcessingContext
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
cached_tokenizer_from_config
)
from
vllm.utils
import
ClassRegistry
from
.audio
import
AudioPlugin
...
...
@@ -21,7 +22,6 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
from
.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
ProcessingCache
)
from
.profiling
import
BaseDummyInputsBuilder
,
MultiModalProfiler
from
.utils
import
cached_get_tokenizer
from
.video
import
VideoPlugin
if
TYPE_CHECKING
:
...
...
@@ -256,10 +256,7 @@ class MultiModalRegistry:
on underlying model configuration.
"""
if
self
.
has_processor
(
model_config
):
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
model_config
.
trust_remote_code
,
)
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
processor
=
self
.
create_processor
(
model_config
,
tokenizer
)
seq_len
=
model_config
.
max_model_len
mm_limits
=
self
.
get_mm_limits_per_prompt
(
model_config
)
...
...
@@ -374,10 +371,7 @@ class MultiModalRegistry:
This should be called after :meth:`init_mm_limits_per_prompt`.
"""
if
self
.
has_processor
(
model_config
):
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
model_config
.
trust_remote_code
,
)
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
processor
=
self
.
create_processor
(
model_config
,
tokenizer
)
profiler
=
MultiModalProfiler
(
processor
)
return
profiler
.
get_mm_limits
()
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment