Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
99267c23
Unverified
Commit
99267c23
authored
Mar 18, 2026
by
Cyrus Leung
Committed by
GitHub
Mar 18, 2026
Browse files
[2/3] Refactor InternVL-based processors (#37324)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
525f2eeb
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
762 additions
and
1146 deletions
+762
-1146
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+17
-14
tests/models/registry.py
tests/models/registry.py
+2
-1
vllm/model_executor/models/eagle2_5_vl.py
vllm/model_executor/models/eagle2_5_vl.py
+32
-6
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+2
-2
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/h2ovl.py
+28
-6
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+85
-46
vllm/model_executor/models/nemotron_vl.py
vllm/model_executor/models/nemotron_vl.py
+75
-29
vllm/model_executor/models/nvlm_d.py
vllm/model_executor/models/nvlm_d.py
+31
-7
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+2
-2
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+35
-10
vllm/transformers_utils/processors/__init__.py
vllm/transformers_utils/processors/__init__.py
+0
-4
vllm/transformers_utils/processors/eagle2_5_vl.py
vllm/transformers_utils/processors/eagle2_5_vl.py
+0
-85
vllm/transformers_utils/processors/h2ovl.py
vllm/transformers_utils/processors/h2ovl.py
+79
-82
vllm/transformers_utils/processors/internvl.py
vllm/transformers_utils/processors/internvl.py
+233
-273
vllm/transformers_utils/processors/nano_nemotron_vl.py
vllm/transformers_utils/processors/nano_nemotron_vl.py
+5
-5
vllm/transformers_utils/processors/nemotron_vl.py
vllm/transformers_utils/processors/nemotron_vl.py
+106
-172
vllm/transformers_utils/processors/nvlm_d.py
vllm/transformers_utils/processors/nvlm_d.py
+30
-13
vllm/transformers_utils/processors/skyworkr1v.py
vllm/transformers_utils/processors/skyworkr1v.py
+0
-389
No files found.
tests/models/multimodal/generation/vlm_utils/model_utils.py
View file @
99267c23
...
...
@@ -489,13 +489,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Image
|
list
[
Image
],
**
kwargs
):
from
vllm.model_executor.models.h2ovl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
from
vllm.transformers_utils.processors.h2ovl
import
(
image_to_pixel_values_h2ovl
,
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
image_to_pixel_values_h2ovl
(
...
...
@@ -751,16 +752,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Image
|
list
[
Image
],
**
kwargs
):
from
vllm.model_executor.models.skyworkr1v
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values_skyworkr1v
,
from
vllm.transformers_utils.processors.internvl
import
(
image_to_pixel_values_internvl
,
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
image_to_pixel_values_
skyworkr1v
(
image_to_pixel_values_
internvl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
self
.
min_num
,
...
...
@@ -815,14 +817,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
]
=
None
,
**
kwargs
,
):
from
vllm.model_executor.models.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
from
vllm.transformers_utils.processors.internvl
import
(
image_to_pixel_values_internvl
,
video_to_pixel_values_internvl
,
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
videos
=
[
videos
]
if
isinstance
(
videos
,
np
.
ndarray
)
else
videos
if
images
is
not
None
:
...
...
tests/models/registry.py
View file @
99267c23
...
...
@@ -779,7 +779,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"rednote-hilab/dots.ocr"
,
trust_remote_code
=
True
),
"Eagle2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/Eagle2.5-8B"
,
trust_remote_code
=
True
,
is_available_online
=
False
"nvidia/Eagle2.5-8B"
,
trust_remote_code
=
True
,
),
"Emu3ForConditionalGeneration"
:
_HfExamplesInfo
(
"BAAI/Emu3-Chat-hf"
),
"Ernie4_5_VLMoeForConditionalGeneration"
:
_HfExamplesInfo
(
...
...
vllm/model_executor/models/eagle2_5_vl.py
View file @
99267c23
...
...
@@ -16,7 +16,10 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from
vllm.model_executor.models.siglip
import
SiglipVisionModel
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.processors.eagle2_5_vl
import
Eagle2_5_VLProcessor
from
vllm.transformers_utils.processors.internvl
import
(
InternVLImageProcessor
,
InternVLProcessor
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
(
...
...
@@ -68,12 +71,35 @@ Eagle2_5_VLImageInputs: TypeAlias = (
class
Eagle2_5_VLProcessingInfo
(
BaseInternVLProcessingInfo
):
"""Processing info for Eagle2.5-VL model."""
def
get_hf_processor
(
self
,
**
kwargs
)
->
Eagle2_5_VLProcessor
:
return
self
.
ctx
.
init_processor
(
Eagle2_5_VLProcessor
,
config
=
self
.
ctx
.
get_hf_config
(),
def
get_image_processor
(
self
,
**
kwargs
):
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"image_size"
,
config
.
force_image_size
or
vision_config
.
image_size
)
kwargs
.
setdefault
(
"min_dynamic_patch"
,
config
.
min_dynamic_patch
)
kwargs
.
setdefault
(
"max_dynamic_patch"
,
config
.
max_dynamic_patch
)
kwargs
.
setdefault
(
"dynamic_image_size"
,
config
.
dynamic_image_size
)
kwargs
.
setdefault
(
"use_thumbnail"
,
config
.
use_thumbnail
)
return
InternVLImageProcessor
(
**
kwargs
)
def
get_hf_processor
(
self
,
**
kwargs
)
->
InternVLProcessor
:
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
image_processor
=
self
.
get_image_processor
(
**
kwargs
)
image_size
=
image_processor
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
config
.
downsample_ratio
image_seq_length
=
int
((
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
))
return
InternVLProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
image_processor
=
image_processor
,
image_seq_length
=
image_seq_length
,
)
...
...
vllm/model_executor/models/glm4v.py
View file @
99267c23
...
...
@@ -395,13 +395,13 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
vision_config
=
config
.
vision_config
image_size
=
vision_config
[
"image_size"
]
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"size"
,
{
"width"
:
image_size
,
"height"
:
image_size
})
return
GLM4VImageProcessorFast
(
**
kwargs
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
GLM4VProcessor
:
return
self
.
ctx
.
init_processor
(
GLM4VProcessor
,
return
GLM4VProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
image_processor
=
self
.
get_image_processor
(
**
kwargs
),
)
...
...
vllm/model_executor/models/h2ovl.py
View file @
99267c23
...
...
@@ -28,7 +28,7 @@ from vllm.multimodal.processing.processor import (
PromptUpdate
,
TimingContext
,
)
from
vllm.transformers_utils.processors.h2ovl
import
H2OVLProcessor
from
vllm.transformers_utils.processors.h2ovl
import
H2OVLImageProcessor
,
H2OVLProcessor
from
.intern_vit
import
InternVisionModel
from
.internvl
import
(
...
...
@@ -40,12 +40,34 @@ from .internvl import (
class
H2OVLProcessingInfo
(
BaseInternVLProcessingInfo
):
def
get_image_processor
(
self
,
**
kwargs
):
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"image_size"
,
vision_config
.
image_size
)
kwargs
.
setdefault
(
"min_dynamic_patch"
,
config
.
min_dynamic_patch
)
kwargs
.
setdefault
(
"max_dynamic_patch"
,
config
.
max_dynamic_patch
)
kwargs
.
setdefault
(
"dynamic_image_size"
,
config
.
dynamic_image_size
)
kwargs
.
setdefault
(
"use_thumbnail"
,
config
.
use_thumbnail
)
kwargs
.
setdefault
(
"use_msac"
,
config
.
use_msac
)
return
H2OVLImageProcessor
(
**
kwargs
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
H2OVLProcessor
:
return
self
.
ctx
.
init_processor
(
H2OVLProcessor
,
config
=
self
.
get_hf_config
(),
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
image_processor
=
self
.
get_image_processor
(
**
kwargs
)
image_size
=
image_processor
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
config
.
downsample_ratio
image_seq_length
=
int
((
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
))
return
H2OVLProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
image_processor
=
image_processor
,
image_seq_length
=
image_seq_length
,
)
def
get_num_image_tokens
(
...
...
@@ -106,7 +128,7 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
return
hf_processor
.
get_image_repl
(
feature_size
,
num_patches
)
return
hf_processor
.
get_image_repl
(
num_patches
,
num_features
=
feature_size
)
return
[
PromptReplacement
(
...
...
vllm/model_executor/models/internvl.py
View file @
99267c23
...
...
@@ -9,6 +9,7 @@
# --------------------------------------------------------
from
abc
import
abstractmethod
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
functools
import
cached_property
from
typing
import
Annotated
,
Literal
,
TypeAlias
,
TypeVar
import
torch
...
...
@@ -45,8 +46,9 @@ from vllm.multimodal.processing import (
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.processors.internvl
import
(
Base
InternVLProcessor
,
InternVL
Image
Processor
,
InternVLProcessor
,
InternVLVideoProcessor
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
...
...
@@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models."""
@
abstractmethod
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
Base
InternVLProcessor
:
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
InternVLProcessor
:
raise
NotImplementedError
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
...
...
@@ -134,7 +136,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
*
,
image_width
:
int
,
image_height
:
int
,
processor
:
Base
InternVLProcessor
,
processor
:
InternVLProcessor
,
)
->
int
:
return
processor
.
get_num_image_tokens
(
image_width
=
image_width
,
...
...
@@ -143,8 +145,9 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
processor
=
self
.
get_hf_processor
()
image_processor
=
processor
.
image_processor
base_size
=
processor
.
image_size
base_size
=
image_
processor
.
image_size
target_ratios
=
processor
.
resolve_target_ratios
()
largest_feature_size
,
largest_feature_pinpoint
=
0
,
None
...
...
@@ -226,7 +229,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
)
image_token_id
=
hf_processor
.
image_token_id
image_token_id
=
hf_processor
.
ctx_
image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
...
...
@@ -291,7 +294,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
return
hf_processor
.
get_image_repl
(
feature_size
,
num_patches
)
return
hf_processor
.
get_image_repl
(
num_patches
,
num_features
=
feature_size
)
return
[
PromptReplacement
(
...
...
@@ -305,23 +308,73 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
class
InternVLProcessingInfo
(
BaseInternVLProcessingInfo
):
"""InternVL ProcessingInfo extended for video processing"""
@
property
def
supports_video
(
self
):
return
self
.
get_hf_processor
().
supports_video
def
get_image_processor
(
self
,
**
kwargs
):
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
def
get_supported_mm_limits
(
self
):
video_limit
=
{
"video"
:
None
}
if
self
.
supports_video
else
{}
return
{
**
super
().
get_supported_mm_limits
(),
**
video_limit
}
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"image_size"
,
vision_config
.
image_size
)
kwargs
.
setdefault
(
"min_dynamic_patch"
,
config
.
min_dynamic_patch
)
kwargs
.
setdefault
(
"max_dynamic_patch"
,
config
.
max_dynamic_patch
)
kwargs
.
setdefault
(
"dynamic_image_size"
,
config
.
dynamic_image_size
)
kwargs
.
setdefault
(
"use_thumbnail"
,
config
.
use_thumbnail
)
return
InternVLImageProcessor
(
**
kwargs
)
def
get_video_processor
(
self
,
**
kwargs
):
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
def
get_video_token
(
self
)
->
str
|
None
:
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"image_size"
,
vision_config
.
image_size
)
return
InternVLVideoProcessor
(
**
kwargs
)
@
cached_property
def
ctx_video_token
(
self
):
text_model_type
=
self
.
get_hf_config
().
get_text_config
().
model_type
video_token_map
=
{
ctx_
video_token_map
=
{
"qwen2"
:
"<|video_pad|>"
,
"qwen3"
:
"<|video_pad|>"
,
"qwen3_moe"
:
"<|video_pad|>"
,
"gpt_oss"
:
"<|reserved_200000|>"
,
}
return
video_token_map
.
get
(
text_model_type
)
if
text_model_type
not
in
ctx_video_token_map
:
return
None
ctx_video_token
=
ctx_video_token_map
[
text_model_type
]
if
ctx_video_token
not
in
self
.
get_tokenizer
().
get_vocab
():
return
None
return
ctx_video_token
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
InternVLProcessor
:
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
image_processor
=
self
.
get_image_processor
(
**
kwargs
)
image_size
=
image_processor
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
config
.
downsample_ratio
image_seq_length
=
int
((
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
))
ctx_video_token
=
self
.
ctx_video_token
video_processor
=
(
self
.
get_video_processor
(
**
kwargs
)
if
ctx_video_token
else
None
)
return
InternVLProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
image_processor
=
image_processor
,
video_processor
=
video_processor
,
image_seq_length
=
image_seq_length
,
ctx_video_token
=
ctx_video_token
,
)
def
get_supported_mm_limits
(
self
):
video_limit
=
{
"video"
:
None
}
if
self
.
ctx_video_token
else
{}
return
{
**
super
().
get_supported_mm_limits
(),
**
video_limit
}
def
get_num_frames_with_most_features
(
self
,
...
...
@@ -332,22 +385,14 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
max_videos
=
mm_counts
.
get
(
"video"
,
0
)
processor
=
self
.
get_hf_processor
()
num_image_token
=
processor
.
image_seq_length
max_image_tokens
=
self
.
get_max_image_tokens
()
*
max_images
max_total_frames
=
(
seq_len
-
max_image_tokens
)
//
processor
.
num_image_token
max_total_frames
=
(
seq_len
-
max_image_tokens
)
//
num_image_token
max_frames_per_video
=
max_total_frames
//
max
(
max_videos
,
1
)
return
max
(
max_frames_per_video
,
1
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
InternVLProcessor
:
return
self
.
ctx
.
init_processor
(
InternVLProcessor
,
config
=
self
.
get_hf_config
(),
tokenizer
=
self
.
get_tokenizer
(),
video_token
=
self
.
get_video_token
(),
**
kwargs
,
)
class
InternVLDummyInputsBuilder
(
BaseInternVLDummyInputsBuilder
[
InternVLProcessingInfo
]
...
...
@@ -366,7 +411,7 @@ class InternVLDummyInputsBuilder(
mm_options
:
Mapping
[
str
,
BaseDummyOptions
],
)
->
MultiModalDataDict
:
dummy_image
=
super
().
get_dummy_mm_data
(
seq_len
,
mm_counts
,
mm_options
)
if
self
.
info
.
supports_video
:
if
self
.
info
.
ctx_video_token
:
config
=
self
.
info
.
get_hf_config
()
image_size
:
int
=
config
.
vision_config
.
image_size
target_num_frames
=
self
.
info
.
get_num_frames_with_most_features
(
...
...
@@ -405,11 +450,9 @@ class InternVLMultiModalProcessor(
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
)
if
(
self
.
info
.
supports_video
and
(
video_token_id
:
=
hf_processor
.
video_token_id
)
is
not
None
):
if
(
video_token_id
:
=
hf_processor
.
ctx_video_token_id
)
is
not
None
:
processed_outputs
[
"video_token_id"
]
=
torch
.
tensor
(
video_token_id
)
return
processed_outputs
def
_get_mm_fields_config
(
...
...
@@ -418,7 +461,7 @@ class InternVLMultiModalProcessor(
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
image_fields
=
super
().
_get_mm_fields_config
(
hf_inputs
,
hf_processor_mm_kwargs
)
if
self
.
info
.
supports_video
:
if
self
.
info
.
ctx_video_token
:
video_num_patches
=
hf_inputs
.
get
(
"video_num_patches"
,
torch
.
empty
(
0
))
num_videos
=
len
(
video_num_patches
)
video_fields
=
dict
(
...
...
@@ -444,6 +487,8 @@ class InternVLMultiModalProcessor(
hf_processor_mm_kwargs
=
hf_processor_mm_kwargs
,
out_mm_kwargs
=
out_mm_kwargs
,
)
if
self
.
info
.
ctx_video_token
is
None
:
return
prompt_repl
hf_processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
...
...
@@ -456,26 +501,20 @@ class InternVLMultiModalProcessor(
video_num_patches
=
[]
def
get_video_replacement_internvl
(
item_idx
:
int
):
feature_size
=
hf_processor
.
num_image_token
num_patches
=
video_num_patches
[
item_idx
]
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
return
hf_processor
.
get_video_repl
(
feature_size
,
num_patches
,
video_context_token
=
hf_processor
.
video_token
)
if
self
.
info
.
supports_video
:
prompt_repl
=
[
*
prompt_repl
,
PromptReplacement
(
modality
=
"video"
,
target
=
"<video>"
,
replacement
=
get_video_replacement_internvl
,
),
]
return
hf_processor
.
get_video_repl
(
num_patches
)
return
prompt_repl
return
[
*
prompt_repl
,
PromptReplacement
(
modality
=
"video"
,
target
=
"<video>"
,
replacement
=
get_video_replacement_internvl
,
),
]
@
MULTIMODAL_REGISTRY
.
register_processor
(
...
...
vllm/model_executor/models/nemotron_vl.py
View file @
99267c23
...
...
@@ -26,8 +26,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.processor
import
cached_image_processor_from_config
from
vllm.transformers_utils.processors.nemotron_vl
import
(
LlamaNemotronNanoVLImageProcessor
,
LlamaNemotronNanoVLProcessor
,
LlamaNemotronVLEmbedImageProcessor
,
LlamaNemotronVLEmbedProcessor
,
NemotronVLProcessor
,
)
from
vllm.transformers_utils.repo_utils
import
get_hf_file_to_dict
...
...
@@ -50,19 +52,34 @@ from .utils import (
class
NemotronVLProcessingInfo
(
BaseInternVLProcessingInfo
):
"""Processing info for Nemotron VL models."""
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
NemotronVLProcessor
:
return
self
.
ctx
.
init_processor
(
NemotronVLProcessor
,
config
=
self
.
get_hf_config
(),
tokenizer
=
self
.
get_tokenizer
(),
image_processor
=
self
.
get_image_processor
(),
**
kwargs
,
def
get_image_processor
(
self
,
**
kwargs
:
object
):
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
orig_processor
=
cached_image_processor_from_config
(
self
.
ctx
.
model_config
,
**
kwargs
)
def
get_image_processor
(
self
,
**
kwargs
:
object
):
return
cached_image_processor_from_config
(
self
.
ctx
.
model_config
,
**
kwargs
,
return
LlamaNemotronNanoVLImageProcessor
(
image_size
=
orig_processor
.
image_size
,
min_dynamic_patch
=
1
,
max_dynamic_patch
=
orig_processor
.
max_num_tiles
,
dynamic_image_size
=
True
,
use_thumbnail
=
orig_processor
.
use_thumbnail
,
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
LlamaNemotronNanoVLProcessor
:
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
image_processor
=
self
.
get_image_processor
(
**
kwargs
)
image_size
=
image_processor
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
config
.
downsample_ratio
image_seq_length
=
int
((
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
))
return
LlamaNemotronNanoVLProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
image_processor
=
image_processor
,
image_seq_length
=
image_seq_length
,
)
...
...
@@ -386,29 +403,58 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
# --------------------------------------------------------
class
LlamaNemotronVLEmbedProcessingInfo
(
Nemotro
nVLProcessingInfo
):
class
LlamaNemotronVLEmbedProcessingInfo
(
BaseInter
nVLProcessingInfo
):
"""Processing info for LlamaNemotronVL embedding model."""
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
LlamaNemotronVLEmbedProcessor
:
"""Override to create embedding-specific processor without image_processor."""
def
get_image_processor
(
self
,
**
kwargs
):
model_config
=
self
.
ctx
.
model_config
processor_config
=
{}
if
model_config
.
model
is
not
None
:
processor_config
=
(
get_hf_file_to_dict
(
"processor_config.json"
,
model_config
.
model
,
model_config
.
revision
,
)
or
{}
config
=
self
.
get_hf_config
()
processor_config
=
(
get_hf_file_to_dict
(
"processor_config.json"
,
model_config
.
model
,
model_config
.
revision
,
)
or
{}
)
min_dynamic_patch
=
processor_config
.
get
(
"min_input_tiles"
,
getattr
(
config
,
"min_dynamic_patch"
,
1
),
)
max_dynamic_patch
=
processor_config
.
get
(
"max_input_tiles"
,
getattr
(
config
,
"max_dynamic_patch"
,
1
),
)
dynamic_image_size
=
processor_config
.
get
(
"dynamic_image_size"
,
getattr
(
config
,
"dynamic_image_size"
,
True
),
)
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"image_size"
,
config
.
force_image_size
)
kwargs
.
setdefault
(
"min_dynamic_patch"
,
min_dynamic_patch
)
kwargs
.
setdefault
(
"max_dynamic_patch"
,
max_dynamic_patch
)
kwargs
.
setdefault
(
"dynamic_image_size"
,
dynamic_image_size
)
kwargs
.
setdefault
(
"use_thumbnail"
,
True
)
return
LlamaNemotronVLEmbedImageProcessor
(
**
kwargs
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
LlamaNemotronVLEmbedProcessor
:
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
image_processor
=
self
.
get_image_processor
(
**
kwargs
)
image_size
=
image_processor
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
config
.
downsample_ratio
image_seq_length
=
int
((
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
))
return
self
.
ctx
.
init_processor
(
LlamaNemotronVLEmbedProcessor
,
config
=
self
.
get_hf_config
(),
return
LlamaNemotronVLEmbedProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
processor
_config
=
processor
_config
,
**
kwargs
,
image_
processor
=
image_
processor
,
image_seq_length
=
image_seq_length
,
)
...
...
vllm/model_executor/models/nvlm_d.py
View file @
99267c23
...
...
@@ -27,7 +27,8 @@ from vllm.multimodal.processing import (
PromptUpdate
,
PromptUpdateDetails
,
)
from
vllm.transformers_utils.processors.nvlm_d
import
IMG_PAD
,
NVLMProcessor
from
vllm.transformers_utils.processors.internvl
import
InternVLImageProcessor
from
vllm.transformers_utils.processors.nvlm_d
import
NVLMProcessor
from
.intern_vit
import
InternVisionModel
from
.internvl
import
(
...
...
@@ -39,12 +40,33 @@ from .internvl import (
class
NVLMProcessingInfo
(
BaseInternVLProcessingInfo
):
def
get_image_processor
(
self
,
**
kwargs
):
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"image_size"
,
vision_config
.
image_size
)
kwargs
.
setdefault
(
"min_dynamic_patch"
,
config
.
min_dynamic_patch
)
kwargs
.
setdefault
(
"max_dynamic_patch"
,
config
.
max_dynamic_patch
)
kwargs
.
setdefault
(
"dynamic_image_size"
,
config
.
dynamic_image_size
)
kwargs
.
setdefault
(
"use_thumbnail"
,
config
.
use_thumbnail
)
return
InternVLImageProcessor
(
**
kwargs
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
NVLMProcessor
:
return
self
.
ctx
.
init_processor
(
NVLMProcessor
,
config
=
self
.
get_hf_config
(),
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
image_processor
=
self
.
get_image_processor
(
**
kwargs
)
image_size
=
image_processor
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
config
.
downsample_ratio
image_seq_length
=
int
((
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
))
return
NVLMProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
image_processor
=
image_processor
,
image_seq_length
=
image_seq_length
,
)
...
...
@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
repl
=
hf_processor
.
get_image_repl
(
feature_size
,
num_patches
)
repl
=
hf_processor
.
get_image_repl
(
num_patches
,
num_features
=
feature_size
)
return
PromptUpdateDetails
.
select_text
(
repl
.
full
+
"
\n
"
,
IMG_PAD
)
return
PromptUpdateDetails
.
select_text
(
repl
.
full
+
"
\n
"
,
hf_processor
.
ctx_image_token
)
# See note in dummy data regarding why we have the extra newline
return
[
...
...
vllm/model_executor/models/qwen_vl.py
View file @
99267c23
...
...
@@ -440,13 +440,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
vision_config
=
config
.
visual
image_size
=
vision_config
[
"image_size"
]
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"size"
,
{
"width"
:
image_size
,
"height"
:
image_size
})
return
QwenVLImageProcessorFast
(
**
kwargs
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
QwenVLProcessor
:
return
self
.
ctx
.
init_processor
(
QwenVLProcessor
,
return
QwenVLProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
image_processor
=
self
.
get_image_processor
(
**
kwargs
),
)
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
99267c23
...
...
@@ -43,7 +43,10 @@ from vllm.multimodal.processing import (
PromptUpdate
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.processors.skyworkr1v
import
SkyworkR1VProcessor
from
vllm.transformers_utils.processors.internvl
import
(
InternVLImageProcessor
,
InternVLProcessor
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
...
...
@@ -96,12 +99,33 @@ SkyworkR1VImageInputs: TypeAlias = (
class
SkyworkR1VProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
SkyworkR1VProcessor
:
return
self
.
ctx
.
init_processor
(
SkyworkR1VProcessor
,
config
=
self
.
get_hf_config
(),
def
get_image_processor
(
self
,
**
kwargs
):
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
kwargs
=
self
.
ctx
.
get_merged_mm_kwargs
(
kwargs
)
kwargs
.
setdefault
(
"image_size"
,
vision_config
.
image_size
)
kwargs
.
setdefault
(
"min_dynamic_patch"
,
config
.
min_dynamic_patch
)
kwargs
.
setdefault
(
"max_dynamic_patch"
,
config
.
max_dynamic_patch
)
kwargs
.
setdefault
(
"dynamic_image_size"
,
config
.
dynamic_image_size
)
kwargs
.
setdefault
(
"use_thumbnail"
,
config
.
use_thumbnail
)
return
InternVLImageProcessor
(
**
kwargs
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
InternVLProcessor
:
config
=
self
.
get_hf_config
()
vision_config
=
config
.
vision_config
image_processor
=
self
.
get_image_processor
(
**
kwargs
)
image_size
=
image_processor
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
config
.
downsample_ratio
image_seq_length
=
int
((
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
))
return
InternVLProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
image_processor
=
image_processor
,
image_seq_length
=
image_seq_length
,
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
...
...
@@ -112,7 +136,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
*
,
image_width
:
int
,
image_height
:
int
,
processor
:
SkyworkR1V
Processor
,
processor
:
InternVL
Processor
,
)
->
int
:
return
processor
.
get_num_image_tokens
(
image_width
=
image_width
,
...
...
@@ -121,8 +145,9 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
processor
=
self
.
get_hf_processor
()
image_processor
=
processor
.
image_processor
base_size
=
processor
.
image_size
base_size
=
image_
processor
.
image_size
target_ratios
=
processor
.
resolve_target_ratios
()
largest_feature_size
,
largest_feature_pinpoint
=
0
,
None
...
...
@@ -187,7 +212,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
)
image_token_id
=
hf_processor
.
image_token_id
image_token_id
=
hf_processor
.
ctx_
image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
...
...
@@ -252,7 +277,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
return
hf_processor
.
get_image_repl
(
feature_size
,
num_patches
)
return
hf_processor
.
get_image_repl
(
num_patches
,
num_features
=
feature_size
)
return
[
PromptReplacement
(
...
...
vllm/transformers_utils/processors/__init__.py
View file @
99267c23
...
...
@@ -14,7 +14,6 @@ __all__ = [
"BagelProcessor"
,
"CohereASRProcessor"
,
"DeepseekVLV2Processor"
,
"Eagle2_5_VLProcessor"
,
"FireRedASR2Processor"
,
"FunASRProcessor"
,
"GLM4VProcessor"
,
...
...
@@ -34,14 +33,12 @@ __all__ = [
"Ovis2_5Processor"
,
"QwenVLProcessor"
,
"Qwen3ASRProcessor"
,
"SkyworkR1VProcessor"
,
]
_CLASS_TO_MODULE
:
dict
[
str
,
str
]
=
{
"BagelProcessor"
:
"vllm.transformers_utils.processors.bagel"
,
"CohereASRProcessor"
:
"vllm.transformers_utils.processors.cohere_asr"
,
"DeepseekVLV2Processor"
:
"vllm.transformers_utils.processors.deepseek_vl2"
,
"Eagle2_5_VLProcessor"
:
"vllm.transformers_utils.processors.eagle2_5_vl"
,
"FireRedASR2Processor"
:
"vllm.transformers_utils.processors.fireredasr2"
,
"FunASRProcessor"
:
"vllm.transformers_utils.processors.funasr"
,
"GLM4VProcessor"
:
"vllm.transformers_utils.processors.glm4v"
,
...
...
@@ -61,7 +58,6 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Ovis2_5Processor"
:
"vllm.transformers_utils.processors.ovis2_5"
,
"QwenVLProcessor"
:
"vllm.transformers_utils.processors.qwen_vl"
,
"Qwen3ASRProcessor"
:
"vllm.transformers_utils.processors.qwen3_asr"
,
"SkyworkR1VProcessor"
:
"vllm.transformers_utils.processors.skyworkr1v"
,
}
...
...
vllm/transformers_utils/processors/eagle2_5_vl.py
deleted
100644 → 0
View file @
525f2eeb
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from NVIDIA Eagle2.5-VL model
# https://huggingface.co/nvidia/Eagle2.5-8B
from
transformers
import
PretrainedConfig
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLProcessor
class
Eagle2_5_VLProcessor
(
BaseInternVLProcessor
):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self
.
config
=
config
self
.
tokenizer
=
tokenizer
# Image size with force_image_size override
image_size
:
int
=
config
.
vision_config
.
image_size
if
hasattr
(
config
,
"force_image_size"
)
and
config
.
force_image_size
:
image_size
=
config
.
force_image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
downsample_ratio
:
float
=
getattr
(
config
,
"downsample_ratio"
,
0.5
)
# Compute num_image_token
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
# Dynamic patch settings with defaults
self
.
min_dynamic_patch
=
(
min_dynamic_patch
if
min_dynamic_patch
is
not
None
else
getattr
(
config
,
"min_dynamic_patch"
,
1
)
)
self
.
max_dynamic_patch
=
(
max_dynamic_patch
if
max_dynamic_patch
is
not
None
else
getattr
(
config
,
"max_dynamic_patch"
,
12
)
)
self
.
dynamic_image_size
=
(
dynamic_image_size
if
dynamic_image_size
is
not
None
else
getattr
(
config
,
"dynamic_image_size"
,
True
)
)
self
.
use_thumbnail
:
bool
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
"""Get the image token ID from config or tokenizer."""
if
hasattr
(
self
.
config
,
"image_token_index"
):
return
self
.
config
.
image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab
=
self
.
tokenizer
.
get_vocab
()
if
IMG_CONTEXT
in
vocab
:
return
vocab
[
IMG_CONTEXT
]
raise
ValueError
(
f
"Cannot find image token '
{
IMG_CONTEXT
}
' in vocabulary"
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
"""Get image replacement string for prompt."""
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
vllm/transformers_utils/processors/h2ovl.py
View file @
99267c23
...
...
@@ -10,16 +10,12 @@
# --------------------------------------------------------
import
torch
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers.hf
import
HfTokenizer
from
.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLProcessor
,
InternVLImageProcessor
,
InternVLProcessor
,
build_transform
,
find_closest_aspect_ratio
,
get_internvl_target_ratios
,
...
...
@@ -217,45 +213,26 @@ def image_to_pixel_values_h2ovl(
return
pixel_values
class
H2OVLProcessor
(
Base
InternVLProcessor
):
class
H2OVL
Image
Processor
(
InternVL
Image
Processor
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_msac
:
bool
|
None
=
None
,
image_size
:
int
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
use_msac
:
bool
,
)
->
None
:
super
().
__init__
(
config
,
tokenizer
,
image_size
=
image_size
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
if
use_msac
is
None
:
use_msac
=
config
.
use_msac
assert
isinstance
(
use_msac
,
bool
)
self
.
use_msac
=
use_msac
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
...
...
@@ -264,18 +241,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
self
.
min_dynamic_patch
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
max_dynamic_patch
if
dynamic_image_size
is
None
:
dynamic_image_size
=
self
.
dynamic_image_size
if
use_thumbnail
is
None
:
use_thumbnail
=
self
.
use_thumbnail
return
resolve_h2ovl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
...
...
@@ -284,6 +257,57 @@ class H2OVLProcessor(BaseInternVLProcessor):
use_thumbnail
=
use_thumbnail
,
)
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
use_msac
=
self
.
use_msac
if
len
(
images
)
==
1
else
False
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_h2ovl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
use_msac
=
use_msac
,
)
for
image
in
images
]
class
H2OVLProcessor
(
InternVLProcessor
):
def
__init__
(
self
,
image_processor
:
H2OVLImageProcessor
,
tokenizer
:
HfTokenizer
,
*
,
image_seq_length
:
int
,
start_image_token
:
str
=
"<img>"
,
end_image_token
:
str
=
"</img>"
,
ctx_image_token
:
str
=
"<IMG_CONTEXT>"
,
)
->
None
:
super
().
__init__
(
image_processor
=
image_processor
,
tokenizer
=
tokenizer
,
image_seq_length
=
image_seq_length
,
start_image_token
=
start_image_token
,
end_image_token
=
end_image_token
,
ctx_image_token
=
ctx_image_token
,
)
self
.
image_processor
:
H2OVLImageProcessor
def
resolve_target_ratios
(
self
,
*
,
...
...
@@ -294,7 +318,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
=
None
,
override_min_num
:
int
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_num
,
max_num
=
self
.
image_processor
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
...
...
@@ -316,9 +340,10 @@ class H2OVLProcessor(BaseInternVLProcessor):
image_height
:
int
,
use_msac
:
bool
|
None
=
None
,
)
->
int
:
use_msac
=
self
.
use_msac
if
use_msac
is
None
else
use_msac
image_processor
=
self
.
image_processor
use_msac
=
image_processor
.
use_msac
if
use_msac
is
None
else
use_msac
use_thumbnail
=
self
.
use_thumbnail
use_thumbnail
=
image_processor
.
use_thumbnail
if
use_msac
:
target_ratios_1
=
self
.
resolve_target_ratios
(
...
...
@@ -328,7 +353,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches_1
,
_
,
_
,
aspect_ratio_1
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios_1
,
use_thumbnail
=
True
,
)
...
...
@@ -341,7 +366,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches_2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios_2
,
use_thumbnail
=
True
,
)
...
...
@@ -354,37 +379,9 @@ class H2OVLProcessor(BaseInternVLProcessor):
num_patches
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
use_msac
=
self
.
use_msac
if
len
(
images
)
==
1
else
False
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_h2ovl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
use_msac
=
use_msac
,
)
for
image
in
images
]
return
num_patches
*
self
.
image_seq_length
vllm/transformers_utils/processors/internvl.py
View file @
99267c23
...
...
@@ -7,24 +7,17 @@
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
TypeVar
import
numpy.typing
as
npt
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
transformers
import
BatchFeature
,
TensorType
from
transformers.processing_utils
import
ProcessorMixin
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
_T
=
TypeVar
(
"_T"
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
from
vllm.tokenizers.hf
import
HfTokenizer
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
...
...
@@ -33,7 +26,7 @@ IMAGENET_STD = (0.229, 0.224, 0.225)
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
(
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
...
...
@@ -43,7 +36,6 @@ def build_transform(input_size: int):
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
return
transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
...
...
@@ -223,65 +215,20 @@ def video_to_pixel_values_internvl(
return
pixel_values
class
BaseInternVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
class
InternVLImageProcessor
:
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
image_size
:
int
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
self
.
use_thumbnail
=
use_thumbnail
def
resolve_min_max_num
(
self
,
...
...
@@ -291,18 +238,14 @@ class BaseInternVLProcessor(ABC):
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
self
.
min_dynamic_patch
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
max_dynamic_patch
if
dynamic_image_size
is
None
:
dynamic_image_size
=
self
.
dynamic_image_size
if
use_thumbnail
is
None
:
use_thumbnail
=
self
.
use_thumbnail
return
resolve_internvl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
...
...
@@ -311,43 +254,6 @@ class BaseInternVLProcessor(ABC):
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_internvl_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
...
...
@@ -355,7 +261,14 @@ class BaseInternVLProcessor(ABC):
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
self
.
min_dynamic_patch
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
max_dynamic_patch
if
dynamic_image_size
is
None
:
dynamic_image_size
=
self
.
dynamic_image_size
min_num
,
max_num
=
resolve_internvl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
...
...
@@ -373,49 +286,9 @@ class BaseInternVLProcessor(ABC):
for
image
in
images
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
_T
|
list
[
_T
]
|
None
=
None
)
->
list
[
_T
]:
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
...
...
@@ -423,120 +296,173 @@ class BaseInternVLProcessor(ABC):
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
images_lst
=
[
images
]
if
not
isinstance
(
images
,
list
)
else
images
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images_lst
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst
]),
}
return
BatchFeature
(
image_inputs
,
tensor_type
=
return_tensors
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
class
InternVLVideoProcessor
:
def
__init__
(
self
,
image_size
:
int
,
)
->
None
:
self
.
image_size
=
image_size
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
)
->
list
[
torch
.
Tensor
]:
return
[
video_to_pixel_values_internvl
(
video
,
input_size
=
self
.
image_size
,
min_num
=
1
,
max_num
=
1
,
use_thumbnail
=
False
,
)
for
video
in
videos
]
def
__call__
(
self
,
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
],
*
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
videos_lst
=
[
videos
]
if
not
isinstance
(
videos
,
list
)
else
videos
pixel_values_lst
=
self
.
_videos_to_pixel_values_lst
(
videos_lst
)
image_inputs
=
{
"pixel_values_flat_video"
:
torch
.
cat
(
pixel_values_lst
),
"video_num_patches"
:
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst
]),
}
return
BatchFeature
(
image_inputs
,
tensor_type
=
return_tensors
)
class
InternVLProcessor
(
BaseInternVLProcessor
):
class
InternVLProcessor
(
ProcessorMixin
):
"""
HF Processor for InternVLChatModel with extended video processing logic.
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
attributes
=
[
"image_processor"
,
"tokenizer"
,
"video_processor"
]
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
image_processor
:
InternVLImageProcessor
,
tokenizer
:
HfTokenizer
,
video_processor
:
InternVLVideoProcessor
|
None
=
None
,
*
,
image_seq_length
:
int
,
start_image_token
:
str
=
"<img>"
,
end_image_token
:
str
=
"</img>"
,
ctx_image_token
:
str
=
"<IMG_CONTEXT>"
,
ctx_video_token
:
str
|
None
=
None
,
)
->
None
:
self
.
image_processor
=
image_processor
self
.
tokenizer
=
tokenizer
self
.
video_processor
=
video_processor
self
.
image_seq_length
=
image_seq_length
self
.
start_image_token
=
start_image_token
self
.
end_image_token
=
end_image_token
self
.
ctx_image_token
=
ctx_image_token
self
.
ctx_video_token
=
ctx_video_token
self
.
start_image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
start_image_token
)
self
.
end_image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
end_image_token
)
self
.
ctx_image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
ctx_image_token
)
self
.
ctx_video_token_id
=
(
None
if
ctx_video_token
is
None
else
tokenizer
.
convert_tokens_to_ids
(
ctx_video_token
)
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
video_token
:
str
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
image_processor
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
return
get_internvl_target_ratios
(
min_num
,
max_num
)
def
_videos_to_pixel_values_lst
(
def
get_num_image_tokens
(
self
,
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
1
,
max_dynamic_patch
=
1
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
image_processor
=
self
.
image_processor
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
return
[
video_to_pixel_values_internvl
(
video
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
False
,
)
for
video
in
videos
]
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
image_processor
.
use_thumbnail
,
)
def
_preprocess_video
(
return
num_patches
*
self
.
image_seq_length
def
get_image_repl
(
self
,
text
:
list
[
str
],
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
return
text
,
{}
num_patches
:
int
|
None
,
num_features
:
int
|
None
=
None
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
assert
num_features
is
not
None
else
:
num_features
=
num_patches
*
self
.
image_seq_length
video_token
=
self
.
video_token
assert
video_token
is
not
None
repl_features
=
self
.
ctx_image_token
*
num_features
repl_full
=
self
.
start_image_token
+
repl_features
+
self
.
end_image_token
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
video_inputs
=
{
"pixel_values_flat_video"
:
torch
.
cat
(
pixel_values_lst_video
),
"video_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst_video
]
),
}
return
PromptUpdateDetails
.
select_text
(
repl_full
,
self
.
ctx_image_token
)
for
pixel_valu
es
in
pixel_values_lst_video
:
num_patches
=
pixel_values
.
shape
[
0
]
def
get_video_repl
(
self
,
num_patch
es
:
in
t
)
->
PromptUpdateDetails
[
str
]
:
assert
self
.
ctx_video_token
is
not
None
video_repl
=
self
.
get_video_repl
(
self
.
num_image_token
,
num_patches
,
video_token
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
video_inputs
repl_features
=
self
.
ctx_video_token
*
self
.
image_seq_length
repl_features_with_sep
=
(
self
.
start_image_token
+
repl_features
+
self
.
end_image_token
)
# num_patches is equal to num_frames
repl_full
=
""
.
join
(
[
f
"Frame
{
i
+
1
}
:
{
repl_features_with_sep
}
"
for
i
in
range
(
num_patches
)]
)
return
PromptUpdateDetails
.
select_text
(
repl_full
,
self
.
ctx_video_token
)
def
__call__
(
self
,
...
...
@@ -550,54 +476,88 @@ class InternVLProcessor(BaseInternVLProcessor):
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
videos
=
self
.
_make_batch_input
(
videos
)
if
images
is
not
None
:
image_inputs
=
self
.
image_processor
(
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
return_tensors
=
return_tensors
,
)
image_num_patches
=
image_inputs
[
"image_num_patches"
]
else
:
image_inputs
=
{}
image_num_patches
=
[]
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
if
videos
is
not
None
:
if
self
.
video_processor
is
None
:
raise
ValueError
(
"This model does not support video inputs"
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
video_inputs
=
self
.
video_processor
(
videos
=
videos
,
return_tensors
=
return_tensors
,
)
video_num_patches
=
video_inputs
[
"video_num_patches"
]
else
:
video_inputs
=
{}
video_num_patches
=
[]
text_inputs
=
self
.
tokenizer
(
text
)
if
text
is
not
None
:
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
,
**
video_inputs
}
if
image_inputs
:
image_token
=
"<image>"
image_index
=
0
processed_text
=
list
[
str
]()
replace_strings
=
list
[
str
]()
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
for
prompt
in
text
:
new_prompt
=
prompt
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
while
image_token
in
new_prompt
:
new_prompt
=
new_prompt
.
replace
(
image_token
,
"<placeholder>"
,
1
)
image_repl
=
self
.
get_image_repl
(
image_num_patches
[
image_index
])
replace_strings
.
append
(
image_repl
.
full
)
image_index
+=
1
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
while
"<placeholder>"
in
new_prompt
:
replace_str
=
replace_strings
.
pop
(
0
)
new_prompt
=
new_prompt
.
replace
(
"<placeholder>"
,
replace_str
,
1
)
def
get_video_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
video_context_token
:
str
=
IMG_CONTEXT
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
processed_text
.
append
(
new_prompt
)
repl_features
=
video_context_token
*
self
.
num_image_token
repl_features_with_sep
=
IMG_START
+
repl_features
+
IMG_END
# num_patches is equal to num_frames
repl_full
=
""
.
join
(
[
f
"Frame
{
i
+
1
}
:
{
repl_features_with_sep
}
"
for
i
in
range
(
num_patches
)]
)
text
=
processed_text
if
video_inputs
:
video_token
=
"<video>"
video_index
=
0
processed_text
=
list
[
str
]()
replace_strings
=
list
[
str
]()
assert
video_token
is
not
None
for
prompt
in
text
:
new_prompt
=
prompt
while
video_token
in
new_prompt
:
new_prompt
=
new_prompt
.
replace
(
video_token
,
"<placeholder>"
,
1
)
video_repl
=
self
.
get_video_repl
(
video_num_patches
[
video_index
])
replace_strings
.
append
(
video_repl
.
full
)
video_index
+=
1
while
"<placeholder>"
in
new_prompt
:
replace_str
=
replace_strings
.
pop
(
0
)
new_prompt
=
new_prompt
.
replace
(
"<placeholder>"
,
replace_str
,
1
)
return
PromptUpdateDetails
.
select_text
(
repl_full
,
video_context_token
)
processed_text
.
append
(
new_prompt
)
text
=
processed_text
text_inputs
=
self
.
tokenizer
(
text
,
return_tensors
=
return_tensors
)
else
:
text_inputs
=
{}
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
,
**
video_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
vllm/transformers_utils/processors/nano_nemotron_vl.py
View file @
99267c23
...
...
@@ -25,7 +25,7 @@ from vllm.model_executor.models.parakeet import ParakeetExtractor
from
vllm.multimodal.evs
import
compute_retained_tokens_count
from
vllm.multimodal.inputs
import
AudioItem
from
vllm.multimodal.processing.processor
import
PromptUpdateDetails
,
_seq2tokens
from
vllm.tokenizers
import
Tokenizer
Like
from
vllm.tokenizers
.hf
import
Hf
Tokenizer
from
.internvl
import
calculate_internvl_targets
,
get_internvl_target_ratios
...
...
@@ -508,7 +508,7 @@ class BaseNanoNemotronVLProcessor(ABC):
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
Tokenizer
Like
,
tokenizer
:
Hf
Tokenizer
,
*
args
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
...
...
@@ -689,7 +689,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
Tokenizer
Like
,
tokenizer
:
Hf
Tokenizer
,
*
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
...
...
@@ -961,7 +961,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame
:
list
[
int
],
frames_indices
:
list
[
int
],
frame_duration_ms
:
int
,
tokenizer
:
Tokenizer
Like
,
tokenizer
:
Hf
Tokenizer
,
img_start_token_ids
:
list
[
int
],
img_end_token_ids
:
list
[
int
],
img_context_token_ids
:
list
[
int
],
...
...
@@ -986,7 +986,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (Tokenizer
Like
): tokenizer to use for tokenizing frame separators
tokenizer (
Hf
Tokenizer): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
...
...
vllm/transformers_utils/processors/nemotron_vl.py
View file @
99267c23
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
transformers.image_processing_utils_fast
import
BaseImageProcessorFast
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers.hf
import
HfTokenizer
from
.internvl
import
InternVLProcessor
from
.internvl
import
InternVLImageProcessor
,
InternVLProcessor
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images
...
...
@@ -172,59 +168,61 @@ def image_to_pixel_values_nemotron_vl(
return
pixel_values
class
NemotronVLProcessor
(
InternVLProcessor
):
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
def
__init__
(
class
LlamaNemotronNanoVLImageProcessor
(
InternVLImageProcessor
):
def
_images_to_pixel_values_lst
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
image_processor
:
BaseImageProcessorFast
,
*
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
ABC
.
__init__
(
self
)
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
image_processor
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
1
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
image_processor
.
max_num_tiles
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
True
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
if
image_processor
is
not
None
:
self
.
use_thumbnail
=
image_processor
.
use_thumbnail
else
:
self
.
use_thumbnail
=
getattr
(
config
,
"use_thumbnail"
,
True
)
return
[
image_to_pixel_values_nemotron_vl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
transform
=
build_transform
(
self
.
image_size
),
)
for
image
in
images
]
class
LlamaNemotronNanoVLProcessor
(
InternVLProcessor
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
self
.
IMG_CONTEXT
]
The image processor is given by:
https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/image_processing.py
"""
def
_get_transform
(
self
)
->
T
.
Compose
:
return
build_transform
(
input_size
=
self
.
image_size
)
def
__init__
(
self
,
image_processor
:
LlamaNemotronNanoVLImageProcessor
,
tokenizer
:
HfTokenizer
,
*
,
image_seq_length
:
int
,
start_image_token
:
str
=
"<img>"
,
end_image_token
:
str
=
"</img>"
,
ctx_image_token
:
str
=
"<image>"
,
)
->
None
:
super
().
__init__
(
image_processor
=
image_processor
,
tokenizer
=
tokenizer
,
image_seq_length
=
image_seq_length
,
start_image_token
=
start_image_token
,
end_image_token
=
end_image_token
,
ctx_image_token
=
ctx_image_token
,
)
def
get_num_image_tokens
(
self
,
...
...
@@ -232,6 +230,7 @@ class NemotronVLProcessor(InternVLProcessor):
image_width
:
int
,
image_height
:
int
,
)
->
int
:
image_processor
=
self
.
image_processor
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
...
...
@@ -239,13 +238,33 @@ class NemotronVLProcessor(InternVLProcessor):
num_patches
,
_
,
_
=
calculate_nemotron_vl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
use_thumbnail
=
image_processor
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
return
num_patches
*
self
.
image_seq_length
# SigLIP normalization constants
SIGLIP_MEAN
=
(
0.5
,
0.5
,
0.5
)
SIGLIP_STD
=
(
0.5
,
0.5
,
0.5
)
def
build_siglip_transform
(
input_size
:
int
):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return
T
.
Compose
(
[
build_transform
(
input_size
=
input_size
),
T
.
Normalize
(
mean
=
SIGLIP_MEAN
,
std
=
SIGLIP_STD
),
]
)
class
LlamaNemotronVLEmbedImageProcessor
(
InternVLImageProcessor
):
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
...
...
@@ -267,83 +286,13 @@ class NemotronVLProcessor(InternVLProcessor):
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
transform
=
self
.
_get_transform
(
),
transform
=
build_siglip_transform
(
self
.
image_size
),
)
for
image
in
images
]
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Replace <image> placeholders with image tokens."""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT
=
image_repl
.
full
.
replace
(
"<image>"
,
"<NVL_IMG_CONTEXT>"
)
text
=
[
t
.
replace
(
"<image>"
,
NVL_IMAGE_CONTEXT
,
1
)
for
t
in
text
]
return
[
t
.
replace
(
"<NVL_IMG_CONTEXT>"
,
self
.
IMG_CONTEXT
)
for
t
in
text
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
text
=
self
.
_replace_image_tokens
(
text
,
pixel_values_lst
)
return
text
,
image_inputs
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
self
.
IMG_CONTEXT
*
feature_size
repl_full
=
self
.
IMG_START
+
repl_features
+
self
.
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
self
.
IMG_CONTEXT
)
# SigLIP normalization constants
SIGLIP_MEAN
=
(
0.5
,
0.5
,
0.5
)
SIGLIP_STD
=
(
0.5
,
0.5
,
0.5
)
def
build_siglip_transform
(
input_size
:
int
):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return
T
.
Compose
(
[
build_transform
(
input_size
=
input_size
),
T
.
Normalize
(
mean
=
SIGLIP_MEAN
,
std
=
SIGLIP_STD
),
]
)
class
LlamaNemotronVLEmbedProcessor
(
NemotronVLProcessor
):
class
LlamaNemotronVLEmbedProcessor
(
InternVLProcessor
):
"""
Processor for LlamaNemotronVL embedding model.
...
...
@@ -352,59 +301,44 @@ class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT
=
"<IMG_CONTEXT>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
processor_config
:
dict
,
image_processor
:
LlamaNemotronVLEmbedImageProcessor
,
tokenizer
:
HfTokenizer
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
image_seq_length
:
int
,
start_image_token
:
str
=
"<img>"
,
end_image_token
:
str
=
"</img>"
,
ctx_image_token
:
str
=
"<IMG_CONTEXT>"
,
)
->
None
:
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
processor_config
.
get
(
"min_input_tiles"
,
getattr
(
config
,
"min_dynamic_patch"
,
1
),
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
processor_config
.
get
(
"max_input_tiles"
,
getattr
(
config
,
"max_dynamic_patch"
,
1
),
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
processor_config
.
get
(
"dynamic_image_size"
,
getattr
(
config
,
"dynamic_image_size"
,
True
),
)
super
().
__init__
(
config
=
config
,
image_processor
=
image_processor
,
tokenizer
=
tokenizer
,
image_
processor
=
None
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic
_image_
size
,
image_
seq_length
=
image_seq_length
,
start_image_token
=
start_image_token
,
end_image_token
=
end_image_token
,
ctx_image_token
=
ctx
_image_
token
,
)
def
_get_transform
(
self
)
->
T
.
Compose
:
"""Override to add SigLIP normalization."""
return
build_siglip_transform
(
input_size
=
self
.
image_size
)
self
.
image_processor
:
LlamaNemotronVLEmbedImageProcessor
def
_replace
_image_tokens
(
def
get_num
_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
image_processor
=
self
.
image_processor
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_nemotron_vl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
image_processor
.
use_thumbnail
,
)
return
num_patches
*
self
.
image_seq_length
vllm/transformers_utils/processors/nvlm_d.py
View file @
99267c23
...
...
@@ -8,37 +8,54 @@
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers.hf
import
HfTokenizer
from
.internvl
import
Base
InternVLProcessor
from
.internvl
import
InternVLImageProcessor
,
InternVLProcessor
IMG_PAD
=
"<|vision_pad|>"
class
NVLMProcessor
(
BaseInternVLProcessor
):
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_PAD
]
class
NVLMProcessor
(
InternVLProcessor
):
def
__init__
(
self
,
image_processor
:
InternVLImageProcessor
,
tokenizer
:
HfTokenizer
,
*
,
image_seq_length
:
int
,
start_image_token
:
str
=
"<Image>"
,
end_image_token
:
str
=
"</Image>"
,
ctx_image_token
:
str
=
"<|vision_pad|>"
,
)
->
None
:
super
().
__init__
(
image_processor
=
image_processor
,
tokenizer
=
tokenizer
,
image_seq_length
=
image_seq_length
,
start_image_token
=
start_image_token
,
end_image_token
=
end_image_token
,
ctx_image_token
=
ctx_image_token
,
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
num_features
:
int
|
None
=
None
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
num_features
=
num_patches
*
self
.
image_seq_length
tile_pos_identifiers
=
[
f
"<tile_
{
i
}
>"
for
i
in
range
(
1
,
num_patches
)]
if
self
.
use_thumbnail
:
if
self
.
image_processor
.
use_thumbnail
:
tile_pos_identifiers
+=
[
"<tile_global_thumbnail>"
]
context_size
=
feature
_size
//
num_patches
context_size
=
num_
feature
s
//
num_patches
features
=
""
.
join
(
identifier
+
IMG_PAD
*
context_size
for
identifier
in
tile_pos_identifiers
(
identifier
+
self
.
ctx_image_token
*
context_size
)
for
identifier
in
tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl
=
"<Image>"
+
features
+
"</Image>"
repl
=
self
.
start_image_token
+
features
+
self
.
end_image_token
return
PromptUpdateDetails
.
select_text
(
repl
,
IMG_PAD
)
return
PromptUpdateDetails
.
select_text
(
repl
,
self
.
ctx_image_token
)
vllm/transformers_utils/processors/skyworkr1v.py
deleted
100644 → 0
View file @
525f2eeb
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
# --------------------------------------------------------
# SkyworkR1V
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_skyworkr1v_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_skyworkr1v_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_skyworkr1v_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_skyworkr1v
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_skyworkr1v_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def
image_to_pixel_values_skyworkr1v
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_skyworkr1v
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
SkyworkR1VProcessor
:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_skyworkr1v_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_skyworkr1v_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_skyworkr1v
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment