Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f3403243
Unverified
Commit
f3403243
authored
Mar 17, 2026
by
Cyrus Leung
Committed by
GitHub
Mar 17, 2026
Browse files
[1/2] Move InternVL-based processors (#37260)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
2660b928
Changes
20
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3252 additions
and
3099 deletions
+3252
-3099
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+1
-1
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+1
-1
tests/models/multimodal/processing/test_nemotron_vl.py
tests/models/multimodal/processing/test_nemotron_vl.py
+1
-1
vllm/model_executor/models/eagle2_5_vl.py
vllm/model_executor/models/eagle2_5_vl.py
+1
-81
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/h2ovl.py
+1
-374
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+7
-578
vllm/model_executor/models/nano_nemotron_vl.py
vllm/model_executor/models/nano_nemotron_vl.py
+16
-1017
vllm/model_executor/models/nemotron_parse.py
vllm/model_executor/models/nemotron_parse.py
+1
-232
vllm/model_executor/models/nemotron_vl.py
vllm/model_executor/models/nemotron_vl.py
+4
-404
vllm/model_executor/models/nvlm_d.py
vllm/model_executor/models/nvlm_d.py
+1
-33
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+2
-377
vllm/transformers_utils/processors/__init__.py
vllm/transformers_utils/processors/__init__.py
+18
-0
vllm/transformers_utils/processors/eagle2_5_vl.py
vllm/transformers_utils/processors/eagle2_5_vl.py
+85
-0
vllm/transformers_utils/processors/h2ovl.py
vllm/transformers_utils/processors/h2ovl.py
+390
-0
vllm/transformers_utils/processors/internvl.py
vllm/transformers_utils/processors/internvl.py
+603
-0
vllm/transformers_utils/processors/nano_nemotron_vl.py
vllm/transformers_utils/processors/nano_nemotron_vl.py
+1032
-0
vllm/transformers_utils/processors/nemotron_parse.py
vllm/transformers_utils/processors/nemotron_parse.py
+245
-0
vllm/transformers_utils/processors/nemotron_vl.py
vllm/transformers_utils/processors/nemotron_vl.py
+410
-0
vllm/transformers_utils/processors/nvlm_d.py
vllm/transformers_utils/processors/nvlm_d.py
+44
-0
vllm/transformers_utils/processors/skyworkr1v.py
vllm/transformers_utils/processors/skyworkr1v.py
+389
-0
No files found.
tests/models/multimodal/processing/test_h2ovl.py
View file @
f3403243
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
min_num
:
int
,
max_num
:
int
,
max_num
:
int
,
):
):
from
vllm.
model_executor.model
s.h2ovl
import
(
from
vllm.
transformers_utils.processor
s.h2ovl
import
(
calculate_h2ovl_targets
,
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
,
get_h2ovl_target_ratios
,
)
)
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
f3403243
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
min_num
:
int
,
max_num
:
int
,
max_num
:
int
,
):
):
from
vllm.
model_executor.model
s.internvl
import
(
from
vllm.
transformers_utils.processor
s.internvl
import
(
calculate_internvl_targets
,
calculate_internvl_targets
,
get_internvl_target_ratios
,
get_internvl_target_ratios
,
)
)
...
...
tests/models/multimodal/processing/test_nemotron_vl.py
View file @
f3403243
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
min_num
:
int
,
max_num
:
int
,
max_num
:
int
,
):
):
from
vllm.
model_executor.model
s.nemotron_vl
import
(
from
vllm.
transformers_utils.processor
s.nemotron_vl
import
(
calculate_nemotron_vl_targets
,
calculate_nemotron_vl_targets
,
get_nemotron_vl_target_ratios
,
get_nemotron_vl_target_ratios
,
)
)
...
...
vllm/model_executor/models/eagle2_5_vl.py
View file @
f3403243
...
@@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
...
@@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.siglip
import
SiglipVisionModel
from
vllm.model_executor.models.siglip
import
SiglipVisionModel
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.t
okenizers
import
TokenizerLike
from
vllm.t
ransformers_utils.processors.eagle2_5_vl
import
Eagle2_5_VLProcessor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
(
from
.interfaces
import
(
...
@@ -27,13 +26,9 @@ from .interfaces import (
...
@@ -27,13 +26,9 @@ from .interfaces import (
SupportsPP
,
SupportsPP
,
)
)
from
.internvl
import
(
from
.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLDummyInputsBuilder
,
BaseInternVLDummyInputsBuilder
,
BaseInternVLMultiModalProcessor
,
BaseInternVLMultiModalProcessor
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessor
,
)
)
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
...
@@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = (
...
@@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = (
)
)
class
Eagle2_5_VLProcessor
(
BaseInternVLProcessor
):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self
.
config
=
config
self
.
tokenizer
=
tokenizer
# Image size with force_image_size override
image_size
:
int
=
config
.
vision_config
.
image_size
if
hasattr
(
config
,
"force_image_size"
)
and
config
.
force_image_size
:
image_size
=
config
.
force_image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
downsample_ratio
:
float
=
getattr
(
config
,
"downsample_ratio"
,
0.5
)
# Compute num_image_token
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
# Dynamic patch settings with defaults
self
.
min_dynamic_patch
=
(
min_dynamic_patch
if
min_dynamic_patch
is
not
None
else
getattr
(
config
,
"min_dynamic_patch"
,
1
)
)
self
.
max_dynamic_patch
=
(
max_dynamic_patch
if
max_dynamic_patch
is
not
None
else
getattr
(
config
,
"max_dynamic_patch"
,
12
)
)
self
.
dynamic_image_size
=
(
dynamic_image_size
if
dynamic_image_size
is
not
None
else
getattr
(
config
,
"dynamic_image_size"
,
True
)
)
self
.
use_thumbnail
:
bool
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
"""Get the image token ID from config or tokenizer."""
if
hasattr
(
self
.
config
,
"image_token_index"
):
return
self
.
config
.
image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab
=
self
.
tokenizer
.
get_vocab
()
if
IMG_CONTEXT
in
vocab
:
return
vocab
[
IMG_CONTEXT
]
raise
ValueError
(
f
"Cannot find image token '
{
IMG_CONTEXT
}
' in vocabulary"
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
"""Get image replacement string for prompt."""
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
class
Eagle2_5_VLProcessingInfo
(
BaseInternVLProcessingInfo
):
class
Eagle2_5_VLProcessingInfo
(
BaseInternVLProcessingInfo
):
"""Processing info for Eagle2.5-VL model."""
"""Processing info for Eagle2.5-VL model."""
...
...
vllm/model_executor/models/h2ovl.py
View file @
f3403243
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
from
collections.abc
import
Mapping
,
Sequence
from
collections.abc
import
Mapping
,
Sequence
import
torch
import
torch
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
...
@@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import (
...
@@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import (
ProcessorInputs
,
ProcessorInputs
,
PromptReplacement
,
PromptReplacement
,
PromptUpdate
,
PromptUpdate
,
PromptUpdateDetails
,
TimingContext
,
TimingContext
,
)
)
from
vllm.t
okenizers
import
TokenizerLike
from
vllm.t
ransformers_utils.processors.h2ovl
import
H2OVLProcessor
from
.intern_vit
import
InternVisionModel
from
.intern_vit
import
InternVisionModel
from
.internvl
import
(
from
.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLDummyInputsBuilder
,
BaseInternVLDummyInputsBuilder
,
BaseInternVLMultiModalProcessor
,
BaseInternVLMultiModalProcessor
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessor
,
InternVLChatModel
,
InternVLChatModel
,
build_transform
,
find_closest_aspect_ratio
,
get_internvl_target_ratios
,
)
)
def
resolve_h2ovl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_h2ovl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
*
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
# if prior_aspect_ratio is provided, filter the target ratios
if
prior_aspect_ratio
is
not
None
:
target_ratios
=
[
ratio
for
ratio
in
target_ratios
if
prior_aspect_ratio
[
0
]
%
ratio
[
0
]
!=
0
and
prior_aspect_ratio
[
1
]
%
ratio
[
1
]
!=
0
]
return
target_ratios
# modified to include blocks generated in second pass
def
calculate_h2ovl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
,
tuple
[
int
,
int
]]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
,
target_aspect_ratio
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
# refactored to handle prior_aspect_ratio
def
dynamic_preprocess_h2ovl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
list
[
Image
.
Image
],
tuple
[
int
,
int
]]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
(
blocks
,
target_width
,
target_height
,
target_aspect_ratio
,
)
=
calculate_h2ovl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
,
target_aspect_ratio
def
_preprocess_image
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
tuple
[
torch
.
Tensor
,
tuple
[
int
,
int
]]:
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
transform
=
build_transform
(
input_size
=
input_size
)
images
,
target_aspect_ratio
=
dynamic_preprocess_h2ovl
(
image
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
target_ratios
=
target_ratios
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
,
target_aspect_ratio
# refactored to use the _preprocess_image function
def
image_to_pixel_values_h2ovl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
use_msac
:
bool
,
)
->
torch
.
Tensor
:
# when MSAC is turned on, we need to process the image twice
if
use_msac
:
# first pass
pixel_values1
,
aspect_ratio1
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
1
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
None
,
)
# second pass
pixel_values2
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
3
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
aspect_ratio1
,
)
# combine pixel values
pixel_values
=
torch
.
cat
(
[
pixel_values2
[:
-
1
],
pixel_values1
[:
-
1
],
pixel_values2
[
-
1
:]],
0
)
else
:
pixel_values
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
use_thumbnail
,
prior_aspect_ratio
=
None
,
)
return
pixel_values
class
H2OVLProcessor
(
BaseInternVLProcessor
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_msac
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
,
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
if
use_msac
is
None
:
use_msac
=
config
.
use_msac
assert
isinstance
(
use_msac
,
bool
)
self
.
use_msac
=
use_msac
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_h2ovl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
=
None
,
override_min_num
:
int
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
if
override_min_num
is
not
None
:
min_num
=
override_min_num
return
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
use_msac
:
bool
|
None
=
None
,
)
->
int
:
use_msac
=
self
.
use_msac
if
use_msac
is
None
else
use_msac
use_thumbnail
=
self
.
use_thumbnail
if
use_msac
:
target_ratios_1
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
override_min_num
=
1
,
)
num_patches_1
,
_
,
_
,
aspect_ratio_1
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios_1
,
use_thumbnail
=
True
,
)
target_ratios_2
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
prior_aspect_ratio
=
aspect_ratio_1
,
override_min_num
=
3
,
)
num_patches_2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios_2
,
use_thumbnail
=
True
,
)
num_patches
=
num_patches_1
+
num_patches_2
-
1
else
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
use_msac
=
self
.
use_msac
if
len
(
images
)
==
1
else
False
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_h2ovl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
use_msac
=
use_msac
,
)
for
image
in
images
]
class
H2OVLProcessingInfo
(
BaseInternVLProcessingInfo
):
class
H2OVLProcessingInfo
(
BaseInternVLProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
H2OVLProcessor
:
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
H2OVLProcessor
:
return
self
.
ctx
.
init_processor
(
return
self
.
ctx
.
init_processor
(
...
...
vllm/model_executor/models/internvl.py
View file @
f3403243
...
@@ -7,16 +7,13 @@
...
@@ -7,16 +7,13 @@
# Copyright (c) 2023 OpenGVLab
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# --------------------------------------------------------
from
abc
import
ABC
,
abstractmethod
from
abc
import
abstractmethod
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Any
,
Literal
,
TypeAlias
,
TypeVar
from
typing
import
Annotated
,
Literal
,
TypeAlias
,
TypeVar
import
numpy.typing
as
npt
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
transformers
import
BatchFeature
,
PretrainedConfig
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
...
@@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import (
...
@@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import (
)
)
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
(
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalFieldConfig
,
...
@@ -46,10 +42,12 @@ from vllm.multimodal.processing import (
...
@@ -46,10 +42,12 @@ from vllm.multimodal.processing import (
BaseProcessingInfo
,
BaseProcessingInfo
,
PromptReplacement
,
PromptReplacement
,
PromptUpdate
,
PromptUpdate
,
PromptUpdateDetails
,
)
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.processors.internvl
import
(
BaseInternVLProcessor
,
InternVLProcessor
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
(
from
.interfaces
import
(
...
@@ -60,13 +58,6 @@ from .interfaces import (
...
@@ -60,13 +58,6 @@ from .interfaces import (
)
)
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
class
InternVLImagePixelInputs
(
TensorSchema
):
class
InternVLImagePixelInputs
(
TensorSchema
):
"""
"""
...
@@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
...
@@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
InternVLVideoInputs
:
TypeAlias
=
InternVLVideoPixelInputs
|
InternVLVideoEmbeddingInputs
InternVLVideoInputs
:
TypeAlias
=
InternVLVideoPixelInputs
|
InternVLVideoEmbeddingInputs
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
return
transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_internvl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_internvl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_internvl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
dynamic_preprocess_internvl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_internvl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
image_to_pixel_values_internvl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_internvl
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
video_to_pixel_values_internvl
(
video
:
npt
.
NDArray
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
frames_list
=
list
[
Image
.
Image
]()
for
frame
in
video
:
pil_frame
=
dynamic_preprocess_internvl
(
Image
.
fromarray
(
frame
,
mode
=
"RGB"
),
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
assert
len
(
pil_frame
)
==
1
frames_list
.
extend
(
pil_frame
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
frames_list
])
return
pixel_values
class
BaseInternVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_internvl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_internvl_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_internvl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
Any
|
list
[
Any
]
|
None
=
None
):
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
text
,
images
=
[
self
.
_make_batch_input
(
x
)
for
x
in
(
text
,
images
)]
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
class
InternVLProcessor
(
BaseInternVLProcessor
):
"""
HF Processor for InternVLChatModel with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
video_token
:
str
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
1
,
max_dynamic_patch
=
1
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
video_to_pixel_values_internvl
(
video
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
False
,
)
for
video
in
videos
]
def
_preprocess_video
(
self
,
text
:
list
[
str
],
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
):
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
video_inputs
=
{}
else
:
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
video_inputs
=
{
"pixel_values_flat_video"
:
torch
.
cat
(
pixel_values_lst_video
),
"video_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst_video
]
),
}
for
pixel_values
in
pixel_values_lst_video
:
num_patches
=
pixel_values
.
shape
[
0
]
video_repl
=
self
.
get_video_repl
(
self
.
num_image_token
,
num_patches
,
self
.
video_token
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
video_inputs
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
text
,
images
,
videos
=
[
self
.
_make_batch_input
(
x
)
for
x
in
(
text
,
images
,
videos
)
]
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
,
**
video_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
get_video_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
=
None
,
video_context_token
:
str
=
IMG_CONTEXT
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
video_context_token
*
self
.
num_image_token
repl_features_with_sep
=
IMG_START
+
repl_features
+
IMG_END
# num_patches is equal to num_frames
repl_full
=
""
.
join
(
[
f
"Frame
{
i
+
1
}
:
{
repl_features_with_sep
}
"
for
i
in
range
(
num_patches
)]
)
return
PromptUpdateDetails
.
select_text
(
repl_full
,
video_context_token
)
class
BaseInternVLProcessingInfo
(
BaseProcessingInfo
):
class
BaseInternVLProcessingInfo
(
BaseProcessingInfo
):
"""Basic image-only ProcessingInfo for InternVL-style models."""
"""Basic image-only ProcessingInfo for InternVL-style models."""
...
...
vllm/model_executor/models/nano_nemotron_vl.py
View file @
f3403243
...
@@ -8,22 +8,15 @@
...
@@ -8,22 +8,15 @@
# --------------------------------------------------------
# --------------------------------------------------------
import
copy
import
copy
import
math
import
warnings
import
warnings
from
abc
import
ABC
,
abstractmethod
from
abc
import
abstractmethod
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
dataclasses
import
dataclass
from
functools
import
cached_property
from
functools
import
cached_property
from
typing
import
Annotated
,
Any
,
Literal
,
TypeAlias
,
TypeVar
from
typing
import
Annotated
,
Literal
,
TypeAlias
,
TypeVar
import
einops
import
numpy
as
np
import
numpy.typing
as
npt
import
regex
as
re
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
PIL
import
Image
from
transformers
import
BatchFeature
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
...
@@ -38,10 +31,6 @@ from vllm.model_executor.models.interfaces import (
...
@@ -38,10 +31,6 @@ from vllm.model_executor.models.interfaces import (
SupportsMultiModal
,
SupportsMultiModal
,
SupportsMultiModalPruning
,
SupportsMultiModalPruning
,
)
)
from
vllm.model_executor.models.internvl
import
(
calculate_internvl_targets
,
get_internvl_target_ratios
,
)
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.nemotron_h
import
NemotronHForCausalLM
from
vllm.model_executor.models.nemotron_h
import
NemotronHForCausalLM
from
vllm.model_executor.models.parakeet
import
ParakeetExtractor
,
ProjectedParakeet
from
vllm.model_executor.models.parakeet
import
ParakeetExtractor
,
ProjectedParakeet
...
@@ -83,23 +72,28 @@ from vllm.multimodal.processing.processor import (
...
@@ -83,23 +72,28 @@ from vllm.multimodal.processing.processor import (
BaseProcessingInfo
,
BaseProcessingInfo
,
PromptReplacement
,
PromptReplacement
,
PromptUpdate
,
PromptUpdate
,
PromptUpdateDetails
,
_seq2tokens
,
)
)
from
vllm.renderers
import
TokenizeParams
from
vllm.renderers
import
TokenizeParams
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
,
cached_tokenizer_from_config
from
vllm.tokenizers
import
cached_tokenizer_from_config
from
vllm.transformers_utils.configs.radio
import
RadioConfig
from
vllm.transformers_utils.configs.radio
import
RadioConfig
from
vllm.transformers_utils.processors.nano_nemotron_vl
import
(
AUDIO_CONTEXT
,
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseNanoNemotronVLProcessor
,
DynamicResolutionImageTiler
,
NanoNemotronVLProcessor
,
get_internvl_target_ratios
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.utils
import
_merge_multimodal_embeddings
from
.utils
import
_merge_multimodal_embeddings
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images
MAX_AUDIO_LEN_S
=
10
*
60
# 10 minutes
Image
.
MAX_IMAGE_PIXELS
=
None
# Disable the limit entirely
# Alternative: Set a specific higher limit
# Image.MAX_IMAGE_PIXELS = 300000000 # ~300M pixels
class
NanoNemotronVLAudioFeatureInputs
(
TensorSchema
):
class
NanoNemotronVLAudioFeatureInputs
(
TensorSchema
):
...
@@ -116,20 +110,6 @@ class NanoNemotronVLAudioFeatureInputs(TensorSchema):
...
@@ -116,20 +110,6 @@ class NanoNemotronVLAudioFeatureInputs(TensorSchema):
audio_feature_lengths
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"b"
)]
audio_feature_lengths
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"b"
)]
MAX_AUDIO_LEN_S
=
10
*
60
# 10 minutes
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
AUDIO_START
=
"<so_start>"
AUDIO_END
=
"<so_end>"
AUDIO_CONTEXT
=
"<so_embedding>"
# Profiling
# MAX_FRAMES = 16
DEFAULT_NUM_TILES
=
12
class
NanoNemotronVLImagePixelInputs
(
TensorSchema
):
class
NanoNemotronVLImagePixelInputs
(
TensorSchema
):
"""
"""
Dimensions:
Dimensions:
...
@@ -213,987 +193,6 @@ NanoNemotronVLVideoInputs: TypeAlias = (
...
@@ -213,987 +193,6 @@ NanoNemotronVLVideoInputs: TypeAlias = (
)
)
def
dynamic_preprocess
(
image
,
*
,
image_size
=
512
,
max_num_tiles
=
12
,
use_thumbnail
=
True
,
idx
=
0
,
):
orig_width
,
orig_height
=
image
.
size
target_ratios
=
get_internvl_target_ratios
(
1
,
max_num_tiles
)
blocks
,
target_width
,
target_height
=
calculate_internvl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
image
=
np
.
asarray
(
image
.
convert
(
"RGB"
)
if
image
.
mode
!=
"RGB"
else
image
,
dtype
=
np
.
uint8
)
image
=
torch
.
from_numpy
(
image
).
unsqueeze
(
0
)
# (1, H, W, 3)
image
=
image
.
permute
(
0
,
3
,
1
,
2
)
# (1, 3, H, W)
resized_img
=
torch
.
nn
.
functional
.
interpolate
(
image
,
size
=
(
target_height
,
target_width
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
B
,
C
,
H
,
W
=
resized_img
.
shape
hp
,
wp
=
H
//
image_size
,
W
//
image_size
patches
=
(
resized_img
.
reshape
(
B
,
C
,
hp
,
image_size
,
wp
,
image_size
)
.
permute
(
0
,
2
,
4
,
1
,
3
,
5
)
.
reshape
(
B
*
hp
*
wp
,
C
,
image_size
,
image_size
)
/
255.0
)
if
use_thumbnail
and
patches
.
shape
[
0
]
>
1
:
thumb
=
(
torch
.
nn
.
functional
.
interpolate
(
image
,
size
=
(
image_size
,
image_size
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
/
255.0
)
patches
=
torch
.
cat
([
patches
,
thumb
],
dim
=
0
)
return
list
(
patches
)
def
image_to_pixel_values
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
idx
:
int
,
)
->
torch
.
Tensor
:
images
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
max_num_tiles
=
max_num
,
use_thumbnail
=
use_thumbnail
,
idx
=
idx
,
)
pixel_values
=
torch
.
stack
(
images
)
return
pixel_values
def
video_to_pixel_values
(
video
:
npt
.
NDArray
,
*
,
input_size
:
int
,
max_num_tiles
:
int
=
1
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
assert
max_num_tiles
==
1
,
"Video modality always uses one tile"
# (num_frames, H, W, C) -> (num_frames, C, H, W)
video_tensor
=
torch
.
from_numpy
(
video
).
permute
(
0
,
3
,
1
,
2
)
if
video_tensor
.
shape
[
2
]
!=
input_size
or
video_tensor
.
shape
[
3
]
!=
input_size
:
video_tensor
=
torch
.
nn
.
functional
.
interpolate
(
video_tensor
,
size
=
(
input_size
,
input_size
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
video_tensor
=
video_tensor
/
255.0
return
video_tensor
def
input_conditioner
(
x
,
norm_mean
,
norm_std
):
return
(
x
-
norm_mean
)
/
norm_std
def
calculate_timestamps
(
indices
:
list
[
int
]
|
torch
.
Tensor
,
frame_duration_ms
:
int
,
):
if
not
isinstance
(
indices
,
list
):
indices
=
indices
.
tolist
()
timestamps
=
[
int
(
i
)
*
frame_duration_ms
/
1000.0
for
i
in
indices
]
return
timestamps
class
DynamicResolutionImageTiler
:
CONV_MERGING
=
False
PIXEL_SHUFFLE
=
True
USE_THUMBNAIL
=
False
def
__init__
(
self
,
*
,
max_model_len
:
int
,
patch_size
:
int
,
min_num_patches
:
int
,
max_num_patches
:
int
,
downsample_ratio
:
int
,
norm_mean
:
Sequence
[
float
],
norm_std
:
Sequence
[
float
],
factor_max
:
float
=
1.0
,
use_thumbnail
:
bool
=
False
,
)
->
None
:
assert
use_thumbnail
is
False
,
"use_thumbnail is not supported"
self
.
_patch_size
:
int
=
patch_size
self
.
_max_model_len
=
max_model_len
self
.
_min_num_patches
=
min_num_patches
self
.
_max_num_patches
=
max_num_patches
if
max_num_patches
>
0
else
float
(
"inf"
)
self
.
_factor_max
=
factor_max
self
.
norm_mean
=
torch
.
tensor
(
norm_mean
).
reshape
(
3
,
1
,
1
)
self
.
norm_std
=
torch
.
tensor
(
norm_std
).
reshape
(
3
,
1
,
1
)
assert
downsample_ratio
<
1
reduction_factor
=
1
/
downsample_ratio
assert
reduction_factor
==
2.0
self
.
_downsample_ratio
=
int
(
reduction_factor
)
**
(
self
.
PIXEL_SHUFFLE
+
self
.
CONV_MERGING
)
assert
self
.
_downsample_ratio
==
2
def
_get_num_embeddings
(
self
,
width
:
int
,
height
:
int
)
->
int
:
num_patches
=
(
width
//
self
.
_patch_size
)
*
(
height
//
self
.
_patch_size
)
num_tokens
=
num_patches
//
(
self
.
_downsample_ratio
**
2
)
return
num_tokens
def
width_and_height_for_max_num_tokens_available
(
self
,
target_num_tokens_post_shuffle
:
int
,
)
->
tuple
[
int
,
int
]:
"""
TODO: optimize this so it squeezes closer to target number of tokens.
Calculate image dimensions that produce approximately `target` tokens after
pixel_shuffle.
With pixel_shuffle enabled, each 2x2 patch grid becomes 1 token, so we
need 4*B patches to get B tokens.
Examples:
>>> PATCH_SIZE = 16
>>> DOWNSAMPLE_RATIO = 0.5
>>> tiler = DynamicResolutionImageTiler(
... max_model_len=16384,
... patch_size=PATCH_SIZE,
... downsample_ratio=DOWNSAMPLE_RATIO,
... min_num_patches=4,
... max_num_patches=0,
... )
>>> width, height = tiler.width_and_height_for_max_num_tokens_available(
... target_num_tokens_post_shuffle=8192,
... )
>>> assert width, height == (2880, 2880)
>>> assert (width // PATCH_SIZE) * (
... height // PATCH_SIZE
... ) // 2**2 == 8100 # tokens post-shuffle
>>> assert tiler._get_num_embeddings(width=width, height=height) == 8100
"""
side_pixels
=
(
math
.
isqrt
(
target_num_tokens_post_shuffle
)
*
self
.
_downsample_ratio
*
self
.
_patch_size
)
assert
isinstance
(
side_pixels
,
int
)
and
side_pixels
%
self
.
_patch_size
==
0
return
side_pixels
,
side_pixels
def
max_num_tokens_available
(
self
,
text_prompt_length
:
int
)
->
int
:
return
self
.
_max_model_len
-
text_prompt_length
-
4
def
_images_to_pixel_values_lst
(
self
,
text_prompt_length
:
int
,
images
:
list
[
Image
.
Image
],
)
->
tuple
[
list
[
torch
.
Tensor
],
list
[
int
]]:
num_tokens_available
=
self
.
max_num_tokens_available
(
text_prompt_length
)
params_per_image
=
self
.
compute_params
(
images
,
num_tokens_available
)
feature_sizes
=
[]
images
=
[]
for
param
in
params_per_image
:
for
t
in
self
.
apply_params
(
param
):
assert
t
.
ndim
==
3
,
f
"
{
t
.
ndim
=
}
: expected 3 dim tensor"
images
.
append
(
t
)
feature_sizes
.
append
(
param
.
num_embeddings
)
return
images
,
feature_sizes
feature_size_cache
:
dict
[
Image
.
Image
,
int
]
=
{}
@
classmethod
def
get_cached_feature_size
(
cls
,
image
:
Image
.
Image
)
->
int
:
feature_size
=
cls
.
feature_size_cache
[
id
(
image
)]
# hard assert that we only use the feature size once
del
cls
.
feature_size_cache
[
id
(
image
)]
return
feature_size
@
dataclass
class
DynamicResolutionParams
:
media
:
Image
.
Image
num_tiles
:
int
num_embeddings
:
int
patch_size
:
tuple
[
int
,
int
]
def
apply_params
(
self
,
params
:
DynamicResolutionParams
)
->
list
[
torch
.
Tensor
]:
target_size
=
(
params
.
patch_size
[
1
]
*
self
.
_patch_size
,
params
.
patch_size
[
0
]
*
self
.
_patch_size
,
)
image
=
np
.
asarray
(
params
.
media
.
convert
(
"RGB"
)
if
params
.
media
.
mode
!=
"RGB"
else
params
.
media
,
dtype
=
np
.
uint8
,
)
resized_img
=
(
torch
.
nn
.
functional
.
interpolate
(
torch
.
from_numpy
(
image
).
unsqueeze
(
0
).
permute
(
0
,
3
,
1
,
2
),
size
=
target_size
,
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
/
255.0
)
return
list
(
resized_img
)
def
process_media
(
self
,
media
:
Image
.
Image
,
num_tokens_available
:
int
,
)
->
tuple
[
DynamicResolutionParams
,
int
]:
"""Process a single media item and return its parameters.
Args:
media: The media item to process
num_tokens_available: Number of tokens available for this media
Returns:
DynamicResolutionParams for the media
"""
current_num_tokens_available
=
num_tokens_available
assert
isinstance
(
media
,
Image
.
Image
),
(
"Dynamic resolution is only supported for image media"
)
orig_width
,
orig_height
=
media
.
width
,
media
.
height
closest_patch_height
=
round
(
orig_height
/
self
.
_patch_size
+
0.5
)
closest_patch_width
=
round
(
orig_width
/
self
.
_patch_size
+
0.5
)
patches
=
closest_patch_height
*
closest_patch_width
factor
=
min
(
math
.
sqrt
(
current_num_tokens_available
/
patches
),
self
.
_factor_max
)
target_patch_height
=
math
.
floor
(
factor
*
closest_patch_height
)
target_patch_width
=
math
.
floor
(
factor
*
closest_patch_width
)
# Consider self._min_num_patches if > current_num_tokens_available.
if
(
current_num_tokens_available
>
self
.
_min_num_patches
and
target_patch_height
*
target_patch_width
<
self
.
_min_num_patches
):
up_factor
=
math
.
sqrt
(
self
.
_min_num_patches
/
(
target_patch_height
*
target_patch_width
)
)
target_patch_height
=
math
.
ceil
(
up_factor
*
target_patch_height
)
target_patch_width
=
math
.
ceil
(
up_factor
*
target_patch_width
)
# Round patch grid to be divisible by 2 (pixel-shuffle OR conv-merging)
# or by 4 when BOTH are enabled (two successive 2x reductions)
if
self
.
PIXEL_SHUFFLE
or
self
.
CONV_MERGING
:
required_divisor
=
4
if
(
self
.
PIXEL_SHUFFLE
and
self
.
CONV_MERGING
)
else
2
rem_h
=
target_patch_height
%
required_divisor
if
rem_h
!=
0
:
inc_h
=
required_divisor
-
rem_h
if
(
target_patch_height
+
inc_h
)
*
target_patch_width
<=
current_num_tokens_available
:
target_patch_height
+=
inc_h
else
:
target_patch_height
=
max
(
required_divisor
,
target_patch_height
-
rem_h
)
rem_w
=
target_patch_width
%
required_divisor
if
rem_w
!=
0
:
inc_w
=
required_divisor
-
rem_w
if
(
target_patch_height
*
(
target_patch_width
+
inc_w
)
<=
current_num_tokens_available
):
target_patch_width
+=
inc_w
else
:
target_patch_width
=
max
(
required_divisor
,
target_patch_width
-
rem_w
)
# Calculate embeddings for the main dynamic resolution image
num_embeddings
=
self
.
_get_num_embeddings
(
target_patch_width
*
self
.
_patch_size
,
target_patch_height
*
self
.
_patch_size
,
)
token_count
=
target_patch_width
*
target_patch_height
# Add thumbnail embeddings if enabled and image area is below threshold
num_tiles
=
1
# Base dynamic resolution image
return
self
.
DynamicResolutionParams
(
media
=
media
,
num_tiles
=
num_tiles
,
num_embeddings
=
num_embeddings
,
patch_size
=
(
target_patch_width
,
target_patch_height
),
),
token_count
def
compute_params
(
self
,
media_list
:
list
[
Image
.
Image
],
num_tokens_available
:
int
|
None
=
None
,
)
->
list
[
DynamicResolutionParams
]:
"""Compute parameters for all media with iterative token budgeting.
Args:
media_list: List of media items to process
num_tokens_available: Total number of tokens available across all media
Returns:
List of ImageTilingParams for each media item
"""
num_tokens_available
=
(
num_tokens_available
*
(
4
if
self
.
PIXEL_SHUFFLE
else
1
)
*
(
4
if
self
.
CONV_MERGING
else
1
)
)
# When the number of available token is too small,
# allow self._min_num_patches per media and let the sample be truncated.
num_tokens_available
=
max
(
num_tokens_available
,
self
.
_min_num_patches
*
len
(
media_list
)
)
# Clip the number of tokens available per media to >min and <max patches.
num_tokens_available_per_media
=
[
max
(
min
(
num_tokens_available
,
self
.
_max_num_patches
),
self
.
_min_num_patches
)
for
_
in
range
(
len
(
media_list
))
]
# prevent infinite loop in any case
for
_
in
range
(
10
):
# Step 1: Process each media with current token budget
params
=
[]
token_counts
=
[]
for
media
,
tokens_for_media
in
zip
(
media_list
,
num_tokens_available_per_media
):
param
,
token_count
=
self
.
process_media
(
media
,
tokens_for_media
)
params
.
append
(
param
)
token_counts
.
append
(
token_count
)
self
.
feature_size_cache
[
id
(
param
.
media
)]
=
param
.
num_embeddings
# Step 2: Check if total tokens is within budget
total_tokens
=
sum
(
token_counts
)
if
total_tokens
<=
num_tokens_available
:
# We're within budget, return the params
return
params
# Step 3: We're over budget, need to scale down
# Calculate scaling factor to get under budget
scaling_factor
=
num_tokens_available
/
total_tokens
# Recalculate token budgets for each media based on scaling
# Each media gets a proportional share of the total budget
scaled_down_num_tokens_available_per_media
=
[
max
(
self
.
_min_num_patches
,
int
(
token_count
*
scaling_factor
))
for
token_count
in
token_counts
]
scaled_down
=
any
(
[
scaled_down_num_tokens_available_per_media
[
i
]
<
num_tokens_available_per_media
[
i
]
for
i
in
range
(
len
(
num_tokens_available_per_media
))
]
)
# If there wasn't scaling down, we're stuck with min_num_patches per media,
# else try with the scaled down num_tokens_available_per_media.
if
not
scaled_down
:
num_tokens_available_per_media
=
[
self
.
_min_num_patches
]
*
len
(
media_list
)
else
:
num_tokens_available_per_media
=
(
scaled_down_num_tokens_available_per_media
)
ctx
=
f
"
{
params
=
}
{
total_tokens
=
}
{
num_tokens_available
=
}
"
raise
ValueError
(
f
"Should be unreachable - `return params` above must be reached:
{
ctx
}
"
)
@
staticmethod
def
stack
(
images
:
list
[
torch
.
Tensor
],
patch_size
:
int
)
->
torch
.
Tensor
:
assert
len
(
images
)
>
0
,
"No images to stack"
def
rearrange_img
(
x
):
py
=
x
.
shape
[
-
2
]
//
patch_size
px
=
x
.
shape
[
-
1
]
//
patch_size
x
=
einops
.
rearrange
(
x
,
"c (py yy) (px xx) -> (py px) (c yy xx)"
,
py
=
py
,
yy
=
patch_size
,
px
=
px
,
xx
=
patch_size
,
)
return
x
imgs
=
[
rearrange_img
(
img
)
for
img
in
images
]
pixel_values_flat
=
torch
.
cat
(
imgs
,
dim
=
0
).
unsqueeze
(
0
)
return
pixel_values_flat
class
BaseNanoNemotronVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
args
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
max_num_tiles
=
max_num_tiles
or
DEFAULT_NUM_TILES
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
downsample_ratio
:
int
=
config
.
downsample_ratio
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
self
.
norm_mean
=
torch
.
Tensor
(
config
.
norm_mean
).
reshape
(
1
,
3
,
1
,
1
)
self
.
norm_std
=
torch
.
Tensor
(
config
.
norm_std
).
reshape
(
1
,
3
,
1
,
1
)
self
.
dynamic_tiler
:
DynamicResolutionImageTiler
|
None
=
None
if
self
.
use_dynamic_resolution
(
config
):
self
.
dynamic_tiler
=
DynamicResolutionImageTiler
(
max_model_len
=
max_model_len
,
patch_size
=
patch_size
,
downsample_ratio
=
downsample_ratio
,
min_num_patches
=
config
.
vision_config
.
args
[
"min_num_patches"
],
max_num_patches
=
config
.
vision_config
.
args
[
"max_num_patches"
],
norm_mean
=
config
.
norm_mean
,
norm_std
=
config
.
norm_std
,
)
@
staticmethod
def
use_dynamic_resolution
(
config
:
PretrainedConfig
)
->
bool
:
return
"min_num_patches"
in
config
.
vision_config
.
args
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
max_num_tiles
:
int
,
)
->
int
:
target_ratios
=
get_internvl_target_ratios
(
1
,
max_num_tiles
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
target_ratios
=
target_ratios
,
image_size
=
self
.
image_size
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
max_num_tiles
:
int
,
)
->
list
[
torch
.
Tensor
]:
return
[
image_to_pixel_values
(
image
,
input_size
=
self
.
image_size
,
max_num
=
max_num_tiles
,
use_thumbnail
=
self
.
use_thumbnail
,
idx
=
idx
,
)
for
idx
,
image
in
enumerate
(
images
)
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
max_num_tiles
:
int
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
return
text
,
image_inputs
if
tiler
:
=
self
.
dynamic_tiler
:
sans_images
=
text
[
0
].
replace
(
"<image>"
,
""
)
text_prompt_length
=
len
(
self
.
tokenizer
(
sans_images
,
add_special_tokens
=
False
).
input_ids
)
pixel_values_lst
,
num_tokens_per_image
=
tiler
.
_images_to_pixel_values_lst
(
text_prompt_length
=
text_prompt_length
,
images
=
images
,
)
imgs_sizes
=
[(
pv
.
shape
[
-
2
],
pv
.
shape
[
-
1
])
for
pv
in
pixel_values_lst
]
normalized
=
[
input_conditioner
(
img
,
tiler
.
norm_mean
,
tiler
.
norm_std
)
for
img
in
pixel_values_lst
]
image_num_patches
=
torch
.
tensor
([
1
]
*
len
(
num_tokens_per_image
))
image_inputs
=
{
"pixel_values_flat"
:
normalized
,
"imgs_sizes"
:
imgs_sizes
,
"num_tokens_per_image"
:
num_tokens_per_image
,
}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
max_num_tiles
)
image_num_patches
=
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst
])
pixel_values_flat
=
input_conditioner
(
torch
.
cat
(
pixel_values_lst
),
self
.
norm_mean
,
self
.
norm_std
)
image_inputs
=
{
"pixel_values_flat"
:
pixel_values_flat
,
"image_num_patches"
:
image_num_patches
,
}
num_tokens_per_image
=
[
self
.
num_image_token
*
len
(
item
)
for
item
in
pixel_values_lst
]
assert
len
(
text
)
==
1
,
(
"hf_processor is called on the output of get_dummy_text, "
"which should be a single string"
)
parts
=
[
x
for
x
in
re
.
split
(
r
"(<image>)"
,
text
[
0
])
if
x
]
assert
parts
.
count
(
"<image>"
)
==
len
(
pixel_values_lst
),
(
"the number of <image> tokens in the text should be the "
"same as the number of images"
)
for
i
,
(
feature_size
,
num_patches
)
in
enumerate
(
zip
(
num_tokens_per_image
,
image_num_patches
,
strict
=
True
)
):
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
parts
[
i
]
=
parts
[
i
].
replace
(
"<image>"
,
image_repl
.
full
)
text
=
[
""
.
join
(
parts
)]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
Any
|
list
[
Any
]
|
None
=
None
):
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
@
abstractmethod
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
max_num_tiles
:
int
|
None
=
None
,
)
->
BatchFeature
:
raise
NotImplementedError
class
NanoNemotronVLProcessor
(
BaseNanoNemotronVLProcessor
):
"""
HF Processor with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
video_token
:
str
|
None
=
None
,
video_pruning_rate
:
float
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
max_model_len
=
max_model_len
,
max_num_tiles
=
max_num_tiles
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
self
.
video_pruning_rate
=
video_pruning_rate
self
.
audio_extractor
:
ParakeetExtractor
|
None
=
None
raw_sound_config
=
getattr
(
config
,
"sound_config"
,
None
)
if
raw_sound_config
is
not
None
:
self
.
audio_extractor
=
ParakeetExtractor
(
raw_sound_config
)
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
self
.
_img_start_token_ids
=
tokenizer
.
encode
(
IMG_START
,
add_special_tokens
=
False
)
self
.
_img_end_token_ids
=
tokenizer
.
encode
(
IMG_END
,
add_special_tokens
=
False
)
self
.
_img_context_token_ids
=
tokenizer
.
encode
(
IMG_CONTEXT
,
add_special_tokens
=
False
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
convert_tokens_to_ids
(
IMG_CONTEXT
)
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
max_num_tiles
:
int
,
)
->
list
[
torch
.
Tensor
]:
return
[
video_to_pixel_values
(
video
,
input_size
=
self
.
image_size
,
max_num_tiles
=
max_num_tiles
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
video
in
videos
]
def
_preprocess_video
(
self
,
text
:
list
[
str
],
videos
:
list
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]],
max_num_tiles
:
int
,
):
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
video_inputs
=
{}
else
:
videos_lst
=
[
v
[
0
]
for
v
in
videos
]
video_metadata_lst
=
[
v
[
1
]
for
v
in
videos
]
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos_lst
,
max_num_tiles
=
max_num_tiles
,
)
# We use frame duration in milliseconds (as integer) to ensure
# we have consistent timestamps calculation. At preprocessing
# fps parameter is given in fp32, while at inference it is bf16
# which leads to inaccurate timestamp calculation and causes
# timestamp values to differ.In rare cases this causes
# mismatching number of output tokens for tokenized frame prefixes
frame_duration_ms_lst
=
[
int
(
1000.0
/
metadata
[
"fps"
])
for
metadata
in
video_metadata_lst
]
frames_indices_lst
=
[
metadata
[
"frames_indices"
]
for
metadata
in
video_metadata_lst
]
video_num_patches
=
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst_video
]
)
video_inputs
=
{
"pixel_values_flat_video"
:
input_conditioner
(
torch
.
cat
(
pixel_values_lst_video
),
self
.
norm_mean
,
self
.
norm_std
),
"video_num_patches"
:
video_num_patches
,
"frames_indices"
:
frames_indices_lst
,
"frame_duration_ms"
:
torch
.
tensor
(
frame_duration_ms_lst
),
}
image_size
:
int
=
self
.
config
.
force_image_size
patch_size
:
int
=
self
.
config
.
patch_size
downsample_ratio
=
self
.
config
.
downsample_ratio
tokens_in_single_frame
=
int
(
(
image_size
*
image_size
//
patch_size
**
2
)
*
(
downsample_ratio
**
2
)
)
for
pixel_values
,
video_metadata
,
frames_indices
,
frame_duration_ms
in
zip
(
pixel_values_lst_video
,
video_metadata_lst
,
frames_indices_lst
,
frame_duration_ms_lst
,
):
num_frames
=
pixel_values
.
shape
[
0
]
if
(
self
.
video_pruning_rate
is
not
None
and
self
.
video_pruning_rate
>
0.0
):
# Start of EVS-specific code
num_tokens
=
compute_retained_tokens_count
(
tokens_per_frame
=
tokens_in_single_frame
,
num_frames
=
num_frames
,
q
=
self
.
video_pruning_rate
,
)
# Here we just need placeholders that won't actually be replaced -
# we just need to make sure the total number of tokens is correct
# assign all tokens to the first frame
tokens_per_frame
=
[
num_tokens
]
+
[
0
]
*
(
num_frames
-
1
)
# End of EVS-specific code
else
:
tokens_per_frame
=
[
tokens_in_single_frame
]
*
num_frames
video_repl
=
self
.
get_video_repl
(
tokens_per_frame
=
tokens_per_frame
,
frames_indices
=
frames_indices
,
frame_duration_ms
=
frame_duration_ms
,
tokenizer
=
self
.
tokenizer
,
img_start_token_ids
=
self
.
_img_start_token_ids
,
img_end_token_ids
=
self
.
_img_end_token_ids
,
img_context_token_ids
=
self
.
_img_context_token_ids
,
)
# video_repl.full is a list of token IDs
# Convert token IDs back to text for the HF processor flow
video_repl_text
=
self
.
tokenizer
.
decode
(
video_repl
.
full
,
skip_special_tokens
=
False
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl_text
,
1
)
for
t
in
text
]
return
text
,
video_inputs
def
_preprocess_audio
(
self
,
text
:
list
[
str
],
audios
:
list
[
npt
.
NDArray
],
):
if
len
(
audios
)
==
0
:
return
text
,
{}
assert
self
.
audio_extractor
is
not
None
extractor
=
self
.
audio_extractor
parts
=
[
x
for
x
in
re
.
split
(
f
"(
{
re
.
escape
(
AUDIO_CONTEXT
)
}
)"
,
text
[
0
])
if
x
]
token_count
=
parts
.
count
(
AUDIO_CONTEXT
)
if
token_count
!=
len
(
audios
):
raise
ValueError
(
"Number of audio tokens in text does not match the number "
f
"of audios (tokens=
{
token_count
}
, audios=
{
len
(
audios
)
}
)."
)
audio_index
=
0
for
idx
,
part
in
enumerate
(
parts
):
if
part
==
AUDIO_CONTEXT
:
audio_repl
=
self
.
get_audio_repl
(
audios
[
audio_index
])
parts
[
idx
]
=
audio_repl
.
full
audio_index
+=
1
text
=
[
""
.
join
(
parts
)]
audio_inputs
=
extractor
(
audios
,
sampling_rate
=
extractor
.
sampling_rate
,
return_tensors
=
"pt"
,
)
input_audio_features
=
audio_inputs
.
input_features
feature_attention_mask
=
audio_inputs
.
attention_mask
audio_feature_lengths
=
feature_attention_mask
.
sum
(
dim
=
1
)
audio_inputs
=
{
"input_audio_features"
:
input_audio_features
,
"feature_attention_mask"
:
feature_attention_mask
,
"audio_feature_lengths"
:
audio_feature_lengths
,
}
return
text
,
audio_inputs
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
videos
:
list
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]]
|
None
=
None
,
audios
:
AudioItem
|
list
[
AudioItem
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
max_num_tiles
:
int
|
None
=
None
,
)
->
BatchFeature
:
# Use default if not provided
if
max_num_tiles
is
None
:
max_num_tiles
=
self
.
max_num_tiles
text
,
images
,
videos
,
audios
=
[
self
.
_make_batch_input
(
x
)
for
x
in
(
text
,
images
,
videos
,
audios
)
]
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
max_num_tiles
=
max_num_tiles
,
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
max_num_tiles
=
1
,
)
text
,
audio_inputs
=
self
.
_preprocess_audio
(
text
=
text
,
audios
=
audios
,
)
text_inputs
=
self
.
tokenizer
(
text
,
add_special_tokens
=
False
)
combined_inputs
=
{
**
text_inputs
,
**
video_inputs
,
**
audio_inputs
}
if
self
.
dynamic_tiler
is
None
:
batch
=
BatchFeature
(
{
**
combined_inputs
,
**
image_inputs
},
tensor_type
=
return_tensors
,
)
else
:
batch
=
BatchFeature
(
combined_inputs
,
tensor_type
=
return_tensors
)
# allow images to be exempt from the BatchFeature validation:
# We will .stack() them in _parse_and_validate_image_input
batch
.
update
(
image_inputs
)
return
batch
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
get_audio_repl
(
self
,
audio
:
npt
.
NDArray
,
)
->
PromptUpdateDetails
[
str
]:
assert
self
.
audio_extractor
is
not
None
num_tokens
=
self
.
audio_extractor
.
audio_token_count
(
len
(
audio
))
repl_full
=
f
"
{
AUDIO_START
}{
AUDIO_CONTEXT
*
num_tokens
}{
AUDIO_END
}
"
return
PromptUpdateDetails
.
select_text
(
repl_full
,
AUDIO_CONTEXT
)
@
classmethod
def
get_video_repl
(
cls
,
*
,
tokens_per_frame
:
list
[
int
],
frames_indices
:
list
[
int
],
frame_duration_ms
:
int
,
tokenizer
:
TokenizerLike
,
img_start_token_ids
:
list
[
int
],
img_end_token_ids
:
list
[
int
],
img_context_token_ids
:
list
[
int
],
)
->
PromptUpdateDetails
[
list
[
int
]]:
"""
Build prompt replacement for a video.
The replacement returned is not actually used to replace the placeholder
tokens - it's just used to make sure we allocate the correct number
of tokens.
Actual replacement is done in embed_multimodal of
NemotronH_Nano_VL_V2
(specifically in _process_video_input -> _create_final_video_embeddings).
There, we create the final embeddings with text embeddings for indicator tokens
and video embeddings for video tokens.
This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
The differentiation is done via tokens_per_frame parameter.
- non EVS case - constant value same value across all frames
- EVS dummy - Doesn't matter how tokens are distributed between frames - just
make sure the total number of tokens is correct.
- EVS real (called from get_real_video_repl_for_evs) - different value per frame
Args:
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
"""
# TODO: Add support of frame_duration_ms to be None
# At preprocessing step we should allow absent / metadata without
# frames_indices field.
timestamps_enabled
=
frame_duration_ms
is
not
None
if
timestamps_enabled
:
timestamps
=
calculate_timestamps
(
frames_indices
,
frame_duration_ms
)
assert
len
(
timestamps
)
==
len
(
tokens_per_frame
),
(
"timestamps and tokens_per_frame must have the same length"
)
frame_separators
=
[
f
"Frame
{
i
+
1
}
sampled at
{
timestamp
:.
2
f
}
seconds: "
for
i
,
timestamp
in
enumerate
(
timestamps
)
]
else
:
frame_separators
=
[
f
"Frame
{
i
+
1
}
: "
for
i
,
_
in
enumerate
(
tokens_per_frame
)
]
# Tokenize frame separator independently
frame_separators_tokenized
=
[
_seq2tokens
(
tokenizer
,
sep
)
for
sep
in
frame_separators
]
# Tokenize each component independently to avoid tokenizer merging tokens
# across boundaries. This ensures consistent tokenization regardless of
# num_tokens_per_frame values.
all_token_ids
=
[]
for
i
,
num_tokens
in
enumerate
(
tokens_per_frame
):
frame_sep_token_ids
=
frame_separators_tokenized
[
i
]
all_token_ids
.
extend
(
frame_sep_token_ids
)
# Add pre-tokenized special tokens
all_token_ids
.
extend
(
img_start_token_ids
)
all_token_ids
.
extend
(
img_context_token_ids
*
num_tokens
)
all_token_ids
.
extend
(
img_end_token_ids
)
return
PromptUpdateDetails
.
from_seq
(
all_token_ids
)
class
BaseNanoNemotronVLProcessingInfo
(
BaseProcessingInfo
):
class
BaseNanoNemotronVLProcessingInfo
(
BaseProcessingInfo
):
"""Basic image-only ProcessingInfo for InternVL-style models."""
"""Basic image-only ProcessingInfo for InternVL-style models."""
...
...
vllm/model_executor/models/nemotron_parse.py
View file @
f3403243
...
@@ -11,18 +11,13 @@ import math
...
@@ -11,18 +11,13 @@ import math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Literal
from
typing
import
Annotated
,
Literal
import
numpy
as
np
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
einops
import
rearrange
from
einops
import
rearrange
from
PIL
import
Image
from
timm.data.constants
import
OPENAI_CLIP_MEAN
,
OPENAI_CLIP_STD
from
torchvision
import
transforms
as
T
from
transformers
import
(
from
transformers
import
(
BartConfig
,
BartConfig
,
BatchFeature
,
BatchFeature
,
PretrainedConfig
,
PretrainedConfig
,
TensorType
,
)
)
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
...
@@ -59,13 +54,12 @@ from vllm.multimodal.processing import (
...
@@ -59,13 +54,12 @@ from vllm.multimodal.processing import (
PromptUpdate
,
PromptUpdate
,
)
)
from
vllm.renderers
import
TokenizeParams
from
vllm.renderers
import
TokenizeParams
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.configs.radio
import
RadioConfig
from
vllm.transformers_utils.configs.radio
import
RadioConfig
from
vllm.transformers_utils.processors.nemotron_parse
import
NemotronParseProcessor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backend
import
AttentionType
from
vllm.v1.attention.backend
import
AttentionType
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
DEFAULT_FINAL_IMAGE_SIZE
=
(
2048
,
1648
)
class
BartScaledWordEmbedding
(
VocabParallelEmbedding
):
class
BartScaledWordEmbedding
(
VocabParallelEmbedding
):
...
@@ -372,231 +366,6 @@ class NemotronParsePixelInputs(TensorSchema):
...
@@ -372,231 +366,6 @@ class NemotronParsePixelInputs(TensorSchema):
data
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"b"
,
3
,
"h"
,
"w"
)]
data
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"b"
,
3
,
"h"
,
"w"
)]
class
NemotronParseImageProcessor
:
"""
NemotronParse Image Processor
"""
def
__init__
(
self
,
final_size
:
tuple
=
DEFAULT_FINAL_IMAGE_SIZE
,
**
kwargs
,
):
# Ensure final_size is properly formatted
if
isinstance
(
final_size
,
(
list
,
tuple
))
and
len
(
final_size
)
>=
2
:
self
.
final_size
=
(
int
(
final_size
[
0
]),
int
(
final_size
[
1
]))
elif
isinstance
(
final_size
,
(
int
,
float
)):
self
.
final_size
=
(
int
(
final_size
),
int
(
final_size
))
else
:
self
.
final_size
=
DEFAULT_FINAL_IMAGE_SIZE
# Default fallback
self
.
norm_mean
=
torch
.
Tensor
(
OPENAI_CLIP_MEAN
).
reshape
(
1
,
3
,
1
,
1
)
self
.
norm_std
=
torch
.
Tensor
(
OPENAI_CLIP_STD
).
reshape
(
1
,
3
,
1
,
1
)
# Create transforms
self
.
_create_transforms
()
def
_create_transforms
(
self
):
"""Create transform objects."""
try
:
import
albumentations
as
A
except
ImportError
as
err
:
raise
ImportError
(
"The package `albumentations` is required to use "
"NemotronParse model. Please install it with `pip install "
"albumentations`."
)
from
err
# Ensure final_size is a tuple of integers
if
isinstance
(
self
.
final_size
,
(
list
,
tuple
)):
self
.
target_height
,
self
.
target_width
=
(
int
(
self
.
final_size
[
0
]),
int
(
self
.
final_size
[
1
]),
)
else
:
self
.
target_height
=
self
.
target_width
=
int
(
self
.
final_size
)
import
cv2
self
.
transform
=
A
.
Compose
(
[
A
.
PadIfNeeded
(
min_height
=
self
.
target_height
,
min_width
=
self
.
target_width
,
border_mode
=
cv2
.
BORDER_CONSTANT
,
fill
=
[
255
,
255
,
255
],
p
=
1.0
,
),
]
)
self
.
torch_transform
=
T
.
Compose
(
[
T
.
ToTensor
(),
]
)
def
_resize_with_aspect_ratio
(
self
,
image
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Resize image maintaining aspect ratio (exact replica of original
LongestMaxSizeHW)."""
height
,
width
=
image
.
shape
[:
2
]
max_size_height
=
self
.
target_height
max_size_width
=
self
.
target_width
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
aspect_ratio
=
width
/
height
new_height
=
height
new_width
=
width
# If height too big then scale image down
if
height
>
max_size_height
:
new_height
=
max_size_height
new_width
=
int
(
new_height
*
aspect_ratio
)
# If width too big, scale image down further
if
new_width
>
max_size_width
:
new_width
=
max_size_width
new_height
=
int
(
new_width
/
aspect_ratio
)
# Use cv2.INTER_LINEAR like the original
import
cv2
return
cv2
.
resize
(
image
,
(
new_width
,
new_height
),
interpolation
=
cv2
.
INTER_LINEAR
)
def
_pad_to_size
(
self
,
image
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Pad image to target size with white padding (matches A.PadIfNeeded
behavior)."""
h
,
w
=
image
.
shape
[:
2
]
min_height
,
min_width
=
self
.
target_height
,
self
.
target_width
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
pad_h
=
max
(
0
,
min_height
-
h
)
pad_w
=
max
(
0
,
min_width
-
w
)
if
pad_h
==
0
and
pad_w
==
0
:
return
image
# A.PadIfNeeded pads to bottom-right with constant value
if
len
(
image
.
shape
)
==
3
:
# Color image - pad bottom and right with white (255, 255, 255)
padded
=
np
.
pad
(
image
,
((
0
,
pad_h
),
(
0
,
pad_w
),
(
0
,
0
)),
mode
=
"constant"
,
constant_values
=
255
,
)
else
:
# Grayscale image - pad with white (255)
padded
=
np
.
pad
(
image
,
((
0
,
pad_h
),
(
0
,
pad_w
)),
mode
=
"constant"
,
constant_values
=
255
)
return
padded
def
preprocess
(
self
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
**
kwargs
,
)
->
dict
[
str
,
torch
.
Tensor
]:
"""
Preprocess an image or batch of images for the NemotronParse model.
Args:
images: Input image(s)
"""
# Ensure images is a list
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
# Convert PIL images to numpy arrays if needed
processed_images
=
[]
for
image
in
images
:
if
isinstance
(
image
,
Image
.
Image
):
image
=
np
.
asarray
(
image
)
processed_images
.
append
(
image
)
# Apply NemotronParse-specific transforms
pixel_values
=
[]
for
image
in
processed_images
:
# Manual resize with aspect ratio preservation
# (replaces LongestMaxSizeHW)
processed_image
=
self
.
_resize_with_aspect_ratio
(
image
)
# Apply remaining albumentations transforms if available
if
self
.
transform
is
not
None
:
transformed
=
self
.
transform
(
image
=
processed_image
)
processed_image
=
transformed
[
"image"
]
else
:
# Fallback: just pad to target size
processed_image
=
self
.
_pad_to_size
(
processed_image
)
# Convert to tensor
pixel_values_tensor
=
self
.
torch_transform
(
processed_image
)
# Handle grayscale images
if
pixel_values_tensor
.
shape
[
0
]
==
1
:
pixel_values_tensor
=
pixel_values_tensor
.
expand
(
3
,
-
1
,
-
1
)
pixel_values
.
append
(
pixel_values_tensor
)
# Stack into batch
pixel_values
=
torch
.
stack
(
pixel_values
)
# Normalize pixel values
normalized_values
=
(
pixel_values
-
self
.
norm_mean
)
/
self
.
norm_std
return
{
"pixel_values"
:
normalized_values
}
def
__call__
(
self
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
**
kwargs
)
->
dict
[
str
,
torch
.
Tensor
]:
return
self
.
preprocess
(
images
,
**
kwargs
)
class
NemotronParseProcessor
:
"""
NemotronParse Processor
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
NemotronParseImageProcessor
(
final_size
=
config
.
image_size
)
def
_make_batch_input
(
self
,
input_item
=
None
):
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
,
images
=
[
self
.
_make_batch_input
(
x
)
for
x
in
(
text
,
images
)]
image_inputs
=
{}
if
len
(
images
)
==
0
else
self
.
image_processor
(
images
)
text_inputs
=
self
.
tokenizer
(
text
,
add_special_tokens
=
False
,
**
kwargs
)
combined_outputs
=
BatchFeature
(
data
=
{
**
text_inputs
,
**
image_inputs
},
tensor_type
=
return_tensors
,
)
return
combined_outputs
class
NemotronParseProcessingInfo
(
BaseProcessingInfo
):
class
NemotronParseProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_config
(
self
):
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
()
return
self
.
ctx
.
get_hf_config
()
...
...
vllm/model_executor/models/nemotron_vl.py
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
math
import
math
from
abc
import
ABC
from
collections.abc
import
Iterable
from
collections.abc
import
Iterable
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
AutoModel
,
PretrainedConfig
from
transformers
import
AutoModel
,
PretrainedConfig
from
transformers.image_processing_utils_fast
import
BaseImageProcessorFast
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
...
@@ -30,16 +19,16 @@ from vllm.model_executor.models.internvl import (
...
@@ -30,16 +19,16 @@ from vllm.model_executor.models.internvl import (
InternVLImageEmbeddingInputs
,
InternVLImageEmbeddingInputs
,
InternVLImageInputs
,
InternVLImageInputs
,
InternVLImagePixelInputs
,
InternVLImagePixelInputs
,
InternVLProcessor
,
)
)
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.siglip
import
SiglipVisionModel
from
vllm.model_executor.models.siglip
import
SiglipVisionModel
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.processor
import
cached_image_processor_from_config
from
vllm.transformers_utils.processor
import
cached_image_processor_from_config
from
vllm.transformers_utils.processors.nemotron_vl
import
(
LlamaNemotronVLEmbedProcessor
,
NemotronVLProcessor
,
)
from
vllm.transformers_utils.repo_utils
import
get_hf_file_to_dict
from
vllm.transformers_utils.repo_utils
import
get_hf_file_to_dict
from
.interfaces
import
(
from
.interfaces
import
(
...
@@ -58,310 +47,6 @@ from .utils import (
...
@@ -58,310 +47,6 @@ from .utils import (
)
)
def
build_transform
(
input_size
:
int
):
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
]
)
# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_factor
=
float
(
"-inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
rw
,
rh
in
target_ratios
:
target_aspect_ratio
=
rw
/
rh
size_factor
=
min
((
rw
*
rh
*
image_size
*
image_size
)
/
area
,
0.6
)
ratio_closeness
=
min
(
target_aspect_ratio
/
aspect_ratio
,
aspect_ratio
/
target_aspect_ratio
)
factor
=
size_factor
*
ratio_closeness
if
factor
>
best_factor
:
best_factor
=
factor
best_ratio
=
(
rw
,
rh
)
return
best_ratio
def
calculate_nemotron_vl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_nemotron_vl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_nemotron_vl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
def
get_nemotron_vl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
image_to_pixel_values_nemotron_vl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
transform
:
T
.
Compose
|
None
=
None
,
)
->
torch
.
Tensor
:
target_ratios
=
get_nemotron_vl_target_ratios
(
min_num
,
max_num
)
if
transform
is
None
:
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_nemotron_vl
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
NemotronVLProcessor
(
InternVLProcessor
):
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
image_processor
:
BaseImageProcessorFast
|
None
=
None
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
ABC
.
__init__
(
self
)
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
image_processor
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
1
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
image_processor
.
max_num_tiles
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
True
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
if
image_processor
is
not
None
:
self
.
use_thumbnail
=
image_processor
.
use_thumbnail
else
:
self
.
use_thumbnail
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
self
.
IMG_CONTEXT
]
def
_get_transform
(
self
)
->
T
.
Compose
:
return
build_transform
(
input_size
=
self
.
image_size
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_nemotron_vl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_nemotron_vl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
transform
=
self
.
_get_transform
(),
)
for
image
in
images
]
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Replace <image> placeholders with image tokens."""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT
=
image_repl
.
full
.
replace
(
"<image>"
,
"<NVL_IMG_CONTEXT>"
)
text
=
[
t
.
replace
(
"<image>"
,
NVL_IMAGE_CONTEXT
,
1
)
for
t
in
text
]
return
[
t
.
replace
(
"<NVL_IMG_CONTEXT>"
,
self
.
IMG_CONTEXT
)
for
t
in
text
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
text
=
self
.
_replace_image_tokens
(
text
,
pixel_values_lst
)
return
text
,
image_inputs
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
self
.
IMG_CONTEXT
*
feature_size
repl_full
=
self
.
IMG_START
+
repl_features
+
self
.
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
self
.
IMG_CONTEXT
)
class
NemotronVLProcessingInfo
(
BaseInternVLProcessingInfo
):
class
NemotronVLProcessingInfo
(
BaseInternVLProcessingInfo
):
"""Processing info for Nemotron VL models."""
"""Processing info for Nemotron VL models."""
...
@@ -700,91 +385,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
...
@@ -700,91 +385,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
# - Pooler output instead of generative logits
# - Pooler output instead of generative logits
# --------------------------------------------------------
# --------------------------------------------------------
# SigLIP normalization constants
SIGLIP_MEAN
=
(
0.5
,
0.5
,
0.5
)
SIGLIP_STD
=
(
0.5
,
0.5
,
0.5
)
def
build_siglip_transform
(
input_size
:
int
):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
base_transform
=
build_transform
(
input_size
=
input_size
)
return
T
.
Compose
(
[
base_transform
,
T
.
Normalize
(
mean
=
SIGLIP_MEAN
,
std
=
SIGLIP_STD
),
]
)
class
LlamaNemotronVLEmbedProcessor
(
NemotronVLProcessor
):
"""
Processor for LlamaNemotronVL embedding model.
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
- Uses SigLIP transform with normalization instead of base transform
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT
=
"<IMG_CONTEXT>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
processor_config
:
dict
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
processor_config
.
get
(
"min_input_tiles"
,
getattr
(
config
,
"min_dynamic_patch"
,
1
),
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
processor_config
.
get
(
"max_input_tiles"
,
getattr
(
config
,
"max_dynamic_patch"
,
1
),
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
processor_config
.
get
(
"dynamic_image_size"
,
getattr
(
config
,
"dynamic_image_size"
,
True
),
)
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
image_processor
=
None
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
def
_get_transform
(
self
)
->
T
.
Compose
:
"""Override to add SigLIP normalization."""
return
build_siglip_transform
(
input_size
=
self
.
image_size
)
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
class
LlamaNemotronVLEmbedProcessingInfo
(
NemotronVLProcessingInfo
):
class
LlamaNemotronVLEmbedProcessingInfo
(
NemotronVLProcessingInfo
):
"""Processing info for LlamaNemotronVL embedding model."""
"""Processing info for LlamaNemotronVL embedding model."""
...
...
vllm/model_executor/models/nvlm_d.py
View file @
f3403243
...
@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
...
@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
PromptUpdate
,
PromptUpdate
,
PromptUpdateDetails
,
PromptUpdateDetails
,
)
)
from
vllm.transformers_utils.processors.nvlm_d
import
IMG_PAD
,
NVLMProcessor
from
.intern_vit
import
InternVisionModel
from
.intern_vit
import
InternVisionModel
from
.internvl
import
(
from
.internvl
import
(
BaseInternVLDummyInputsBuilder
,
BaseInternVLDummyInputsBuilder
,
BaseInternVLMultiModalProcessor
,
BaseInternVLMultiModalProcessor
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessor
,
InternVLChatModel
,
InternVLChatModel
,
)
)
IMG_PAD
=
"<|vision_pad|>"
class
NVLMProcessor
(
BaseInternVLProcessor
):
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_PAD
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
tile_pos_identifiers
=
[
f
"<tile_
{
i
}
>"
for
i
in
range
(
1
,
num_patches
)]
if
self
.
use_thumbnail
:
tile_pos_identifiers
+=
[
"<tile_global_thumbnail>"
]
context_size
=
feature_size
//
num_patches
features
=
""
.
join
(
identifier
+
IMG_PAD
*
context_size
for
identifier
in
tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl
=
"<Image>"
+
features
+
"</Image>"
return
PromptUpdateDetails
.
select_text
(
repl
,
IMG_PAD
)
class
NVLMProcessingInfo
(
BaseInternVLProcessingInfo
):
class
NVLMProcessingInfo
(
BaseInternVLProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
NVLMProcessor
:
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
NVLMProcessor
:
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
f3403243
...
@@ -12,9 +12,7 @@ from typing import Annotated, Literal, TypeAlias
...
@@ -12,9 +12,7 @@ from typing import Annotated, Literal, TypeAlias
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
transformers
import
BatchFeature
,
PretrainedConfig
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
...
@@ -26,7 +24,6 @@ from vllm.model_executor.models.intern_vit import (
...
@@ -26,7 +24,6 @@ from vllm.model_executor.models.intern_vit import (
InternVisionPatchModel
,
InternVisionPatchModel
,
)
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
(
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalFieldConfig
,
...
@@ -44,22 +41,14 @@ from vllm.multimodal.processing import (
...
@@ -44,22 +41,14 @@ from vllm.multimodal.processing import (
BaseProcessingInfo
,
BaseProcessingInfo
,
PromptReplacement
,
PromptReplacement
,
PromptUpdate
,
PromptUpdate
,
PromptUpdateDetails
,
)
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.t
okenizers
import
TokenizerLike
from
vllm.t
ransformers_utils.processors.skyworkr1v
import
SkyworkR1VProcessor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
class
SkyworkR1VImagePixelInputs
(
TensorSchema
):
class
SkyworkR1VImagePixelInputs
(
TensorSchema
):
"""
"""
...
@@ -106,370 +95,6 @@ SkyworkR1VImageInputs: TypeAlias = (
...
@@ -106,370 +95,6 @@ SkyworkR1VImageInputs: TypeAlias = (
)
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_skyworkr1v_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_skyworkr1v_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_skyworkr1v_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_skyworkr1v
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_skyworkr1v_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def
image_to_pixel_values_skyworkr1v
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_skyworkr1v
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
SkyworkR1VProcessor
:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_skyworkr1v_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_skyworkr1v_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_skyworkr1v
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
class
SkyworkR1VProcessingInfo
(
BaseProcessingInfo
):
class
SkyworkR1VProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
SkyworkR1VProcessor
:
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
SkyworkR1VProcessor
:
return
self
.
ctx
.
init_processor
(
return
self
.
ctx
.
init_processor
(
...
...
vllm/transformers_utils/processors/__init__.py
View file @
f3403243
...
@@ -13,35 +13,53 @@ import importlib
...
@@ -13,35 +13,53 @@ import importlib
__all__
=
[
__all__
=
[
"BagelProcessor"
,
"BagelProcessor"
,
"DeepseekVLV2Processor"
,
"DeepseekVLV2Processor"
,
"Eagle2_5_VLProcessor"
,
"FireRedASR2Processor"
,
"FireRedASR2Processor"
,
"FunASRProcessor"
,
"FunASRProcessor"
,
"GLM4VProcessor"
,
"GLM4VProcessor"
,
"H2OVLProcessor"
,
"HunYuanVLProcessor"
,
"HunYuanVLProcessor"
,
"HunYuanVLImageProcessor"
,
"HunYuanVLImageProcessor"
,
"InternVLProcessor"
,
"KimiAudioProcessor"
,
"KimiAudioProcessor"
,
"MistralCommonPixtralProcessor"
,
"MistralCommonPixtralProcessor"
,
"MistralCommonVoxtralProcessor"
,
"MistralCommonVoxtralProcessor"
,
"NanoNemotronVLProcessor"
,
"NemotronParseProcessor"
,
"NemotronVLProcessor"
,
"LlamaNemotronVLEmbedProcessor"
,
"NVLMProcessor"
,
"OvisProcessor"
,
"OvisProcessor"
,
"Ovis2_5Processor"
,
"Ovis2_5Processor"
,
"QwenVLProcessor"
,
"QwenVLProcessor"
,
"Qwen3ASRProcessor"
,
"Qwen3ASRProcessor"
,
"SkyworkR1VProcessor"
,
]
]
_CLASS_TO_MODULE
:
dict
[
str
,
str
]
=
{
_CLASS_TO_MODULE
:
dict
[
str
,
str
]
=
{
"BagelProcessor"
:
"vllm.transformers_utils.processors.bagel"
,
"BagelProcessor"
:
"vllm.transformers_utils.processors.bagel"
,
"DeepseekVLV2Processor"
:
"vllm.transformers_utils.processors.deepseek_vl2"
,
"DeepseekVLV2Processor"
:
"vllm.transformers_utils.processors.deepseek_vl2"
,
"Eagle2_5_VLProcessor"
:
"vllm.transformers_utils.processors.eagle2_5_vl"
,
"FireRedASR2Processor"
:
"vllm.transformers_utils.processors.fireredasr2"
,
"FireRedASR2Processor"
:
"vllm.transformers_utils.processors.fireredasr2"
,
"FunASRProcessor"
:
"vllm.transformers_utils.processors.funasr"
,
"FunASRProcessor"
:
"vllm.transformers_utils.processors.funasr"
,
"GLM4VProcessor"
:
"vllm.transformers_utils.processors.glm4v"
,
"GLM4VProcessor"
:
"vllm.transformers_utils.processors.glm4v"
,
"H2OVLProcessor"
:
"vllm.transformers_utils.processors.h2ovl"
,
"HunYuanVLProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl"
,
"HunYuanVLProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl"
,
"HunYuanVLImageProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl_image"
,
"HunYuanVLImageProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl_image"
,
"InternVLProcessor"
:
"vllm.transformers_utils.processors.internvl"
,
"KimiAudioProcessor"
:
"vllm.transformers_utils.processors.kimi_audio"
,
"KimiAudioProcessor"
:
"vllm.transformers_utils.processors.kimi_audio"
,
"MistralCommonPixtralProcessor"
:
"vllm.transformers_utils.processors.pixtral"
,
"MistralCommonPixtralProcessor"
:
"vllm.transformers_utils.processors.pixtral"
,
"MistralCommonVoxtralProcessor"
:
"vllm.transformers_utils.processors.voxtral"
,
"MistralCommonVoxtralProcessor"
:
"vllm.transformers_utils.processors.voxtral"
,
"NanoNemotronVLProcessor"
:
"vllm.transformers_utils.processors.nano_nemotron_vl"
,
"NemotronParseProcessor"
:
"vllm.transformers_utils.processors.nemotron_parse"
,
"NemotronVLProcessor"
:
"vllm.transformers_utils.processors.nemotron_vl"
,
"LlamaNemotronVLEmbedProcessor"
:
"vllm.transformers_utils.processors.nemotron_vl"
,
"NVLMProcessor"
:
"vllm.transformers_utils.processors.nvlm_d"
,
"OvisProcessor"
:
"vllm.transformers_utils.processors.ovis"
,
"OvisProcessor"
:
"vllm.transformers_utils.processors.ovis"
,
"Ovis2_5Processor"
:
"vllm.transformers_utils.processors.ovis2_5"
,
"Ovis2_5Processor"
:
"vllm.transformers_utils.processors.ovis2_5"
,
"QwenVLProcessor"
:
"vllm.transformers_utils.processors.qwen_vl"
,
"QwenVLProcessor"
:
"vllm.transformers_utils.processors.qwen_vl"
,
"Qwen3ASRProcessor"
:
"vllm.transformers_utils.processors.qwen3_asr"
,
"Qwen3ASRProcessor"
:
"vllm.transformers_utils.processors.qwen3_asr"
,
"SkyworkR1VProcessor"
:
"vllm.transformers_utils.processors.skyworkr1v"
,
}
}
...
...
vllm/transformers_utils/processors/eagle2_5_vl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from NVIDIA Eagle2.5-VL model
# https://huggingface.co/nvidia/Eagle2.5-8B
from
transformers
import
PretrainedConfig
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLProcessor
class
Eagle2_5_VLProcessor
(
BaseInternVLProcessor
):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self
.
config
=
config
self
.
tokenizer
=
tokenizer
# Image size with force_image_size override
image_size
:
int
=
config
.
vision_config
.
image_size
if
hasattr
(
config
,
"force_image_size"
)
and
config
.
force_image_size
:
image_size
=
config
.
force_image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
downsample_ratio
:
float
=
getattr
(
config
,
"downsample_ratio"
,
0.5
)
# Compute num_image_token
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
# Dynamic patch settings with defaults
self
.
min_dynamic_patch
=
(
min_dynamic_patch
if
min_dynamic_patch
is
not
None
else
getattr
(
config
,
"min_dynamic_patch"
,
1
)
)
self
.
max_dynamic_patch
=
(
max_dynamic_patch
if
max_dynamic_patch
is
not
None
else
getattr
(
config
,
"max_dynamic_patch"
,
12
)
)
self
.
dynamic_image_size
=
(
dynamic_image_size
if
dynamic_image_size
is
not
None
else
getattr
(
config
,
"dynamic_image_size"
,
True
)
)
self
.
use_thumbnail
:
bool
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
"""Get the image token ID from config or tokenizer."""
if
hasattr
(
self
.
config
,
"image_token_index"
):
return
self
.
config
.
image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab
=
self
.
tokenizer
.
get_vocab
()
if
IMG_CONTEXT
in
vocab
:
return
vocab
[
IMG_CONTEXT
]
raise
ValueError
(
f
"Cannot find image token '
{
IMG_CONTEXT
}
' in vocabulary"
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
"""Get image replacement string for prompt."""
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
vllm/transformers_utils/processors/h2ovl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
# --------------------------------------------------------
# H2OVL-Mississippi
# Copyright (c) 2024 H2O.AI
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
import
torch
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLProcessor
,
build_transform
,
find_closest_aspect_ratio
,
get_internvl_target_ratios
,
)
def
resolve_h2ovl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_h2ovl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
*
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
# if prior_aspect_ratio is provided, filter the target ratios
if
prior_aspect_ratio
is
not
None
:
target_ratios
=
[
ratio
for
ratio
in
target_ratios
if
prior_aspect_ratio
[
0
]
%
ratio
[
0
]
!=
0
and
prior_aspect_ratio
[
1
]
%
ratio
[
1
]
!=
0
]
return
target_ratios
# modified to include blocks generated in second pass
def
calculate_h2ovl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
,
tuple
[
int
,
int
]]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
,
target_aspect_ratio
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
# refactored to handle prior_aspect_ratio
def
dynamic_preprocess_h2ovl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
list
[
Image
.
Image
],
tuple
[
int
,
int
]]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
(
blocks
,
target_width
,
target_height
,
target_aspect_ratio
,
)
=
calculate_h2ovl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
,
target_aspect_ratio
def
_preprocess_image
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
tuple
[
torch
.
Tensor
,
tuple
[
int
,
int
]]:
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
transform
=
build_transform
(
input_size
=
input_size
)
images
,
target_aspect_ratio
=
dynamic_preprocess_h2ovl
(
image
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
target_ratios
=
target_ratios
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
,
target_aspect_ratio
# refactored to use the _preprocess_image function
def
image_to_pixel_values_h2ovl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
use_msac
:
bool
,
)
->
torch
.
Tensor
:
# when MSAC is turned on, we need to process the image twice
if
use_msac
:
# first pass
pixel_values1
,
aspect_ratio1
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
1
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
None
,
)
# second pass
pixel_values2
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
3
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
aspect_ratio1
,
)
# combine pixel values
pixel_values
=
torch
.
cat
(
[
pixel_values2
[:
-
1
],
pixel_values1
[:
-
1
],
pixel_values2
[
-
1
:]],
0
)
else
:
pixel_values
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
use_thumbnail
,
prior_aspect_ratio
=
None
,
)
return
pixel_values
class
H2OVLProcessor
(
BaseInternVLProcessor
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_msac
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
,
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
if
use_msac
is
None
:
use_msac
=
config
.
use_msac
assert
isinstance
(
use_msac
,
bool
)
self
.
use_msac
=
use_msac
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_h2ovl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
=
None
,
override_min_num
:
int
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
if
override_min_num
is
not
None
:
min_num
=
override_min_num
return
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
use_msac
:
bool
|
None
=
None
,
)
->
int
:
use_msac
=
self
.
use_msac
if
use_msac
is
None
else
use_msac
use_thumbnail
=
self
.
use_thumbnail
if
use_msac
:
target_ratios_1
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
override_min_num
=
1
,
)
num_patches_1
,
_
,
_
,
aspect_ratio_1
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios_1
,
use_thumbnail
=
True
,
)
target_ratios_2
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
prior_aspect_ratio
=
aspect_ratio_1
,
override_min_num
=
3
,
)
num_patches_2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios_2
,
use_thumbnail
=
True
,
)
num_patches
=
num_patches_1
+
num_patches_2
-
1
else
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
use_msac
=
self
.
use_msac
if
len
(
images
)
==
1
else
False
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_h2ovl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
use_msac
=
use_msac
,
)
for
image
in
images
]
vllm/transformers_utils/processors/internvl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
TypeVar
import
numpy.typing
as
npt
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
_T
=
TypeVar
(
"_T"
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
return
transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_internvl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_internvl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_internvl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
dynamic_preprocess_internvl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_internvl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
image_to_pixel_values_internvl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_internvl
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
video_to_pixel_values_internvl
(
video
:
npt
.
NDArray
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
frames_list
=
list
[
Image
.
Image
]()
for
frame
in
video
:
pil_frame
=
dynamic_preprocess_internvl
(
Image
.
fromarray
(
frame
,
mode
=
"RGB"
),
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
assert
len
(
pil_frame
)
==
1
frames_list
.
extend
(
pil_frame
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
frames_list
])
return
pixel_values
class
BaseInternVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_internvl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_internvl_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_internvl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
_T
|
list
[
_T
]
|
None
=
None
)
->
list
[
_T
]:
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
class
InternVLProcessor
(
BaseInternVLProcessor
):
"""
HF Processor for InternVLChatModel with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
video_token
:
str
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
1
,
max_dynamic_patch
=
1
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
video_to_pixel_values_internvl
(
video
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
False
,
)
for
video
in
videos
]
def
_preprocess_video
(
self
,
text
:
list
[
str
],
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
return
text
,
{}
video_token
=
self
.
video_token
assert
video_token
is
not
None
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
video_inputs
=
{
"pixel_values_flat_video"
:
torch
.
cat
(
pixel_values_lst_video
),
"video_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst_video
]
),
}
for
pixel_values
in
pixel_values_lst_video
:
num_patches
=
pixel_values
.
shape
[
0
]
video_repl
=
self
.
get_video_repl
(
self
.
num_image_token
,
num_patches
,
video_token
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
video_inputs
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
]
|
None
=
None
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
videos
=
self
.
_make_batch_input
(
videos
)
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
,
**
video_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
get_video_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
video_context_token
:
str
=
IMG_CONTEXT
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
repl_features
=
video_context_token
*
self
.
num_image_token
repl_features_with_sep
=
IMG_START
+
repl_features
+
IMG_END
# num_patches is equal to num_frames
repl_full
=
""
.
join
(
[
f
"Frame
{
i
+
1
}
:
{
repl_features_with_sep
}
"
for
i
in
range
(
num_patches
)]
)
return
PromptUpdateDetails
.
select_text
(
repl_full
,
video_context_token
)
vllm/transformers_utils/processors/nano_nemotron_vl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# --------------------------------------------------------
# Adapted from
# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/internvl.py
# under Apache-2.0 License
# LICENSE is in root directory.
# --------------------------------------------------------
import
math
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Sequence
from
dataclasses
import
dataclass
from
typing
import
Any
,
TypeVar
import
einops
import
numpy
as
np
import
numpy.typing
as
npt
import
regex
as
re
import
torch
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.model_executor.models.parakeet
import
ParakeetExtractor
from
vllm.multimodal.evs
import
compute_retained_tokens_count
from
vllm.multimodal.inputs
import
AudioItem
from
vllm.multimodal.processing.processor
import
PromptUpdateDetails
,
_seq2tokens
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
calculate_internvl_targets
,
get_internvl_target_ratios
_T
=
TypeVar
(
"_T"
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
AUDIO_START
=
"<so_start>"
AUDIO_END
=
"<so_end>"
AUDIO_CONTEXT
=
"<so_embedding>"
# Profiling
# MAX_FRAMES = 16
DEFAULT_NUM_TILES
=
12
def
calculate_timestamps
(
indices
:
list
[
int
]
|
torch
.
Tensor
,
frame_duration_ms
:
int
,
):
if
not
isinstance
(
indices
,
list
):
indices
=
indices
.
tolist
()
timestamps
=
[
int
(
i
)
*
frame_duration_ms
/
1000.0
for
i
in
indices
]
return
timestamps
def
input_conditioner
(
x
:
torch
.
Tensor
,
norm_mean
:
torch
.
Tensor
,
norm_std
:
torch
.
Tensor
):
return
(
x
-
norm_mean
)
/
norm_std
def
dynamic_preprocess
(
image
,
*
,
image_size
=
512
,
max_num_tiles
=
12
,
use_thumbnail
=
True
,
idx
=
0
,
):
orig_width
,
orig_height
=
image
.
size
target_ratios
=
get_internvl_target_ratios
(
1
,
max_num_tiles
)
blocks
,
target_width
,
target_height
=
calculate_internvl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
image
=
np
.
asarray
(
image
.
convert
(
"RGB"
)
if
image
.
mode
!=
"RGB"
else
image
,
dtype
=
np
.
uint8
)
image
=
torch
.
from_numpy
(
image
).
unsqueeze
(
0
)
# (1, H, W, 3)
image
=
image
.
permute
(
0
,
3
,
1
,
2
)
# (1, 3, H, W)
resized_img
=
torch
.
nn
.
functional
.
interpolate
(
image
,
size
=
(
target_height
,
target_width
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
B
,
C
,
H
,
W
=
resized_img
.
shape
hp
,
wp
=
H
//
image_size
,
W
//
image_size
patches
=
(
resized_img
.
reshape
(
B
,
C
,
hp
,
image_size
,
wp
,
image_size
)
.
permute
(
0
,
2
,
4
,
1
,
3
,
5
)
.
reshape
(
B
*
hp
*
wp
,
C
,
image_size
,
image_size
)
/
255.0
)
if
use_thumbnail
and
patches
.
shape
[
0
]
>
1
:
thumb
=
(
torch
.
nn
.
functional
.
interpolate
(
image
,
size
=
(
image_size
,
image_size
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
/
255.0
)
patches
=
torch
.
cat
([
patches
,
thumb
],
dim
=
0
)
return
list
(
patches
)
def
image_to_pixel_values
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
idx
:
int
,
)
->
torch
.
Tensor
:
images
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
max_num_tiles
=
max_num
,
use_thumbnail
=
use_thumbnail
,
idx
=
idx
,
)
pixel_values
=
torch
.
stack
(
images
)
return
pixel_values
def
video_to_pixel_values
(
video
:
npt
.
NDArray
,
*
,
input_size
:
int
,
max_num_tiles
:
int
=
1
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
assert
max_num_tiles
==
1
,
"Video modality always uses one tile"
# (num_frames, H, W, C) -> (num_frames, C, H, W)
video_tensor
=
torch
.
from_numpy
(
video
).
permute
(
0
,
3
,
1
,
2
)
if
video_tensor
.
shape
[
2
]
!=
input_size
or
video_tensor
.
shape
[
3
]
!=
input_size
:
video_tensor
=
torch
.
nn
.
functional
.
interpolate
(
video_tensor
,
size
=
(
input_size
,
input_size
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
video_tensor
=
video_tensor
/
255.0
return
video_tensor
class
DynamicResolutionImageTiler
:
CONV_MERGING
=
False
PIXEL_SHUFFLE
=
True
USE_THUMBNAIL
=
False
def
__init__
(
self
,
*
,
max_model_len
:
int
,
patch_size
:
int
,
min_num_patches
:
int
,
max_num_patches
:
int
,
downsample_ratio
:
int
,
norm_mean
:
Sequence
[
float
],
norm_std
:
Sequence
[
float
],
factor_max
:
float
=
1.0
,
use_thumbnail
:
bool
=
False
,
)
->
None
:
assert
use_thumbnail
is
False
,
"use_thumbnail is not supported"
self
.
_patch_size
:
int
=
patch_size
self
.
_max_model_len
=
max_model_len
self
.
_min_num_patches
=
min_num_patches
self
.
_max_num_patches
=
max_num_patches
if
max_num_patches
>
0
else
float
(
"inf"
)
self
.
_factor_max
=
factor_max
self
.
norm_mean
=
torch
.
tensor
(
norm_mean
).
reshape
(
3
,
1
,
1
)
self
.
norm_std
=
torch
.
tensor
(
norm_std
).
reshape
(
3
,
1
,
1
)
assert
downsample_ratio
<
1
reduction_factor
=
1
/
downsample_ratio
assert
reduction_factor
==
2.0
self
.
_downsample_ratio
=
int
(
reduction_factor
)
**
(
self
.
PIXEL_SHUFFLE
+
self
.
CONV_MERGING
)
assert
self
.
_downsample_ratio
==
2
def
_get_num_embeddings
(
self
,
width
:
int
,
height
:
int
)
->
int
:
num_patches
=
(
width
//
self
.
_patch_size
)
*
(
height
//
self
.
_patch_size
)
num_tokens
=
num_patches
//
(
self
.
_downsample_ratio
**
2
)
return
num_tokens
def
width_and_height_for_max_num_tokens_available
(
self
,
target_num_tokens_post_shuffle
:
int
,
)
->
tuple
[
int
,
int
]:
"""
TODO: optimize this so it squeezes closer to target number of tokens.
Calculate image dimensions that produce approximately `target` tokens after
pixel_shuffle.
With pixel_shuffle enabled, each 2x2 patch grid becomes 1 token, so we
need 4*B patches to get B tokens.
Examples:
>>> PATCH_SIZE = 16
>>> DOWNSAMPLE_RATIO = 0.5
>>> tiler = DynamicResolutionImageTiler(
... max_model_len=16384,
... patch_size=PATCH_SIZE,
... downsample_ratio=DOWNSAMPLE_RATIO,
... min_num_patches=4,
... max_num_patches=0,
... )
>>> width, height = tiler.width_and_height_for_max_num_tokens_available(
... target_num_tokens_post_shuffle=8192,
... )
>>> assert width, height == (2880, 2880)
>>> assert (width // PATCH_SIZE) * (
... height // PATCH_SIZE
... ) // 2**2 == 8100 # tokens post-shuffle
>>> assert tiler._get_num_embeddings(width=width, height=height) == 8100
"""
side_pixels
=
(
math
.
isqrt
(
target_num_tokens_post_shuffle
)
*
self
.
_downsample_ratio
*
self
.
_patch_size
)
assert
isinstance
(
side_pixels
,
int
)
and
side_pixels
%
self
.
_patch_size
==
0
return
side_pixels
,
side_pixels
def
max_num_tokens_available
(
self
,
text_prompt_length
:
int
)
->
int
:
return
self
.
_max_model_len
-
text_prompt_length
-
4
def
_images_to_pixel_values_lst
(
self
,
text_prompt_length
:
int
,
images
:
list
[
Image
.
Image
],
)
->
tuple
[
list
[
torch
.
Tensor
],
list
[
int
]]:
num_tokens_available
=
self
.
max_num_tokens_available
(
text_prompt_length
)
params_per_image
=
self
.
compute_params
(
images
,
num_tokens_available
)
feature_sizes
=
[]
images
=
[]
for
param
in
params_per_image
:
for
t
in
self
.
apply_params
(
param
):
assert
t
.
ndim
==
3
,
f
"
{
t
.
ndim
=
}
: expected 3 dim tensor"
images
.
append
(
t
)
feature_sizes
.
append
(
param
.
num_embeddings
)
return
images
,
feature_sizes
feature_size_cache
:
dict
[
Image
.
Image
,
int
]
=
{}
@
classmethod
def
get_cached_feature_size
(
cls
,
image
:
Image
.
Image
)
->
int
:
feature_size
=
cls
.
feature_size_cache
[
id
(
image
)]
# hard assert that we only use the feature size once
del
cls
.
feature_size_cache
[
id
(
image
)]
return
feature_size
@
dataclass
class
DynamicResolutionParams
:
media
:
Image
.
Image
num_tiles
:
int
num_embeddings
:
int
patch_size
:
tuple
[
int
,
int
]
def
apply_params
(
self
,
params
:
DynamicResolutionParams
)
->
list
[
torch
.
Tensor
]:
target_size
=
(
params
.
patch_size
[
1
]
*
self
.
_patch_size
,
params
.
patch_size
[
0
]
*
self
.
_patch_size
,
)
image
=
np
.
asarray
(
params
.
media
.
convert
(
"RGB"
)
if
params
.
media
.
mode
!=
"RGB"
else
params
.
media
,
dtype
=
np
.
uint8
,
)
resized_img
=
(
torch
.
nn
.
functional
.
interpolate
(
torch
.
from_numpy
(
image
).
unsqueeze
(
0
).
permute
(
0
,
3
,
1
,
2
),
size
=
target_size
,
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
/
255.0
)
return
list
(
resized_img
)
def
process_media
(
self
,
media
:
Image
.
Image
,
num_tokens_available
:
int
,
)
->
tuple
[
DynamicResolutionParams
,
int
]:
"""Process a single media item and return its parameters.
Args:
media: The media item to process
num_tokens_available: Number of tokens available for this media
Returns:
DynamicResolutionParams for the media
"""
current_num_tokens_available
=
num_tokens_available
assert
isinstance
(
media
,
Image
.
Image
),
(
"Dynamic resolution is only supported for image media"
)
orig_width
,
orig_height
=
media
.
width
,
media
.
height
closest_patch_height
=
round
(
orig_height
/
self
.
_patch_size
+
0.5
)
closest_patch_width
=
round
(
orig_width
/
self
.
_patch_size
+
0.5
)
patches
=
closest_patch_height
*
closest_patch_width
factor
=
min
(
math
.
sqrt
(
current_num_tokens_available
/
patches
),
self
.
_factor_max
)
target_patch_height
=
math
.
floor
(
factor
*
closest_patch_height
)
target_patch_width
=
math
.
floor
(
factor
*
closest_patch_width
)
# Consider self._min_num_patches if > current_num_tokens_available.
if
(
current_num_tokens_available
>
self
.
_min_num_patches
and
target_patch_height
*
target_patch_width
<
self
.
_min_num_patches
):
up_factor
=
math
.
sqrt
(
self
.
_min_num_patches
/
(
target_patch_height
*
target_patch_width
)
)
target_patch_height
=
math
.
ceil
(
up_factor
*
target_patch_height
)
target_patch_width
=
math
.
ceil
(
up_factor
*
target_patch_width
)
# Round patch grid to be divisible by 2 (pixel-shuffle OR conv-merging)
# or by 4 when BOTH are enabled (two successive 2x reductions)
if
self
.
PIXEL_SHUFFLE
or
self
.
CONV_MERGING
:
required_divisor
=
4
if
(
self
.
PIXEL_SHUFFLE
and
self
.
CONV_MERGING
)
else
2
rem_h
=
target_patch_height
%
required_divisor
if
rem_h
!=
0
:
inc_h
=
required_divisor
-
rem_h
if
(
target_patch_height
+
inc_h
)
*
target_patch_width
<=
current_num_tokens_available
:
target_patch_height
+=
inc_h
else
:
target_patch_height
=
max
(
required_divisor
,
target_patch_height
-
rem_h
)
rem_w
=
target_patch_width
%
required_divisor
if
rem_w
!=
0
:
inc_w
=
required_divisor
-
rem_w
if
(
target_patch_height
*
(
target_patch_width
+
inc_w
)
<=
current_num_tokens_available
):
target_patch_width
+=
inc_w
else
:
target_patch_width
=
max
(
required_divisor
,
target_patch_width
-
rem_w
)
# Calculate embeddings for the main dynamic resolution image
num_embeddings
=
self
.
_get_num_embeddings
(
target_patch_width
*
self
.
_patch_size
,
target_patch_height
*
self
.
_patch_size
,
)
token_count
=
target_patch_width
*
target_patch_height
# Add thumbnail embeddings if enabled and image area is below threshold
num_tiles
=
1
# Base dynamic resolution image
return
self
.
DynamicResolutionParams
(
media
=
media
,
num_tiles
=
num_tiles
,
num_embeddings
=
num_embeddings
,
patch_size
=
(
target_patch_width
,
target_patch_height
),
),
token_count
def
compute_params
(
self
,
media_list
:
list
[
Image
.
Image
],
num_tokens_available
:
int
,
)
->
list
[
DynamicResolutionParams
]:
"""Compute parameters for all media with iterative token budgeting.
Args:
media_list: List of media items to process
num_tokens_available: Total number of tokens available across all media
Returns:
List of ImageTilingParams for each media item
"""
num_tokens_available
=
(
num_tokens_available
*
(
4
if
self
.
PIXEL_SHUFFLE
else
1
)
*
(
4
if
self
.
CONV_MERGING
else
1
)
)
# When the number of available token is too small,
# allow self._min_num_patches per media and let the sample be truncated.
num_tokens_available
=
max
(
num_tokens_available
,
self
.
_min_num_patches
*
len
(
media_list
)
)
# Clip the number of tokens available per media to >min and <max patches.
num_tokens_available_per_media
=
[
int
(
max
(
min
(
num_tokens_available
,
self
.
_max_num_patches
),
self
.
_min_num_patches
,
)
)
for
_
in
range
(
len
(
media_list
))
]
# prevent infinite loop in any case
for
_
in
range
(
10
):
# Step 1: Process each media with current token budget
params
=
[]
token_counts
=
[]
for
media
,
tokens_for_media
in
zip
(
media_list
,
num_tokens_available_per_media
):
param
,
token_count
=
self
.
process_media
(
media
,
tokens_for_media
)
params
.
append
(
param
)
token_counts
.
append
(
token_count
)
self
.
feature_size_cache
[
id
(
param
.
media
)]
=
param
.
num_embeddings
# Step 2: Check if total tokens is within budget
total_tokens
=
sum
(
token_counts
)
if
total_tokens
<=
num_tokens_available
:
# We're within budget, return the params
return
params
# Step 3: We're over budget, need to scale down
# Calculate scaling factor to get under budget
scaling_factor
=
num_tokens_available
/
total_tokens
# Recalculate token budgets for each media based on scaling
# Each media gets a proportional share of the total budget
scaled_down_num_tokens_available_per_media
=
[
max
(
self
.
_min_num_patches
,
int
(
token_count
*
scaling_factor
))
for
token_count
in
token_counts
]
scaled_down
=
any
(
[
scaled_down_num_tokens_available_per_media
[
i
]
<
num_tokens_available_per_media
[
i
]
for
i
in
range
(
len
(
num_tokens_available_per_media
))
]
)
# If there wasn't scaling down, we're stuck with min_num_patches per media,
# else try with the scaled down num_tokens_available_per_media.
if
not
scaled_down
:
num_tokens_available_per_media
=
[
self
.
_min_num_patches
]
*
len
(
media_list
)
else
:
num_tokens_available_per_media
=
(
scaled_down_num_tokens_available_per_media
)
ctx
=
f
"
{
params
=
}
{
total_tokens
=
}
{
num_tokens_available
=
}
"
raise
ValueError
(
f
"Should be unreachable - `return params` above must be reached:
{
ctx
}
"
)
@
staticmethod
def
stack
(
images
:
list
[
torch
.
Tensor
],
patch_size
:
int
)
->
torch
.
Tensor
:
assert
len
(
images
)
>
0
,
"No images to stack"
def
rearrange_img
(
x
):
py
=
x
.
shape
[
-
2
]
//
patch_size
px
=
x
.
shape
[
-
1
]
//
patch_size
x
=
einops
.
rearrange
(
x
,
"c (py yy) (px xx) -> (py px) (c yy xx)"
,
py
=
py
,
yy
=
patch_size
,
px
=
px
,
xx
=
patch_size
,
)
return
x
imgs
=
[
rearrange_img
(
img
)
for
img
in
images
]
pixel_values_flat
=
torch
.
cat
(
imgs
,
dim
=
0
).
unsqueeze
(
0
)
return
pixel_values_flat
class
BaseNanoNemotronVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
args
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
max_num_tiles
=
max_num_tiles
or
DEFAULT_NUM_TILES
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
downsample_ratio
:
int
=
config
.
downsample_ratio
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
self
.
norm_mean
=
torch
.
Tensor
(
config
.
norm_mean
).
reshape
(
1
,
3
,
1
,
1
)
self
.
norm_std
=
torch
.
Tensor
(
config
.
norm_std
).
reshape
(
1
,
3
,
1
,
1
)
self
.
dynamic_tiler
:
DynamicResolutionImageTiler
|
None
=
None
if
self
.
use_dynamic_resolution
(
config
):
self
.
dynamic_tiler
=
DynamicResolutionImageTiler
(
max_model_len
=
max_model_len
,
patch_size
=
patch_size
,
downsample_ratio
=
downsample_ratio
,
min_num_patches
=
config
.
vision_config
.
args
[
"min_num_patches"
],
max_num_patches
=
config
.
vision_config
.
args
[
"max_num_patches"
],
norm_mean
=
config
.
norm_mean
,
norm_std
=
config
.
norm_std
,
)
@
staticmethod
def
use_dynamic_resolution
(
config
:
PretrainedConfig
)
->
bool
:
return
"min_num_patches"
in
config
.
vision_config
.
args
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
max_num_tiles
:
int
,
)
->
int
:
target_ratios
=
get_internvl_target_ratios
(
1
,
max_num_tiles
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
target_ratios
=
target_ratios
,
image_size
=
self
.
image_size
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
max_num_tiles
:
int
,
)
->
list
[
torch
.
Tensor
]:
return
[
image_to_pixel_values
(
image
,
input_size
=
self
.
image_size
,
max_num
=
max_num_tiles
,
use_thumbnail
=
self
.
use_thumbnail
,
idx
=
idx
,
)
for
idx
,
image
in
enumerate
(
images
)
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
max_num_tiles
:
int
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
images
)
==
0
:
return
text
,
{}
image_inputs
:
dict
[
str
,
Any
]
if
tiler
:
=
self
.
dynamic_tiler
:
sans_images
=
text
[
0
].
replace
(
"<image>"
,
""
)
text_prompt_length
=
len
(
self
.
tokenizer
(
sans_images
,
add_special_tokens
=
False
).
input_ids
)
pixel_values_lst
,
num_tokens_per_image
=
tiler
.
_images_to_pixel_values_lst
(
text_prompt_length
=
text_prompt_length
,
images
=
images
,
)
imgs_sizes
=
[(
pv
.
shape
[
-
2
],
pv
.
shape
[
-
1
])
for
pv
in
pixel_values_lst
]
normalized
=
[
input_conditioner
(
img
,
tiler
.
norm_mean
,
tiler
.
norm_std
)
for
img
in
pixel_values_lst
]
image_num_patches
=
torch
.
tensor
([
1
]
*
len
(
num_tokens_per_image
))
image_inputs
=
{
"pixel_values_flat"
:
normalized
,
"imgs_sizes"
:
imgs_sizes
,
"num_tokens_per_image"
:
num_tokens_per_image
,
}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
max_num_tiles
)
image_num_patches
=
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst
])
pixel_values_flat
=
input_conditioner
(
torch
.
cat
(
pixel_values_lst
),
self
.
norm_mean
,
self
.
norm_std
)
image_inputs
=
{
"pixel_values_flat"
:
pixel_values_flat
,
"image_num_patches"
:
image_num_patches
,
}
num_tokens_per_image
=
[
self
.
num_image_token
*
len
(
item
)
for
item
in
pixel_values_lst
]
assert
len
(
text
)
==
1
,
(
"hf_processor is called on the output of get_dummy_text, "
"which should be a single string"
)
parts
=
[
x
for
x
in
re
.
split
(
r
"(<image>)"
,
text
[
0
])
if
x
]
assert
parts
.
count
(
"<image>"
)
==
len
(
pixel_values_lst
),
(
"the number of <image> tokens in the text should be the "
"same as the number of images"
)
for
i
,
(
feature_size
,
num_patches
)
in
enumerate
(
zip
(
num_tokens_per_image
,
image_num_patches
,
strict
=
True
)
):
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
parts
[
i
]
=
parts
[
i
].
replace
(
"<image>"
,
image_repl
.
full
)
text
=
[
""
.
join
(
parts
)]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
_T
|
list
[
_T
]
|
None
=
None
)
->
list
[
_T
]:
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
@
abstractmethod
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
*
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
max_num_tiles
:
int
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
raise
NotImplementedError
class
NanoNemotronVLProcessor
(
BaseNanoNemotronVLProcessor
):
"""
HF Processor with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
video_token
:
str
|
None
=
None
,
video_pruning_rate
:
float
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
max_model_len
=
max_model_len
,
max_num_tiles
=
max_num_tiles
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
self
.
video_pruning_rate
=
video_pruning_rate
self
.
audio_extractor
:
ParakeetExtractor
|
None
=
None
raw_sound_config
=
getattr
(
config
,
"sound_config"
,
None
)
if
raw_sound_config
is
not
None
:
self
.
audio_extractor
=
ParakeetExtractor
(
raw_sound_config
)
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
self
.
_img_start_token_ids
=
tokenizer
.
encode
(
IMG_START
,
add_special_tokens
=
False
)
self
.
_img_end_token_ids
=
tokenizer
.
encode
(
IMG_END
,
add_special_tokens
=
False
)
self
.
_img_context_token_ids
=
tokenizer
.
encode
(
IMG_CONTEXT
,
add_special_tokens
=
False
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
convert_tokens_to_ids
(
IMG_CONTEXT
)
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
max_num_tiles
:
int
,
)
->
list
[
torch
.
Tensor
]:
return
[
video_to_pixel_values
(
video
,
input_size
=
self
.
image_size
,
max_num_tiles
=
max_num_tiles
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
video
in
videos
]
def
_preprocess_video
(
self
,
text
:
list
[
str
],
videos
:
list
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]],
max_num_tiles
:
int
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
return
text
,
{}
videos_lst
=
[
v
[
0
]
for
v
in
videos
]
video_metadata_lst
=
[
v
[
1
]
for
v
in
videos
]
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos_lst
,
max_num_tiles
=
max_num_tiles
,
)
# We use frame duration in milliseconds (as integer) to ensure
# we have consistent timestamps calculation. At preprocessing
# fps parameter is given in fp32, while at inference it is bf16
# which leads to inaccurate timestamp calculation and causes
# timestamp values to differ.In rare cases this causes
# mismatching number of output tokens for tokenized frame prefixes
frame_duration_ms_lst
=
[
int
(
1000.0
/
metadata
[
"fps"
])
for
metadata
in
video_metadata_lst
]
frames_indices_lst
=
[
metadata
[
"frames_indices"
]
for
metadata
in
video_metadata_lst
]
video_num_patches
=
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst_video
])
video_inputs
=
{
"pixel_values_flat_video"
:
input_conditioner
(
torch
.
cat
(
pixel_values_lst_video
),
self
.
norm_mean
,
self
.
norm_std
),
"video_num_patches"
:
video_num_patches
,
"frames_indices"
:
frames_indices_lst
,
"frame_duration_ms"
:
torch
.
tensor
(
frame_duration_ms_lst
),
}
image_size
:
int
=
self
.
config
.
force_image_size
patch_size
:
int
=
self
.
config
.
patch_size
downsample_ratio
=
self
.
config
.
downsample_ratio
tokens_in_single_frame
=
int
(
(
image_size
*
image_size
//
patch_size
**
2
)
*
(
downsample_ratio
**
2
)
)
for
pixel_values
,
video_metadata
,
frames_indices
,
frame_duration_ms
in
zip
(
pixel_values_lst_video
,
video_metadata_lst
,
frames_indices_lst
,
frame_duration_ms_lst
,
):
num_frames
=
pixel_values
.
shape
[
0
]
if
self
.
video_pruning_rate
is
not
None
and
self
.
video_pruning_rate
>
0.0
:
# Start of EVS-specific code
num_tokens
=
compute_retained_tokens_count
(
tokens_per_frame
=
tokens_in_single_frame
,
num_frames
=
num_frames
,
q
=
self
.
video_pruning_rate
,
)
# Here we just need placeholders that won't actually be replaced -
# we just need to make sure the total number of tokens is correct
# assign all tokens to the first frame
tokens_per_frame
=
[
num_tokens
]
+
[
0
]
*
(
num_frames
-
1
)
# End of EVS-specific code
else
:
tokens_per_frame
=
[
tokens_in_single_frame
]
*
num_frames
video_repl
=
self
.
get_video_repl
(
tokens_per_frame
=
tokens_per_frame
,
frames_indices
=
frames_indices
,
frame_duration_ms
=
frame_duration_ms
,
tokenizer
=
self
.
tokenizer
,
img_start_token_ids
=
self
.
_img_start_token_ids
,
img_end_token_ids
=
self
.
_img_end_token_ids
,
img_context_token_ids
=
self
.
_img_context_token_ids
,
)
# video_repl.full is a list of token IDs
# Convert token IDs back to text for the HF processor flow
video_repl_text
=
self
.
tokenizer
.
decode
(
video_repl
.
full
,
skip_special_tokens
=
False
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl_text
,
1
)
for
t
in
text
]
return
text
,
video_inputs
def
_preprocess_audio
(
self
,
text
:
list
[
str
],
audios
:
list
[
npt
.
NDArray
],
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
audios
)
==
0
:
return
text
,
{}
assert
self
.
audio_extractor
is
not
None
extractor
=
self
.
audio_extractor
parts
=
[
x
for
x
in
re
.
split
(
f
"(
{
re
.
escape
(
AUDIO_CONTEXT
)
}
)"
,
text
[
0
])
if
x
]
token_count
=
parts
.
count
(
AUDIO_CONTEXT
)
if
token_count
!=
len
(
audios
):
raise
ValueError
(
"Number of audio tokens in text does not match the number "
f
"of audios (tokens=
{
token_count
}
, audios=
{
len
(
audios
)
}
)."
)
audio_index
=
0
for
idx
,
part
in
enumerate
(
parts
):
if
part
==
AUDIO_CONTEXT
:
audio_repl
=
self
.
get_audio_repl
(
audios
[
audio_index
])
parts
[
idx
]
=
audio_repl
.
full
audio_index
+=
1
text
=
[
""
.
join
(
parts
)]
audio_inputs
=
extractor
(
audios
,
sampling_rate
=
extractor
.
sampling_rate
,
return_tensors
=
"pt"
,
)
input_audio_features
=
audio_inputs
.
input_features
feature_attention_mask
=
audio_inputs
.
attention_mask
audio_feature_lengths
=
feature_attention_mask
.
sum
(
dim
=
1
)
audio_inputs
=
{
"input_audio_features"
:
input_audio_features
,
"feature_attention_mask"
:
feature_attention_mask
,
"audio_feature_lengths"
:
audio_feature_lengths
,
}
return
text
,
audio_inputs
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
videos
:
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]
|
list
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]]
|
None
=
None
,
audios
:
AudioItem
|
list
[
AudioItem
]
|
None
=
None
,
*
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
max_num_tiles
:
int
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
# Use default if not provided
if
max_num_tiles
is
None
:
max_num_tiles
=
self
.
max_num_tiles
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
videos
=
self
.
_make_batch_input
(
videos
)
audios
=
self
.
_make_batch_input
(
audios
)
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
max_num_tiles
=
max_num_tiles
,
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
max_num_tiles
=
1
,
)
text
,
audio_inputs
=
self
.
_preprocess_audio
(
text
=
text
,
audios
=
audios
,
)
text_inputs
=
self
.
tokenizer
(
text
,
add_special_tokens
=
False
)
combined_inputs
=
{
**
text_inputs
,
**
video_inputs
,
**
audio_inputs
}
if
self
.
dynamic_tiler
is
None
:
batch
=
BatchFeature
(
{
**
combined_inputs
,
**
image_inputs
},
tensor_type
=
return_tensors
,
)
else
:
batch
=
BatchFeature
(
combined_inputs
,
tensor_type
=
return_tensors
)
# allow images to be exempt from the BatchFeature validation:
# We will .stack() them in _parse_and_validate_image_input
batch
.
update
(
image_inputs
)
return
batch
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
get_audio_repl
(
self
,
audio
:
npt
.
NDArray
,
)
->
PromptUpdateDetails
[
str
]:
assert
self
.
audio_extractor
is
not
None
num_tokens
=
self
.
audio_extractor
.
audio_token_count
(
len
(
audio
))
repl_full
=
f
"
{
AUDIO_START
}{
AUDIO_CONTEXT
*
num_tokens
}{
AUDIO_END
}
"
return
PromptUpdateDetails
.
select_text
(
repl_full
,
AUDIO_CONTEXT
)
@
classmethod
def
get_video_repl
(
cls
,
*
,
tokens_per_frame
:
list
[
int
],
frames_indices
:
list
[
int
],
frame_duration_ms
:
int
,
tokenizer
:
TokenizerLike
,
img_start_token_ids
:
list
[
int
],
img_end_token_ids
:
list
[
int
],
img_context_token_ids
:
list
[
int
],
)
->
PromptUpdateDetails
[
list
[
int
]]:
"""
Build prompt replacement for a video.
The replacement returned is not actually used to replace the placeholder
tokens - it's just used to make sure we allocate the correct number
of tokens.
Actual replacement is done in embed_multimodal of
NemotronH_Nano_VL_V2
(specifically in _process_video_input -> _create_final_video_embeddings).
There, we create the final embeddings with text embeddings for indicator tokens
and video embeddings for video tokens.
This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
The differentiation is done via tokens_per_frame parameter.
- non EVS case - constant value same value across all frames
- EVS dummy - Doesn't matter how tokens are distributed between frames - just
make sure the total number of tokens is correct.
- EVS real (called from get_real_video_repl_for_evs) - different value per frame
Args:
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
"""
# TODO: Add support of frame_duration_ms to be None
# At preprocessing step we should allow absent / metadata without
# frames_indices field.
timestamps_enabled
=
frame_duration_ms
is
not
None
if
timestamps_enabled
:
timestamps
=
calculate_timestamps
(
frames_indices
,
frame_duration_ms
)
assert
len
(
timestamps
)
==
len
(
tokens_per_frame
),
(
"timestamps and tokens_per_frame must have the same length"
)
frame_separators
=
[
f
"Frame
{
i
+
1
}
sampled at
{
timestamp
:.
2
f
}
seconds: "
for
i
,
timestamp
in
enumerate
(
timestamps
)
]
else
:
frame_separators
=
[
f
"Frame
{
i
+
1
}
: "
for
i
,
_
in
enumerate
(
tokens_per_frame
)
]
# Tokenize frame separator independently
frame_separators_tokenized
=
[
_seq2tokens
(
tokenizer
,
sep
)
for
sep
in
frame_separators
]
# Tokenize each component independently to avoid tokenizer merging tokens
# across boundaries. This ensures consistent tokenization regardless of
# num_tokens_per_frame values.
all_token_ids
=
[]
for
i
,
num_tokens
in
enumerate
(
tokens_per_frame
):
frame_sep_token_ids
=
frame_separators_tokenized
[
i
]
all_token_ids
.
extend
(
frame_sep_token_ids
)
# Add pre-tokenized special tokens
all_token_ids
.
extend
(
img_start_token_ids
)
all_token_ids
.
extend
(
img_context_token_ids
*
num_tokens
)
all_token_ids
.
extend
(
img_end_token_ids
)
return
PromptUpdateDetails
.
from_seq
(
all_token_ids
)
vllm/transformers_utils/processors/nemotron_parse.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py
# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py
from
typing
import
TypeVar
import
numpy
as
np
import
torch
from
PIL
import
Image
from
timm.data.constants
import
OPENAI_CLIP_MEAN
,
OPENAI_CLIP_STD
from
torchvision
import
transforms
as
T
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.tokenizers
import
TokenizerLike
_T
=
TypeVar
(
"_T"
)
DEFAULT_FINAL_IMAGE_SIZE
=
(
2048
,
1648
)
class
NemotronParseImageProcessor
:
"""
NemotronParse Image Processor
"""
def
__init__
(
self
,
final_size
:
tuple
=
DEFAULT_FINAL_IMAGE_SIZE
,
**
kwargs
,
):
# Ensure final_size is properly formatted
if
isinstance
(
final_size
,
(
list
,
tuple
))
and
len
(
final_size
)
>=
2
:
self
.
final_size
=
(
int
(
final_size
[
0
]),
int
(
final_size
[
1
]))
elif
isinstance
(
final_size
,
(
int
,
float
)):
self
.
final_size
=
(
int
(
final_size
),
int
(
final_size
))
else
:
self
.
final_size
=
DEFAULT_FINAL_IMAGE_SIZE
# Default fallback
self
.
norm_mean
=
torch
.
Tensor
(
OPENAI_CLIP_MEAN
).
reshape
(
1
,
3
,
1
,
1
)
self
.
norm_std
=
torch
.
Tensor
(
OPENAI_CLIP_STD
).
reshape
(
1
,
3
,
1
,
1
)
# Create transforms
self
.
_create_transforms
()
def
_create_transforms
(
self
):
"""Create transform objects."""
try
:
import
albumentations
as
A
except
ImportError
as
err
:
raise
ImportError
(
"The package `albumentations` is required to use "
"NemotronParse model. Please install it with `pip install "
"albumentations`."
)
from
err
# Ensure final_size is a tuple of integers
if
isinstance
(
self
.
final_size
,
(
list
,
tuple
)):
self
.
target_height
,
self
.
target_width
=
(
int
(
self
.
final_size
[
0
]),
int
(
self
.
final_size
[
1
]),
)
else
:
self
.
target_height
=
self
.
target_width
=
int
(
self
.
final_size
)
import
cv2
self
.
transform
=
A
.
Compose
(
[
A
.
PadIfNeeded
(
min_height
=
self
.
target_height
,
min_width
=
self
.
target_width
,
border_mode
=
cv2
.
BORDER_CONSTANT
,
fill
=
[
255
,
255
,
255
],
p
=
1.0
,
),
]
)
self
.
torch_transform
=
T
.
Compose
(
[
T
.
ToTensor
(),
]
)
def
_resize_with_aspect_ratio
(
self
,
image
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Resize image maintaining aspect ratio (exact replica of original
LongestMaxSizeHW)."""
height
,
width
=
image
.
shape
[:
2
]
max_size_height
=
self
.
target_height
max_size_width
=
self
.
target_width
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
aspect_ratio
=
width
/
height
new_height
=
height
new_width
=
width
# If height too big then scale image down
if
height
>
max_size_height
:
new_height
=
max_size_height
new_width
=
int
(
new_height
*
aspect_ratio
)
# If width too big, scale image down further
if
new_width
>
max_size_width
:
new_width
=
max_size_width
new_height
=
int
(
new_width
/
aspect_ratio
)
# Use cv2.INTER_LINEAR like the original
import
cv2
return
cv2
.
resize
(
image
,
(
new_width
,
new_height
),
interpolation
=
cv2
.
INTER_LINEAR
)
def
_pad_to_size
(
self
,
image
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Pad image to target size with white padding (matches A.PadIfNeeded
behavior)."""
h
,
w
=
image
.
shape
[:
2
]
min_height
,
min_width
=
self
.
target_height
,
self
.
target_width
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
pad_h
=
max
(
0
,
min_height
-
h
)
pad_w
=
max
(
0
,
min_width
-
w
)
if
pad_h
==
0
and
pad_w
==
0
:
return
image
# A.PadIfNeeded pads to bottom-right with constant value
if
len
(
image
.
shape
)
==
3
:
# Color image - pad bottom and right with white (255, 255, 255)
padded
=
np
.
pad
(
image
,
((
0
,
pad_h
),
(
0
,
pad_w
),
(
0
,
0
)),
mode
=
"constant"
,
constant_values
=
255
,
)
else
:
# Grayscale image - pad with white (255)
padded
=
np
.
pad
(
image
,
((
0
,
pad_h
),
(
0
,
pad_w
)),
mode
=
"constant"
,
constant_values
=
255
)
return
padded
def
preprocess
(
self
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
**
kwargs
,
)
->
dict
[
str
,
torch
.
Tensor
]:
"""
Preprocess an image or batch of images for the NemotronParse model.
Args:
images: Input image(s)
"""
# Ensure images is a list
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
# Convert PIL images to numpy arrays if needed
processed_images
=
[]
for
image
in
images
:
if
isinstance
(
image
,
Image
.
Image
):
image
=
np
.
asarray
(
image
)
processed_images
.
append
(
image
)
# Apply NemotronParse-specific transforms
pixel_values
=
[]
for
image
in
processed_images
:
# Manual resize with aspect ratio preservation
# (replaces LongestMaxSizeHW)
processed_image
=
self
.
_resize_with_aspect_ratio
(
image
)
# Apply remaining albumentations transforms if available
if
self
.
transform
is
not
None
:
transformed
=
self
.
transform
(
image
=
processed_image
)
processed_image
=
transformed
[
"image"
]
else
:
# Fallback: just pad to target size
processed_image
=
self
.
_pad_to_size
(
processed_image
)
# Convert to tensor
pixel_values_tensor
=
self
.
torch_transform
(
processed_image
)
# Handle grayscale images
if
pixel_values_tensor
.
shape
[
0
]
==
1
:
pixel_values_tensor
=
pixel_values_tensor
.
expand
(
3
,
-
1
,
-
1
)
pixel_values
.
append
(
pixel_values_tensor
)
# Stack into batch
pixel_values
=
torch
.
stack
(
pixel_values
)
# Normalize pixel values
normalized_values
=
(
pixel_values
-
self
.
norm_mean
)
/
self
.
norm_std
return
{
"pixel_values"
:
normalized_values
}
def
__call__
(
self
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
**
kwargs
)
->
dict
[
str
,
torch
.
Tensor
]:
return
self
.
preprocess
(
images
,
**
kwargs
)
class
NemotronParseProcessor
:
"""
NemotronParse Processor
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
NemotronParseImageProcessor
(
final_size
=
config
.
image_size
)
def
_make_batch_input
(
self
,
input_item
:
_T
|
list
[
_T
]
|
None
=
None
)
->
list
[
_T
]:
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
image_inputs
=
{}
if
len
(
images
)
==
0
else
self
.
image_processor
(
images
)
text_inputs
=
self
.
tokenizer
(
text
,
add_special_tokens
=
False
,
**
kwargs
)
combined_outputs
=
BatchFeature
(
data
=
{
**
text_inputs
,
**
image_inputs
},
tensor_type
=
return_tensors
,
)
return
combined_outputs
vllm/transformers_utils/processors/nemotron_vl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
transformers.image_processing_utils_fast
import
BaseImageProcessorFast
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
InternVLProcessor
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images
Image
.
MAX_IMAGE_PIXELS
=
None
# Disable the limit entirely
# Alternative: Set a specific higher limit
# Image.MAX_IMAGE_PIXELS = 300000000 # ~300M pixels
def
build_transform
(
input_size
:
int
):
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
]
)
# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_factor
=
float
(
"-inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
rw
,
rh
in
target_ratios
:
target_aspect_ratio
=
rw
/
rh
size_factor
=
min
((
rw
*
rh
*
image_size
*
image_size
)
/
area
,
0.6
)
ratio_closeness
=
min
(
target_aspect_ratio
/
aspect_ratio
,
aspect_ratio
/
target_aspect_ratio
)
factor
=
size_factor
*
ratio_closeness
if
factor
>
best_factor
:
best_factor
=
factor
best_ratio
=
(
rw
,
rh
)
return
best_ratio
def
calculate_nemotron_vl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_nemotron_vl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_nemotron_vl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
def
get_nemotron_vl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
image_to_pixel_values_nemotron_vl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
transform
:
T
.
Compose
|
None
=
None
,
)
->
torch
.
Tensor
:
target_ratios
=
get_nemotron_vl_target_ratios
(
min_num
,
max_num
)
if
transform
is
None
:
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_nemotron_vl
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
NemotronVLProcessor
(
InternVLProcessor
):
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
image_processor
:
BaseImageProcessorFast
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
ABC
.
__init__
(
self
)
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
image_processor
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
1
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
image_processor
.
max_num_tiles
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
True
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
if
image_processor
is
not
None
:
self
.
use_thumbnail
=
image_processor
.
use_thumbnail
else
:
self
.
use_thumbnail
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
self
.
IMG_CONTEXT
]
def
_get_transform
(
self
)
->
T
.
Compose
:
return
build_transform
(
input_size
=
self
.
image_size
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_nemotron_vl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_nemotron_vl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
transform
=
self
.
_get_transform
(),
)
for
image
in
images
]
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Replace <image> placeholders with image tokens."""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT
=
image_repl
.
full
.
replace
(
"<image>"
,
"<NVL_IMG_CONTEXT>"
)
text
=
[
t
.
replace
(
"<image>"
,
NVL_IMAGE_CONTEXT
,
1
)
for
t
in
text
]
return
[
t
.
replace
(
"<NVL_IMG_CONTEXT>"
,
self
.
IMG_CONTEXT
)
for
t
in
text
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
text
=
self
.
_replace_image_tokens
(
text
,
pixel_values_lst
)
return
text
,
image_inputs
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
self
.
IMG_CONTEXT
*
feature_size
repl_full
=
self
.
IMG_START
+
repl_features
+
self
.
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
self
.
IMG_CONTEXT
)
# SigLIP normalization constants
SIGLIP_MEAN
=
(
0.5
,
0.5
,
0.5
)
SIGLIP_STD
=
(
0.5
,
0.5
,
0.5
)
def
build_siglip_transform
(
input_size
:
int
):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return
T
.
Compose
(
[
build_transform
(
input_size
=
input_size
),
T
.
Normalize
(
mean
=
SIGLIP_MEAN
,
std
=
SIGLIP_STD
),
]
)
class
LlamaNemotronVLEmbedProcessor
(
NemotronVLProcessor
):
"""
Processor for LlamaNemotronVL embedding model.
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
- Uses SigLIP transform with normalization instead of base transform
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT
=
"<IMG_CONTEXT>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
processor_config
:
dict
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
processor_config
.
get
(
"min_input_tiles"
,
getattr
(
config
,
"min_dynamic_patch"
,
1
),
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
processor_config
.
get
(
"max_input_tiles"
,
getattr
(
config
,
"max_dynamic_patch"
,
1
),
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
processor_config
.
get
(
"dynamic_image_size"
,
getattr
(
config
,
"dynamic_image_size"
,
True
),
)
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
image_processor
=
None
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
def
_get_transform
(
self
)
->
T
.
Compose
:
"""Override to add SigLIP normalization."""
return
build_siglip_transform
(
input_size
=
self
.
image_size
)
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
vllm/transformers_utils/processors/nvlm_d.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
# --------------------------------------------------------
# NVLM-D
# Copyright (c) 2024 NVIDIA
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
.internvl
import
BaseInternVLProcessor
IMG_PAD
=
"<|vision_pad|>"
class
NVLMProcessor
(
BaseInternVLProcessor
):
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_PAD
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
tile_pos_identifiers
=
[
f
"<tile_
{
i
}
>"
for
i
in
range
(
1
,
num_patches
)]
if
self
.
use_thumbnail
:
tile_pos_identifiers
+=
[
"<tile_global_thumbnail>"
]
context_size
=
feature_size
//
num_patches
features
=
""
.
join
(
identifier
+
IMG_PAD
*
context_size
for
identifier
in
tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl
=
"<Image>"
+
features
+
"</Image>"
return
PromptUpdateDetails
.
select_text
(
repl
,
IMG_PAD
)
vllm/transformers_utils/processors/skyworkr1v.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
# --------------------------------------------------------
# SkyworkR1V
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_skyworkr1v_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_skyworkr1v_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_skyworkr1v_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_skyworkr1v
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_skyworkr1v_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def
image_to_pixel_values_skyworkr1v
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_skyworkr1v
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
SkyworkR1VProcessor
:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_skyworkr1v_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_skyworkr1v_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_skyworkr1v
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment