Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f3403243
Unverified
Commit
f3403243
authored
Mar 17, 2026
by
Cyrus Leung
Committed by
GitHub
Mar 17, 2026
Browse files
[1/2] Move InternVL-based processors (#37260)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
2660b928
Changes
20
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3252 additions
and
3099 deletions
+3252
-3099
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+1
-1
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+1
-1
tests/models/multimodal/processing/test_nemotron_vl.py
tests/models/multimodal/processing/test_nemotron_vl.py
+1
-1
vllm/model_executor/models/eagle2_5_vl.py
vllm/model_executor/models/eagle2_5_vl.py
+1
-81
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/h2ovl.py
+1
-374
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+7
-578
vllm/model_executor/models/nano_nemotron_vl.py
vllm/model_executor/models/nano_nemotron_vl.py
+16
-1017
vllm/model_executor/models/nemotron_parse.py
vllm/model_executor/models/nemotron_parse.py
+1
-232
vllm/model_executor/models/nemotron_vl.py
vllm/model_executor/models/nemotron_vl.py
+4
-404
vllm/model_executor/models/nvlm_d.py
vllm/model_executor/models/nvlm_d.py
+1
-33
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+2
-377
vllm/transformers_utils/processors/__init__.py
vllm/transformers_utils/processors/__init__.py
+18
-0
vllm/transformers_utils/processors/eagle2_5_vl.py
vllm/transformers_utils/processors/eagle2_5_vl.py
+85
-0
vllm/transformers_utils/processors/h2ovl.py
vllm/transformers_utils/processors/h2ovl.py
+390
-0
vllm/transformers_utils/processors/internvl.py
vllm/transformers_utils/processors/internvl.py
+603
-0
vllm/transformers_utils/processors/nano_nemotron_vl.py
vllm/transformers_utils/processors/nano_nemotron_vl.py
+1032
-0
vllm/transformers_utils/processors/nemotron_parse.py
vllm/transformers_utils/processors/nemotron_parse.py
+245
-0
vllm/transformers_utils/processors/nemotron_vl.py
vllm/transformers_utils/processors/nemotron_vl.py
+410
-0
vllm/transformers_utils/processors/nvlm_d.py
vllm/transformers_utils/processors/nvlm_d.py
+44
-0
vllm/transformers_utils/processors/skyworkr1v.py
vllm/transformers_utils/processors/skyworkr1v.py
+389
-0
No files found.
tests/models/multimodal/processing/test_h2ovl.py
View file @
f3403243
...
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
max_num
:
int
,
):
from
vllm.
model_executor.model
s.h2ovl
import
(
from
vllm.
transformers_utils.processor
s.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
,
)
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
f3403243
...
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
max_num
:
int
,
):
from
vllm.
model_executor.model
s.internvl
import
(
from
vllm.
transformers_utils.processor
s.internvl
import
(
calculate_internvl_targets
,
get_internvl_target_ratios
,
)
...
...
tests/models/multimodal/processing/test_nemotron_vl.py
View file @
f3403243
...
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
max_num
:
int
,
):
from
vllm.
model_executor.model
s.nemotron_vl
import
(
from
vllm.
transformers_utils.processor
s.nemotron_vl
import
(
calculate_nemotron_vl_targets
,
get_nemotron_vl_target_ratios
,
)
...
...
vllm/model_executor/models/eagle2_5_vl.py
View file @
f3403243
...
...
@@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.siglip
import
SiglipVisionModel
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.sequence
import
IntermediateTensors
from
vllm.t
okenizers
import
TokenizerLike
from
vllm.t
ransformers_utils.processors.eagle2_5_vl
import
Eagle2_5_VLProcessor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
(
...
...
@@ -27,13 +26,9 @@ from .interfaces import (
SupportsPP
,
)
from
.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLDummyInputsBuilder
,
BaseInternVLMultiModalProcessor
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessor
,
)
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
...
...
@@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = (
)
class
Eagle2_5_VLProcessor
(
BaseInternVLProcessor
):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self
.
config
=
config
self
.
tokenizer
=
tokenizer
# Image size with force_image_size override
image_size
:
int
=
config
.
vision_config
.
image_size
if
hasattr
(
config
,
"force_image_size"
)
and
config
.
force_image_size
:
image_size
=
config
.
force_image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
downsample_ratio
:
float
=
getattr
(
config
,
"downsample_ratio"
,
0.5
)
# Compute num_image_token
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
# Dynamic patch settings with defaults
self
.
min_dynamic_patch
=
(
min_dynamic_patch
if
min_dynamic_patch
is
not
None
else
getattr
(
config
,
"min_dynamic_patch"
,
1
)
)
self
.
max_dynamic_patch
=
(
max_dynamic_patch
if
max_dynamic_patch
is
not
None
else
getattr
(
config
,
"max_dynamic_patch"
,
12
)
)
self
.
dynamic_image_size
=
(
dynamic_image_size
if
dynamic_image_size
is
not
None
else
getattr
(
config
,
"dynamic_image_size"
,
True
)
)
self
.
use_thumbnail
:
bool
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
"""Get the image token ID from config or tokenizer."""
if
hasattr
(
self
.
config
,
"image_token_index"
):
return
self
.
config
.
image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab
=
self
.
tokenizer
.
get_vocab
()
if
IMG_CONTEXT
in
vocab
:
return
vocab
[
IMG_CONTEXT
]
raise
ValueError
(
f
"Cannot find image token '
{
IMG_CONTEXT
}
' in vocabulary"
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
"""Get image replacement string for prompt."""
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
class
Eagle2_5_VLProcessingInfo
(
BaseInternVLProcessingInfo
):
"""Processing info for Eagle2.5-VL model."""
...
...
vllm/model_executor/models/h2ovl.py
View file @
f3403243
...
...
@@ -11,7 +11,6 @@
from
collections.abc
import
Mapping
,
Sequence
import
torch
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
...
...
@@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import (
ProcessorInputs
,
PromptReplacement
,
PromptUpdate
,
PromptUpdateDetails
,
TimingContext
,
)
from
vllm.t
okenizers
import
TokenizerLike
from
vllm.t
ransformers_utils.processors.h2ovl
import
H2OVLProcessor
from
.intern_vit
import
InternVisionModel
from
.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLDummyInputsBuilder
,
BaseInternVLMultiModalProcessor
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessor
,
InternVLChatModel
,
build_transform
,
find_closest_aspect_ratio
,
get_internvl_target_ratios
,
)
def
resolve_h2ovl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_h2ovl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
*
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
# if prior_aspect_ratio is provided, filter the target ratios
if
prior_aspect_ratio
is
not
None
:
target_ratios
=
[
ratio
for
ratio
in
target_ratios
if
prior_aspect_ratio
[
0
]
%
ratio
[
0
]
!=
0
and
prior_aspect_ratio
[
1
]
%
ratio
[
1
]
!=
0
]
return
target_ratios
# modified to include blocks generated in second pass
def
calculate_h2ovl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
,
tuple
[
int
,
int
]]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
,
target_aspect_ratio
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
# refactored to handle prior_aspect_ratio
def
dynamic_preprocess_h2ovl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
list
[
Image
.
Image
],
tuple
[
int
,
int
]]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
(
blocks
,
target_width
,
target_height
,
target_aspect_ratio
,
)
=
calculate_h2ovl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
,
target_aspect_ratio
def
_preprocess_image
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
tuple
[
torch
.
Tensor
,
tuple
[
int
,
int
]]:
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
transform
=
build_transform
(
input_size
=
input_size
)
images
,
target_aspect_ratio
=
dynamic_preprocess_h2ovl
(
image
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
target_ratios
=
target_ratios
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
,
target_aspect_ratio
# refactored to use the _preprocess_image function
def
image_to_pixel_values_h2ovl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
use_msac
:
bool
,
)
->
torch
.
Tensor
:
# when MSAC is turned on, we need to process the image twice
if
use_msac
:
# first pass
pixel_values1
,
aspect_ratio1
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
1
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
None
,
)
# second pass
pixel_values2
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
3
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
aspect_ratio1
,
)
# combine pixel values
pixel_values
=
torch
.
cat
(
[
pixel_values2
[:
-
1
],
pixel_values1
[:
-
1
],
pixel_values2
[
-
1
:]],
0
)
else
:
pixel_values
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
use_thumbnail
,
prior_aspect_ratio
=
None
,
)
return
pixel_values
class
H2OVLProcessor
(
BaseInternVLProcessor
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_msac
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
,
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
if
use_msac
is
None
:
use_msac
=
config
.
use_msac
assert
isinstance
(
use_msac
,
bool
)
self
.
use_msac
=
use_msac
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_h2ovl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
=
None
,
override_min_num
:
int
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
if
override_min_num
is
not
None
:
min_num
=
override_min_num
return
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
use_msac
:
bool
|
None
=
None
,
)
->
int
:
use_msac
=
self
.
use_msac
if
use_msac
is
None
else
use_msac
use_thumbnail
=
self
.
use_thumbnail
if
use_msac
:
target_ratios_1
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
override_min_num
=
1
,
)
num_patches_1
,
_
,
_
,
aspect_ratio_1
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios_1
,
use_thumbnail
=
True
,
)
target_ratios_2
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
prior_aspect_ratio
=
aspect_ratio_1
,
override_min_num
=
3
,
)
num_patches_2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios_2
,
use_thumbnail
=
True
,
)
num_patches
=
num_patches_1
+
num_patches_2
-
1
else
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
use_msac
=
self
.
use_msac
if
len
(
images
)
==
1
else
False
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_h2ovl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
use_msac
=
use_msac
,
)
for
image
in
images
]
class
H2OVLProcessingInfo
(
BaseInternVLProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
H2OVLProcessor
:
return
self
.
ctx
.
init_processor
(
...
...
vllm/model_executor/models/internvl.py
View file @
f3403243
...
...
@@ -7,16 +7,13 @@
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
abc
import
ABC
,
abstractmethod
from
abc
import
abstractmethod
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Any
,
Literal
,
TypeAlias
,
TypeVar
from
typing
import
Annotated
,
Literal
,
TypeAlias
,
TypeVar
import
numpy.typing
as
npt
import
torch
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
transformers
import
BatchFeature
,
PretrainedConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
...
...
@@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import (
)
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
...
@@ -46,10 +42,12 @@ from vllm.multimodal.processing import (
BaseProcessingInfo
,
PromptReplacement
,
PromptUpdate
,
PromptUpdateDetails
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.processors.internvl
import
(
BaseInternVLProcessor
,
InternVLProcessor
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
(
...
...
@@ -60,13 +58,6 @@ from .interfaces import (
)
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
class
InternVLImagePixelInputs
(
TensorSchema
):
"""
...
...
@@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
InternVLVideoInputs
:
TypeAlias
=
InternVLVideoPixelInputs
|
InternVLVideoEmbeddingInputs
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
return
transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_internvl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_internvl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_internvl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
dynamic_preprocess_internvl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_internvl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
image_to_pixel_values_internvl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_internvl
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
video_to_pixel_values_internvl
(
video
:
npt
.
NDArray
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
frames_list
=
list
[
Image
.
Image
]()
for
frame
in
video
:
pil_frame
=
dynamic_preprocess_internvl
(
Image
.
fromarray
(
frame
,
mode
=
"RGB"
),
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
assert
len
(
pil_frame
)
==
1
frames_list
.
extend
(
pil_frame
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
frames_list
])
return
pixel_values
class
BaseInternVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_internvl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_internvl_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_internvl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
Any
|
list
[
Any
]
|
None
=
None
):
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
text
,
images
=
[
self
.
_make_batch_input
(
x
)
for
x
in
(
text
,
images
)]
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
class
InternVLProcessor
(
BaseInternVLProcessor
):
"""
HF Processor for InternVLChatModel with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
video_token
:
str
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
1
,
max_dynamic_patch
=
1
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
video_to_pixel_values_internvl
(
video
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
False
,
)
for
video
in
videos
]
def
_preprocess_video
(
self
,
text
:
list
[
str
],
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
):
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
video_inputs
=
{}
else
:
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
video_inputs
=
{
"pixel_values_flat_video"
:
torch
.
cat
(
pixel_values_lst_video
),
"video_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst_video
]
),
}
for
pixel_values
in
pixel_values_lst_video
:
num_patches
=
pixel_values
.
shape
[
0
]
video_repl
=
self
.
get_video_repl
(
self
.
num_image_token
,
num_patches
,
self
.
video_token
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
video_inputs
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
text
,
images
,
videos
=
[
self
.
_make_batch_input
(
x
)
for
x
in
(
text
,
images
,
videos
)
]
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
,
**
video_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
get_video_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
=
None
,
video_context_token
:
str
=
IMG_CONTEXT
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
video_context_token
*
self
.
num_image_token
repl_features_with_sep
=
IMG_START
+
repl_features
+
IMG_END
# num_patches is equal to num_frames
repl_full
=
""
.
join
(
[
f
"Frame
{
i
+
1
}
:
{
repl_features_with_sep
}
"
for
i
in
range
(
num_patches
)]
)
return
PromptUpdateDetails
.
select_text
(
repl_full
,
video_context_token
)
class
BaseInternVLProcessingInfo
(
BaseProcessingInfo
):
"""Basic image-only ProcessingInfo for InternVL-style models."""
...
...
vllm/model_executor/models/nano_nemotron_vl.py
View file @
f3403243
...
...
@@ -8,22 +8,15 @@
# --------------------------------------------------------
import
copy
import
math
import
warnings
from
abc
import
ABC
,
abstractmethod
from
abc
import
abstractmethod
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
dataclasses
import
dataclass
from
functools
import
cached_property
from
typing
import
Annotated
,
Any
,
Literal
,
TypeAlias
,
TypeVar
from
typing
import
Annotated
,
Literal
,
TypeAlias
,
TypeVar
import
einops
import
numpy
as
np
import
numpy.typing
as
npt
import
regex
as
re
import
torch
import
torch.nn
as
nn
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
transformers
import
BatchFeature
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
,
VideoDummyOptions
...
...
@@ -38,10 +31,6 @@ from vllm.model_executor.models.interfaces import (
SupportsMultiModal
,
SupportsMultiModalPruning
,
)
from
vllm.model_executor.models.internvl
import
(
calculate_internvl_targets
,
get_internvl_target_ratios
,
)
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.nemotron_h
import
NemotronHForCausalLM
from
vllm.model_executor.models.parakeet
import
ParakeetExtractor
,
ProjectedParakeet
...
...
@@ -83,23 +72,28 @@ from vllm.multimodal.processing.processor import (
BaseProcessingInfo
,
PromptReplacement
,
PromptUpdate
,
PromptUpdateDetails
,
_seq2tokens
,
)
from
vllm.renderers
import
TokenizeParams
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
,
cached_tokenizer_from_config
from
vllm.tokenizers
import
cached_tokenizer_from_config
from
vllm.transformers_utils.configs.radio
import
RadioConfig
from
vllm.transformers_utils.processors.nano_nemotron_vl
import
(
AUDIO_CONTEXT
,
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseNanoNemotronVLProcessor
,
DynamicResolutionImageTiler
,
NanoNemotronVLProcessor
,
get_internvl_target_ratios
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.utils
import
_merge_multimodal_embeddings
logger
=
init_logger
(
__name__
)
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images
Image
.
MAX_IMAGE_PIXELS
=
None
# Disable the limit entirely
# Alternative: Set a specific higher limit
# Image.MAX_IMAGE_PIXELS = 300000000 # ~300M pixels
MAX_AUDIO_LEN_S
=
10
*
60
# 10 minutes
class
NanoNemotronVLAudioFeatureInputs
(
TensorSchema
):
...
...
@@ -116,20 +110,6 @@ class NanoNemotronVLAudioFeatureInputs(TensorSchema):
audio_feature_lengths
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"b"
)]
MAX_AUDIO_LEN_S
=
10
*
60
# 10 minutes
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
AUDIO_START
=
"<so_start>"
AUDIO_END
=
"<so_end>"
AUDIO_CONTEXT
=
"<so_embedding>"
# Profiling
# MAX_FRAMES = 16
DEFAULT_NUM_TILES
=
12
class
NanoNemotronVLImagePixelInputs
(
TensorSchema
):
"""
Dimensions:
...
...
@@ -213,987 +193,6 @@ NanoNemotronVLVideoInputs: TypeAlias = (
)
def
dynamic_preprocess
(
image
,
*
,
image_size
=
512
,
max_num_tiles
=
12
,
use_thumbnail
=
True
,
idx
=
0
,
):
orig_width
,
orig_height
=
image
.
size
target_ratios
=
get_internvl_target_ratios
(
1
,
max_num_tiles
)
blocks
,
target_width
,
target_height
=
calculate_internvl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
image
=
np
.
asarray
(
image
.
convert
(
"RGB"
)
if
image
.
mode
!=
"RGB"
else
image
,
dtype
=
np
.
uint8
)
image
=
torch
.
from_numpy
(
image
).
unsqueeze
(
0
)
# (1, H, W, 3)
image
=
image
.
permute
(
0
,
3
,
1
,
2
)
# (1, 3, H, W)
resized_img
=
torch
.
nn
.
functional
.
interpolate
(
image
,
size
=
(
target_height
,
target_width
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
B
,
C
,
H
,
W
=
resized_img
.
shape
hp
,
wp
=
H
//
image_size
,
W
//
image_size
patches
=
(
resized_img
.
reshape
(
B
,
C
,
hp
,
image_size
,
wp
,
image_size
)
.
permute
(
0
,
2
,
4
,
1
,
3
,
5
)
.
reshape
(
B
*
hp
*
wp
,
C
,
image_size
,
image_size
)
/
255.0
)
if
use_thumbnail
and
patches
.
shape
[
0
]
>
1
:
thumb
=
(
torch
.
nn
.
functional
.
interpolate
(
image
,
size
=
(
image_size
,
image_size
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
/
255.0
)
patches
=
torch
.
cat
([
patches
,
thumb
],
dim
=
0
)
return
list
(
patches
)
def
image_to_pixel_values
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
idx
:
int
,
)
->
torch
.
Tensor
:
images
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
max_num_tiles
=
max_num
,
use_thumbnail
=
use_thumbnail
,
idx
=
idx
,
)
pixel_values
=
torch
.
stack
(
images
)
return
pixel_values
def
video_to_pixel_values
(
video
:
npt
.
NDArray
,
*
,
input_size
:
int
,
max_num_tiles
:
int
=
1
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
assert
max_num_tiles
==
1
,
"Video modality always uses one tile"
# (num_frames, H, W, C) -> (num_frames, C, H, W)
video_tensor
=
torch
.
from_numpy
(
video
).
permute
(
0
,
3
,
1
,
2
)
if
video_tensor
.
shape
[
2
]
!=
input_size
or
video_tensor
.
shape
[
3
]
!=
input_size
:
video_tensor
=
torch
.
nn
.
functional
.
interpolate
(
video_tensor
,
size
=
(
input_size
,
input_size
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
video_tensor
=
video_tensor
/
255.0
return
video_tensor
def
input_conditioner
(
x
,
norm_mean
,
norm_std
):
return
(
x
-
norm_mean
)
/
norm_std
def
calculate_timestamps
(
indices
:
list
[
int
]
|
torch
.
Tensor
,
frame_duration_ms
:
int
,
):
if
not
isinstance
(
indices
,
list
):
indices
=
indices
.
tolist
()
timestamps
=
[
int
(
i
)
*
frame_duration_ms
/
1000.0
for
i
in
indices
]
return
timestamps
class
DynamicResolutionImageTiler
:
CONV_MERGING
=
False
PIXEL_SHUFFLE
=
True
USE_THUMBNAIL
=
False
def
__init__
(
self
,
*
,
max_model_len
:
int
,
patch_size
:
int
,
min_num_patches
:
int
,
max_num_patches
:
int
,
downsample_ratio
:
int
,
norm_mean
:
Sequence
[
float
],
norm_std
:
Sequence
[
float
],
factor_max
:
float
=
1.0
,
use_thumbnail
:
bool
=
False
,
)
->
None
:
assert
use_thumbnail
is
False
,
"use_thumbnail is not supported"
self
.
_patch_size
:
int
=
patch_size
self
.
_max_model_len
=
max_model_len
self
.
_min_num_patches
=
min_num_patches
self
.
_max_num_patches
=
max_num_patches
if
max_num_patches
>
0
else
float
(
"inf"
)
self
.
_factor_max
=
factor_max
self
.
norm_mean
=
torch
.
tensor
(
norm_mean
).
reshape
(
3
,
1
,
1
)
self
.
norm_std
=
torch
.
tensor
(
norm_std
).
reshape
(
3
,
1
,
1
)
assert
downsample_ratio
<
1
reduction_factor
=
1
/
downsample_ratio
assert
reduction_factor
==
2.0
self
.
_downsample_ratio
=
int
(
reduction_factor
)
**
(
self
.
PIXEL_SHUFFLE
+
self
.
CONV_MERGING
)
assert
self
.
_downsample_ratio
==
2
def
_get_num_embeddings
(
self
,
width
:
int
,
height
:
int
)
->
int
:
num_patches
=
(
width
//
self
.
_patch_size
)
*
(
height
//
self
.
_patch_size
)
num_tokens
=
num_patches
//
(
self
.
_downsample_ratio
**
2
)
return
num_tokens
def
width_and_height_for_max_num_tokens_available
(
self
,
target_num_tokens_post_shuffle
:
int
,
)
->
tuple
[
int
,
int
]:
"""
TODO: optimize this so it squeezes closer to target number of tokens.
Calculate image dimensions that produce approximately `target` tokens after
pixel_shuffle.
With pixel_shuffle enabled, each 2x2 patch grid becomes 1 token, so we
need 4*B patches to get B tokens.
Examples:
>>> PATCH_SIZE = 16
>>> DOWNSAMPLE_RATIO = 0.5
>>> tiler = DynamicResolutionImageTiler(
... max_model_len=16384,
... patch_size=PATCH_SIZE,
... downsample_ratio=DOWNSAMPLE_RATIO,
... min_num_patches=4,
... max_num_patches=0,
... )
>>> width, height = tiler.width_and_height_for_max_num_tokens_available(
... target_num_tokens_post_shuffle=8192,
... )
>>> assert width, height == (2880, 2880)
>>> assert (width // PATCH_SIZE) * (
... height // PATCH_SIZE
... ) // 2**2 == 8100 # tokens post-shuffle
>>> assert tiler._get_num_embeddings(width=width, height=height) == 8100
"""
side_pixels
=
(
math
.
isqrt
(
target_num_tokens_post_shuffle
)
*
self
.
_downsample_ratio
*
self
.
_patch_size
)
assert
isinstance
(
side_pixels
,
int
)
and
side_pixels
%
self
.
_patch_size
==
0
return
side_pixels
,
side_pixels
def
max_num_tokens_available
(
self
,
text_prompt_length
:
int
)
->
int
:
return
self
.
_max_model_len
-
text_prompt_length
-
4
def
_images_to_pixel_values_lst
(
self
,
text_prompt_length
:
int
,
images
:
list
[
Image
.
Image
],
)
->
tuple
[
list
[
torch
.
Tensor
],
list
[
int
]]:
num_tokens_available
=
self
.
max_num_tokens_available
(
text_prompt_length
)
params_per_image
=
self
.
compute_params
(
images
,
num_tokens_available
)
feature_sizes
=
[]
images
=
[]
for
param
in
params_per_image
:
for
t
in
self
.
apply_params
(
param
):
assert
t
.
ndim
==
3
,
f
"
{
t
.
ndim
=
}
: expected 3 dim tensor"
images
.
append
(
t
)
feature_sizes
.
append
(
param
.
num_embeddings
)
return
images
,
feature_sizes
feature_size_cache
:
dict
[
Image
.
Image
,
int
]
=
{}
@
classmethod
def
get_cached_feature_size
(
cls
,
image
:
Image
.
Image
)
->
int
:
feature_size
=
cls
.
feature_size_cache
[
id
(
image
)]
# hard assert that we only use the feature size once
del
cls
.
feature_size_cache
[
id
(
image
)]
return
feature_size
@
dataclass
class
DynamicResolutionParams
:
media
:
Image
.
Image
num_tiles
:
int
num_embeddings
:
int
patch_size
:
tuple
[
int
,
int
]
def
apply_params
(
self
,
params
:
DynamicResolutionParams
)
->
list
[
torch
.
Tensor
]:
target_size
=
(
params
.
patch_size
[
1
]
*
self
.
_patch_size
,
params
.
patch_size
[
0
]
*
self
.
_patch_size
,
)
image
=
np
.
asarray
(
params
.
media
.
convert
(
"RGB"
)
if
params
.
media
.
mode
!=
"RGB"
else
params
.
media
,
dtype
=
np
.
uint8
,
)
resized_img
=
(
torch
.
nn
.
functional
.
interpolate
(
torch
.
from_numpy
(
image
).
unsqueeze
(
0
).
permute
(
0
,
3
,
1
,
2
),
size
=
target_size
,
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
/
255.0
)
return
list
(
resized_img
)
def
process_media
(
self
,
media
:
Image
.
Image
,
num_tokens_available
:
int
,
)
->
tuple
[
DynamicResolutionParams
,
int
]:
"""Process a single media item and return its parameters.
Args:
media: The media item to process
num_tokens_available: Number of tokens available for this media
Returns:
DynamicResolutionParams for the media
"""
current_num_tokens_available
=
num_tokens_available
assert
isinstance
(
media
,
Image
.
Image
),
(
"Dynamic resolution is only supported for image media"
)
orig_width
,
orig_height
=
media
.
width
,
media
.
height
closest_patch_height
=
round
(
orig_height
/
self
.
_patch_size
+
0.5
)
closest_patch_width
=
round
(
orig_width
/
self
.
_patch_size
+
0.5
)
patches
=
closest_patch_height
*
closest_patch_width
factor
=
min
(
math
.
sqrt
(
current_num_tokens_available
/
patches
),
self
.
_factor_max
)
target_patch_height
=
math
.
floor
(
factor
*
closest_patch_height
)
target_patch_width
=
math
.
floor
(
factor
*
closest_patch_width
)
# Consider self._min_num_patches if > current_num_tokens_available.
if
(
current_num_tokens_available
>
self
.
_min_num_patches
and
target_patch_height
*
target_patch_width
<
self
.
_min_num_patches
):
up_factor
=
math
.
sqrt
(
self
.
_min_num_patches
/
(
target_patch_height
*
target_patch_width
)
)
target_patch_height
=
math
.
ceil
(
up_factor
*
target_patch_height
)
target_patch_width
=
math
.
ceil
(
up_factor
*
target_patch_width
)
# Round patch grid to be divisible by 2 (pixel-shuffle OR conv-merging)
# or by 4 when BOTH are enabled (two successive 2x reductions)
if
self
.
PIXEL_SHUFFLE
or
self
.
CONV_MERGING
:
required_divisor
=
4
if
(
self
.
PIXEL_SHUFFLE
and
self
.
CONV_MERGING
)
else
2
rem_h
=
target_patch_height
%
required_divisor
if
rem_h
!=
0
:
inc_h
=
required_divisor
-
rem_h
if
(
target_patch_height
+
inc_h
)
*
target_patch_width
<=
current_num_tokens_available
:
target_patch_height
+=
inc_h
else
:
target_patch_height
=
max
(
required_divisor
,
target_patch_height
-
rem_h
)
rem_w
=
target_patch_width
%
required_divisor
if
rem_w
!=
0
:
inc_w
=
required_divisor
-
rem_w
if
(
target_patch_height
*
(
target_patch_width
+
inc_w
)
<=
current_num_tokens_available
):
target_patch_width
+=
inc_w
else
:
target_patch_width
=
max
(
required_divisor
,
target_patch_width
-
rem_w
)
# Calculate embeddings for the main dynamic resolution image
num_embeddings
=
self
.
_get_num_embeddings
(
target_patch_width
*
self
.
_patch_size
,
target_patch_height
*
self
.
_patch_size
,
)
token_count
=
target_patch_width
*
target_patch_height
# Add thumbnail embeddings if enabled and image area is below threshold
num_tiles
=
1
# Base dynamic resolution image
return
self
.
DynamicResolutionParams
(
media
=
media
,
num_tiles
=
num_tiles
,
num_embeddings
=
num_embeddings
,
patch_size
=
(
target_patch_width
,
target_patch_height
),
),
token_count
def
compute_params
(
self
,
media_list
:
list
[
Image
.
Image
],
num_tokens_available
:
int
|
None
=
None
,
)
->
list
[
DynamicResolutionParams
]:
"""Compute parameters for all media with iterative token budgeting.
Args:
media_list: List of media items to process
num_tokens_available: Total number of tokens available across all media
Returns:
List of ImageTilingParams for each media item
"""
num_tokens_available
=
(
num_tokens_available
*
(
4
if
self
.
PIXEL_SHUFFLE
else
1
)
*
(
4
if
self
.
CONV_MERGING
else
1
)
)
# When the number of available token is too small,
# allow self._min_num_patches per media and let the sample be truncated.
num_tokens_available
=
max
(
num_tokens_available
,
self
.
_min_num_patches
*
len
(
media_list
)
)
# Clip the number of tokens available per media to >min and <max patches.
num_tokens_available_per_media
=
[
max
(
min
(
num_tokens_available
,
self
.
_max_num_patches
),
self
.
_min_num_patches
)
for
_
in
range
(
len
(
media_list
))
]
# prevent infinite loop in any case
for
_
in
range
(
10
):
# Step 1: Process each media with current token budget
params
=
[]
token_counts
=
[]
for
media
,
tokens_for_media
in
zip
(
media_list
,
num_tokens_available_per_media
):
param
,
token_count
=
self
.
process_media
(
media
,
tokens_for_media
)
params
.
append
(
param
)
token_counts
.
append
(
token_count
)
self
.
feature_size_cache
[
id
(
param
.
media
)]
=
param
.
num_embeddings
# Step 2: Check if total tokens is within budget
total_tokens
=
sum
(
token_counts
)
if
total_tokens
<=
num_tokens_available
:
# We're within budget, return the params
return
params
# Step 3: We're over budget, need to scale down
# Calculate scaling factor to get under budget
scaling_factor
=
num_tokens_available
/
total_tokens
# Recalculate token budgets for each media based on scaling
# Each media gets a proportional share of the total budget
scaled_down_num_tokens_available_per_media
=
[
max
(
self
.
_min_num_patches
,
int
(
token_count
*
scaling_factor
))
for
token_count
in
token_counts
]
scaled_down
=
any
(
[
scaled_down_num_tokens_available_per_media
[
i
]
<
num_tokens_available_per_media
[
i
]
for
i
in
range
(
len
(
num_tokens_available_per_media
))
]
)
# If there wasn't scaling down, we're stuck with min_num_patches per media,
# else try with the scaled down num_tokens_available_per_media.
if
not
scaled_down
:
num_tokens_available_per_media
=
[
self
.
_min_num_patches
]
*
len
(
media_list
)
else
:
num_tokens_available_per_media
=
(
scaled_down_num_tokens_available_per_media
)
ctx
=
f
"
{
params
=
}
{
total_tokens
=
}
{
num_tokens_available
=
}
"
raise
ValueError
(
f
"Should be unreachable - `return params` above must be reached:
{
ctx
}
"
)
@
staticmethod
def
stack
(
images
:
list
[
torch
.
Tensor
],
patch_size
:
int
)
->
torch
.
Tensor
:
assert
len
(
images
)
>
0
,
"No images to stack"
def
rearrange_img
(
x
):
py
=
x
.
shape
[
-
2
]
//
patch_size
px
=
x
.
shape
[
-
1
]
//
patch_size
x
=
einops
.
rearrange
(
x
,
"c (py yy) (px xx) -> (py px) (c yy xx)"
,
py
=
py
,
yy
=
patch_size
,
px
=
px
,
xx
=
patch_size
,
)
return
x
imgs
=
[
rearrange_img
(
img
)
for
img
in
images
]
pixel_values_flat
=
torch
.
cat
(
imgs
,
dim
=
0
).
unsqueeze
(
0
)
return
pixel_values_flat
class
BaseNanoNemotronVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
args
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
max_num_tiles
=
max_num_tiles
or
DEFAULT_NUM_TILES
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
downsample_ratio
:
int
=
config
.
downsample_ratio
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
self
.
norm_mean
=
torch
.
Tensor
(
config
.
norm_mean
).
reshape
(
1
,
3
,
1
,
1
)
self
.
norm_std
=
torch
.
Tensor
(
config
.
norm_std
).
reshape
(
1
,
3
,
1
,
1
)
self
.
dynamic_tiler
:
DynamicResolutionImageTiler
|
None
=
None
if
self
.
use_dynamic_resolution
(
config
):
self
.
dynamic_tiler
=
DynamicResolutionImageTiler
(
max_model_len
=
max_model_len
,
patch_size
=
patch_size
,
downsample_ratio
=
downsample_ratio
,
min_num_patches
=
config
.
vision_config
.
args
[
"min_num_patches"
],
max_num_patches
=
config
.
vision_config
.
args
[
"max_num_patches"
],
norm_mean
=
config
.
norm_mean
,
norm_std
=
config
.
norm_std
,
)
@
staticmethod
def
use_dynamic_resolution
(
config
:
PretrainedConfig
)
->
bool
:
return
"min_num_patches"
in
config
.
vision_config
.
args
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
max_num_tiles
:
int
,
)
->
int
:
target_ratios
=
get_internvl_target_ratios
(
1
,
max_num_tiles
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
target_ratios
=
target_ratios
,
image_size
=
self
.
image_size
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
max_num_tiles
:
int
,
)
->
list
[
torch
.
Tensor
]:
return
[
image_to_pixel_values
(
image
,
input_size
=
self
.
image_size
,
max_num
=
max_num_tiles
,
use_thumbnail
=
self
.
use_thumbnail
,
idx
=
idx
,
)
for
idx
,
image
in
enumerate
(
images
)
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
max_num_tiles
:
int
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
return
text
,
image_inputs
if
tiler
:
=
self
.
dynamic_tiler
:
sans_images
=
text
[
0
].
replace
(
"<image>"
,
""
)
text_prompt_length
=
len
(
self
.
tokenizer
(
sans_images
,
add_special_tokens
=
False
).
input_ids
)
pixel_values_lst
,
num_tokens_per_image
=
tiler
.
_images_to_pixel_values_lst
(
text_prompt_length
=
text_prompt_length
,
images
=
images
,
)
imgs_sizes
=
[(
pv
.
shape
[
-
2
],
pv
.
shape
[
-
1
])
for
pv
in
pixel_values_lst
]
normalized
=
[
input_conditioner
(
img
,
tiler
.
norm_mean
,
tiler
.
norm_std
)
for
img
in
pixel_values_lst
]
image_num_patches
=
torch
.
tensor
([
1
]
*
len
(
num_tokens_per_image
))
image_inputs
=
{
"pixel_values_flat"
:
normalized
,
"imgs_sizes"
:
imgs_sizes
,
"num_tokens_per_image"
:
num_tokens_per_image
,
}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
max_num_tiles
)
image_num_patches
=
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst
])
pixel_values_flat
=
input_conditioner
(
torch
.
cat
(
pixel_values_lst
),
self
.
norm_mean
,
self
.
norm_std
)
image_inputs
=
{
"pixel_values_flat"
:
pixel_values_flat
,
"image_num_patches"
:
image_num_patches
,
}
num_tokens_per_image
=
[
self
.
num_image_token
*
len
(
item
)
for
item
in
pixel_values_lst
]
assert
len
(
text
)
==
1
,
(
"hf_processor is called on the output of get_dummy_text, "
"which should be a single string"
)
parts
=
[
x
for
x
in
re
.
split
(
r
"(<image>)"
,
text
[
0
])
if
x
]
assert
parts
.
count
(
"<image>"
)
==
len
(
pixel_values_lst
),
(
"the number of <image> tokens in the text should be the "
"same as the number of images"
)
for
i
,
(
feature_size
,
num_patches
)
in
enumerate
(
zip
(
num_tokens_per_image
,
image_num_patches
,
strict
=
True
)
):
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
parts
[
i
]
=
parts
[
i
].
replace
(
"<image>"
,
image_repl
.
full
)
text
=
[
""
.
join
(
parts
)]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
Any
|
list
[
Any
]
|
None
=
None
):
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
@
abstractmethod
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
max_num_tiles
:
int
|
None
=
None
,
)
->
BatchFeature
:
raise
NotImplementedError
class
NanoNemotronVLProcessor
(
BaseNanoNemotronVLProcessor
):
"""
HF Processor with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
video_token
:
str
|
None
=
None
,
video_pruning_rate
:
float
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
max_model_len
=
max_model_len
,
max_num_tiles
=
max_num_tiles
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
self
.
video_pruning_rate
=
video_pruning_rate
self
.
audio_extractor
:
ParakeetExtractor
|
None
=
None
raw_sound_config
=
getattr
(
config
,
"sound_config"
,
None
)
if
raw_sound_config
is
not
None
:
self
.
audio_extractor
=
ParakeetExtractor
(
raw_sound_config
)
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
self
.
_img_start_token_ids
=
tokenizer
.
encode
(
IMG_START
,
add_special_tokens
=
False
)
self
.
_img_end_token_ids
=
tokenizer
.
encode
(
IMG_END
,
add_special_tokens
=
False
)
self
.
_img_context_token_ids
=
tokenizer
.
encode
(
IMG_CONTEXT
,
add_special_tokens
=
False
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
convert_tokens_to_ids
(
IMG_CONTEXT
)
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
max_num_tiles
:
int
,
)
->
list
[
torch
.
Tensor
]:
return
[
video_to_pixel_values
(
video
,
input_size
=
self
.
image_size
,
max_num_tiles
=
max_num_tiles
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
video
in
videos
]
def
_preprocess_video
(
self
,
text
:
list
[
str
],
videos
:
list
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]],
max_num_tiles
:
int
,
):
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
video_inputs
=
{}
else
:
videos_lst
=
[
v
[
0
]
for
v
in
videos
]
video_metadata_lst
=
[
v
[
1
]
for
v
in
videos
]
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos_lst
,
max_num_tiles
=
max_num_tiles
,
)
# We use frame duration in milliseconds (as integer) to ensure
# we have consistent timestamps calculation. At preprocessing
# fps parameter is given in fp32, while at inference it is bf16
# which leads to inaccurate timestamp calculation and causes
# timestamp values to differ.In rare cases this causes
# mismatching number of output tokens for tokenized frame prefixes
frame_duration_ms_lst
=
[
int
(
1000.0
/
metadata
[
"fps"
])
for
metadata
in
video_metadata_lst
]
frames_indices_lst
=
[
metadata
[
"frames_indices"
]
for
metadata
in
video_metadata_lst
]
video_num_patches
=
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst_video
]
)
video_inputs
=
{
"pixel_values_flat_video"
:
input_conditioner
(
torch
.
cat
(
pixel_values_lst_video
),
self
.
norm_mean
,
self
.
norm_std
),
"video_num_patches"
:
video_num_patches
,
"frames_indices"
:
frames_indices_lst
,
"frame_duration_ms"
:
torch
.
tensor
(
frame_duration_ms_lst
),
}
image_size
:
int
=
self
.
config
.
force_image_size
patch_size
:
int
=
self
.
config
.
patch_size
downsample_ratio
=
self
.
config
.
downsample_ratio
tokens_in_single_frame
=
int
(
(
image_size
*
image_size
//
patch_size
**
2
)
*
(
downsample_ratio
**
2
)
)
for
pixel_values
,
video_metadata
,
frames_indices
,
frame_duration_ms
in
zip
(
pixel_values_lst_video
,
video_metadata_lst
,
frames_indices_lst
,
frame_duration_ms_lst
,
):
num_frames
=
pixel_values
.
shape
[
0
]
if
(
self
.
video_pruning_rate
is
not
None
and
self
.
video_pruning_rate
>
0.0
):
# Start of EVS-specific code
num_tokens
=
compute_retained_tokens_count
(
tokens_per_frame
=
tokens_in_single_frame
,
num_frames
=
num_frames
,
q
=
self
.
video_pruning_rate
,
)
# Here we just need placeholders that won't actually be replaced -
# we just need to make sure the total number of tokens is correct
# assign all tokens to the first frame
tokens_per_frame
=
[
num_tokens
]
+
[
0
]
*
(
num_frames
-
1
)
# End of EVS-specific code
else
:
tokens_per_frame
=
[
tokens_in_single_frame
]
*
num_frames
video_repl
=
self
.
get_video_repl
(
tokens_per_frame
=
tokens_per_frame
,
frames_indices
=
frames_indices
,
frame_duration_ms
=
frame_duration_ms
,
tokenizer
=
self
.
tokenizer
,
img_start_token_ids
=
self
.
_img_start_token_ids
,
img_end_token_ids
=
self
.
_img_end_token_ids
,
img_context_token_ids
=
self
.
_img_context_token_ids
,
)
# video_repl.full is a list of token IDs
# Convert token IDs back to text for the HF processor flow
video_repl_text
=
self
.
tokenizer
.
decode
(
video_repl
.
full
,
skip_special_tokens
=
False
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl_text
,
1
)
for
t
in
text
]
return
text
,
video_inputs
def
_preprocess_audio
(
self
,
text
:
list
[
str
],
audios
:
list
[
npt
.
NDArray
],
):
if
len
(
audios
)
==
0
:
return
text
,
{}
assert
self
.
audio_extractor
is
not
None
extractor
=
self
.
audio_extractor
parts
=
[
x
for
x
in
re
.
split
(
f
"(
{
re
.
escape
(
AUDIO_CONTEXT
)
}
)"
,
text
[
0
])
if
x
]
token_count
=
parts
.
count
(
AUDIO_CONTEXT
)
if
token_count
!=
len
(
audios
):
raise
ValueError
(
"Number of audio tokens in text does not match the number "
f
"of audios (tokens=
{
token_count
}
, audios=
{
len
(
audios
)
}
)."
)
audio_index
=
0
for
idx
,
part
in
enumerate
(
parts
):
if
part
==
AUDIO_CONTEXT
:
audio_repl
=
self
.
get_audio_repl
(
audios
[
audio_index
])
parts
[
idx
]
=
audio_repl
.
full
audio_index
+=
1
text
=
[
""
.
join
(
parts
)]
audio_inputs
=
extractor
(
audios
,
sampling_rate
=
extractor
.
sampling_rate
,
return_tensors
=
"pt"
,
)
input_audio_features
=
audio_inputs
.
input_features
feature_attention_mask
=
audio_inputs
.
attention_mask
audio_feature_lengths
=
feature_attention_mask
.
sum
(
dim
=
1
)
audio_inputs
=
{
"input_audio_features"
:
input_audio_features
,
"feature_attention_mask"
:
feature_attention_mask
,
"audio_feature_lengths"
:
audio_feature_lengths
,
}
return
text
,
audio_inputs
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
videos
:
list
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]]
|
None
=
None
,
audios
:
AudioItem
|
list
[
AudioItem
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
max_num_tiles
:
int
|
None
=
None
,
)
->
BatchFeature
:
# Use default if not provided
if
max_num_tiles
is
None
:
max_num_tiles
=
self
.
max_num_tiles
text
,
images
,
videos
,
audios
=
[
self
.
_make_batch_input
(
x
)
for
x
in
(
text
,
images
,
videos
,
audios
)
]
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
max_num_tiles
=
max_num_tiles
,
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
max_num_tiles
=
1
,
)
text
,
audio_inputs
=
self
.
_preprocess_audio
(
text
=
text
,
audios
=
audios
,
)
text_inputs
=
self
.
tokenizer
(
text
,
add_special_tokens
=
False
)
combined_inputs
=
{
**
text_inputs
,
**
video_inputs
,
**
audio_inputs
}
if
self
.
dynamic_tiler
is
None
:
batch
=
BatchFeature
(
{
**
combined_inputs
,
**
image_inputs
},
tensor_type
=
return_tensors
,
)
else
:
batch
=
BatchFeature
(
combined_inputs
,
tensor_type
=
return_tensors
)
# allow images to be exempt from the BatchFeature validation:
# We will .stack() them in _parse_and_validate_image_input
batch
.
update
(
image_inputs
)
return
batch
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
get_audio_repl
(
self
,
audio
:
npt
.
NDArray
,
)
->
PromptUpdateDetails
[
str
]:
assert
self
.
audio_extractor
is
not
None
num_tokens
=
self
.
audio_extractor
.
audio_token_count
(
len
(
audio
))
repl_full
=
f
"
{
AUDIO_START
}{
AUDIO_CONTEXT
*
num_tokens
}{
AUDIO_END
}
"
return
PromptUpdateDetails
.
select_text
(
repl_full
,
AUDIO_CONTEXT
)
@
classmethod
def
get_video_repl
(
cls
,
*
,
tokens_per_frame
:
list
[
int
],
frames_indices
:
list
[
int
],
frame_duration_ms
:
int
,
tokenizer
:
TokenizerLike
,
img_start_token_ids
:
list
[
int
],
img_end_token_ids
:
list
[
int
],
img_context_token_ids
:
list
[
int
],
)
->
PromptUpdateDetails
[
list
[
int
]]:
"""
Build prompt replacement for a video.
The replacement returned is not actually used to replace the placeholder
tokens - it's just used to make sure we allocate the correct number
of tokens.
Actual replacement is done in embed_multimodal of
NemotronH_Nano_VL_V2
(specifically in _process_video_input -> _create_final_video_embeddings).
There, we create the final embeddings with text embeddings for indicator tokens
and video embeddings for video tokens.
This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
The differentiation is done via tokens_per_frame parameter.
- non EVS case - constant value same value across all frames
- EVS dummy - Doesn't matter how tokens are distributed between frames - just
make sure the total number of tokens is correct.
- EVS real (called from get_real_video_repl_for_evs) - different value per frame
Args:
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
"""
# TODO: Add support of frame_duration_ms to be None
# At preprocessing step we should allow absent / metadata without
# frames_indices field.
timestamps_enabled
=
frame_duration_ms
is
not
None
if
timestamps_enabled
:
timestamps
=
calculate_timestamps
(
frames_indices
,
frame_duration_ms
)
assert
len
(
timestamps
)
==
len
(
tokens_per_frame
),
(
"timestamps and tokens_per_frame must have the same length"
)
frame_separators
=
[
f
"Frame
{
i
+
1
}
sampled at
{
timestamp
:.
2
f
}
seconds: "
for
i
,
timestamp
in
enumerate
(
timestamps
)
]
else
:
frame_separators
=
[
f
"Frame
{
i
+
1
}
: "
for
i
,
_
in
enumerate
(
tokens_per_frame
)
]
# Tokenize frame separator independently
frame_separators_tokenized
=
[
_seq2tokens
(
tokenizer
,
sep
)
for
sep
in
frame_separators
]
# Tokenize each component independently to avoid tokenizer merging tokens
# across boundaries. This ensures consistent tokenization regardless of
# num_tokens_per_frame values.
all_token_ids
=
[]
for
i
,
num_tokens
in
enumerate
(
tokens_per_frame
):
frame_sep_token_ids
=
frame_separators_tokenized
[
i
]
all_token_ids
.
extend
(
frame_sep_token_ids
)
# Add pre-tokenized special tokens
all_token_ids
.
extend
(
img_start_token_ids
)
all_token_ids
.
extend
(
img_context_token_ids
*
num_tokens
)
all_token_ids
.
extend
(
img_end_token_ids
)
return
PromptUpdateDetails
.
from_seq
(
all_token_ids
)
class
BaseNanoNemotronVLProcessingInfo
(
BaseProcessingInfo
):
"""Basic image-only ProcessingInfo for InternVL-style models."""
...
...
vllm/model_executor/models/nemotron_parse.py
View file @
f3403243
...
...
@@ -11,18 +11,13 @@ import math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Literal
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
einops
import
rearrange
from
PIL
import
Image
from
timm.data.constants
import
OPENAI_CLIP_MEAN
,
OPENAI_CLIP_STD
from
torchvision
import
transforms
as
T
from
transformers
import
(
BartConfig
,
BatchFeature
,
PretrainedConfig
,
TensorType
,
)
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -59,13 +54,12 @@ from vllm.multimodal.processing import (
PromptUpdate
,
)
from
vllm.renderers
import
TokenizeParams
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.configs.radio
import
RadioConfig
from
vllm.transformers_utils.processors.nemotron_parse
import
NemotronParseProcessor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.v1.attention.backend
import
AttentionType
logger
=
init_logger
(
__name__
)
DEFAULT_FINAL_IMAGE_SIZE
=
(
2048
,
1648
)
class
BartScaledWordEmbedding
(
VocabParallelEmbedding
):
...
...
@@ -372,231 +366,6 @@ class NemotronParsePixelInputs(TensorSchema):
data
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"b"
,
3
,
"h"
,
"w"
)]
class
NemotronParseImageProcessor
:
"""
NemotronParse Image Processor
"""
def
__init__
(
self
,
final_size
:
tuple
=
DEFAULT_FINAL_IMAGE_SIZE
,
**
kwargs
,
):
# Ensure final_size is properly formatted
if
isinstance
(
final_size
,
(
list
,
tuple
))
and
len
(
final_size
)
>=
2
:
self
.
final_size
=
(
int
(
final_size
[
0
]),
int
(
final_size
[
1
]))
elif
isinstance
(
final_size
,
(
int
,
float
)):
self
.
final_size
=
(
int
(
final_size
),
int
(
final_size
))
else
:
self
.
final_size
=
DEFAULT_FINAL_IMAGE_SIZE
# Default fallback
self
.
norm_mean
=
torch
.
Tensor
(
OPENAI_CLIP_MEAN
).
reshape
(
1
,
3
,
1
,
1
)
self
.
norm_std
=
torch
.
Tensor
(
OPENAI_CLIP_STD
).
reshape
(
1
,
3
,
1
,
1
)
# Create transforms
self
.
_create_transforms
()
def
_create_transforms
(
self
):
"""Create transform objects."""
try
:
import
albumentations
as
A
except
ImportError
as
err
:
raise
ImportError
(
"The package `albumentations` is required to use "
"NemotronParse model. Please install it with `pip install "
"albumentations`."
)
from
err
# Ensure final_size is a tuple of integers
if
isinstance
(
self
.
final_size
,
(
list
,
tuple
)):
self
.
target_height
,
self
.
target_width
=
(
int
(
self
.
final_size
[
0
]),
int
(
self
.
final_size
[
1
]),
)
else
:
self
.
target_height
=
self
.
target_width
=
int
(
self
.
final_size
)
import
cv2
self
.
transform
=
A
.
Compose
(
[
A
.
PadIfNeeded
(
min_height
=
self
.
target_height
,
min_width
=
self
.
target_width
,
border_mode
=
cv2
.
BORDER_CONSTANT
,
fill
=
[
255
,
255
,
255
],
p
=
1.0
,
),
]
)
self
.
torch_transform
=
T
.
Compose
(
[
T
.
ToTensor
(),
]
)
def
_resize_with_aspect_ratio
(
self
,
image
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Resize image maintaining aspect ratio (exact replica of original
LongestMaxSizeHW)."""
height
,
width
=
image
.
shape
[:
2
]
max_size_height
=
self
.
target_height
max_size_width
=
self
.
target_width
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
aspect_ratio
=
width
/
height
new_height
=
height
new_width
=
width
# If height too big then scale image down
if
height
>
max_size_height
:
new_height
=
max_size_height
new_width
=
int
(
new_height
*
aspect_ratio
)
# If width too big, scale image down further
if
new_width
>
max_size_width
:
new_width
=
max_size_width
new_height
=
int
(
new_width
/
aspect_ratio
)
# Use cv2.INTER_LINEAR like the original
import
cv2
return
cv2
.
resize
(
image
,
(
new_width
,
new_height
),
interpolation
=
cv2
.
INTER_LINEAR
)
def
_pad_to_size
(
self
,
image
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Pad image to target size with white padding (matches A.PadIfNeeded
behavior)."""
h
,
w
=
image
.
shape
[:
2
]
min_height
,
min_width
=
self
.
target_height
,
self
.
target_width
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
pad_h
=
max
(
0
,
min_height
-
h
)
pad_w
=
max
(
0
,
min_width
-
w
)
if
pad_h
==
0
and
pad_w
==
0
:
return
image
# A.PadIfNeeded pads to bottom-right with constant value
if
len
(
image
.
shape
)
==
3
:
# Color image - pad bottom and right with white (255, 255, 255)
padded
=
np
.
pad
(
image
,
((
0
,
pad_h
),
(
0
,
pad_w
),
(
0
,
0
)),
mode
=
"constant"
,
constant_values
=
255
,
)
else
:
# Grayscale image - pad with white (255)
padded
=
np
.
pad
(
image
,
((
0
,
pad_h
),
(
0
,
pad_w
)),
mode
=
"constant"
,
constant_values
=
255
)
return
padded
def
preprocess
(
self
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
**
kwargs
,
)
->
dict
[
str
,
torch
.
Tensor
]:
"""
Preprocess an image or batch of images for the NemotronParse model.
Args:
images: Input image(s)
"""
# Ensure images is a list
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
# Convert PIL images to numpy arrays if needed
processed_images
=
[]
for
image
in
images
:
if
isinstance
(
image
,
Image
.
Image
):
image
=
np
.
asarray
(
image
)
processed_images
.
append
(
image
)
# Apply NemotronParse-specific transforms
pixel_values
=
[]
for
image
in
processed_images
:
# Manual resize with aspect ratio preservation
# (replaces LongestMaxSizeHW)
processed_image
=
self
.
_resize_with_aspect_ratio
(
image
)
# Apply remaining albumentations transforms if available
if
self
.
transform
is
not
None
:
transformed
=
self
.
transform
(
image
=
processed_image
)
processed_image
=
transformed
[
"image"
]
else
:
# Fallback: just pad to target size
processed_image
=
self
.
_pad_to_size
(
processed_image
)
# Convert to tensor
pixel_values_tensor
=
self
.
torch_transform
(
processed_image
)
# Handle grayscale images
if
pixel_values_tensor
.
shape
[
0
]
==
1
:
pixel_values_tensor
=
pixel_values_tensor
.
expand
(
3
,
-
1
,
-
1
)
pixel_values
.
append
(
pixel_values_tensor
)
# Stack into batch
pixel_values
=
torch
.
stack
(
pixel_values
)
# Normalize pixel values
normalized_values
=
(
pixel_values
-
self
.
norm_mean
)
/
self
.
norm_std
return
{
"pixel_values"
:
normalized_values
}
def
__call__
(
self
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
**
kwargs
)
->
dict
[
str
,
torch
.
Tensor
]:
return
self
.
preprocess
(
images
,
**
kwargs
)
class
NemotronParseProcessor
:
"""
NemotronParse Processor
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
NemotronParseImageProcessor
(
final_size
=
config
.
image_size
)
def
_make_batch_input
(
self
,
input_item
=
None
):
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
,
images
=
[
self
.
_make_batch_input
(
x
)
for
x
in
(
text
,
images
)]
image_inputs
=
{}
if
len
(
images
)
==
0
else
self
.
image_processor
(
images
)
text_inputs
=
self
.
tokenizer
(
text
,
add_special_tokens
=
False
,
**
kwargs
)
combined_outputs
=
BatchFeature
(
data
=
{
**
text_inputs
,
**
image_inputs
},
tensor_type
=
return_tensors
,
)
return
combined_outputs
class
NemotronParseProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
()
...
...
vllm/model_executor/models/nemotron_vl.py
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
math
from
abc
import
ABC
from
collections.abc
import
Iterable
import
torch
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
AutoModel
,
PretrainedConfig
from
transformers.image_processing_utils_fast
import
BaseImageProcessorFast
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
...
...
@@ -30,16 +19,16 @@ from vllm.model_executor.models.internvl import (
InternVLImageEmbeddingInputs
,
InternVLImageInputs
,
InternVLImagePixelInputs
,
InternVLProcessor
,
)
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.siglip
import
SiglipVisionModel
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.transformers_utils.processor
import
cached_image_processor_from_config
from
vllm.transformers_utils.processors.nemotron_vl
import
(
LlamaNemotronVLEmbedProcessor
,
NemotronVLProcessor
,
)
from
vllm.transformers_utils.repo_utils
import
get_hf_file_to_dict
from
.interfaces
import
(
...
...
@@ -58,310 +47,6 @@ from .utils import (
)
def
build_transform
(
input_size
:
int
):
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
]
)
# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_factor
=
float
(
"-inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
rw
,
rh
in
target_ratios
:
target_aspect_ratio
=
rw
/
rh
size_factor
=
min
((
rw
*
rh
*
image_size
*
image_size
)
/
area
,
0.6
)
ratio_closeness
=
min
(
target_aspect_ratio
/
aspect_ratio
,
aspect_ratio
/
target_aspect_ratio
)
factor
=
size_factor
*
ratio_closeness
if
factor
>
best_factor
:
best_factor
=
factor
best_ratio
=
(
rw
,
rh
)
return
best_ratio
def
calculate_nemotron_vl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_nemotron_vl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_nemotron_vl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
def
get_nemotron_vl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
image_to_pixel_values_nemotron_vl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
transform
:
T
.
Compose
|
None
=
None
,
)
->
torch
.
Tensor
:
target_ratios
=
get_nemotron_vl_target_ratios
(
min_num
,
max_num
)
if
transform
is
None
:
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_nemotron_vl
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
NemotronVLProcessor
(
InternVLProcessor
):
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
image_processor
:
BaseImageProcessorFast
|
None
=
None
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
ABC
.
__init__
(
self
)
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
image_processor
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
1
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
image_processor
.
max_num_tiles
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
True
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
if
image_processor
is
not
None
:
self
.
use_thumbnail
=
image_processor
.
use_thumbnail
else
:
self
.
use_thumbnail
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
self
.
IMG_CONTEXT
]
def
_get_transform
(
self
)
->
T
.
Compose
:
return
build_transform
(
input_size
=
self
.
image_size
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_nemotron_vl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_nemotron_vl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
transform
=
self
.
_get_transform
(),
)
for
image
in
images
]
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Replace <image> placeholders with image tokens."""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT
=
image_repl
.
full
.
replace
(
"<image>"
,
"<NVL_IMG_CONTEXT>"
)
text
=
[
t
.
replace
(
"<image>"
,
NVL_IMAGE_CONTEXT
,
1
)
for
t
in
text
]
return
[
t
.
replace
(
"<NVL_IMG_CONTEXT>"
,
self
.
IMG_CONTEXT
)
for
t
in
text
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
text
=
self
.
_replace_image_tokens
(
text
,
pixel_values_lst
)
return
text
,
image_inputs
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
self
.
IMG_CONTEXT
*
feature_size
repl_full
=
self
.
IMG_START
+
repl_features
+
self
.
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
self
.
IMG_CONTEXT
)
class
NemotronVLProcessingInfo
(
BaseInternVLProcessingInfo
):
"""Processing info for Nemotron VL models."""
...
...
@@ -700,91 +385,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
# - Pooler output instead of generative logits
# --------------------------------------------------------
# SigLIP normalization constants
SIGLIP_MEAN
=
(
0.5
,
0.5
,
0.5
)
SIGLIP_STD
=
(
0.5
,
0.5
,
0.5
)
def
build_siglip_transform
(
input_size
:
int
):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
base_transform
=
build_transform
(
input_size
=
input_size
)
return
T
.
Compose
(
[
base_transform
,
T
.
Normalize
(
mean
=
SIGLIP_MEAN
,
std
=
SIGLIP_STD
),
]
)
class
LlamaNemotronVLEmbedProcessor
(
NemotronVLProcessor
):
"""
Processor for LlamaNemotronVL embedding model.
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
- Uses SigLIP transform with normalization instead of base transform
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT
=
"<IMG_CONTEXT>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
processor_config
:
dict
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
processor_config
.
get
(
"min_input_tiles"
,
getattr
(
config
,
"min_dynamic_patch"
,
1
),
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
processor_config
.
get
(
"max_input_tiles"
,
getattr
(
config
,
"max_dynamic_patch"
,
1
),
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
processor_config
.
get
(
"dynamic_image_size"
,
getattr
(
config
,
"dynamic_image_size"
,
True
),
)
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
image_processor
=
None
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
def
_get_transform
(
self
)
->
T
.
Compose
:
"""Override to add SigLIP normalization."""
return
build_siglip_transform
(
input_size
=
self
.
image_size
)
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
class
LlamaNemotronVLEmbedProcessingInfo
(
NemotronVLProcessingInfo
):
"""Processing info for LlamaNemotronVL embedding model."""
...
...
vllm/model_executor/models/nvlm_d.py
View file @
f3403243
...
...
@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
PromptUpdate
,
PromptUpdateDetails
,
)
from
vllm.transformers_utils.processors.nvlm_d
import
IMG_PAD
,
NVLMProcessor
from
.intern_vit
import
InternVisionModel
from
.internvl
import
(
BaseInternVLDummyInputsBuilder
,
BaseInternVLMultiModalProcessor
,
BaseInternVLProcessingInfo
,
BaseInternVLProcessor
,
InternVLChatModel
,
)
IMG_PAD
=
"<|vision_pad|>"
class
NVLMProcessor
(
BaseInternVLProcessor
):
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_PAD
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
tile_pos_identifiers
=
[
f
"<tile_
{
i
}
>"
for
i
in
range
(
1
,
num_patches
)]
if
self
.
use_thumbnail
:
tile_pos_identifiers
+=
[
"<tile_global_thumbnail>"
]
context_size
=
feature_size
//
num_patches
features
=
""
.
join
(
identifier
+
IMG_PAD
*
context_size
for
identifier
in
tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl
=
"<Image>"
+
features
+
"</Image>"
return
PromptUpdateDetails
.
select_text
(
repl
,
IMG_PAD
)
class
NVLMProcessingInfo
(
BaseInternVLProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
NVLMProcessor
:
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
f3403243
...
...
@@ -12,9 +12,7 @@ from typing import Annotated, Literal, TypeAlias
import
torch
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
transformers
import
BatchFeature
,
PretrainedConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
...
...
@@ -26,7 +24,6 @@ from vllm.model_executor.models.intern_vit import (
InternVisionPatchModel
,
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
...
@@ -44,22 +41,14 @@ from vllm.multimodal.processing import (
BaseProcessingInfo
,
PromptReplacement
,
PromptUpdate
,
PromptUpdateDetails
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.t
okenizers
import
TokenizerLike
from
vllm.t
ransformers_utils.processors.skyworkr1v
import
SkyworkR1VProcessor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
class
SkyworkR1VImagePixelInputs
(
TensorSchema
):
"""
...
...
@@ -106,370 +95,6 @@ SkyworkR1VImageInputs: TypeAlias = (
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_skyworkr1v_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_skyworkr1v_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_skyworkr1v_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_skyworkr1v
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_skyworkr1v_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def
image_to_pixel_values_skyworkr1v
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_skyworkr1v
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
SkyworkR1VProcessor
:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_skyworkr1v_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_skyworkr1v_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_skyworkr1v
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
class
SkyworkR1VProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
SkyworkR1VProcessor
:
return
self
.
ctx
.
init_processor
(
...
...
vllm/transformers_utils/processors/__init__.py
View file @
f3403243
...
...
@@ -13,35 +13,53 @@ import importlib
__all__
=
[
"BagelProcessor"
,
"DeepseekVLV2Processor"
,
"Eagle2_5_VLProcessor"
,
"FireRedASR2Processor"
,
"FunASRProcessor"
,
"GLM4VProcessor"
,
"H2OVLProcessor"
,
"HunYuanVLProcessor"
,
"HunYuanVLImageProcessor"
,
"InternVLProcessor"
,
"KimiAudioProcessor"
,
"MistralCommonPixtralProcessor"
,
"MistralCommonVoxtralProcessor"
,
"NanoNemotronVLProcessor"
,
"NemotronParseProcessor"
,
"NemotronVLProcessor"
,
"LlamaNemotronVLEmbedProcessor"
,
"NVLMProcessor"
,
"OvisProcessor"
,
"Ovis2_5Processor"
,
"QwenVLProcessor"
,
"Qwen3ASRProcessor"
,
"SkyworkR1VProcessor"
,
]
_CLASS_TO_MODULE
:
dict
[
str
,
str
]
=
{
"BagelProcessor"
:
"vllm.transformers_utils.processors.bagel"
,
"DeepseekVLV2Processor"
:
"vllm.transformers_utils.processors.deepseek_vl2"
,
"Eagle2_5_VLProcessor"
:
"vllm.transformers_utils.processors.eagle2_5_vl"
,
"FireRedASR2Processor"
:
"vllm.transformers_utils.processors.fireredasr2"
,
"FunASRProcessor"
:
"vllm.transformers_utils.processors.funasr"
,
"GLM4VProcessor"
:
"vllm.transformers_utils.processors.glm4v"
,
"H2OVLProcessor"
:
"vllm.transformers_utils.processors.h2ovl"
,
"HunYuanVLProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl"
,
"HunYuanVLImageProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl_image"
,
"InternVLProcessor"
:
"vllm.transformers_utils.processors.internvl"
,
"KimiAudioProcessor"
:
"vllm.transformers_utils.processors.kimi_audio"
,
"MistralCommonPixtralProcessor"
:
"vllm.transformers_utils.processors.pixtral"
,
"MistralCommonVoxtralProcessor"
:
"vllm.transformers_utils.processors.voxtral"
,
"NanoNemotronVLProcessor"
:
"vllm.transformers_utils.processors.nano_nemotron_vl"
,
"NemotronParseProcessor"
:
"vllm.transformers_utils.processors.nemotron_parse"
,
"NemotronVLProcessor"
:
"vllm.transformers_utils.processors.nemotron_vl"
,
"LlamaNemotronVLEmbedProcessor"
:
"vllm.transformers_utils.processors.nemotron_vl"
,
"NVLMProcessor"
:
"vllm.transformers_utils.processors.nvlm_d"
,
"OvisProcessor"
:
"vllm.transformers_utils.processors.ovis"
,
"Ovis2_5Processor"
:
"vllm.transformers_utils.processors.ovis2_5"
,
"QwenVLProcessor"
:
"vllm.transformers_utils.processors.qwen_vl"
,
"Qwen3ASRProcessor"
:
"vllm.transformers_utils.processors.qwen3_asr"
,
"SkyworkR1VProcessor"
:
"vllm.transformers_utils.processors.skyworkr1v"
,
}
...
...
vllm/transformers_utils/processors/eagle2_5_vl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from NVIDIA Eagle2.5-VL model
# https://huggingface.co/nvidia/Eagle2.5-8B
from
transformers
import
PretrainedConfig
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLProcessor
class
Eagle2_5_VLProcessor
(
BaseInternVLProcessor
):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self
.
config
=
config
self
.
tokenizer
=
tokenizer
# Image size with force_image_size override
image_size
:
int
=
config
.
vision_config
.
image_size
if
hasattr
(
config
,
"force_image_size"
)
and
config
.
force_image_size
:
image_size
=
config
.
force_image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
downsample_ratio
:
float
=
getattr
(
config
,
"downsample_ratio"
,
0.5
)
# Compute num_image_token
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
# Dynamic patch settings with defaults
self
.
min_dynamic_patch
=
(
min_dynamic_patch
if
min_dynamic_patch
is
not
None
else
getattr
(
config
,
"min_dynamic_patch"
,
1
)
)
self
.
max_dynamic_patch
=
(
max_dynamic_patch
if
max_dynamic_patch
is
not
None
else
getattr
(
config
,
"max_dynamic_patch"
,
12
)
)
self
.
dynamic_image_size
=
(
dynamic_image_size
if
dynamic_image_size
is
not
None
else
getattr
(
config
,
"dynamic_image_size"
,
True
)
)
self
.
use_thumbnail
:
bool
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
"""Get the image token ID from config or tokenizer."""
if
hasattr
(
self
.
config
,
"image_token_index"
):
return
self
.
config
.
image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab
=
self
.
tokenizer
.
get_vocab
()
if
IMG_CONTEXT
in
vocab
:
return
vocab
[
IMG_CONTEXT
]
raise
ValueError
(
f
"Cannot find image token '
{
IMG_CONTEXT
}
' in vocabulary"
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
"""Get image replacement string for prompt."""
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
vllm/transformers_utils/processors/h2ovl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
# --------------------------------------------------------
# H2OVL-Mississippi
# Copyright (c) 2024 H2O.AI
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
import
torch
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
BaseInternVLProcessor
,
build_transform
,
find_closest_aspect_ratio
,
get_internvl_target_ratios
,
)
def
resolve_h2ovl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_h2ovl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
*
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
# if prior_aspect_ratio is provided, filter the target ratios
if
prior_aspect_ratio
is
not
None
:
target_ratios
=
[
ratio
for
ratio
in
target_ratios
if
prior_aspect_ratio
[
0
]
%
ratio
[
0
]
!=
0
and
prior_aspect_ratio
[
1
]
%
ratio
[
1
]
!=
0
]
return
target_ratios
# modified to include blocks generated in second pass
def
calculate_h2ovl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
,
tuple
[
int
,
int
]]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
,
target_aspect_ratio
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
# refactored to handle prior_aspect_ratio
def
dynamic_preprocess_h2ovl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
list
[
Image
.
Image
],
tuple
[
int
,
int
]]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
(
blocks
,
target_width
,
target_height
,
target_aspect_ratio
,
)
=
calculate_h2ovl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
,
target_aspect_ratio
def
_preprocess_image
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
tuple
[
torch
.
Tensor
,
tuple
[
int
,
int
]]:
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
transform
=
build_transform
(
input_size
=
input_size
)
images
,
target_aspect_ratio
=
dynamic_preprocess_h2ovl
(
image
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
target_ratios
=
target_ratios
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
,
target_aspect_ratio
# refactored to use the _preprocess_image function
def
image_to_pixel_values_h2ovl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
use_msac
:
bool
,
)
->
torch
.
Tensor
:
# when MSAC is turned on, we need to process the image twice
if
use_msac
:
# first pass
pixel_values1
,
aspect_ratio1
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
1
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
None
,
)
# second pass
pixel_values2
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
3
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
aspect_ratio1
,
)
# combine pixel values
pixel_values
=
torch
.
cat
(
[
pixel_values2
[:
-
1
],
pixel_values1
[:
-
1
],
pixel_values2
[
-
1
:]],
0
)
else
:
pixel_values
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
use_thumbnail
,
prior_aspect_ratio
=
None
,
)
return
pixel_values
class
H2OVLProcessor
(
BaseInternVLProcessor
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_msac
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
,
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
if
use_msac
is
None
:
use_msac
=
config
.
use_msac
assert
isinstance
(
use_msac
,
bool
)
self
.
use_msac
=
use_msac
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_h2ovl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
=
None
,
override_min_num
:
int
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
if
override_min_num
is
not
None
:
min_num
=
override_min_num
return
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
use_msac
:
bool
|
None
=
None
,
)
->
int
:
use_msac
=
self
.
use_msac
if
use_msac
is
None
else
use_msac
use_thumbnail
=
self
.
use_thumbnail
if
use_msac
:
target_ratios_1
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
override_min_num
=
1
,
)
num_patches_1
,
_
,
_
,
aspect_ratio_1
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios_1
,
use_thumbnail
=
True
,
)
target_ratios_2
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
prior_aspect_ratio
=
aspect_ratio_1
,
override_min_num
=
3
,
)
num_patches_2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios_2
,
use_thumbnail
=
True
,
)
num_patches
=
num_patches_1
+
num_patches_2
-
1
else
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
use_msac
=
self
.
use_msac
if
len
(
images
)
==
1
else
False
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_h2ovl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
use_msac
=
use_msac
,
)
for
image
in
images
]
vllm/transformers_utils/processors/internvl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
TypeVar
import
numpy.typing
as
npt
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
_T
=
TypeVar
(
"_T"
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
return
transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_internvl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_internvl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_internvl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
dynamic_preprocess_internvl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_internvl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
image_to_pixel_values_internvl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_internvl
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
video_to_pixel_values_internvl
(
video
:
npt
.
NDArray
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
frames_list
=
list
[
Image
.
Image
]()
for
frame
in
video
:
pil_frame
=
dynamic_preprocess_internvl
(
Image
.
fromarray
(
frame
,
mode
=
"RGB"
),
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
assert
len
(
pil_frame
)
==
1
frames_list
.
extend
(
pil_frame
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
frames_list
])
return
pixel_values
class
BaseInternVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_internvl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_internvl_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_internvl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
_T
|
list
[
_T
]
|
None
=
None
)
->
list
[
_T
]:
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
class
InternVLProcessor
(
BaseInternVLProcessor
):
"""
HF Processor for InternVLChatModel with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
video_token
:
str
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
1
,
max_dynamic_patch
=
1
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
video_to_pixel_values_internvl
(
video
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
False
,
)
for
video
in
videos
]
def
_preprocess_video
(
self
,
text
:
list
[
str
],
videos
:
list
[
npt
.
NDArray
],
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
return
text
,
{}
video_token
=
self
.
video_token
assert
video_token
is
not
None
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
video_inputs
=
{
"pixel_values_flat_video"
:
torch
.
cat
(
pixel_values_lst_video
),
"video_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst_video
]
),
}
for
pixel_values
in
pixel_values_lst_video
:
num_patches
=
pixel_values
.
shape
[
0
]
video_repl
=
self
.
get_video_repl
(
self
.
num_image_token
,
num_patches
,
video_token
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl
.
full
,
1
)
for
t
in
text
]
return
text
,
video_inputs
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
]
|
None
=
None
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
videos
=
self
.
_make_batch_input
(
videos
)
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
dynamic_image_size
=
dynamic_image_size
,
)
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
,
**
video_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
get_video_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
video_context_token
:
str
=
IMG_CONTEXT
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
repl_features
=
video_context_token
*
self
.
num_image_token
repl_features_with_sep
=
IMG_START
+
repl_features
+
IMG_END
# num_patches is equal to num_frames
repl_full
=
""
.
join
(
[
f
"Frame
{
i
+
1
}
:
{
repl_features_with_sep
}
"
for
i
in
range
(
num_patches
)]
)
return
PromptUpdateDetails
.
select_text
(
repl_full
,
video_context_token
)
vllm/transformers_utils/processors/nano_nemotron_vl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# --------------------------------------------------------
# Adapted from
# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/internvl.py
# under Apache-2.0 License
# LICENSE is in root directory.
# --------------------------------------------------------
import
math
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Sequence
from
dataclasses
import
dataclass
from
typing
import
Any
,
TypeVar
import
einops
import
numpy
as
np
import
numpy.typing
as
npt
import
regex
as
re
import
torch
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.model_executor.models.parakeet
import
ParakeetExtractor
from
vllm.multimodal.evs
import
compute_retained_tokens_count
from
vllm.multimodal.inputs
import
AudioItem
from
vllm.multimodal.processing.processor
import
PromptUpdateDetails
,
_seq2tokens
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
calculate_internvl_targets
,
get_internvl_target_ratios
_T
=
TypeVar
(
"_T"
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
AUDIO_START
=
"<so_start>"
AUDIO_END
=
"<so_end>"
AUDIO_CONTEXT
=
"<so_embedding>"
# Profiling
# MAX_FRAMES = 16
DEFAULT_NUM_TILES
=
12
def
calculate_timestamps
(
indices
:
list
[
int
]
|
torch
.
Tensor
,
frame_duration_ms
:
int
,
):
if
not
isinstance
(
indices
,
list
):
indices
=
indices
.
tolist
()
timestamps
=
[
int
(
i
)
*
frame_duration_ms
/
1000.0
for
i
in
indices
]
return
timestamps
def
input_conditioner
(
x
:
torch
.
Tensor
,
norm_mean
:
torch
.
Tensor
,
norm_std
:
torch
.
Tensor
):
return
(
x
-
norm_mean
)
/
norm_std
def
dynamic_preprocess
(
image
,
*
,
image_size
=
512
,
max_num_tiles
=
12
,
use_thumbnail
=
True
,
idx
=
0
,
):
orig_width
,
orig_height
=
image
.
size
target_ratios
=
get_internvl_target_ratios
(
1
,
max_num_tiles
)
blocks
,
target_width
,
target_height
=
calculate_internvl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
image
=
np
.
asarray
(
image
.
convert
(
"RGB"
)
if
image
.
mode
!=
"RGB"
else
image
,
dtype
=
np
.
uint8
)
image
=
torch
.
from_numpy
(
image
).
unsqueeze
(
0
)
# (1, H, W, 3)
image
=
image
.
permute
(
0
,
3
,
1
,
2
)
# (1, 3, H, W)
resized_img
=
torch
.
nn
.
functional
.
interpolate
(
image
,
size
=
(
target_height
,
target_width
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
B
,
C
,
H
,
W
=
resized_img
.
shape
hp
,
wp
=
H
//
image_size
,
W
//
image_size
patches
=
(
resized_img
.
reshape
(
B
,
C
,
hp
,
image_size
,
wp
,
image_size
)
.
permute
(
0
,
2
,
4
,
1
,
3
,
5
)
.
reshape
(
B
*
hp
*
wp
,
C
,
image_size
,
image_size
)
/
255.0
)
if
use_thumbnail
and
patches
.
shape
[
0
]
>
1
:
thumb
=
(
torch
.
nn
.
functional
.
interpolate
(
image
,
size
=
(
image_size
,
image_size
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
/
255.0
)
patches
=
torch
.
cat
([
patches
,
thumb
],
dim
=
0
)
return
list
(
patches
)
def
image_to_pixel_values
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
idx
:
int
,
)
->
torch
.
Tensor
:
images
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
max_num_tiles
=
max_num
,
use_thumbnail
=
use_thumbnail
,
idx
=
idx
,
)
pixel_values
=
torch
.
stack
(
images
)
return
pixel_values
def
video_to_pixel_values
(
video
:
npt
.
NDArray
,
*
,
input_size
:
int
,
max_num_tiles
:
int
=
1
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
assert
max_num_tiles
==
1
,
"Video modality always uses one tile"
# (num_frames, H, W, C) -> (num_frames, C, H, W)
video_tensor
=
torch
.
from_numpy
(
video
).
permute
(
0
,
3
,
1
,
2
)
if
video_tensor
.
shape
[
2
]
!=
input_size
or
video_tensor
.
shape
[
3
]
!=
input_size
:
video_tensor
=
torch
.
nn
.
functional
.
interpolate
(
video_tensor
,
size
=
(
input_size
,
input_size
),
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
video_tensor
=
video_tensor
/
255.0
return
video_tensor
class
DynamicResolutionImageTiler
:
CONV_MERGING
=
False
PIXEL_SHUFFLE
=
True
USE_THUMBNAIL
=
False
def
__init__
(
self
,
*
,
max_model_len
:
int
,
patch_size
:
int
,
min_num_patches
:
int
,
max_num_patches
:
int
,
downsample_ratio
:
int
,
norm_mean
:
Sequence
[
float
],
norm_std
:
Sequence
[
float
],
factor_max
:
float
=
1.0
,
use_thumbnail
:
bool
=
False
,
)
->
None
:
assert
use_thumbnail
is
False
,
"use_thumbnail is not supported"
self
.
_patch_size
:
int
=
patch_size
self
.
_max_model_len
=
max_model_len
self
.
_min_num_patches
=
min_num_patches
self
.
_max_num_patches
=
max_num_patches
if
max_num_patches
>
0
else
float
(
"inf"
)
self
.
_factor_max
=
factor_max
self
.
norm_mean
=
torch
.
tensor
(
norm_mean
).
reshape
(
3
,
1
,
1
)
self
.
norm_std
=
torch
.
tensor
(
norm_std
).
reshape
(
3
,
1
,
1
)
assert
downsample_ratio
<
1
reduction_factor
=
1
/
downsample_ratio
assert
reduction_factor
==
2.0
self
.
_downsample_ratio
=
int
(
reduction_factor
)
**
(
self
.
PIXEL_SHUFFLE
+
self
.
CONV_MERGING
)
assert
self
.
_downsample_ratio
==
2
def
_get_num_embeddings
(
self
,
width
:
int
,
height
:
int
)
->
int
:
num_patches
=
(
width
//
self
.
_patch_size
)
*
(
height
//
self
.
_patch_size
)
num_tokens
=
num_patches
//
(
self
.
_downsample_ratio
**
2
)
return
num_tokens
def
width_and_height_for_max_num_tokens_available
(
self
,
target_num_tokens_post_shuffle
:
int
,
)
->
tuple
[
int
,
int
]:
"""
TODO: optimize this so it squeezes closer to target number of tokens.
Calculate image dimensions that produce approximately `target` tokens after
pixel_shuffle.
With pixel_shuffle enabled, each 2x2 patch grid becomes 1 token, so we
need 4*B patches to get B tokens.
Examples:
>>> PATCH_SIZE = 16
>>> DOWNSAMPLE_RATIO = 0.5
>>> tiler = DynamicResolutionImageTiler(
... max_model_len=16384,
... patch_size=PATCH_SIZE,
... downsample_ratio=DOWNSAMPLE_RATIO,
... min_num_patches=4,
... max_num_patches=0,
... )
>>> width, height = tiler.width_and_height_for_max_num_tokens_available(
... target_num_tokens_post_shuffle=8192,
... )
>>> assert width, height == (2880, 2880)
>>> assert (width // PATCH_SIZE) * (
... height // PATCH_SIZE
... ) // 2**2 == 8100 # tokens post-shuffle
>>> assert tiler._get_num_embeddings(width=width, height=height) == 8100
"""
side_pixels
=
(
math
.
isqrt
(
target_num_tokens_post_shuffle
)
*
self
.
_downsample_ratio
*
self
.
_patch_size
)
assert
isinstance
(
side_pixels
,
int
)
and
side_pixels
%
self
.
_patch_size
==
0
return
side_pixels
,
side_pixels
def
max_num_tokens_available
(
self
,
text_prompt_length
:
int
)
->
int
:
return
self
.
_max_model_len
-
text_prompt_length
-
4
def
_images_to_pixel_values_lst
(
self
,
text_prompt_length
:
int
,
images
:
list
[
Image
.
Image
],
)
->
tuple
[
list
[
torch
.
Tensor
],
list
[
int
]]:
num_tokens_available
=
self
.
max_num_tokens_available
(
text_prompt_length
)
params_per_image
=
self
.
compute_params
(
images
,
num_tokens_available
)
feature_sizes
=
[]
images
=
[]
for
param
in
params_per_image
:
for
t
in
self
.
apply_params
(
param
):
assert
t
.
ndim
==
3
,
f
"
{
t
.
ndim
=
}
: expected 3 dim tensor"
images
.
append
(
t
)
feature_sizes
.
append
(
param
.
num_embeddings
)
return
images
,
feature_sizes
feature_size_cache
:
dict
[
Image
.
Image
,
int
]
=
{}
@
classmethod
def
get_cached_feature_size
(
cls
,
image
:
Image
.
Image
)
->
int
:
feature_size
=
cls
.
feature_size_cache
[
id
(
image
)]
# hard assert that we only use the feature size once
del
cls
.
feature_size_cache
[
id
(
image
)]
return
feature_size
@
dataclass
class
DynamicResolutionParams
:
media
:
Image
.
Image
num_tiles
:
int
num_embeddings
:
int
patch_size
:
tuple
[
int
,
int
]
def
apply_params
(
self
,
params
:
DynamicResolutionParams
)
->
list
[
torch
.
Tensor
]:
target_size
=
(
params
.
patch_size
[
1
]
*
self
.
_patch_size
,
params
.
patch_size
[
0
]
*
self
.
_patch_size
,
)
image
=
np
.
asarray
(
params
.
media
.
convert
(
"RGB"
)
if
params
.
media
.
mode
!=
"RGB"
else
params
.
media
,
dtype
=
np
.
uint8
,
)
resized_img
=
(
torch
.
nn
.
functional
.
interpolate
(
torch
.
from_numpy
(
image
).
unsqueeze
(
0
).
permute
(
0
,
3
,
1
,
2
),
size
=
target_size
,
mode
=
"bicubic"
,
align_corners
=
False
,
antialias
=
True
,
)
/
255.0
)
return
list
(
resized_img
)
def
process_media
(
self
,
media
:
Image
.
Image
,
num_tokens_available
:
int
,
)
->
tuple
[
DynamicResolutionParams
,
int
]:
"""Process a single media item and return its parameters.
Args:
media: The media item to process
num_tokens_available: Number of tokens available for this media
Returns:
DynamicResolutionParams for the media
"""
current_num_tokens_available
=
num_tokens_available
assert
isinstance
(
media
,
Image
.
Image
),
(
"Dynamic resolution is only supported for image media"
)
orig_width
,
orig_height
=
media
.
width
,
media
.
height
closest_patch_height
=
round
(
orig_height
/
self
.
_patch_size
+
0.5
)
closest_patch_width
=
round
(
orig_width
/
self
.
_patch_size
+
0.5
)
patches
=
closest_patch_height
*
closest_patch_width
factor
=
min
(
math
.
sqrt
(
current_num_tokens_available
/
patches
),
self
.
_factor_max
)
target_patch_height
=
math
.
floor
(
factor
*
closest_patch_height
)
target_patch_width
=
math
.
floor
(
factor
*
closest_patch_width
)
# Consider self._min_num_patches if > current_num_tokens_available.
if
(
current_num_tokens_available
>
self
.
_min_num_patches
and
target_patch_height
*
target_patch_width
<
self
.
_min_num_patches
):
up_factor
=
math
.
sqrt
(
self
.
_min_num_patches
/
(
target_patch_height
*
target_patch_width
)
)
target_patch_height
=
math
.
ceil
(
up_factor
*
target_patch_height
)
target_patch_width
=
math
.
ceil
(
up_factor
*
target_patch_width
)
# Round patch grid to be divisible by 2 (pixel-shuffle OR conv-merging)
# or by 4 when BOTH are enabled (two successive 2x reductions)
if
self
.
PIXEL_SHUFFLE
or
self
.
CONV_MERGING
:
required_divisor
=
4
if
(
self
.
PIXEL_SHUFFLE
and
self
.
CONV_MERGING
)
else
2
rem_h
=
target_patch_height
%
required_divisor
if
rem_h
!=
0
:
inc_h
=
required_divisor
-
rem_h
if
(
target_patch_height
+
inc_h
)
*
target_patch_width
<=
current_num_tokens_available
:
target_patch_height
+=
inc_h
else
:
target_patch_height
=
max
(
required_divisor
,
target_patch_height
-
rem_h
)
rem_w
=
target_patch_width
%
required_divisor
if
rem_w
!=
0
:
inc_w
=
required_divisor
-
rem_w
if
(
target_patch_height
*
(
target_patch_width
+
inc_w
)
<=
current_num_tokens_available
):
target_patch_width
+=
inc_w
else
:
target_patch_width
=
max
(
required_divisor
,
target_patch_width
-
rem_w
)
# Calculate embeddings for the main dynamic resolution image
num_embeddings
=
self
.
_get_num_embeddings
(
target_patch_width
*
self
.
_patch_size
,
target_patch_height
*
self
.
_patch_size
,
)
token_count
=
target_patch_width
*
target_patch_height
# Add thumbnail embeddings if enabled and image area is below threshold
num_tiles
=
1
# Base dynamic resolution image
return
self
.
DynamicResolutionParams
(
media
=
media
,
num_tiles
=
num_tiles
,
num_embeddings
=
num_embeddings
,
patch_size
=
(
target_patch_width
,
target_patch_height
),
),
token_count
def
compute_params
(
self
,
media_list
:
list
[
Image
.
Image
],
num_tokens_available
:
int
,
)
->
list
[
DynamicResolutionParams
]:
"""Compute parameters for all media with iterative token budgeting.
Args:
media_list: List of media items to process
num_tokens_available: Total number of tokens available across all media
Returns:
List of ImageTilingParams for each media item
"""
num_tokens_available
=
(
num_tokens_available
*
(
4
if
self
.
PIXEL_SHUFFLE
else
1
)
*
(
4
if
self
.
CONV_MERGING
else
1
)
)
# When the number of available token is too small,
# allow self._min_num_patches per media and let the sample be truncated.
num_tokens_available
=
max
(
num_tokens_available
,
self
.
_min_num_patches
*
len
(
media_list
)
)
# Clip the number of tokens available per media to >min and <max patches.
num_tokens_available_per_media
=
[
int
(
max
(
min
(
num_tokens_available
,
self
.
_max_num_patches
),
self
.
_min_num_patches
,
)
)
for
_
in
range
(
len
(
media_list
))
]
# prevent infinite loop in any case
for
_
in
range
(
10
):
# Step 1: Process each media with current token budget
params
=
[]
token_counts
=
[]
for
media
,
tokens_for_media
in
zip
(
media_list
,
num_tokens_available_per_media
):
param
,
token_count
=
self
.
process_media
(
media
,
tokens_for_media
)
params
.
append
(
param
)
token_counts
.
append
(
token_count
)
self
.
feature_size_cache
[
id
(
param
.
media
)]
=
param
.
num_embeddings
# Step 2: Check if total tokens is within budget
total_tokens
=
sum
(
token_counts
)
if
total_tokens
<=
num_tokens_available
:
# We're within budget, return the params
return
params
# Step 3: We're over budget, need to scale down
# Calculate scaling factor to get under budget
scaling_factor
=
num_tokens_available
/
total_tokens
# Recalculate token budgets for each media based on scaling
# Each media gets a proportional share of the total budget
scaled_down_num_tokens_available_per_media
=
[
max
(
self
.
_min_num_patches
,
int
(
token_count
*
scaling_factor
))
for
token_count
in
token_counts
]
scaled_down
=
any
(
[
scaled_down_num_tokens_available_per_media
[
i
]
<
num_tokens_available_per_media
[
i
]
for
i
in
range
(
len
(
num_tokens_available_per_media
))
]
)
# If there wasn't scaling down, we're stuck with min_num_patches per media,
# else try with the scaled down num_tokens_available_per_media.
if
not
scaled_down
:
num_tokens_available_per_media
=
[
self
.
_min_num_patches
]
*
len
(
media_list
)
else
:
num_tokens_available_per_media
=
(
scaled_down_num_tokens_available_per_media
)
ctx
=
f
"
{
params
=
}
{
total_tokens
=
}
{
num_tokens_available
=
}
"
raise
ValueError
(
f
"Should be unreachable - `return params` above must be reached:
{
ctx
}
"
)
@
staticmethod
def
stack
(
images
:
list
[
torch
.
Tensor
],
patch_size
:
int
)
->
torch
.
Tensor
:
assert
len
(
images
)
>
0
,
"No images to stack"
def
rearrange_img
(
x
):
py
=
x
.
shape
[
-
2
]
//
patch_size
px
=
x
.
shape
[
-
1
]
//
patch_size
x
=
einops
.
rearrange
(
x
,
"c (py yy) (px xx) -> (py px) (c yy xx)"
,
py
=
py
,
yy
=
patch_size
,
px
=
px
,
xx
=
patch_size
,
)
return
x
imgs
=
[
rearrange_img
(
img
)
for
img
in
images
]
pixel_values_flat
=
torch
.
cat
(
imgs
,
dim
=
0
).
unsqueeze
(
0
)
return
pixel_values_flat
class
BaseNanoNemotronVLProcessor
(
ABC
):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
args
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
max_num_tiles
=
max_num_tiles
or
DEFAULT_NUM_TILES
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
downsample_ratio
:
int
=
config
.
downsample_ratio
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
self
.
norm_mean
=
torch
.
Tensor
(
config
.
norm_mean
).
reshape
(
1
,
3
,
1
,
1
)
self
.
norm_std
=
torch
.
Tensor
(
config
.
norm_std
).
reshape
(
1
,
3
,
1
,
1
)
self
.
dynamic_tiler
:
DynamicResolutionImageTiler
|
None
=
None
if
self
.
use_dynamic_resolution
(
config
):
self
.
dynamic_tiler
=
DynamicResolutionImageTiler
(
max_model_len
=
max_model_len
,
patch_size
=
patch_size
,
downsample_ratio
=
downsample_ratio
,
min_num_patches
=
config
.
vision_config
.
args
[
"min_num_patches"
],
max_num_patches
=
config
.
vision_config
.
args
[
"max_num_patches"
],
norm_mean
=
config
.
norm_mean
,
norm_std
=
config
.
norm_std
,
)
@
staticmethod
def
use_dynamic_resolution
(
config
:
PretrainedConfig
)
->
bool
:
return
"min_num_patches"
in
config
.
vision_config
.
args
@
property
@
abstractmethod
def
image_token_id
(
self
)
->
int
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
max_num_tiles
:
int
,
)
->
int
:
target_ratios
=
get_internvl_target_ratios
(
1
,
max_num_tiles
)
num_patches
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
target_ratios
=
target_ratios
,
image_size
=
self
.
image_size
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
max_num_tiles
:
int
,
)
->
list
[
torch
.
Tensor
]:
return
[
image_to_pixel_values
(
image
,
input_size
=
self
.
image_size
,
max_num
=
max_num_tiles
,
use_thumbnail
=
self
.
use_thumbnail
,
idx
=
idx
,
)
for
idx
,
image
in
enumerate
(
images
)
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
max_num_tiles
:
int
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
images
)
==
0
:
return
text
,
{}
image_inputs
:
dict
[
str
,
Any
]
if
tiler
:
=
self
.
dynamic_tiler
:
sans_images
=
text
[
0
].
replace
(
"<image>"
,
""
)
text_prompt_length
=
len
(
self
.
tokenizer
(
sans_images
,
add_special_tokens
=
False
).
input_ids
)
pixel_values_lst
,
num_tokens_per_image
=
tiler
.
_images_to_pixel_values_lst
(
text_prompt_length
=
text_prompt_length
,
images
=
images
,
)
imgs_sizes
=
[(
pv
.
shape
[
-
2
],
pv
.
shape
[
-
1
])
for
pv
in
pixel_values_lst
]
normalized
=
[
input_conditioner
(
img
,
tiler
.
norm_mean
,
tiler
.
norm_std
)
for
img
in
pixel_values_lst
]
image_num_patches
=
torch
.
tensor
([
1
]
*
len
(
num_tokens_per_image
))
image_inputs
=
{
"pixel_values_flat"
:
normalized
,
"imgs_sizes"
:
imgs_sizes
,
"num_tokens_per_image"
:
num_tokens_per_image
,
}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
max_num_tiles
)
image_num_patches
=
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst
])
pixel_values_flat
=
input_conditioner
(
torch
.
cat
(
pixel_values_lst
),
self
.
norm_mean
,
self
.
norm_std
)
image_inputs
=
{
"pixel_values_flat"
:
pixel_values_flat
,
"image_num_patches"
:
image_num_patches
,
}
num_tokens_per_image
=
[
self
.
num_image_token
*
len
(
item
)
for
item
in
pixel_values_lst
]
assert
len
(
text
)
==
1
,
(
"hf_processor is called on the output of get_dummy_text, "
"which should be a single string"
)
parts
=
[
x
for
x
in
re
.
split
(
r
"(<image>)"
,
text
[
0
])
if
x
]
assert
parts
.
count
(
"<image>"
)
==
len
(
pixel_values_lst
),
(
"the number of <image> tokens in the text should be the "
"same as the number of images"
)
for
i
,
(
feature_size
,
num_patches
)
in
enumerate
(
zip
(
num_tokens_per_image
,
image_num_patches
,
strict
=
True
)
):
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
parts
[
i
]
=
parts
[
i
].
replace
(
"<image>"
,
image_repl
.
full
)
text
=
[
""
.
join
(
parts
)]
return
text
,
image_inputs
def
_make_batch_input
(
self
,
input_item
:
_T
|
list
[
_T
]
|
None
=
None
)
->
list
[
_T
]:
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
@
abstractmethod
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
*
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
max_num_tiles
:
int
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
raise
NotImplementedError
class
NanoNemotronVLProcessor
(
BaseNanoNemotronVLProcessor
):
"""
HF Processor with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
max_model_len
:
int
,
max_num_tiles
:
int
|
None
=
None
,
video_token
:
str
|
None
=
None
,
video_pruning_rate
:
float
|
None
=
None
,
)
->
None
:
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
max_model_len
=
max_model_len
,
max_num_tiles
=
max_num_tiles
,
)
# add extra video token for video processing
self
.
video_token
=
video_token
self
.
video_pruning_rate
=
video_pruning_rate
self
.
audio_extractor
:
ParakeetExtractor
|
None
=
None
raw_sound_config
=
getattr
(
config
,
"sound_config"
,
None
)
if
raw_sound_config
is
not
None
:
self
.
audio_extractor
=
ParakeetExtractor
(
raw_sound_config
)
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
self
.
_img_start_token_ids
=
tokenizer
.
encode
(
IMG_START
,
add_special_tokens
=
False
)
self
.
_img_end_token_ids
=
tokenizer
.
encode
(
IMG_END
,
add_special_tokens
=
False
)
self
.
_img_context_token_ids
=
tokenizer
.
encode
(
IMG_CONTEXT
,
add_special_tokens
=
False
)
@
property
def
supports_video
(
self
)
->
bool
:
return
self
.
video_token_id
is
not
None
@
property
def
video_token_id
(
self
)
->
int
|
None
:
if
self
.
video_token
is
None
:
return
None
return
self
.
tokenizer
.
get_vocab
().
get
(
self
.
video_token
,
None
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
convert_tokens_to_ids
(
IMG_CONTEXT
)
def
_videos_to_pixel_values_lst
(
self
,
videos
:
list
[
npt
.
NDArray
],
max_num_tiles
:
int
,
)
->
list
[
torch
.
Tensor
]:
return
[
video_to_pixel_values
(
video
,
input_size
=
self
.
image_size
,
max_num_tiles
=
max_num_tiles
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
video
in
videos
]
def
_preprocess_video
(
self
,
text
:
list
[
str
],
videos
:
list
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]],
max_num_tiles
:
int
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
videos
)
==
0
or
not
self
.
supports_video
:
return
text
,
{}
videos_lst
=
[
v
[
0
]
for
v
in
videos
]
video_metadata_lst
=
[
v
[
1
]
for
v
in
videos
]
pixel_values_lst_video
=
self
.
_videos_to_pixel_values_lst
(
videos_lst
,
max_num_tiles
=
max_num_tiles
,
)
# We use frame duration in milliseconds (as integer) to ensure
# we have consistent timestamps calculation. At preprocessing
# fps parameter is given in fp32, while at inference it is bf16
# which leads to inaccurate timestamp calculation and causes
# timestamp values to differ.In rare cases this causes
# mismatching number of output tokens for tokenized frame prefixes
frame_duration_ms_lst
=
[
int
(
1000.0
/
metadata
[
"fps"
])
for
metadata
in
video_metadata_lst
]
frames_indices_lst
=
[
metadata
[
"frames_indices"
]
for
metadata
in
video_metadata_lst
]
video_num_patches
=
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst_video
])
video_inputs
=
{
"pixel_values_flat_video"
:
input_conditioner
(
torch
.
cat
(
pixel_values_lst_video
),
self
.
norm_mean
,
self
.
norm_std
),
"video_num_patches"
:
video_num_patches
,
"frames_indices"
:
frames_indices_lst
,
"frame_duration_ms"
:
torch
.
tensor
(
frame_duration_ms_lst
),
}
image_size
:
int
=
self
.
config
.
force_image_size
patch_size
:
int
=
self
.
config
.
patch_size
downsample_ratio
=
self
.
config
.
downsample_ratio
tokens_in_single_frame
=
int
(
(
image_size
*
image_size
//
patch_size
**
2
)
*
(
downsample_ratio
**
2
)
)
for
pixel_values
,
video_metadata
,
frames_indices
,
frame_duration_ms
in
zip
(
pixel_values_lst_video
,
video_metadata_lst
,
frames_indices_lst
,
frame_duration_ms_lst
,
):
num_frames
=
pixel_values
.
shape
[
0
]
if
self
.
video_pruning_rate
is
not
None
and
self
.
video_pruning_rate
>
0.0
:
# Start of EVS-specific code
num_tokens
=
compute_retained_tokens_count
(
tokens_per_frame
=
tokens_in_single_frame
,
num_frames
=
num_frames
,
q
=
self
.
video_pruning_rate
,
)
# Here we just need placeholders that won't actually be replaced -
# we just need to make sure the total number of tokens is correct
# assign all tokens to the first frame
tokens_per_frame
=
[
num_tokens
]
+
[
0
]
*
(
num_frames
-
1
)
# End of EVS-specific code
else
:
tokens_per_frame
=
[
tokens_in_single_frame
]
*
num_frames
video_repl
=
self
.
get_video_repl
(
tokens_per_frame
=
tokens_per_frame
,
frames_indices
=
frames_indices
,
frame_duration_ms
=
frame_duration_ms
,
tokenizer
=
self
.
tokenizer
,
img_start_token_ids
=
self
.
_img_start_token_ids
,
img_end_token_ids
=
self
.
_img_end_token_ids
,
img_context_token_ids
=
self
.
_img_context_token_ids
,
)
# video_repl.full is a list of token IDs
# Convert token IDs back to text for the HF processor flow
video_repl_text
=
self
.
tokenizer
.
decode
(
video_repl
.
full
,
skip_special_tokens
=
False
)
text
=
[
t
.
replace
(
"<video>"
,
video_repl_text
,
1
)
for
t
in
text
]
return
text
,
video_inputs
def
_preprocess_audio
(
self
,
text
:
list
[
str
],
audios
:
list
[
npt
.
NDArray
],
)
->
tuple
[
list
[
str
],
dict
[
str
,
Any
]]:
if
len
(
audios
)
==
0
:
return
text
,
{}
assert
self
.
audio_extractor
is
not
None
extractor
=
self
.
audio_extractor
parts
=
[
x
for
x
in
re
.
split
(
f
"(
{
re
.
escape
(
AUDIO_CONTEXT
)
}
)"
,
text
[
0
])
if
x
]
token_count
=
parts
.
count
(
AUDIO_CONTEXT
)
if
token_count
!=
len
(
audios
):
raise
ValueError
(
"Number of audio tokens in text does not match the number "
f
"of audios (tokens=
{
token_count
}
, audios=
{
len
(
audios
)
}
)."
)
audio_index
=
0
for
idx
,
part
in
enumerate
(
parts
):
if
part
==
AUDIO_CONTEXT
:
audio_repl
=
self
.
get_audio_repl
(
audios
[
audio_index
])
parts
[
idx
]
=
audio_repl
.
full
audio_index
+=
1
text
=
[
""
.
join
(
parts
)]
audio_inputs
=
extractor
(
audios
,
sampling_rate
=
extractor
.
sampling_rate
,
return_tensors
=
"pt"
,
)
input_audio_features
=
audio_inputs
.
input_features
feature_attention_mask
=
audio_inputs
.
attention_mask
audio_feature_lengths
=
feature_attention_mask
.
sum
(
dim
=
1
)
audio_inputs
=
{
"input_audio_features"
:
input_audio_features
,
"feature_attention_mask"
:
feature_attention_mask
,
"audio_feature_lengths"
:
audio_feature_lengths
,
}
return
text
,
audio_inputs
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
videos
:
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]
|
list
[
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]]
|
None
=
None
,
audios
:
AudioItem
|
list
[
AudioItem
]
|
None
=
None
,
*
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
max_num_tiles
:
int
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
# Use default if not provided
if
max_num_tiles
is
None
:
max_num_tiles
=
self
.
max_num_tiles
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
videos
=
self
.
_make_batch_input
(
videos
)
audios
=
self
.
_make_batch_input
(
audios
)
text
,
image_inputs
=
self
.
_preprocess_image
(
text
=
text
,
images
=
images
,
max_num_tiles
=
max_num_tiles
,
)
text
,
video_inputs
=
self
.
_preprocess_video
(
text
=
text
,
videos
=
videos
,
max_num_tiles
=
1
,
)
text
,
audio_inputs
=
self
.
_preprocess_audio
(
text
=
text
,
audios
=
audios
,
)
text_inputs
=
self
.
tokenizer
(
text
,
add_special_tokens
=
False
)
combined_inputs
=
{
**
text_inputs
,
**
video_inputs
,
**
audio_inputs
}
if
self
.
dynamic_tiler
is
None
:
batch
=
BatchFeature
(
{
**
combined_inputs
,
**
image_inputs
},
tensor_type
=
return_tensors
,
)
else
:
batch
=
BatchFeature
(
combined_inputs
,
tensor_type
=
return_tensors
)
# allow images to be exempt from the BatchFeature validation:
# We will .stack() them in _parse_and_validate_image_input
batch
.
update
(
image_inputs
)
return
batch
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
get_audio_repl
(
self
,
audio
:
npt
.
NDArray
,
)
->
PromptUpdateDetails
[
str
]:
assert
self
.
audio_extractor
is
not
None
num_tokens
=
self
.
audio_extractor
.
audio_token_count
(
len
(
audio
))
repl_full
=
f
"
{
AUDIO_START
}{
AUDIO_CONTEXT
*
num_tokens
}{
AUDIO_END
}
"
return
PromptUpdateDetails
.
select_text
(
repl_full
,
AUDIO_CONTEXT
)
@
classmethod
def
get_video_repl
(
cls
,
*
,
tokens_per_frame
:
list
[
int
],
frames_indices
:
list
[
int
],
frame_duration_ms
:
int
,
tokenizer
:
TokenizerLike
,
img_start_token_ids
:
list
[
int
],
img_end_token_ids
:
list
[
int
],
img_context_token_ids
:
list
[
int
],
)
->
PromptUpdateDetails
[
list
[
int
]]:
"""
Build prompt replacement for a video.
The replacement returned is not actually used to replace the placeholder
tokens - it's just used to make sure we allocate the correct number
of tokens.
Actual replacement is done in embed_multimodal of
NemotronH_Nano_VL_V2
(specifically in _process_video_input -> _create_final_video_embeddings).
There, we create the final embeddings with text embeddings for indicator tokens
and video embeddings for video tokens.
This is a single function that handles all cases - non EVS, EVS dummy, EVS real.
The differentiation is done via tokens_per_frame parameter.
- non EVS case - constant value same value across all frames
- EVS dummy - Doesn't matter how tokens are distributed between frames - just
make sure the total number of tokens is correct.
- EVS real (called from get_real_video_repl_for_evs) - different value per frame
Args:
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
"""
# TODO: Add support of frame_duration_ms to be None
# At preprocessing step we should allow absent / metadata without
# frames_indices field.
timestamps_enabled
=
frame_duration_ms
is
not
None
if
timestamps_enabled
:
timestamps
=
calculate_timestamps
(
frames_indices
,
frame_duration_ms
)
assert
len
(
timestamps
)
==
len
(
tokens_per_frame
),
(
"timestamps and tokens_per_frame must have the same length"
)
frame_separators
=
[
f
"Frame
{
i
+
1
}
sampled at
{
timestamp
:.
2
f
}
seconds: "
for
i
,
timestamp
in
enumerate
(
timestamps
)
]
else
:
frame_separators
=
[
f
"Frame
{
i
+
1
}
: "
for
i
,
_
in
enumerate
(
tokens_per_frame
)
]
# Tokenize frame separator independently
frame_separators_tokenized
=
[
_seq2tokens
(
tokenizer
,
sep
)
for
sep
in
frame_separators
]
# Tokenize each component independently to avoid tokenizer merging tokens
# across boundaries. This ensures consistent tokenization regardless of
# num_tokens_per_frame values.
all_token_ids
=
[]
for
i
,
num_tokens
in
enumerate
(
tokens_per_frame
):
frame_sep_token_ids
=
frame_separators_tokenized
[
i
]
all_token_ids
.
extend
(
frame_sep_token_ids
)
# Add pre-tokenized special tokens
all_token_ids
.
extend
(
img_start_token_ids
)
all_token_ids
.
extend
(
img_context_token_ids
*
num_tokens
)
all_token_ids
.
extend
(
img_end_token_ids
)
return
PromptUpdateDetails
.
from_seq
(
all_token_ids
)
vllm/transformers_utils/processors/nemotron_parse.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py
# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py
from
typing
import
TypeVar
import
numpy
as
np
import
torch
from
PIL
import
Image
from
timm.data.constants
import
OPENAI_CLIP_MEAN
,
OPENAI_CLIP_STD
from
torchvision
import
transforms
as
T
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.tokenizers
import
TokenizerLike
_T
=
TypeVar
(
"_T"
)
DEFAULT_FINAL_IMAGE_SIZE
=
(
2048
,
1648
)
class
NemotronParseImageProcessor
:
"""
NemotronParse Image Processor
"""
def
__init__
(
self
,
final_size
:
tuple
=
DEFAULT_FINAL_IMAGE_SIZE
,
**
kwargs
,
):
# Ensure final_size is properly formatted
if
isinstance
(
final_size
,
(
list
,
tuple
))
and
len
(
final_size
)
>=
2
:
self
.
final_size
=
(
int
(
final_size
[
0
]),
int
(
final_size
[
1
]))
elif
isinstance
(
final_size
,
(
int
,
float
)):
self
.
final_size
=
(
int
(
final_size
),
int
(
final_size
))
else
:
self
.
final_size
=
DEFAULT_FINAL_IMAGE_SIZE
# Default fallback
self
.
norm_mean
=
torch
.
Tensor
(
OPENAI_CLIP_MEAN
).
reshape
(
1
,
3
,
1
,
1
)
self
.
norm_std
=
torch
.
Tensor
(
OPENAI_CLIP_STD
).
reshape
(
1
,
3
,
1
,
1
)
# Create transforms
self
.
_create_transforms
()
def
_create_transforms
(
self
):
"""Create transform objects."""
try
:
import
albumentations
as
A
except
ImportError
as
err
:
raise
ImportError
(
"The package `albumentations` is required to use "
"NemotronParse model. Please install it with `pip install "
"albumentations`."
)
from
err
# Ensure final_size is a tuple of integers
if
isinstance
(
self
.
final_size
,
(
list
,
tuple
)):
self
.
target_height
,
self
.
target_width
=
(
int
(
self
.
final_size
[
0
]),
int
(
self
.
final_size
[
1
]),
)
else
:
self
.
target_height
=
self
.
target_width
=
int
(
self
.
final_size
)
import
cv2
self
.
transform
=
A
.
Compose
(
[
A
.
PadIfNeeded
(
min_height
=
self
.
target_height
,
min_width
=
self
.
target_width
,
border_mode
=
cv2
.
BORDER_CONSTANT
,
fill
=
[
255
,
255
,
255
],
p
=
1.0
,
),
]
)
self
.
torch_transform
=
T
.
Compose
(
[
T
.
ToTensor
(),
]
)
def
_resize_with_aspect_ratio
(
self
,
image
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Resize image maintaining aspect ratio (exact replica of original
LongestMaxSizeHW)."""
height
,
width
=
image
.
shape
[:
2
]
max_size_height
=
self
.
target_height
max_size_width
=
self
.
target_width
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
aspect_ratio
=
width
/
height
new_height
=
height
new_width
=
width
# If height too big then scale image down
if
height
>
max_size_height
:
new_height
=
max_size_height
new_width
=
int
(
new_height
*
aspect_ratio
)
# If width too big, scale image down further
if
new_width
>
max_size_width
:
new_width
=
max_size_width
new_height
=
int
(
new_width
/
aspect_ratio
)
# Use cv2.INTER_LINEAR like the original
import
cv2
return
cv2
.
resize
(
image
,
(
new_width
,
new_height
),
interpolation
=
cv2
.
INTER_LINEAR
)
def
_pad_to_size
(
self
,
image
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Pad image to target size with white padding (matches A.PadIfNeeded
behavior)."""
h
,
w
=
image
.
shape
[:
2
]
min_height
,
min_width
=
self
.
target_height
,
self
.
target_width
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
pad_h
=
max
(
0
,
min_height
-
h
)
pad_w
=
max
(
0
,
min_width
-
w
)
if
pad_h
==
0
and
pad_w
==
0
:
return
image
# A.PadIfNeeded pads to bottom-right with constant value
if
len
(
image
.
shape
)
==
3
:
# Color image - pad bottom and right with white (255, 255, 255)
padded
=
np
.
pad
(
image
,
((
0
,
pad_h
),
(
0
,
pad_w
),
(
0
,
0
)),
mode
=
"constant"
,
constant_values
=
255
,
)
else
:
# Grayscale image - pad with white (255)
padded
=
np
.
pad
(
image
,
((
0
,
pad_h
),
(
0
,
pad_w
)),
mode
=
"constant"
,
constant_values
=
255
)
return
padded
def
preprocess
(
self
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
**
kwargs
,
)
->
dict
[
str
,
torch
.
Tensor
]:
"""
Preprocess an image or batch of images for the NemotronParse model.
Args:
images: Input image(s)
"""
# Ensure images is a list
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
# Convert PIL images to numpy arrays if needed
processed_images
=
[]
for
image
in
images
:
if
isinstance
(
image
,
Image
.
Image
):
image
=
np
.
asarray
(
image
)
processed_images
.
append
(
image
)
# Apply NemotronParse-specific transforms
pixel_values
=
[]
for
image
in
processed_images
:
# Manual resize with aspect ratio preservation
# (replaces LongestMaxSizeHW)
processed_image
=
self
.
_resize_with_aspect_ratio
(
image
)
# Apply remaining albumentations transforms if available
if
self
.
transform
is
not
None
:
transformed
=
self
.
transform
(
image
=
processed_image
)
processed_image
=
transformed
[
"image"
]
else
:
# Fallback: just pad to target size
processed_image
=
self
.
_pad_to_size
(
processed_image
)
# Convert to tensor
pixel_values_tensor
=
self
.
torch_transform
(
processed_image
)
# Handle grayscale images
if
pixel_values_tensor
.
shape
[
0
]
==
1
:
pixel_values_tensor
=
pixel_values_tensor
.
expand
(
3
,
-
1
,
-
1
)
pixel_values
.
append
(
pixel_values_tensor
)
# Stack into batch
pixel_values
=
torch
.
stack
(
pixel_values
)
# Normalize pixel values
normalized_values
=
(
pixel_values
-
self
.
norm_mean
)
/
self
.
norm_std
return
{
"pixel_values"
:
normalized_values
}
def
__call__
(
self
,
images
:
Image
.
Image
|
list
[
Image
.
Image
],
**
kwargs
)
->
dict
[
str
,
torch
.
Tensor
]:
return
self
.
preprocess
(
images
,
**
kwargs
)
class
NemotronParseProcessor
:
"""
NemotronParse Processor
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
**
kwargs
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
NemotronParseImageProcessor
(
final_size
=
config
.
image_size
)
def
_make_batch_input
(
self
,
input_item
:
_T
|
list
[
_T
]
|
None
=
None
)
->
list
[
_T
]:
if
input_item
is
None
:
input_item
=
[]
if
not
isinstance
(
input_item
,
list
):
input_item
=
[
input_item
]
return
input_item
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
**
kwargs
,
)
->
BatchFeature
:
text
=
self
.
_make_batch_input
(
text
)
images
=
self
.
_make_batch_input
(
images
)
image_inputs
=
{}
if
len
(
images
)
==
0
else
self
.
image_processor
(
images
)
text_inputs
=
self
.
tokenizer
(
text
,
add_special_tokens
=
False
,
**
kwargs
)
combined_outputs
=
BatchFeature
(
data
=
{
**
text_inputs
,
**
image_inputs
},
tensor_type
=
return_tensors
,
)
return
combined_outputs
vllm/transformers_utils/processors/nemotron_vl.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
transformers.image_processing_utils_fast
import
BaseImageProcessorFast
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
from
.internvl
import
InternVLProcessor
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images
Image
.
MAX_IMAGE_PIXELS
=
None
# Disable the limit entirely
# Alternative: Set a specific higher limit
# Image.MAX_IMAGE_PIXELS = 300000000 # ~300M pixels
def
build_transform
(
input_size
:
int
):
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
]
)
# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_factor
=
float
(
"-inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
rw
,
rh
in
target_ratios
:
target_aspect_ratio
=
rw
/
rh
size_factor
=
min
((
rw
*
rh
*
image_size
*
image_size
)
/
area
,
0.6
)
ratio_closeness
=
min
(
target_aspect_ratio
/
aspect_ratio
,
aspect_ratio
/
target_aspect_ratio
)
factor
=
size_factor
*
ratio_closeness
if
factor
>
best_factor
:
best_factor
=
factor
best_ratio
=
(
rw
,
rh
)
return
best_ratio
def
calculate_nemotron_vl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_nemotron_vl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_nemotron_vl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
def
get_nemotron_vl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
image_to_pixel_values_nemotron_vl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
transform
:
T
.
Compose
|
None
=
None
,
)
->
torch
.
Tensor
:
target_ratios
=
get_nemotron_vl_target_ratios
(
min_num
,
max_num
)
if
transform
is
None
:
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_nemotron_vl
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
NemotronVLProcessor
(
InternVLProcessor
):
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<image>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
image_processor
:
BaseImageProcessorFast
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
ABC
.
__init__
(
self
)
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
image_processor
image_size
:
int
=
config
.
force_image_size
patch_size
:
int
=
config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
1
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
image_processor
.
max_num_tiles
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
True
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
if
image_processor
is
not
None
:
self
.
use_thumbnail
=
image_processor
.
use_thumbnail
else
:
self
.
use_thumbnail
=
getattr
(
config
,
"use_thumbnail"
,
True
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
self
.
IMG_CONTEXT
]
def
_get_transform
(
self
)
->
T
.
Compose
:
return
build_transform
(
input_size
=
self
.
image_size
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_nemotron_vl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_nemotron_vl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
transform
=
self
.
_get_transform
(),
)
for
image
in
images
]
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Replace <image> placeholders with image tokens."""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT
=
image_repl
.
full
.
replace
(
"<image>"
,
"<NVL_IMG_CONTEXT>"
)
text
=
[
t
.
replace
(
"<image>"
,
NVL_IMAGE_CONTEXT
,
1
)
for
t
in
text
]
return
[
t
.
replace
(
"<NVL_IMG_CONTEXT>"
,
self
.
IMG_CONTEXT
)
for
t
in
text
]
def
_preprocess_image
(
self
,
text
:
list
[
str
],
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
tuple
[
list
[
str
],
dict
[
str
,
torch
.
Tensor
]]:
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
text
=
self
.
_replace_image_tokens
(
text
,
pixel_values_lst
)
return
text
,
image_inputs
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
self
.
IMG_CONTEXT
*
feature_size
repl_full
=
self
.
IMG_START
+
repl_features
+
self
.
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
self
.
IMG_CONTEXT
)
# SigLIP normalization constants
SIGLIP_MEAN
=
(
0.5
,
0.5
,
0.5
)
SIGLIP_STD
=
(
0.5
,
0.5
,
0.5
)
def
build_siglip_transform
(
input_size
:
int
):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
return
T
.
Compose
(
[
build_transform
(
input_size
=
input_size
),
T
.
Normalize
(
mean
=
SIGLIP_MEAN
,
std
=
SIGLIP_STD
),
]
)
class
LlamaNemotronVLEmbedProcessor
(
NemotronVLProcessor
):
"""
Processor for LlamaNemotronVL embedding model.
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
- Uses SigLIP transform with normalization instead of base transform
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT
=
"<IMG_CONTEXT>"
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
processor_config
:
dict
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
processor_config
.
get
(
"min_input_tiles"
,
getattr
(
config
,
"min_dynamic_patch"
,
1
),
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
processor_config
.
get
(
"max_input_tiles"
,
getattr
(
config
,
"max_dynamic_patch"
,
1
),
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
processor_config
.
get
(
"dynamic_image_size"
,
getattr
(
config
,
"dynamic_image_size"
,
True
),
)
super
().
__init__
(
config
=
config
,
tokenizer
=
tokenizer
,
image_processor
=
None
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
def
_get_transform
(
self
)
->
T
.
Compose
:
"""Override to add SigLIP normalization."""
return
build_siglip_transform
(
input_size
=
self
.
image_size
)
def
_replace_image_tokens
(
self
,
text
:
list
[
str
],
pixel_values_lst
:
list
[
torch
.
Tensor
],
)
->
list
[
str
]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
return
text
vllm/transformers_utils/processors/nvlm_d.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
# --------------------------------------------------------
# NVLM-D
# Copyright (c) 2024 NVIDIA
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
.internvl
import
BaseInternVLProcessor
IMG_PAD
=
"<|vision_pad|>"
class
NVLMProcessor
(
BaseInternVLProcessor
):
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_PAD
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
tile_pos_identifiers
=
[
f
"<tile_
{
i
}
>"
for
i
in
range
(
1
,
num_patches
)]
if
self
.
use_thumbnail
:
tile_pos_identifiers
+=
[
"<tile_global_thumbnail>"
]
context_size
=
feature_size
//
num_patches
features
=
""
.
join
(
identifier
+
IMG_PAD
*
context_size
for
identifier
in
tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl
=
"<Image>"
+
features
+
"</Image>"
return
PromptUpdateDetails
.
select_text
(
repl
,
IMG_PAD
)
vllm/transformers_utils/processors/skyworkr1v.py
0 → 100644
View file @
f3403243
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
# --------------------------------------------------------
# SkyworkR1V
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
torch
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.processing
import
PromptUpdateDetails
from
vllm.tokenizers
import
TokenizerLike
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
return
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
convert_image_mode
(
img
,
"RGB"
)),
T
.
Resize
(
(
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def
find_closest_aspect_ratio
(
aspect_ratio
:
float
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
*
,
width
:
int
,
height
:
int
,
image_size
:
int
,
)
->
tuple
[
int
,
int
]:
best_ratio_diff
=
float
(
"inf"
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
resolve_skyworkr1v_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_skyworkr1v_target_ratios
(
min_num
:
int
,
max_num
:
int
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
{
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
min_num
<=
i
*
j
<=
max_num
}
return
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
def
calculate_skyworkr1v_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
def
dynamic_preprocess_skyworkr1v
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
list
[
Image
.
Image
]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
blocks
,
target_width
,
target_height
=
calculate_skyworkr1v_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def
image_to_pixel_values_skyworkr1v
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
)
->
torch
.
Tensor
:
target_ratios
=
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess_skyworkr1v
(
image
,
target_ratios
=
target_ratios
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
class
SkyworkR1VProcessor
:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
image_size
:
int
=
config
.
vision_config
.
image_size
patch_size
:
int
=
config
.
vision_config
.
patch_size
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
config
.
min_dynamic_patch
assert
isinstance
(
min_dynamic_patch
,
int
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
config
.
max_dynamic_patch
assert
isinstance
(
max_dynamic_patch
,
int
)
if
dynamic_image_size
is
None
:
dynamic_image_size
=
config
.
dynamic_image_size
assert
isinstance
(
dynamic_image_size
,
bool
)
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
)
)
self
.
image_size
=
image_size
self
.
min_dynamic_patch
=
min_dynamic_patch
self
.
max_dynamic_patch
=
max_dynamic_patch
self
.
dynamic_image_size
=
dynamic_image_size
self
.
use_thumbnail
:
bool
=
config
.
use_thumbnail
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
int
|
None
,
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
return
PromptUpdateDetails
.
select_text
(
repl_full
,
IMG_CONTEXT
)
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
is
None
else
dynamic_image_size
)
use_thumbnail
=
self
.
use_thumbnail
if
use_thumbnail
is
None
else
use_thumbnail
return
resolve_skyworkr1v_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
return
get_skyworkr1v_target_ratios
(
min_num
,
max_num
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
)
->
int
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
=
calculate_skyworkr1v_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
self
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
self
.
use_thumbnail
,
)
return
num_patches
*
self
.
num_image_token
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_skyworkr1v
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
)
for
image
in
images
]
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
else
:
pixel_values_lst
=
self
.
_images_to_pixel_values_lst
(
images
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
(
[
len
(
item
)
for
item
in
pixel_values_lst
]
),
}
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
"<image>"
,
image_repl
.
full
,
1
)
for
t
in
text
]
text_inputs
=
self
.
tokenizer
(
text
)
combined_outputs
=
{
**
text_inputs
,
**
image_inputs
}
return
BatchFeature
(
combined_outputs
,
tensor_type
=
return_tensors
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment