Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
657855ab
Unverified
Commit
657855ab
authored
Mar 19, 2026
by
Cyrus Leung
Committed by
GitHub
Mar 19, 2026
Browse files
[Misc] Cleanup more configs and processors (#37560)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
e27b8ba3
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
985 additions
and
4 deletions
+985
-4
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+1
-1
vllm/transformers_utils/configs/speculators/__init__.py
vllm/transformers_utils/configs/speculators/__init__.py
+3
-0
vllm/transformers_utils/configs/speculators/base.py
vllm/transformers_utils/configs/speculators/base.py
+0
-3
vllm/transformers_utils/processors/__init__.py
vllm/transformers_utils/processors/__init__.py
+6
-0
vllm/transformers_utils/processors/isaac.py
vllm/transformers_utils/processors/isaac.py
+461
-0
vllm/transformers_utils/processors/kimi_k25.py
vllm/transformers_utils/processors/kimi_k25.py
+73
-0
vllm/transformers_utils/processors/step3_vl.py
vllm/transformers_utils/processors/step3_vl.py
+441
-0
No files found.
vllm/transformers_utils/configs/__init__.py
View file @
657855ab
...
...
@@ -55,7 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"OvisConfig"
:
"vllm.transformers_utils.configs.ovis"
,
"PixelShuffleSiglip2VisionConfig"
:
"vllm.transformers_utils.configs.isaac"
,
"RadioConfig"
:
"vllm.transformers_utils.configs.radio"
,
"SpeculatorsConfig"
:
"vllm.transformers_utils.configs.speculators
.base
"
,
"SpeculatorsConfig"
:
"vllm.transformers_utils.configs.speculators"
,
"UltravoxConfig"
:
"vllm.transformers_utils.configs.ultravox"
,
"Step3VLConfig"
:
"vllm.transformers_utils.configs.step3_vl"
,
"Step3VisionEncoderConfig"
:
"vllm.transformers_utils.configs.step3_vl"
,
...
...
vllm/transformers_utils/configs/speculators/__init__.py
View file @
657855ab
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
.base
import
SpeculatorsConfig
__all__
=
[
"SpeculatorsConfig"
]
vllm/transformers_utils/configs/speculators/base.py
View file @
657855ab
...
...
@@ -8,9 +8,6 @@ from transformers import PretrainedConfig
from
vllm.transformers_utils.configs.speculators.algos
import
(
SUPPORTED_SPECULATORS_TYPES
,
)
__all__
=
[
"SpeculatorsConfig"
]
from
vllm.transformers_utils.utils
import
without_trust_remote_code
...
...
vllm/transformers_utils/processors/__init__.py
View file @
657855ab
...
...
@@ -21,7 +21,9 @@ __all__ = [
"HunYuanVLProcessor"
,
"HunYuanVLImageProcessor"
,
"InternVLProcessor"
,
"IsaacProcessor"
,
"KimiAudioProcessor"
,
"KimiK25Processor"
,
"MistralCommonPixtralProcessor"
,
"MistralCommonVoxtralProcessor"
,
"NanoNemotronVLProcessor"
,
...
...
@@ -32,6 +34,7 @@ __all__ = [
"Ovis2_5Processor"
,
"QwenVLProcessor"
,
"Qwen3ASRProcessor"
,
"Step3VLProcessor"
,
]
_CLASS_TO_MODULE
:
dict
[
str
,
str
]
=
{
...
...
@@ -45,7 +48,9 @@ _CLASS_TO_MODULE: dict[str, str] = {
"HunYuanVLProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl"
,
"HunYuanVLImageProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl_image"
,
"InternVLProcessor"
:
"vllm.transformers_utils.processors.internvl"
,
"IsaacProcessor"
:
"vllm.transformers_utils.processors.isaac"
,
"KimiAudioProcessor"
:
"vllm.transformers_utils.processors.kimi_audio"
,
"KimiK25Processor"
:
"vllm.transformers_utils.processors.kimi_k25"
,
"MistralCommonPixtralProcessor"
:
"vllm.transformers_utils.processors.pixtral"
,
"MistralCommonVoxtralProcessor"
:
"vllm.transformers_utils.processors.voxtral"
,
"NanoNemotronVLProcessor"
:
"vllm.transformers_utils.processors.nano_nemotron_vl"
,
...
...
@@ -56,6 +61,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Ovis2_5Processor"
:
"vllm.transformers_utils.processors.ovis2_5"
,
"QwenVLProcessor"
:
"vllm.transformers_utils.processors.qwen_vl"
,
"Qwen3ASRProcessor"
:
"vllm.transformers_utils.processors.qwen3_asr"
,
"Step3VLProcessor"
:
"vllm.transformers_utils.processors.step3_vl"
,
}
...
...
vllm/transformers_utils/processors/isaac.py
0 → 100644
View file @
657855ab
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
math
from
typing
import
Any
import
numpy
as
np
import
PIL.Image
import
torch
import
torch.nn.functional
as
F
from
transformers
import
BatchFeature
,
ProcessorMixin
,
TensorType
from
typing_extensions
import
TypedDict
,
Unpack
MAX_PIXELS
=
60_000_000
# 60-megapixel ceiling ≈ 8200 × 7300 px
# Vision preprocessing constants
VISION_MEAN
=
(
0.5
,
0.5
,
0.5
)
VISION_STD
=
(
0.5
,
0.5
,
0.5
)
VISION_SCALE
=
1
/
255
def
_make_writeable
(
arr
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Return *arr* itself if it is already writeable, otherwise try to flip the
write flag in-place and finally fall back to `arr.copy()`.
This guarantees the buffer handed to `torch.from_numpy()` is always
writeable, silencing the PyTorch warning about undefined behaviour.
"""
if
arr
.
flags
.
writeable
:
return
arr
# First, try the cheap path — in-place flag toggle (works for mmap'd arrays
# and some shared memory buffers):
try
:
arr
.
setflags
(
write
=
True
)
return
arr
# success: no data copy
except
ValueError
:
# Buffer is inherently read-only (e.g. backed by PyAV / PIL): make copy
return
arr
.
copy
()
def
extract_image_pil
(
image
:
PIL
.
Image
.
Image
)
->
torch
.
Tensor
|
None
:
if
image
.
width
*
image
.
height
>
MAX_PIXELS
:
raise
ValueError
(
f
"Image (w=
{
image
.
width
}
, h=
{
image
.
height
}
) > MAX=`
{
MAX_PIXELS
}
`"
)
img
=
image
if
image
.
mode
==
"RGB"
else
image
.
convert
(
"RGB"
)
arr
=
np
.
asarray
(
img
)
arr
=
_make_writeable
(
arr
)
return
torch
.
from_numpy
(
arr
)
def
get_image_size_for_max_num_patches
(
image_height
:
int
,
image_width
:
int
,
patch_size
:
int
,
max_num_patches
:
int
,
min_num_patches
:
int
|
None
=
None
,
eps
:
float
=
1e-5
,
pixel_shuffle_scale
:
int
=
1
,
)
->
tuple
[
int
,
int
]:
r
"""Compute a target resolution whose patch grid satisfies patching parametrization.
Args:
image_height (`int`):
Height in pixels of the source image prior to any resizing.
image_width (`int`):
Width in pixels of the source image prior to any resizing.
patch_size (`int`):
Size of the square patch used by the vision encoder.
max_num_patches (`int`):
Upper bound on `(height / patch_size) * (width / patch_size)` after
resizing.
min_num_patches (`int`, *optional*):
Lower bound on the number of patches. When provided the image will
be scaled up if necessary.
eps (`float`, *optional*, defaults to 1e-5):
Convergence tolerance for the internal binary search to determine
the target dimensions.
pixel_shuffle_scale (`int`, *optional*, defaults to 1):
Additional stride multiplier applied when pixel shuffle later
reduces spatial resolution.
Returns:
`tuple[int, int]`: Height and width (in pixels) that are multiples of
`patch_size * pixel_shuffle_scale` and respect both the maximum and
optional minimum patch-count constraints.
"""
def
get_scaled_image_size
(
scale
,
original_size
,
patch_size
,
pixel_shuffle_scale
):
scaled_size
=
scale
*
original_size
divisor
=
patch_size
*
pixel_shuffle_scale
scaled_size
=
math
.
ceil
(
scaled_size
/
divisor
)
*
divisor
scaled_size
=
max
(
divisor
,
scaled_size
)
return
int
(
scaled_size
)
# Ensure divisibility
divisor
=
patch_size
*
pixel_shuffle_scale
adjusted_height
=
math
.
ceil
(
image_height
/
divisor
)
*
divisor
adjusted_height
=
max
(
divisor
,
adjusted_height
)
adjusted_width
=
math
.
ceil
(
image_width
/
divisor
)
*
divisor
adjusted_width
=
max
(
divisor
,
adjusted_width
)
num_patches
=
(
adjusted_height
/
patch_size
)
*
(
adjusted_width
/
patch_size
)
if
min_num_patches
is
not
None
and
num_patches
<
min_num_patches
:
# Scale up
scale_min
,
scale_max
=
1.0
,
100.0
while
(
scale_max
-
scale_min
)
>=
eps
:
scale
=
(
scale_min
+
scale_max
)
/
2
target_height
=
get_scaled_image_size
(
scale
,
image_height
,
patch_size
,
pixel_shuffle_scale
)
target_width
=
get_scaled_image_size
(
scale
,
image_width
,
patch_size
,
pixel_shuffle_scale
)
num_patches
=
(
target_height
/
patch_size
)
*
(
target_width
/
patch_size
)
if
num_patches
>=
min_num_patches
:
scale_max
=
scale
else
:
scale_min
=
scale
scale
=
scale_max
target_height
=
get_scaled_image_size
(
scale
,
image_height
,
patch_size
,
pixel_shuffle_scale
)
target_width
=
get_scaled_image_size
(
scale
,
image_width
,
patch_size
,
pixel_shuffle_scale
)
return
target_height
,
target_width
elif
num_patches
<=
max_num_patches
:
return
adjusted_height
,
adjusted_width
else
:
# Scale down
scale_min
,
scale_max
=
eps
/
10
,
1.0
while
(
scale_max
-
scale_min
)
>=
eps
:
scale
=
(
scale_min
+
scale_max
)
/
2
target_height
=
get_scaled_image_size
(
scale
,
image_height
,
patch_size
,
pixel_shuffle_scale
)
target_width
=
get_scaled_image_size
(
scale
,
image_width
,
patch_size
,
pixel_shuffle_scale
)
num_patches
=
(
target_height
/
patch_size
)
*
(
target_width
/
patch_size
)
if
num_patches
<=
max_num_patches
:
scale_min
=
scale
else
:
scale_max
=
scale
scale
=
scale_min
target_height
=
get_scaled_image_size
(
scale
,
image_height
,
patch_size
,
pixel_shuffle_scale
)
target_width
=
get_scaled_image_size
(
scale
,
image_width
,
patch_size
,
pixel_shuffle_scale
)
return
target_height
,
target_width
_MEAN_TENSOR
=
torch
.
tensor
(
VISION_MEAN
,
dtype
=
torch
.
float32
).
view
(
1
,
1
,
1
,
-
1
)
_STD_TENSOR
=
torch
.
tensor
(
VISION_STD
,
dtype
=
torch
.
float32
).
view
(
1
,
1
,
1
,
-
1
)
def
prepare_image_tensor
(
image
:
torch
.
Tensor
,
scale
:
float
=
VISION_SCALE
,
)
->
torch
.
Tensor
:
r
"""Standardize RGB images prior to patch extraction via rescaling and whitening.
Args:
image (`torch.Tensor`):
Tensor with shape `(..., height, width, 3)` containing RGB values.
The tensor is converted to floating point if needed.
scale (`float`, *optional*, defaults to `VISION_SCALE`):
Scalar multiplier applied before normalization.
Returns:
`torch.Tensor`: Normalized tensor with the same shape as the input and
dtype `torch.float32`.
"""
if
not
torch
.
is_floating_point
(
image
):
image
=
image
.
float
()
rescaled
=
image
*
scale
# Use precomputed tensors and move to the correct device if needed
mean_tensor
=
_MEAN_TENSOR
.
to
(
image
.
device
)
std_tensor
=
_STD_TENSOR
.
to
(
image
.
device
)
normalized
=
(
rescaled
-
mean_tensor
)
/
std_tensor
return
normalized
def
patchify_vision
(
image
:
torch
.
Tensor
,
patch_size
:
int
)
->
torch
.
Tensor
:
r
"""Convert normalized images into flattened ViT-style patches.
Args:
image (`torch.Tensor`):
Tensor of shape `(num_images, height, width, channels)`.
patch_size (`int`):
Edge length of the square patches
Returns:
`torch.Tensor`:
Patch tensor where each position stores the flattened pixels
belonging to that patch.
Raises:
ValueError: If `height` or `width` is not divisible by `patch_size`.
"""
num_images
,
height
,
width
,
channels
=
image
.
shape
if
height
%
patch_size
or
width
%
patch_size
:
raise
ValueError
(
"Dimensions of images "
f
"
{
image
.
shape
}
are not divisible by patch_size=
{
patch_size
}
."
)
patches
=
image
.
reshape
(
num_images
,
height
//
patch_size
,
patch_size
,
width
//
patch_size
,
patch_size
,
channels
,
)
patches
=
patches
.
permute
(
0
,
1
,
3
,
2
,
4
,
5
)
patches
=
patches
.
reshape
(
num_images
,
height
//
patch_size
,
width
//
patch_size
,
channels
*
patch_size
*
patch_size
,
)
return
patches
def
process_vision_for_patches
(
images
:
torch
.
Tensor
,
patch_size
:
int
,
max_num_patches
:
int
,
min_num_patches
:
int
|
None
=
None
,
pixel_shuffle_scale
:
int
=
1
,
)
->
tuple
[
torch
.
Tensor
,
list
[
int
]]:
r
"""Resize, normalize, and patchify RGB images for the vision encoder.
Args:
images (`torch.Tensor`):
Either `(height, width, channels)` for a single image or
`(num_images, height, width, channels)` for a batch. Channels are
expected to be RGB.
patch_size (`int`):
Edge length of square patches; implicitly controls resize grid granularity.
max_num_patches (`int`):
Maximum number of patches allowed after resizing.
min_num_patches (`int`, *optional*):
Minimum number of patches. If provided, the routine upsamples images
as needed to satisfy the lower bound.
pixel_shuffle_scale (`int`, *optional*, defaults to 1):
Pixel shuffle scale factor; influences the target grid that the
function produces.
Returns:
`tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)`
where `patches` has shape `(num_images, target_h / patch_size, target_w
/ patch_size, channels * patch_size**2)` and `dims_virtual` encodes
effective `(images, height, width)` dimensions after optional pixel
shuffling.
"""
# Add batch dim if single image
if
images
.
dim
()
==
3
:
images
=
images
.
unsqueeze
(
0
)
# Permute to channel first for resize
images
=
images
.
permute
(
0
,
3
,
1
,
2
)
# Get target dimensions
_
,
_
,
orig_height
,
orig_width
=
images
.
shape
target_height
,
target_width
=
get_image_size_for_max_num_patches
(
orig_height
,
orig_width
,
patch_size
,
max_num_patches
,
min_num_patches
=
min_num_patches
,
pixel_shuffle_scale
=
pixel_shuffle_scale
,
)
# Resize
images
=
F
.
interpolate
(
images
,
size
=
(
target_height
,
target_width
),
mode
=
"bilinear"
,
align_corners
=
False
,
)
# Back to channel last
images
=
images
.
permute
(
0
,
2
,
3
,
1
)
# Normalize
images
=
prepare_image_tensor
(
images
)
# Patchify
patches
=
patchify_vision
(
images
,
patch_size
=
patch_size
)
# Calculate dimensions for the patches
n_images
,
h_patches
,
w_patches
,
_
=
patches
.
shape
dims_virtual
=
(
[
1
,
h_patches
,
w_patches
]
if
pixel_shuffle_scale
==
1
else
[
1
,
h_patches
//
pixel_shuffle_scale
,
w_patches
//
pixel_shuffle_scale
]
)
return
patches
,
dims_virtual
class
IsaacImageProcessorKwargs
(
TypedDict
,
total
=
False
):
patch_size
:
int
max_num_patches
:
int
min_num_patches
:
int
pixel_shuffle_scale
:
int
class
IsaacImageProcessor
:
patch_size
=
16
max_num_patches
=
6144
min_num_patches
=
256
pixel_shuffle_scale
=
2
valid_kwargs
=
IsaacImageProcessorKwargs
model_input_names
=
[
"pixel_values"
,
"image_grid_thw"
]
def
__init__
(
self
,
kwargs
):
self
.
patch_size
=
kwargs
.
pop
(
"patch_size"
,
self
.
patch_size
)
self
.
vision_max_num_patches
=
kwargs
.
pop
(
"vision_max_num_patches"
,
self
.
max_num_patches
)
self
.
vision_min_num_patches
=
kwargs
.
pop
(
"vision_min_num_patches"
,
self
.
min_num_patches
)
self
.
pixel_shuffle_scale
=
kwargs
.
pop
(
"pixel_shuffle_scale"
,
2
)
def
preprocess
(
self
,
images
:
list
[
torch
.
Tensor
],
return_tensors
:
str
|
TensorType
|
None
,
**
kwargs
:
Unpack
[
IsaacImageProcessorKwargs
],
)
->
BatchFeature
:
"""Preprocess images into format compatible with vLLM input processing."""
all_pixel_values
:
list
[
torch
.
Tensor
]
=
[]
all_image_grids
:
list
[
torch
.
Tensor
]
=
[]
for
image
in
images
:
image_tensor
=
extract_image_pil
(
image
)
patches
,
dims_virtual
=
process_vision_for_patches
(
image_tensor
,
patch_size
=
self
.
patch_size
,
max_num_patches
=
self
.
vision_max_num_patches
,
min_num_patches
=
self
.
vision_min_num_patches
,
pixel_shuffle_scale
=
self
.
pixel_shuffle_scale
,
)
# Isaac packs a dummy temporal dim for images
patches
=
patches
.
unsqueeze
(
1
)
# [N, T=1, Hp, Wp, D]
hp
,
wp
,
dim
=
patches
.
shape
[
-
3
],
patches
.
shape
[
-
2
],
patches
.
shape
[
-
1
]
current_num_patches
=
hp
*
wp
pixel_values
=
patches
.
reshape
(
current_num_patches
,
dim
)
# [N_tokens, D]
# Use real patch dimensions for image_grid_thw, not virtual dimensions
# This ensures the vision model receives correct grid info for pixel shuffle
dims_real
=
[
1
,
hp
,
wp
]
# Real patch dimensions
image_grid_thw
=
torch
.
tensor
(
dims_real
).
unsqueeze
(
0
)
all_pixel_values
.
append
(
pixel_values
)
all_image_grids
.
append
(
image_grid_thw
)
if
all_pixel_values
:
final_pixel_values
=
torch
.
cat
(
all_pixel_values
,
dim
=
0
)
final_image_grids
=
torch
.
cat
(
all_image_grids
,
dim
=
0
)
else
:
final_pixel_values
=
torch
.
empty
(
0
,
0
)
final_image_grids
=
torch
.
empty
(
0
,
3
)
return
BatchFeature
(
data
=
{
"pixel_values"
:
final_pixel_values
,
"image_grid_thw"
:
final_image_grids
,
},
tensor_type
=
return_tensors
,
)
class
IsaacProcessor
(
ProcessorMixin
):
attributes
=
[
"image_processor"
,
"tokenizer"
]
def
__init__
(
self
,
image_processor
=
None
,
tokenizer
=
None
,
**
kwargs
):
self
.
image_token
=
kwargs
.
pop
(
"image_token"
,
"<image>"
)
self
.
image_processor
=
image_processor
self
.
tokenizer
=
tokenizer
def
__call__
(
self
,
text
=
None
,
images
=
None
,
**
kwargs
)
->
BatchFeature
:
result
=
{}
if
images
is
not
None
:
image_inputs
=
self
.
image_processor
.
preprocess
(
images
,
**
kwargs
)
image_grid_thw
=
image_inputs
[
"image_grid_thw"
]
result
.
update
(
image_inputs
)
if
text
is
not
None
:
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
text
=
text
.
copy
()
# below lines change text in-place
merge_length
=
self
.
image_processor
.
pixel_shuffle_scale
**
2
index
=
0
for
i
in
range
(
len
(
text
)):
while
self
.
image_token
in
text
[
i
]:
num_image_tokens
=
image_grid_thw
[
index
].
prod
()
//
merge_length
text
[
i
]
=
text
[
i
].
replace
(
self
.
image_token
,
"<|placeholder|>"
*
num_image_tokens
,
1
)
index
+=
1
text
[
i
]
=
text
[
i
].
replace
(
"<|placeholder|>"
,
"<|image_pad|>"
)
if
text
is
not
None
:
result
.
update
(
self
.
tokenizer
(
text
,
**
kwargs
))
return
BatchFeature
(
result
)
def
apply_chat_template
(
self
,
messages
:
list
[
dict
[
str
,
Any
]],
tokenize
:
bool
=
False
,
add_generation_prompt
:
bool
=
False
,
**
kwargs
,
)
->
Any
:
# Convert mixed content messages to simple text format
processed_messages
=
[]
for
message
in
messages
:
if
"content"
in
message
and
isinstance
(
message
[
"content"
],
list
):
# Handle mixed content (text + image)
text_parts
=
[]
for
content_item
in
message
[
"content"
]:
if
content_item
.
get
(
"type"
)
==
"text"
:
text_parts
.
append
(
content_item
.
get
(
"text"
,
""
))
elif
content_item
.
get
(
"type"
)
==
"image"
:
# Replace image with vision token
text_parts
.
append
(
self
.
image_token
)
processed_message
=
{
"role"
:
message
.
get
(
"role"
,
"user"
),
"content"
:
""
.
join
(
text_parts
),
}
processed_messages
.
append
(
processed_message
)
else
:
# Regular text message
processed_messages
.
append
(
message
)
kwargs
[
"return_dict"
]
=
False
return
self
.
tokenizer
.
apply_chat_template
(
processed_messages
,
tokenize
=
tokenize
,
add_generation_prompt
=
add_generation_prompt
,
**
kwargs
,
)
vllm/transformers_utils/processors/kimi_k25.py
0 → 100644
View file @
657855ab
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
from
transformers
import
BatchFeature
from
transformers.processing_utils
import
ProcessorMixin
from
vllm.multimodal.inputs
import
VisionChunk
class
KimiK25Processor
(
ProcessorMixin
):
attributes
=
[
"tokenizer"
]
tokenizer_class
=
"AutoTokenizer"
def
__init__
(
self
,
media_processor
=
None
,
tokenizer
=
None
,
media_token_id
:
int
|
None
=
None
):
super
().
__init__
(
tokenizer
)
self
.
media_processor
=
media_processor
self
.
media_token_id
=
media_token_id
assert
self
.
media_token_id
is
not
None
# We do not support str input for text here
def
__call__
(
self
,
vision_chunks
:
list
[
VisionChunk
]
|
None
=
None
,
*
,
text
:
list
[
int
]
|
str
,
**
kwargs
,
)
->
BatchFeature
:
"""
Args:
vision_chunks: List of VisionChunk items to be processed.
For image: VisionChunkImage with type='image', image=PIL.Image
For video_chunk: VisionChunkVideo with type='video_chunk',
video_chunk=list[PIL.Image]
text: The token ids to be fed to a model (required).
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- list of token ids to be fed to a model.
- **pixel_values** -- Pixel values to be fed to a model.
Returned when `vision_chunks` is not `None`.
- **grid_thws** -- list of image 3D grid in LLM.
Returned when `vision_chunks` is not `None`.
"""
mm_inputs
=
{}
input_ids
=
self
.
tokenizer
.
encode
(
text
)
if
isinstance
(
text
,
str
)
else
text
if
vision_chunks
is
not
None
:
assert
isinstance
(
vision_chunks
,
list
)
mm_inputs
=
self
.
media_processor
.
preprocess
(
vision_chunks
)
num_tokens_per_chunk
=
[
self
.
media_processor
.
media_tokens_calculator
(
chunk
)
for
chunk
in
vision_chunks
]
new_input_ids
=
[]
for
token
in
input_ids
:
if
token
==
self
.
media_token_id
:
new_input_ids
.
extend
(
[
self
.
media_token_id
]
*
num_tokens_per_chunk
.
pop
(
0
)
)
else
:
new_input_ids
.
append
(
token
)
input_ids
=
new_input_ids
# XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
return
BatchFeature
(
data
=
{
"input_ids"
:
torch
.
tensor
([
input_ids
]),
**
mm_inputs
,
}
)
vllm/transformers_utils/processors/step3_vl.py
0 → 100644
View file @
657855ab
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
itertools
import
product
from
math
import
ceil
import
numpy
as
np
import
torch
from
PIL
import
Image
from
torchvision
import
transforms
from
torchvision.transforms.functional
import
InterpolationMode
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
vllm.tokenizers
import
TokenizerLike
MAX_IMAGE_SIZE
:
int
=
3024
ImageWithPatches
=
tuple
[
Image
.
Image
,
list
[
Image
.
Image
],
list
[
bool
]
|
None
]
class
Step3VisionProcessor
:
def
__init__
(
self
,
size
,
interpolation_mode
=
"bicubic"
,
patch_size
=
None
):
mean
=
[
0.48145466
,
0.4578275
,
0.40821073
]
std
=
[
0.26862954
,
0.26130258
,
0.27577711
]
patch_size
=
patch_size
if
patch_size
is
not
None
else
size
self
.
transform
=
transforms
.
Compose
(
[
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
,
std
),
transforms
.
Resize
(
(
size
,
size
),
interpolation
=
InterpolationMode
.
BICUBIC
if
interpolation_mode
==
"bicubic"
else
InterpolationMode
.
BILINEAR
,
antialias
=
True
,
),
]
)
self
.
patch_transform
=
(
transforms
.
Compose
(
[
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
,
std
),
transforms
.
Resize
(
(
patch_size
,
patch_size
),
interpolation
=
InterpolationMode
.
BICUBIC
if
interpolation_mode
==
"bicubic"
else
InterpolationMode
.
BILINEAR
,
antialias
=
True
,
),
]
)
if
patch_size
is
not
None
else
None
)
def
__call__
(
self
,
image
,
is_patch
=
False
):
if
is_patch
:
assert
self
.
patch_transform
is
not
None
return
{
"pixel_values"
:
self
.
patch_transform
(
image
).
unsqueeze
(
0
)}
return
{
"pixel_values"
:
self
.
transform
(
image
).
unsqueeze
(
0
)}
class
ImagePatcher
:
def
__init__
(
self
,
enable_patch
:
bool
=
True
)
->
None
:
self
.
enable_patch
=
enable_patch
def
determine_window_size
(
self
,
long
:
int
,
short
:
int
)
->
int
:
if
long
<
728
:
return
short
if
long
/
short
>
1.5
else
0
return
min
(
short
,
504
)
if
long
/
short
>
4
else
504
def
slide_window
(
self
,
width
:
int
,
height
:
int
,
sizes
:
list
[
tuple
[
int
,
int
]],
steps
:
list
[
tuple
[
int
,
int
]],
img_rate_thr
:
float
=
0.6
,
)
->
tuple
[
list
[
tuple
[
int
,
int
,
int
,
int
]],
tuple
[
int
,
int
]]:
assert
1
>=
img_rate_thr
>=
0
,
"The `in_rate_thr` should lie in 0~1"
windows
=
[]
# Sliding windows.
for
size
,
step
in
zip
(
sizes
,
steps
):
size_w
,
size_h
=
size
step_w
,
step_h
=
step
x_num
=
1
if
width
<=
size_w
else
ceil
((
width
-
size_w
)
/
step_w
+
1
)
x_start
=
[
step_w
*
i
for
i
in
range
(
x_num
)]
if
len
(
x_start
)
>
1
and
x_start
[
-
1
]
+
size_w
>
width
:
x_start
[
-
1
]
=
width
-
size_w
y_num
=
1
if
height
<=
size_h
else
ceil
((
height
-
size_h
)
/
step_h
+
1
)
y_start
=
[
step_h
*
i
for
i
in
range
(
y_num
)]
if
len
(
y_start
)
>
1
and
y_start
[
-
1
]
+
size_h
>
height
:
y_start
[
-
1
]
=
height
-
size_h
start
=
np
.
array
(
list
(
product
(
y_start
,
x_start
)),
dtype
=
int
)
start
[:,
[
0
,
1
]]
=
start
[:,
[
1
,
0
]]
windows
.
append
(
np
.
concatenate
([
start
,
start
+
size
],
axis
=
1
))
windows
=
np
.
concatenate
(
windows
,
axis
=
0
)
return
[
(
int
(
box
[
0
]),
int
(
box
[
1
]),
int
(
box
[
2
]
-
box
[
0
]),
int
(
box
[
3
]
-
box
[
1
]))
for
box
in
windows
],
(
x_num
,
y_num
)
def
square_pad
(
self
,
img
:
Image
.
Image
)
->
Image
.
Image
:
w
,
h
=
img
.
size
if
w
==
h
:
return
img
size
=
max
(
w
,
h
)
padded
=
Image
.
new
(
img
.
mode
,
(
size
,
size
),
0
)
padded
.
paste
(
img
,
(
0
,
0
))
return
padded
def
get_image_size_for_padding
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
ratio
=
img_width
/
img_height
if
min
(
img_height
,
img_width
)
<
32
and
(
ratio
>
4
or
ratio
<
1
/
4
):
new_size
=
max
(
img_height
,
img_width
)
return
new_size
,
new_size
return
img_width
,
img_height
def
get_image_size_for_preprocess
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
if
max
(
img_height
,
img_width
)
>
MAX_IMAGE_SIZE
:
scale_factor
=
MAX_IMAGE_SIZE
/
max
(
img_height
,
img_width
)
img_width
=
int
(
img_width
*
scale_factor
)
img_height
=
int
(
img_height
*
scale_factor
)
return
img_width
,
img_height
def
get_image_size_for_crop
(
self
,
img_width
:
int
,
img_height
:
int
,
window_size
:
int
):
w_ratio
=
img_width
/
window_size
h_ratio
=
img_height
/
window_size
if
w_ratio
<
1
:
width_new
=
img_width
else
:
decimal_w
=
w_ratio
-
img_width
//
window_size
w_ratio
=
int
(
w_ratio
)
+
1
if
decimal_w
>
0.2
else
int
(
w_ratio
)
width_new
=
window_size
*
w_ratio
if
h_ratio
<
1
:
height_new
=
img_height
else
:
decimal_h
=
h_ratio
-
img_height
//
window_size
h_ratio
=
int
(
h_ratio
)
+
1
if
decimal_h
>
0.2
else
int
(
h_ratio
)
height_new
=
window_size
*
h_ratio
return
int
(
width_new
),
int
(
height_new
)
def
patch_crop
(
self
,
img
:
Image
.
Image
,
i
:
int
,
j
:
int
,
th
:
int
,
tw
:
int
):
target
=
img
.
crop
((
j
,
i
,
j
+
tw
,
i
+
th
))
return
target
def
get_num_patches
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
img_width
,
img_height
=
self
.
get_image_size_for_padding
(
img_width
,
img_height
)
img_width
,
img_height
=
self
.
get_image_size_for_preprocess
(
img_width
,
img_height
)
window_size
=
self
.
determine_window_size
(
max
(
img_height
,
img_width
),
min
(
img_height
,
img_width
)
)
if
window_size
==
0
or
not
self
.
enable_patch
:
return
0
,
0
else
:
img_width
,
img_height
=
self
.
get_image_size_for_crop
(
img_width
,
img_height
,
window_size
)
center_list
,
(
x_num
,
y_num
)
=
self
.
slide_window
(
img_width
,
img_height
,
[(
window_size
,
window_size
)],
[(
window_size
,
window_size
)],
)
full_rows
=
(
len
(
center_list
)
-
1
)
//
x_num
+
1
if
len
(
center_list
)
>
0
and
len
(
center_list
)
%
x_num
==
0
:
full_rows
-=
1
return
len
(
center_list
),
full_rows
def
__call__
(
self
,
img
:
Image
.
Image
)
->
tuple
[
Image
.
Image
,
list
[
Image
.
Image
],
list
[
bool
]
|
None
]:
img_width
,
img_height
=
img
.
size
new_img_width
,
new_img_height
=
self
.
get_image_size_for_padding
(
img_width
,
img_height
)
if
new_img_width
!=
img_width
or
new_img_height
!=
img_height
:
img
=
self
.
square_pad
(
img
)
img_width
,
img_height
=
img
.
size
new_img_width
,
new_img_height
=
self
.
get_image_size_for_preprocess
(
img_width
,
img_height
)
img
=
img
.
resize
((
new_img_width
,
new_img_height
),
Image
.
Resampling
.
BILINEAR
)
window_size
=
self
.
determine_window_size
(
max
(
new_img_height
,
new_img_width
),
min
(
new_img_height
,
new_img_width
)
)
if
window_size
==
0
or
not
self
.
enable_patch
:
return
img
,
[],
None
else
:
new_img_width
,
new_img_height
=
self
.
get_image_size_for_crop
(
new_img_width
,
new_img_height
,
window_size
)
if
(
new_img_width
,
new_img_height
)
!=
(
img_width
,
img_height
):
img_for_crop
=
img
.
resize
(
(
new_img_width
,
new_img_height
),
Image
.
Resampling
.
BILINEAR
)
else
:
img_for_crop
=
img
patches
=
[]
newlines
=
[]
center_list
,
(
x_num
,
y_num
)
=
self
.
slide_window
(
new_img_width
,
new_img_height
,
[(
window_size
,
window_size
)],
[(
window_size
,
window_size
)],
)
for
patch_id
,
center_lf_point
in
enumerate
(
center_list
):
x
,
y
,
patch_w
,
patch_h
=
center_lf_point
big_patch
=
self
.
patch_crop
(
img_for_crop
,
y
,
x
,
patch_h
,
patch_w
)
patches
.
append
(
big_patch
)
if
(
patch_id
+
1
)
%
x_num
==
0
:
newlines
.
append
(
patch_id
)
if
newlines
and
newlines
[
-
1
]
==
len
(
patches
)
-
1
:
newlines
.
pop
()
return
(
img
,
patches
,
[
i
in
newlines
for
i
in
range
(
len
(
patches
))]
if
len
(
patches
)
>
0
else
None
,
)
class
Step3VLProcessor
:
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_size
=
728
self
.
patch_size
=
504
self
.
image_preprocessor
=
Step3VisionProcessor
(
self
.
image_size
,
"bilinear"
,
self
.
patch_size
)
self
.
num_image_feature_size
=
169
self
.
num_patch_feature_size
=
81
self
.
image_token
=
"<im_patch>"
self
.
image_feature_placeholder
=
self
.
image_token
*
self
.
num_image_feature_size
self
.
patch_feature_placeholder
=
self
.
image_token
*
self
.
num_patch_feature_size
# Respect vision config switch to enable/disable patch extraction.
# For video understanding, it's preferable to disable patch.
enable_patch
=
getattr
(
self
.
config
.
vision_config
,
"enable_patch"
,
True
)
self
.
patcher
=
ImagePatcher
(
enable_patch
=
enable_patch
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
self
.
image_token
]
def
get_num_image_tokens
(
self
,
img_width
:
int
,
img_height
:
int
)
->
int
:
num_patches
,
num_newlines
=
self
.
patcher
.
get_num_patches
(
img_width
,
img_height
)
return
(
num_patches
*
(
self
.
num_patch_feature_size
+
2
)
+
self
.
num_image_feature_size
+
2
+
num_newlines
)
def
_split_images
(
self
,
images
:
list
[
Image
.
Image
])
->
list
[
ImageWithPatches
]:
result
=
[]
for
img
in
images
:
result
.
append
(
self
.
patcher
(
img
))
return
result
def
_convert_images_to_pixel_values
(
self
,
images
:
list
[
Image
.
Image
],
is_patch
:
bool
=
False
,
)
->
list
[
torch
.
Tensor
]:
return
[
self
.
image_preprocessor
(
img
,
is_patch
=
is_patch
)[
"pixel_values"
]
for
img
in
images
]
def
_get_patch_repl
(
self
,
num_patches
:
int
,
patch_newline_mask
:
list
[
bool
]
|
None
,
)
->
tuple
[
str
,
list
[
int
]]:
text
=
""
token_ids
=
[]
for
i
in
range
(
num_patches
):
assert
(
patch_newline_mask
is
not
None
and
len
(
patch_newline_mask
)
==
num_patches
)
text
+=
f
"<patch_start>
{
self
.
patch_feature_placeholder
}
<patch_end>"
token_ids
.
extend
(
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_start>"
)]
+
[
self
.
image_token_id
]
*
self
.
num_patch_feature_size
+
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_end>"
)]
)
if
patch_newline_mask
and
patch_newline_mask
[
i
]:
text
+=
"<patch_newline>"
token_ids
.
append
(
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_newline>"
)
)
return
text
,
token_ids
def
_get_image_repl
(
self
,
num_images
:
int
,
)
->
tuple
[
str
,
list
[
int
]]:
text
=
f
"<im_start>
{
self
.
image_feature_placeholder
}
<im_end>"
token_ids
=
(
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<im_start>"
)]
+
[
self
.
image_token_id
]
*
self
.
num_image_feature_size
+
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<im_end>"
)]
)
return
text
*
num_images
,
token_ids
*
num_images
def
_get_image_repl_features
(
self
,
num_images
:
int
,
num_patches
:
int
,
patch_new_line_idx
:
list
[
bool
]
|
None
,
)
->
tuple
[
str
,
list
[
int
]]:
if
num_patches
>
0
:
patch_repl
,
patch_repl_ids
=
self
.
_get_patch_repl
(
num_patches
,
patch_new_line_idx
)
else
:
patch_repl
=
""
patch_repl_ids
=
[]
image_repl
,
image_repl_ids
=
self
.
_get_image_repl
(
num_images
)
return
patch_repl
+
image_repl
,
patch_repl_ids
+
image_repl_ids
def
replace_placeholder
(
self
,
text
:
str
,
placeholder
:
str
,
repls
:
list
[
str
])
->
str
:
parts
=
text
.
split
(
placeholder
)
if
len
(
parts
)
-
1
!=
len
(
repls
):
raise
ValueError
(
"The number of placeholders does not match the number of replacements."
)
result
=
[
parts
[
0
]]
for
i
,
repl
in
enumerate
(
repls
):
result
.
append
(
repl
)
result
.
append
(
parts
[
i
+
1
])
return
""
.
join
(
result
)
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
text_inputs
=
self
.
tokenizer
(
text
)
else
:
split_images_data
=
self
.
_split_images
(
images
)
pixel_values_lst
=
[]
patch_pixel_values_lst
=
[]
patch_newline_mask_lst
=
[]
image_repl_str_lst
=
[]
image_repl_ids_lst
=
[]
num_patches
=
[]
for
raw_img
,
img_patches
,
patch_newline_mask
in
split_images_data
:
pixel_values_lst
.
extend
(
self
.
_convert_images_to_pixel_values
([
raw_img
]))
if
len
(
img_patches
)
>
0
:
patch_pixel_values_lst
.
extend
(
self
.
_convert_images_to_pixel_values
(
img_patches
,
is_patch
=
True
)
)
num_patches
.
append
(
len
(
img_patches
))
image_repl_str
,
image_repl_ids
=
self
.
_get_image_repl_features
(
1
,
len
(
img_patches
),
patch_newline_mask
)
image_repl_str_lst
.
append
(
image_repl_str
)
image_repl_ids_lst
.
extend
(
image_repl_ids
)
if
patch_newline_mask
is
not
None
:
patch_newline_mask_lst
.
extend
(
patch_newline_mask
)
pixel_values
=
torch
.
cat
(
pixel_values_lst
)
patch_size
=
self
.
patch_size
image_inputs
=
{
"pixel_values"
:
pixel_values
,
"num_patches"
:
num_patches
,
"patch_pixel_values"
:
(
torch
.
cat
(
patch_pixel_values_lst
)
if
patch_pixel_values_lst
else
pixel_values
.
new_empty
((
0
,
3
,
patch_size
,
patch_size
))
),
"patch_newline_mask"
:
torch
.
tensor
(
patch_newline_mask_lst
,
dtype
=
torch
.
bool
),
}
text
=
[
self
.
replace_placeholder
(
t
,
self
.
image_token
,
image_repl_str_lst
)
for
t
in
text
]
text_inputs
=
self
.
tokenizer
(
text
)
return
BatchFeature
(
{
**
text_inputs
,
**
image_inputs
,
},
tensor_type
=
return_tensors
,
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment