Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
657855ab
Unverified
Commit
657855ab
authored
Mar 19, 2026
by
Cyrus Leung
Committed by
GitHub
Mar 19, 2026
Browse files
[Misc] Cleanup more configs and processors (#37560)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
e27b8ba3
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
77 additions
and
1137 deletions
+77
-1137
vllm/config/speculative.py
vllm/config/speculative.py
+3
-1
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+1
-1
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon.py
+1
-1
vllm/model_executor/models/flex_olmo.py
vllm/model_executor/models/flex_olmo.py
+1
-1
vllm/model_executor/models/hyperclovax_vision.py
vllm/model_executor/models/hyperclovax_vision.py
+2
-26
vllm/model_executor/models/isaac.py
vllm/model_executor/models/isaac.py
+23
-480
vllm/model_executor/models/jais.py
vllm/model_executor/models/jais.py
+1
-1
vllm/model_executor/models/kimi_k25.py
vllm/model_executor/models/kimi_k25.py
+14
-81
vllm/model_executor/models/kimi_vl.py
vllm/model_executor/models/kimi_vl.py
+1
-1
vllm/model_executor/models/lfm2_moe.py
vllm/model_executor/models/lfm2_moe.py
+1
-1
vllm/model_executor/models/lightonocr.py
vllm/model_executor/models/lightonocr.py
+4
-17
vllm/model_executor/models/mistral3.py
vllm/model_executor/models/mistral3.py
+13
-63
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nemotron.py
+1
-1
vllm/model_executor/models/nemotron_h.py
vllm/model_executor/models/nemotron_h.py
+1
-1
vllm/model_executor/models/nemotron_h_mtp.py
vllm/model_executor/models/nemotron_h_mtp.py
+1
-1
vllm/model_executor/models/olmo2.py
vllm/model_executor/models/olmo2.py
+1
-1
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+1
-1
vllm/model_executor/models/qwen3_next_mtp.py
vllm/model_executor/models/qwen3_next_mtp.py
+1
-1
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+4
-433
vllm/model_executor/models/tarsier.py
vllm/model_executor/models/tarsier.py
+2
-24
No files found.
vllm/config/speculative.py
View file @
657855ab
...
...
@@ -520,8 +520,10 @@ class SpeculativeConfig:
# Replace hf_config for EAGLE draft_model
if
self
.
method
in
(
"eagle"
,
"eagle3"
):
from
vllm.transformers_utils.configs
import
SpeculatorsConfig
from
vllm.transformers_utils.configs.eagle
import
EAGLEConfig
from
vllm.transformers_utils.configs.speculators
import
(
SpeculatorsConfig
,
)
if
isinstance
(
self
.
draft_model_config
.
hf_config
,
...
...
vllm/model_executor/models/chatglm.py
View file @
657855ab
...
...
@@ -32,7 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
ChatGLMConfig
from
vllm.transformers_utils.configs
.chatglm
import
ChatGLMConfig
from
.interfaces
import
SupportsLoRA
,
SupportsPP
,
SupportsQuant
from
.utils
import
(
...
...
vllm/model_executor/models/falcon.py
View file @
657855ab
...
...
@@ -54,7 +54,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
RWConfig
from
vllm.transformers_utils.configs
.falcon
import
RWConfig
from
.interfaces
import
SupportsPP
from
.utils
import
(
...
...
vllm/model_executor/models/flex_olmo.py
View file @
657855ab
...
...
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.models.olmoe
import
OlmoeAttention
,
OlmoeForCausalLM
from
vllm.transformers_utils.configs
import
FlexOlmoConfig
from
vllm.transformers_utils.configs
.flex_olmo
import
FlexOlmoConfig
logger
=
init_logger
(
__name__
)
...
...
vllm/model_executor/models/hyperclovax_vision.py
View file @
657855ab
...
...
@@ -20,7 +20,6 @@ from vllm.config import VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
...
@@ -31,7 +30,6 @@ from vllm.multimodal.processing import (
BaseDummyInputsBuilder
,
BaseMultiModalProcessor
,
BaseProcessingInfo
,
InputProcessingContext
,
PromptReplacement
,
PromptUpdate
,
)
...
...
@@ -336,28 +334,6 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn
return
fields
def
_build_hcxvision_hf_info
(
ctx
:
InputProcessingContext
,
)
->
HCXVisionProcessingInfo
:
return
HCXVisionProcessingInfo
(
ctx
)
def
_build_hcxvision_hf_processor
(
info
:
HCXVisionProcessingInfo
,
dummy_inputs
:
BaseDummyInputsBuilder
[
HCXVisionProcessingInfo
],
*
,
cache
:
BaseMultiModalProcessorCache
|
None
=
None
,
)
->
BaseMultiModalProcessor
:
if
isinstance
(
info
,
HCXVisionProcessingInfo
):
return
HCXVisionMultiModalProcessor
(
info
,
dummy_inputs
,
# type: ignore
cache
=
cache
,
)
raise
NotImplementedError
(
type
(
info
))
def
init_vision_tower_for_hcxvision
(
vision_config
,
quant_config
:
QuantizationConfig
|
None
,
...
...
@@ -587,8 +563,8 @@ class HCXVisionCAbstractor(nn.Module):
@
MULTIMODAL_REGISTRY
.
register_processor
(
_build_hcxvision_hf_p
rocessor
,
info
=
_build_hcxvision_hf_i
nfo
,
HCXVisionMultiModalP
rocessor
,
info
=
HCXVisionProcessingI
nfo
,
dummy_inputs
=
HCXVisionDummyInputsBuilder
,
)
class
HCXVisionForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
...
...
vllm/model_executor/models/isaac.py
View file @
657855ab
...
...
@@ -2,22 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
math
from
collections.abc
import
Iterable
,
Iterator
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Any
import
numpy
as
np
import
PIL.Image
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
einops
import
rearrange
from
transformers.image_processing_utils
import
BatchFeature
from
transformers.utils
import
TensorType
from
typing_extensions
import
TypedDict
,
Unpack
from
vllm.config
import
VllmConfig
from
vllm.config.model
import
ModelConfig
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
...
...
@@ -64,13 +59,17 @@ from vllm.multimodal.processing import (
PromptUpdateDetails
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tokenizers.hf
import
get_cached_tokenizer
from
vllm.tokenizers
import
cached_tokenizer_from_config
from
vllm.transformers_utils.config
import
patch_rope_parameters
from
vllm.transformers_utils.configs
import
(
from
vllm.transformers_utils.configs
.isaac
import
(
IsaacConfig
,
PixelShuffleSiglip2VisionConfig
,
)
from
vllm.transformers_utils.processors.isaac
import
(
IsaacImageProcessor
,
IsaacProcessor
,
get_image_size_for_max_num_patches
,
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.vision
import
is_vit_use_data_parallel
...
...
@@ -307,467 +306,6 @@ def pixel_shuffle_varlen(
# Configuration
# ============================================================================
MAX_PIXELS
=
60_000_000
# 60-megapixel ceiling ≈ 8200 × 7300 px
# Vision preprocessing constants
VISION_MEAN
=
(
0.5
,
0.5
,
0.5
)
VISION_STD
=
(
0.5
,
0.5
,
0.5
)
VISION_SCALE
=
1
/
255
def
_make_writeable
(
arr
:
np
.
ndarray
)
->
np
.
ndarray
:
"""Return *arr* itself if it is already writeable, otherwise try to flip the
write flag in-place and finally fall back to `arr.copy()`.
This guarantees the buffer handed to `torch.from_numpy()` is always
writeable, silencing the PyTorch warning about undefined behaviour.
"""
if
arr
.
flags
.
writeable
:
return
arr
# First, try the cheap path — in-place flag toggle (works for mmap'd arrays
# and some shared memory buffers):
try
:
arr
.
setflags
(
write
=
True
)
return
arr
# success: no data copy
except
ValueError
:
# Buffer is inherently read-only (e.g. backed by PyAV / PIL): make copy
return
arr
.
copy
()
def
extract_image_pil
(
image
:
PIL
.
Image
.
Image
)
->
torch
.
Tensor
|
None
:
if
image
.
width
*
image
.
height
>
MAX_PIXELS
:
raise
ValueError
(
f
"Image (w=
{
image
.
width
}
, h=
{
image
.
height
}
) > MAX=`
{
MAX_PIXELS
}
`"
)
img
=
image
if
image
.
mode
==
"RGB"
else
image
.
convert
(
"RGB"
)
arr
=
np
.
asarray
(
img
)
arr
=
_make_writeable
(
arr
)
return
torch
.
from_numpy
(
arr
)
def
get_image_size_for_max_num_patches
(
image_height
:
int
,
image_width
:
int
,
patch_size
:
int
,
max_num_patches
:
int
,
min_num_patches
:
int
|
None
=
None
,
eps
:
float
=
1e-5
,
pixel_shuffle_scale
:
int
=
1
,
)
->
tuple
[
int
,
int
]:
r
"""Compute a target resolution whose patch grid satisfies patching parametrization.
Args:
image_height (`int`):
Height in pixels of the source image prior to any resizing.
image_width (`int`):
Width in pixels of the source image prior to any resizing.
patch_size (`int`):
Size of the square patch used by the vision encoder.
max_num_patches (`int`):
Upper bound on `(height / patch_size) * (width / patch_size)` after
resizing.
min_num_patches (`int`, *optional*):
Lower bound on the number of patches. When provided the image will
be scaled up if necessary.
eps (`float`, *optional*, defaults to 1e-5):
Convergence tolerance for the internal binary search to determine
the target dimensions.
pixel_shuffle_scale (`int`, *optional*, defaults to 1):
Additional stride multiplier applied when pixel shuffle later
reduces spatial resolution.
Returns:
`tuple[int, int]`: Height and width (in pixels) that are multiples of
`patch_size * pixel_shuffle_scale` and respect both the maximum and
optional minimum patch-count constraints.
"""
def
get_scaled_image_size
(
scale
,
original_size
,
patch_size
,
pixel_shuffle_scale
):
scaled_size
=
scale
*
original_size
divisor
=
patch_size
*
pixel_shuffle_scale
scaled_size
=
math
.
ceil
(
scaled_size
/
divisor
)
*
divisor
scaled_size
=
max
(
divisor
,
scaled_size
)
return
int
(
scaled_size
)
# Ensure divisibility
divisor
=
patch_size
*
pixel_shuffle_scale
adjusted_height
=
math
.
ceil
(
image_height
/
divisor
)
*
divisor
adjusted_height
=
max
(
divisor
,
adjusted_height
)
adjusted_width
=
math
.
ceil
(
image_width
/
divisor
)
*
divisor
adjusted_width
=
max
(
divisor
,
adjusted_width
)
num_patches
=
(
adjusted_height
/
patch_size
)
*
(
adjusted_width
/
patch_size
)
if
min_num_patches
is
not
None
and
num_patches
<
min_num_patches
:
# Scale up
scale_min
,
scale_max
=
1.0
,
100.0
while
(
scale_max
-
scale_min
)
>=
eps
:
scale
=
(
scale_min
+
scale_max
)
/
2
target_height
=
get_scaled_image_size
(
scale
,
image_height
,
patch_size
,
pixel_shuffle_scale
)
target_width
=
get_scaled_image_size
(
scale
,
image_width
,
patch_size
,
pixel_shuffle_scale
)
num_patches
=
(
target_height
/
patch_size
)
*
(
target_width
/
patch_size
)
if
num_patches
>=
min_num_patches
:
scale_max
=
scale
else
:
scale_min
=
scale
scale
=
scale_max
target_height
=
get_scaled_image_size
(
scale
,
image_height
,
patch_size
,
pixel_shuffle_scale
)
target_width
=
get_scaled_image_size
(
scale
,
image_width
,
patch_size
,
pixel_shuffle_scale
)
return
target_height
,
target_width
elif
num_patches
<=
max_num_patches
:
return
adjusted_height
,
adjusted_width
else
:
# Scale down
scale_min
,
scale_max
=
eps
/
10
,
1.0
while
(
scale_max
-
scale_min
)
>=
eps
:
scale
=
(
scale_min
+
scale_max
)
/
2
target_height
=
get_scaled_image_size
(
scale
,
image_height
,
patch_size
,
pixel_shuffle_scale
)
target_width
=
get_scaled_image_size
(
scale
,
image_width
,
patch_size
,
pixel_shuffle_scale
)
num_patches
=
(
target_height
/
patch_size
)
*
(
target_width
/
patch_size
)
if
num_patches
<=
max_num_patches
:
scale_min
=
scale
else
:
scale_max
=
scale
scale
=
scale_min
target_height
=
get_scaled_image_size
(
scale
,
image_height
,
patch_size
,
pixel_shuffle_scale
)
target_width
=
get_scaled_image_size
(
scale
,
image_width
,
patch_size
,
pixel_shuffle_scale
)
return
target_height
,
target_width
_MEAN_TENSOR
=
torch
.
tensor
(
VISION_MEAN
,
dtype
=
torch
.
float32
).
view
(
1
,
1
,
1
,
-
1
)
_STD_TENSOR
=
torch
.
tensor
(
VISION_STD
,
dtype
=
torch
.
float32
).
view
(
1
,
1
,
1
,
-
1
)
def
_resolve_vision_token_id
(
model_config
:
ModelConfig
,
vision_token
:
str
)
->
int
:
tokenizer_name
=
model_config
.
tokenizer
or
model_config
.
model
tokenizer
=
get_cached_tokenizer
(
get_tokenizer
(
tokenizer_name
,
tokenizer_mode
=
model_config
.
tokenizer_mode
,
trust_remote_code
=
model_config
.
trust_remote_code
,
revision
=
model_config
.
tokenizer_revision
or
model_config
.
revision
,
)
)
return
tokenizer
.
encode
(
vision_token
,
add_special_tokens
=
False
)[
0
]
def
prepare_image_tensor
(
image
:
torch
.
Tensor
,
scale
:
float
=
VISION_SCALE
,
)
->
torch
.
Tensor
:
r
"""Standardize RGB images prior to patch extraction via rescaling and whitening.
Args:
image (`torch.Tensor`):
Tensor with shape `(..., height, width, 3)` containing RGB values.
The tensor is converted to floating point if needed.
scale (`float`, *optional*, defaults to `VISION_SCALE`):
Scalar multiplier applied before normalization.
Returns:
`torch.Tensor`: Normalized tensor with the same shape as the input and
dtype `torch.float32`.
"""
if
not
torch
.
is_floating_point
(
image
):
image
=
image
.
float
()
rescaled
=
image
*
scale
# Use precomputed tensors and move to the correct device if needed
mean_tensor
=
_MEAN_TENSOR
.
to
(
image
.
device
)
std_tensor
=
_STD_TENSOR
.
to
(
image
.
device
)
normalized
=
(
rescaled
-
mean_tensor
)
/
std_tensor
return
normalized
def
patchify_vision
(
image
:
torch
.
Tensor
,
patch_size
:
int
)
->
torch
.
Tensor
:
r
"""Convert normalized images into flattened ViT-style patches.
Args:
image (`torch.Tensor`):
Tensor of shape `(num_images, height, width, channels)`.
patch_size (`int`):
Edge length of the square patches
Returns:
`torch.Tensor`:
Patch tensor where each position stores the flattened pixels
belonging to that patch.
Raises:
ValueError: If `height` or `width` is not divisible by `patch_size`.
"""
num_images
,
height
,
width
,
channels
=
image
.
shape
if
height
%
patch_size
or
width
%
patch_size
:
raise
ValueError
(
"Dimensions of images "
f
"
{
image
.
shape
}
are not divisible by patch_size=
{
patch_size
}
."
)
patches
=
image
.
reshape
(
num_images
,
height
//
patch_size
,
patch_size
,
width
//
patch_size
,
patch_size
,
channels
,
)
patches
=
patches
.
permute
(
0
,
1
,
3
,
2
,
4
,
5
)
patches
=
patches
.
reshape
(
num_images
,
height
//
patch_size
,
width
//
patch_size
,
channels
*
patch_size
*
patch_size
,
)
return
patches
def
process_vision_for_patches
(
images
:
torch
.
Tensor
,
patch_size
:
int
,
max_num_patches
:
int
,
min_num_patches
:
int
|
None
=
None
,
pixel_shuffle_scale
:
int
=
1
,
)
->
tuple
[
torch
.
Tensor
,
list
[
int
]]:
r
"""Resize, normalize, and patchify RGB images for the vision encoder.
Args:
images (`torch.Tensor`):
Either `(height, width, channels)` for a single image or
`(num_images, height, width, channels)` for a batch. Channels are
expected to be RGB.
patch_size (`int`):
Edge length of square patches; implicitly controls resize grid granularity.
max_num_patches (`int`):
Maximum number of patches allowed after resizing.
min_num_patches (`int`, *optional*):
Minimum number of patches. If provided, the routine upsamples images
as needed to satisfy the lower bound.
pixel_shuffle_scale (`int`, *optional*, defaults to 1):
Pixel shuffle scale factor; influences the target grid that the
function produces.
Returns:
`tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)`
where `patches` has shape `(num_images, target_h / patch_size, target_w
/ patch_size, channels * patch_size**2)` and `dims_virtual` encodes
effective `(images, height, width)` dimensions after optional pixel
shuffling.
"""
# Add batch dim if single image
if
images
.
dim
()
==
3
:
images
=
images
.
unsqueeze
(
0
)
# Permute to channel first for resize
images
=
images
.
permute
(
0
,
3
,
1
,
2
)
# Get target dimensions
_
,
_
,
orig_height
,
orig_width
=
images
.
shape
target_height
,
target_width
=
get_image_size_for_max_num_patches
(
orig_height
,
orig_width
,
patch_size
,
max_num_patches
,
min_num_patches
=
min_num_patches
,
pixel_shuffle_scale
=
pixel_shuffle_scale
,
)
# Resize
images
=
F
.
interpolate
(
images
,
size
=
(
target_height
,
target_width
),
mode
=
"bilinear"
,
align_corners
=
False
,
)
# Back to channel last
images
=
images
.
permute
(
0
,
2
,
3
,
1
)
# Normalize
images
=
prepare_image_tensor
(
images
)
# Patchify
patches
=
patchify_vision
(
images
,
patch_size
=
patch_size
)
# Calculate dimensions for the patches
n_images
,
h_patches
,
w_patches
,
_
=
patches
.
shape
dims_virtual
=
(
[
1
,
h_patches
,
w_patches
]
if
pixel_shuffle_scale
==
1
else
[
1
,
h_patches
//
pixel_shuffle_scale
,
w_patches
//
pixel_shuffle_scale
]
)
return
patches
,
dims_virtual
class
IsaacImageProcessorKwargs
(
TypedDict
,
total
=
False
):
patch_size
:
int
max_num_patches
:
int
min_num_patches
:
int
pixel_shuffle_scale
:
int
class
IsaacImageProcessor
:
patch_size
=
16
max_num_patches
=
6144
min_num_patches
=
256
pixel_shuffle_scale
=
2
valid_kwargs
=
IsaacImageProcessorKwargs
model_input_names
=
[
"pixel_values"
,
"image_grid_thw"
]
def
__init__
(
self
,
kwargs
):
self
.
patch_size
=
kwargs
.
pop
(
"patch_size"
,
self
.
patch_size
)
self
.
vision_max_num_patches
=
kwargs
.
pop
(
"vision_max_num_patches"
,
self
.
max_num_patches
)
self
.
vision_min_num_patches
=
kwargs
.
pop
(
"vision_min_num_patches"
,
self
.
min_num_patches
)
self
.
pixel_shuffle_scale
=
kwargs
.
pop
(
"pixel_shuffle_scale"
,
2
)
def
preprocess
(
self
,
images
:
list
[
torch
.
Tensor
],
return_tensors
:
str
|
TensorType
|
None
,
**
kwargs
:
Unpack
[
IsaacImageProcessorKwargs
],
)
->
BatchFeature
:
"""Preprocess images into format compatible with vLLM input processing."""
all_pixel_values
:
list
[
torch
.
Tensor
]
=
[]
all_image_grids
:
list
[
torch
.
Tensor
]
=
[]
for
image
in
images
:
image_tensor
=
extract_image_pil
(
image
)
patches
,
dims_virtual
=
process_vision_for_patches
(
image_tensor
,
patch_size
=
self
.
patch_size
,
max_num_patches
=
self
.
vision_max_num_patches
,
min_num_patches
=
self
.
vision_min_num_patches
,
pixel_shuffle_scale
=
self
.
pixel_shuffle_scale
,
)
# Isaac packs a dummy temporal dim for images
patches
=
patches
.
unsqueeze
(
1
)
# [N, T=1, Hp, Wp, D]
hp
,
wp
,
dim
=
patches
.
shape
[
-
3
],
patches
.
shape
[
-
2
],
patches
.
shape
[
-
1
]
current_num_patches
=
hp
*
wp
pixel_values
=
patches
.
reshape
(
current_num_patches
,
dim
)
# [N_tokens, D]
# Use real patch dimensions for image_grid_thw, not virtual dimensions
# This ensures the vision model receives correct grid info for pixel shuffle
dims_real
=
[
1
,
hp
,
wp
]
# Real patch dimensions
image_grid_thw
=
torch
.
tensor
(
dims_real
).
unsqueeze
(
0
)
all_pixel_values
.
append
(
pixel_values
)
all_image_grids
.
append
(
image_grid_thw
)
if
all_pixel_values
:
final_pixel_values
=
torch
.
cat
(
all_pixel_values
,
dim
=
0
)
final_image_grids
=
torch
.
cat
(
all_image_grids
,
dim
=
0
)
else
:
final_pixel_values
=
torch
.
empty
(
0
,
0
)
final_image_grids
=
torch
.
empty
(
0
,
3
)
return
BatchFeature
(
data
=
{
"pixel_values"
:
final_pixel_values
,
"image_grid_thw"
:
final_image_grids
,
},
tensor_type
=
return_tensors
,
)
class
IsaacProcessor
:
"""Processor wrapper (tokenizer + IsaacImageProcessor)."""
def
__init__
(
self
,
image_processor
=
None
,
tokenizer
=
None
,
**
kwargs
):
self
.
image_token
=
kwargs
.
pop
(
"image_token"
,
"<image>"
)
self
.
image_processor
=
image_processor
or
IsaacImageProcessor
(
kwargs
)
self
.
tokenizer
=
tokenizer
def
__call__
(
self
,
text
=
None
,
images
=
None
,
**
kwargs
)
->
BatchFeature
:
result
=
{}
if
images
is
not
None
:
image_inputs
=
self
.
image_processor
.
preprocess
(
images
,
**
kwargs
)
image_grid_thw
=
image_inputs
[
"image_grid_thw"
]
result
.
update
(
image_inputs
)
if
text
is
not
None
:
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
text
=
text
.
copy
()
# below lines change text in-place
merge_length
=
self
.
image_processor
.
pixel_shuffle_scale
**
2
index
=
0
for
i
in
range
(
len
(
text
)):
while
self
.
image_token
in
text
[
i
]:
num_image_tokens
=
image_grid_thw
[
index
].
prod
()
//
merge_length
text
[
i
]
=
text
[
i
].
replace
(
self
.
image_token
,
"<|placeholder|>"
*
num_image_tokens
,
1
)
index
+=
1
text
[
i
]
=
text
[
i
].
replace
(
"<|placeholder|>"
,
"<|image_pad|>"
)
if
text
is
not
None
:
result
.
update
(
self
.
tokenizer
(
text
,
**
kwargs
))
return
BatchFeature
(
result
)
def
apply_chat_template
(
self
,
messages
:
list
[
dict
[
str
,
Any
]],
tokenize
:
bool
=
False
,
add_generation_prompt
:
bool
=
False
,
**
kwargs
,
)
->
Any
:
# Convert mixed content messages to simple text format
processed_messages
=
[]
for
message
in
messages
:
if
"content"
in
message
and
isinstance
(
message
[
"content"
],
list
):
# Handle mixed content (text + image)
text_parts
=
[]
for
content_item
in
message
[
"content"
]:
if
content_item
.
get
(
"type"
)
==
"text"
:
text_parts
.
append
(
content_item
.
get
(
"text"
,
""
))
elif
content_item
.
get
(
"type"
)
==
"image"
:
# Replace image with vision token
text_parts
.
append
(
self
.
image_token
)
processed_message
=
{
"role"
:
message
.
get
(
"role"
,
"user"
),
"content"
:
""
.
join
(
text_parts
),
}
processed_messages
.
append
(
processed_message
)
else
:
# Regular text message
processed_messages
.
append
(
message
)
kwargs
[
"return_dict"
]
=
False
return
self
.
tokenizer
.
apply_chat_template
(
processed_messages
,
tokenize
=
tokenize
,
add_generation_prompt
=
add_generation_prompt
,
**
kwargs
,
)
class
IsaacProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_config
(
self
)
->
IsaacConfig
:
...
...
@@ -795,16 +333,18 @@ class IsaacProcessingInfo(BaseProcessingInfo):
)
return
IsaacConfig
()
def
get_image_processor
(
self
,
**
kwargs
)
->
IsaacImageProcessor
:
return
IsaacImageProcessor
(
kwargs
)
def
get_hf_processor
(
self
,
**
kwargs
)
->
IsaacProcessor
:
hf_config
=
self
.
get_hf_config
()
processor_kwargs
=
{
"image_token"
:
hf_config
.
vision_token
,
}
processor_kwargs
.
update
(
kwargs
)
return
self
.
ctx
.
get_hf_processor
(
IsaacProcessor
,
**
processor_kwargs
)
def
get_tokenizer
(
self
):
return
self
.
ctx
.
tokenizer
return
self
.
ctx
.
init_processor
(
IsaacProcessor
,
tokenizer
=
self
.
get_tokenizer
(),
image_processor
=
self
.
get_image_processor
(),
image_token
=
hf_config
.
vision_token
,
)
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
hf_config
=
self
.
get_hf_config
()
...
...
@@ -819,9 +359,6 @@ class IsaacProcessingInfo(BaseProcessingInfo):
)
return
ImageSize
(
width
=
target_width
,
height
=
target_height
)
def
get_image_processor
(
self
,
**
kwargs
)
->
IsaacImageProcessor
:
return
self
.
get_hf_processor
(
**
kwargs
).
image_processor
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
return
{
"image"
:
None
}
...
...
@@ -1206,6 +743,12 @@ class Siglip2VisionTransformer(nn.Module):
return
loaded_params
def
_resolve_vision_token_id
(
model_config
:
ModelConfig
,
vision_token
:
str
)
->
int
:
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
assert
tokenizer
is
not
None
return
tokenizer
.
encode
(
vision_token
,
add_special_tokens
=
False
)[
0
]
class
IsaacVisionEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
...
...
vllm/model_executor/models/jais.py
View file @
657855ab
...
...
@@ -49,7 +49,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
JAISConfig
from
vllm.transformers_utils.configs
.jais
import
JAISConfig
from
.interfaces
import
SupportsPP
from
.utils
import
(
...
...
vllm/model_executor/models/kimi_k25.py
View file @
657855ab
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Kimi-K2.5 Model Implementation for vLLM.
Kimi-K2.5 extends Kimi-K2 with vision support
This module defines:
- KimiK25ProcessingInfo/KimiK25MultiModalProcessor: Processing logic
- KimiK25ForConditionalGeneration: Main model class
Kimi-K2.5 extends Kimi-K2 with vision support.
"""
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
...
...
@@ -18,14 +13,13 @@ from typing import Annotated, Any, Literal
import
torch
from
torch
import
nn
from
transformers
import
BatchFeature
from
transformers.processing_utils
import
ProcessorMixin
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.compressed_tensors
.compressed_tensors
import
(
C
ompressed
T
ensors
Config
,
from
vllm.model_executor.layers.quantization.compressed_tensors
import
(
c
ompressed
_t
ensors
,
)
from
vllm.model_executor.models.interfaces
import
(
SupportsEagle
,
...
...
@@ -45,7 +39,6 @@ from vllm.multimodal.inputs import (
MultiModalFieldConfig
,
MultiModalKwargsItems
,
NestedTensors
,
VisionChunk
,
VisionChunkImage
,
VisionChunkVideo
,
)
...
...
@@ -60,8 +53,9 @@ from vllm.multimodal.processing import (
)
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
KimiK25Config
from
vllm.transformers_utils.configs
.kimi_k25
import
KimiK25Config
from
vllm.transformers_utils.processor
import
cached_get_image_processor
from
vllm.transformers_utils.processors.kimi_k25
import
KimiK25Processor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.utils
import
(
...
...
@@ -101,69 +95,6 @@ class KimiK25MediaPixelInputs(TensorSchema):
grid_thws
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"nm"
,
3
)]
class
MoonshotKimiVAutoProcessor
(
ProcessorMixin
):
attributes
=
[
"tokenizer"
]
tokenizer_class
=
"AutoTokenizer"
def
__init__
(
self
,
media_processor
=
None
,
tokenizer
=
None
,
media_token_id
:
int
|
None
=
None
):
super
().
__init__
(
tokenizer
)
self
.
media_processor
=
media_processor
self
.
media_token_id
=
media_token_id
assert
self
.
media_token_id
is
not
None
# We do not support str input for text here
def
__call__
(
self
,
vision_chunks
:
list
[
VisionChunk
]
|
None
=
None
,
*
,
text
:
list
[
int
]
|
str
,
**
kwargs
,
)
->
BatchFeature
:
"""
Args:
vision_chunks: List of VisionChunk items to be processed.
For image: VisionChunkImage with type='image', image=PIL.Image
For video_chunk: VisionChunkVideo with type='video_chunk', video_chunk=list[PIL.Image]
text: The token ids to be fed to a model (required).
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- list of token ids to be fed to a model.
- **pixel_values** -- Pixel values to be fed to a model. Returned when `vision_chunks` is not `None`.
- **grid_thws** -- list of image 3D grid in LLM. Returned when `vision_chunks` is not `None`.
"""
mm_inputs
=
{}
input_ids
=
self
.
tokenizer
.
encode
(
text
)
if
isinstance
(
text
,
str
)
else
text
if
vision_chunks
is
not
None
:
assert
isinstance
(
vision_chunks
,
list
)
mm_inputs
=
self
.
media_processor
.
preprocess
(
vision_chunks
)
num_tokens_per_chunk
=
[
self
.
media_processor
.
media_tokens_calculator
(
chunk
)
for
chunk
in
vision_chunks
]
new_input_ids
=
[]
for
token
in
input_ids
:
if
token
==
self
.
media_token_id
:
new_input_ids
.
extend
(
[
self
.
media_token_id
]
*
num_tokens_per_chunk
.
pop
(
0
)
)
else
:
new_input_ids
.
append
(
token
)
input_ids
=
new_input_ids
# XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
return
BatchFeature
(
data
=
{
"input_ids"
:
torch
.
tensor
([
input_ids
]),
**
mm_inputs
,
}
)
class
KimiK25ProcessingInfo
(
BaseProcessingInfo
):
"""Processing information for Kimi-K2.5 model.
...
...
@@ -180,7 +111,7 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
trust_remote_code
=
self
.
ctx
.
model_config
.
trust_remote_code
,
)
self
.
media_processor
=
media_processor
self
.
hf_processor
=
MoonshotKimiVAuto
Processor
(
self
.
hf_processor
=
KimiK25
Processor
(
media_processor
=
self
.
media_processor
,
tokenizer
=
self
.
get_tokenizer
(),
media_token_id
=
self
.
media_token_id
,
...
...
@@ -263,12 +194,14 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo])
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
"""Indicates how to slice media input into multiple items.
pixel_values: [N, 3, patch_size, patch_size], all patches collected from B medias
grid_thws: [B,3], each item: [N_t, N_h ,N_w], indicates the grid size in time/height/width direction
for current item.
pixel_values: [N, 3, patch_size, patch_size],
all patches collected from B medias
grid_thws: [B,3], each item: [N_t, N_h ,N_w],
indicates the grid size in time/height/width direction for current item.
by multiplying [N_t, N_h ,N_w], we get the number of patches for each media item, thus we can slice
pixel_values by pixel_values[start:start + N_t*N_h*N_w] to get patches of one item.
by multiplying [N_t, N_h ,N_w], we get the number of patches
for each media item, thus we can slice pixel_values by
pixel_values[start:start + N_t*N_h*N_w] to get patches of one item.
"""
grid_thws
=
hf_inputs
.
get
(
"grid_thws"
,
torch
.
empty
((
0
,
3
)))
...
...
@@ -403,7 +336,7 @@ class KimiK25ForConditionalGeneration(
self
.
media_placeholder
:
int
=
self
.
config
.
media_placeholder_token_id
def
_maybe_ignore_quant_config
(
self
,
quant_config
:
QuantizationConfig
):
if
isinstance
(
quant_config
,
CompressedTensorsConfig
):
if
isinstance
(
quant_config
,
compressed_tensors
.
CompressedTensorsConfig
):
return
None
return
quant_config
...
...
vllm/model_executor/models/kimi_vl.py
View file @
657855ab
...
...
@@ -77,7 +77,7 @@ from vllm.multimodal.processing import (
PromptUpdate
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
KimiVLConfig
,
MoonViTConfig
from
vllm.transformers_utils.configs
.kimi_vl
import
KimiVLConfig
,
MoonViTConfig
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.utils
import
AutoWeightsLoader
,
init_vllm_registered_model
,
maybe_prefix
...
...
vllm/model_executor/models/lfm2_moe.py
View file @
657855ab
...
...
@@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
Lfm2MoeConfig
from
vllm.transformers_utils.configs
.lfm2_moe
import
Lfm2MoeConfig
from
.interfaces
import
(
HasInnerState
,
...
...
vllm/model_executor/models/lightonocr.py
View file @
657855ab
...
...
@@ -16,8 +16,7 @@ from vllm.model_executor.models.mistral3 import (
Mistral3ForConditionalGeneration
,
Mistral3MultiModalProjector
,
Mistral3ProcessingInfo
,
_build_mistral3_info
,
init_vision_tower_for_llava
,
init_vision_tower_for_mistral3
,
)
from
vllm.model_executor.models.pixtral
import
PixtralHFEncoderInfo
from
vllm.model_executor.models.utils
import
(
...
...
@@ -27,11 +26,9 @@ from vllm.model_executor.models.utils import (
maybe_prefix
,
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargsItems
from
vllm.multimodal.parse
import
ImageProcessorItems
,
MultiModalDataItems
from
vllm.multimodal.processing
import
(
BaseDummyInputsBuilder
,
BaseMultiModalProcessor
,
PromptReplacement
,
PromptUpdate
,
...
...
@@ -128,19 +125,9 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
]
def
_build_LightOnOCR_processor
(
info
:
_I
,
dummy_inputs
:
BaseDummyInputsBuilder
[
_I
],
*
,
cache
:
BaseMultiModalProcessorCache
|
None
=
None
,
):
assert
isinstance
(
info
,
Mistral3ProcessingInfo
)
return
LightOnOCRMultiModalProcessor
(
info
,
dummy_inputs
,
cache
=
cache
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
_build_
LightOnOCR
_p
rocessor
,
info
=
_build_mistral3_i
nfo
,
LightOnOCR
MultiModalP
rocessor
,
info
=
Mistral3ProcessingI
nfo
,
dummy_inputs
=
Mistral3DummyInputsBuilder
,
)
class
LightOnOCRForConditionalGeneration
(
Mistral3ForConditionalGeneration
):
...
...
@@ -164,7 +151,7 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
self
.
multimodal_config
=
multimodal_config
with
self
.
_mark_tower_model
(
vllm_config
,
"image"
):
self
.
vision_tower
=
init_vision_tower_for_
llava
(
self
.
vision_tower
=
init_vision_tower_for_
mistral3
(
config
,
quant_config
=
quant_config
,
require_post_norm
=
False
,
...
...
vllm/model_executor/models/mistral3.py
View file @
657855ab
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
abstractmethod
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Final
,
Literal
,
Protocol
,
TypeVar
from
typing
import
Annotated
,
Literal
import
torch
import
torch.nn
as
nn
from
transformers
import
(
BatchFeature
,
Mistral3Config
,
PixtralVisionConfig
,
PretrainedConfig
,
)
from
transformers
import
BatchFeature
,
Mistral3Config
,
PixtralVisionConfig
from
transformers.models.pixtral
import
PixtralProcessor
from
vllm.config
import
VllmConfig
...
...
@@ -23,7 +17,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
...
@@ -34,7 +27,6 @@ from vllm.multimodal.processing import (
BaseDummyInputsBuilder
,
BaseMultiModalProcessor
,
BaseProcessingInfo
,
InputProcessingContext
,
PromptReplacement
,
PromptUpdate
,
PromptUpdateDetails
,
...
...
@@ -178,27 +170,15 @@ class Mistral3MultiModalProjector(nn.Module):
return
hidden_states
class
LlavaLikeConfig
(
Protocol
):
vision_config
:
Final
[
PretrainedConfig
]
image_token_index
:
Final
[
int
]
vision_feature_select_strategy
:
Final
[
str
]
vision_feature_layer
:
Final
[
int
|
list
[
int
]]
class
LlavaLikeProcessor
(
Protocol
):
image_token
:
Final
[
str
]
class
BaseLlavaProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_config
(
self
)
->
LlavaLikeConfig
:
class
Mistral3ProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_config
(
self
)
->
Mistral3Config
:
return
self
.
ctx
.
get_hf_config
(
Mistral3Config
)
def
get_vision_encoder_info
(
self
):
return
get_vision_encoder_info
(
self
.
get_hf_config
())
@
abstractmethod
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
LlavaLikeProcessor
:
raise
NotImplementedError
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
PixtralProcessor
,
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
return
{
"image"
:
None
}
...
...
@@ -221,10 +201,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
return
ImageSize
(
width
=
width
,
height
=
height
)
_I
=
TypeVar
(
"_I"
,
bound
=
BaseLlavaProcessingInfo
)
class
Mistral3DummyInputsBuilder
(
BaseDummyInputsBuilder
[
_I
]):
class
Mistral3DummyInputsBuilder
(
BaseDummyInputsBuilder
[
Mistral3ProcessingInfo
]):
def
get_dummy_text
(
self
,
mm_counts
:
Mapping
[
str
,
int
])
->
str
:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
...
...
@@ -255,11 +232,6 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
}
class
Mistral3ProcessingInfo
(
BaseLlavaProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
PixtralProcessor
,
**
kwargs
)
class
Mistral3MultiModalProcessor
(
BaseMultiModalProcessor
[
Mistral3ProcessingInfo
]):
def
_call_hf_processor
(
self
,
...
...
@@ -339,29 +311,7 @@ class Mistral3MultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo
]
def
_build_mistral3_info
(
ctx
:
InputProcessingContext
,
)
->
BaseLlavaProcessingInfo
:
hf_config
=
ctx
.
get_hf_config
(
Mistral3Config
)
assert
isinstance
(
hf_config
.
vision_config
,
PixtralVisionConfig
)
return
Mistral3ProcessingInfo
(
ctx
)
def
_build_mistral3_processor
(
info
:
_I
,
dummy_inputs
:
BaseDummyInputsBuilder
[
_I
],
*
,
cache
:
BaseMultiModalProcessorCache
|
None
=
None
,
)
->
BaseMultiModalProcessor
:
assert
isinstance
(
info
,
Mistral3ProcessingInfo
)
return
Mistral3MultiModalProcessor
(
info
,
dummy_inputs
,
# type: ignore
cache
=
cache
,
)
def
_get_num_hidden_layers
(
hf_config
:
LlavaLikeConfig
)
->
int
:
def
_get_num_hidden_layers
(
hf_config
:
Mistral3Config
)
->
int
:
"""Determine the number of hidden layers to initialize up to in the
visual encoder.
...
...
@@ -381,8 +331,8 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
)
def
init_vision_tower_for_
llava
(
hf_config
:
LlavaLike
Config
,
def
init_vision_tower_for_
mistral3
(
hf_config
:
Mistral3
Config
,
quant_config
:
QuantizationConfig
|
None
,
*
,
require_post_norm
:
bool
|
None
=
None
,
...
...
@@ -405,8 +355,8 @@ def init_vision_tower_for_llava(
@
MULTIMODAL_REGISTRY
.
register_processor
(
_build_mistral3_p
rocessor
,
info
=
_build_mistral3_i
nfo
,
Mistral3MultiModalP
rocessor
,
info
=
Mistral3ProcessingI
nfo
,
dummy_inputs
=
Mistral3DummyInputsBuilder
,
)
class
Mistral3ForConditionalGeneration
(
...
...
@@ -466,7 +416,7 @@ class Mistral3ForConditionalGeneration(
config
.
projector_hidden_act
=
"gelu"
with
self
.
_mark_tower_model
(
vllm_config
,
"image"
):
self
.
vision_tower
=
init_vision_tower_for_
llava
(
self
.
vision_tower
=
init_vision_tower_for_
mistral3
(
config
,
quant_config
=
quant_config
,
require_post_norm
=
False
,
...
...
vllm/model_executor/models/nemotron.py
View file @
657855ab
...
...
@@ -52,7 +52,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
NemotronConfig
from
vllm.transformers_utils.configs
.nemotron
import
NemotronConfig
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
(
...
...
vllm/model_executor/models/nemotron_h.py
View file @
657855ab
...
...
@@ -81,7 +81,7 @@ from vllm.model_executor.models.utils import (
sequence_parallel_chunk
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
NemotronHConfig
from
vllm.transformers_utils.configs
.nemotron_h
import
NemotronHConfig
class
NemotronHMLP
(
nn
.
Module
):
...
...
vllm/model_executor/models/nemotron_h_mtp.py
View file @
657855ab
...
...
@@ -26,7 +26,7 @@ from vllm.model_executor.models.utils import (
maybe_prefix
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
NemotronHConfig
from
vllm.transformers_utils.configs
.nemotron_h
import
NemotronHConfig
from
.interfaces
import
SupportsPP
from
.nemotron_h
import
(
...
...
vllm/model_executor/models/olmo2.py
View file @
657855ab
...
...
@@ -63,7 +63,7 @@ from vllm.model_executor.models.utils import (
maybe_prefix
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
Olmo3Config
from
vllm.transformers_utils.configs
.olmo3
import
Olmo3Config
class
Olmo2Attention
(
nn
.
Module
):
...
...
vllm/model_executor/models/qwen3_next.py
View file @
657855ab
...
...
@@ -80,7 +80,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
Qwen3NextConfig
from
vllm.transformers_utils.configs
.qwen3_next
import
Qwen3NextConfig
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils.multi_stream_utils
import
maybe_execute_in_parallel
from
vllm.utils.torch_utils
import
(
...
...
vllm/model_executor/models/qwen3_next_mtp.py
View file @
657855ab
...
...
@@ -25,7 +25,7 @@ from vllm.model_executor.models.qwen3_next import (
QwenNextMixtureOfExperts
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
Qwen3NextConfig
from
vllm.transformers_utils.configs
.qwen3_next
import
Qwen3NextConfig
from
.utils
import
(
AutoWeightsLoader
,
...
...
vllm/model_executor/models/step3_vl.py
View file @
657855ab
...
...
@@ -2,18 +2,13 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
itertools
import
product
from
math
import
ceil
,
sqrt
from
math
import
sqrt
from
typing
import
Annotated
,
Any
,
Literal
,
TypeAlias
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
PIL
import
Image
from
torchvision
import
transforms
from
torchvision.transforms.functional
import
InterpolationMode
from
transformers
import
BatchFeature
,
PretrainedConfig
,
TensorType
from
transformers
import
BatchFeature
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
...
...
@@ -43,8 +38,8 @@ from vllm.multimodal.processing import (
PromptUpdateDetails
,
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.t
okenizers
import
TokenizerLike
from
vllm.transformers_utils.
configs
import
Step3VisionEncoderConfig
from
vllm.t
ransformers_utils.configs.step3_vl
import
Step3VisionEncoderConfig
from
vllm.transformers_utils.
processors.step3_vl
import
Step3VLProcessor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
...
...
@@ -89,430 +84,6 @@ class Step3VLImageEmbeddingInputs(TensorSchema):
Step3VLImageInputs
:
TypeAlias
=
Step3VLImagePixelInputs
|
Step3VLImageEmbeddingInputs
ImageWithPatches
=
tuple
[
Image
.
Image
,
list
[
Image
.
Image
],
list
[
bool
]
|
None
]
MAX_IMAGE_SIZE
:
int
=
3024
class
Step3VisionProcessor
:
def
__init__
(
self
,
size
,
interpolation_mode
=
"bicubic"
,
patch_size
=
None
):
mean
=
[
0.48145466
,
0.4578275
,
0.40821073
]
std
=
[
0.26862954
,
0.26130258
,
0.27577711
]
patch_size
=
patch_size
if
patch_size
is
not
None
else
size
self
.
transform
=
transforms
.
Compose
(
[
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
,
std
),
transforms
.
Resize
(
(
size
,
size
),
interpolation
=
InterpolationMode
.
BICUBIC
if
interpolation_mode
==
"bicubic"
else
InterpolationMode
.
BILINEAR
,
antialias
=
True
,
),
]
)
self
.
patch_transform
=
(
transforms
.
Compose
(
[
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
,
std
),
transforms
.
Resize
(
(
patch_size
,
patch_size
),
interpolation
=
InterpolationMode
.
BICUBIC
if
interpolation_mode
==
"bicubic"
else
InterpolationMode
.
BILINEAR
,
antialias
=
True
,
),
]
)
if
patch_size
is
not
None
else
None
)
def
__call__
(
self
,
image
,
is_patch
=
False
):
if
is_patch
:
return
{
"pixel_values"
:
self
.
patch_transform
(
image
).
unsqueeze
(
0
)}
else
:
return
{
"pixel_values"
:
self
.
transform
(
image
).
unsqueeze
(
0
)}
class
ImagePatcher
:
def
__init__
(
self
,
enable_patch
:
bool
=
True
)
->
None
:
self
.
enable_patch
=
enable_patch
def
determine_window_size
(
self
,
long
:
int
,
short
:
int
)
->
int
:
if
long
<
728
:
return
short
if
long
/
short
>
1.5
else
0
return
min
(
short
,
504
)
if
long
/
short
>
4
else
504
def
slide_window
(
self
,
width
:
int
,
height
:
int
,
sizes
:
list
[
tuple
[
int
,
int
]],
steps
:
list
[
tuple
[
int
,
int
]],
img_rate_thr
:
float
=
0.6
,
)
->
tuple
[
list
[
tuple
[
int
,
int
,
int
,
int
]],
tuple
[
int
,
int
]]:
assert
1
>=
img_rate_thr
>=
0
,
"The `in_rate_thr` should lie in 0~1"
windows
=
[]
# Sliding windows.
for
size
,
step
in
zip
(
sizes
,
steps
):
size_w
,
size_h
=
size
step_w
,
step_h
=
step
x_num
=
1
if
width
<=
size_w
else
ceil
((
width
-
size_w
)
/
step_w
+
1
)
x_start
=
[
step_w
*
i
for
i
in
range
(
x_num
)]
if
len
(
x_start
)
>
1
and
x_start
[
-
1
]
+
size_w
>
width
:
x_start
[
-
1
]
=
width
-
size_w
y_num
=
1
if
height
<=
size_h
else
ceil
((
height
-
size_h
)
/
step_h
+
1
)
y_start
=
[
step_h
*
i
for
i
in
range
(
y_num
)]
if
len
(
y_start
)
>
1
and
y_start
[
-
1
]
+
size_h
>
height
:
y_start
[
-
1
]
=
height
-
size_h
start
=
np
.
array
(
list
(
product
(
y_start
,
x_start
)),
dtype
=
int
)
start
[:,
[
0
,
1
]]
=
start
[:,
[
1
,
0
]]
windows
.
append
(
np
.
concatenate
([
start
,
start
+
size
],
axis
=
1
))
windows
=
np
.
concatenate
(
windows
,
axis
=
0
)
return
[
(
int
(
box
[
0
]),
int
(
box
[
1
]),
int
(
box
[
2
]
-
box
[
0
]),
int
(
box
[
3
]
-
box
[
1
]))
for
box
in
windows
],
(
x_num
,
y_num
)
def
square_pad
(
self
,
img
:
Image
.
Image
)
->
Image
.
Image
:
w
,
h
=
img
.
size
if
w
==
h
:
return
img
size
=
max
(
w
,
h
)
padded
=
Image
.
new
(
img
.
mode
,
(
size
,
size
),
0
)
padded
.
paste
(
img
,
(
0
,
0
))
return
padded
def
get_image_size_for_padding
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
ratio
=
img_width
/
img_height
if
min
(
img_height
,
img_width
)
<
32
and
(
ratio
>
4
or
ratio
<
1
/
4
):
new_size
=
max
(
img_height
,
img_width
)
return
new_size
,
new_size
return
img_width
,
img_height
def
get_image_size_for_preprocess
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
if
max
(
img_height
,
img_width
)
>
MAX_IMAGE_SIZE
:
scale_factor
=
MAX_IMAGE_SIZE
/
max
(
img_height
,
img_width
)
img_width
=
int
(
img_width
*
scale_factor
)
img_height
=
int
(
img_height
*
scale_factor
)
return
img_width
,
img_height
def
get_image_size_for_crop
(
self
,
img_width
:
int
,
img_height
:
int
,
window_size
:
int
):
w_ratio
=
img_width
/
window_size
h_ratio
=
img_height
/
window_size
if
w_ratio
<
1
:
width_new
=
img_width
else
:
decimal_w
=
w_ratio
-
img_width
//
window_size
w_ratio
=
int
(
w_ratio
)
+
1
if
decimal_w
>
0.2
else
int
(
w_ratio
)
width_new
=
window_size
*
w_ratio
if
h_ratio
<
1
:
height_new
=
img_height
else
:
decimal_h
=
h_ratio
-
img_height
//
window_size
h_ratio
=
int
(
h_ratio
)
+
1
if
decimal_h
>
0.2
else
int
(
h_ratio
)
height_new
=
window_size
*
h_ratio
return
int
(
width_new
),
int
(
height_new
)
def
patch_crop
(
self
,
img
:
Image
.
Image
,
i
:
int
,
j
:
int
,
th
:
int
,
tw
:
int
):
target
=
img
.
crop
((
j
,
i
,
j
+
tw
,
i
+
th
))
return
target
def
get_num_patches
(
self
,
img_width
:
int
,
img_height
:
int
)
->
tuple
[
int
,
int
]:
img_width
,
img_height
=
self
.
get_image_size_for_padding
(
img_width
,
img_height
)
img_width
,
img_height
=
self
.
get_image_size_for_preprocess
(
img_width
,
img_height
)
window_size
=
self
.
determine_window_size
(
max
(
img_height
,
img_width
),
min
(
img_height
,
img_width
)
)
if
window_size
==
0
or
not
self
.
enable_patch
:
return
0
,
0
else
:
img_width
,
img_height
=
self
.
get_image_size_for_crop
(
img_width
,
img_height
,
window_size
)
center_list
,
(
x_num
,
y_num
)
=
self
.
slide_window
(
img_width
,
img_height
,
[(
window_size
,
window_size
)],
[(
window_size
,
window_size
)],
)
full_rows
=
(
len
(
center_list
)
-
1
)
//
x_num
+
1
if
len
(
center_list
)
>
0
and
len
(
center_list
)
%
x_num
==
0
:
full_rows
-=
1
return
len
(
center_list
),
full_rows
def
__call__
(
self
,
img
:
Image
.
Image
)
->
tuple
[
Image
.
Image
,
list
[
Image
.
Image
],
list
[
bool
]
|
None
]:
img_width
,
img_height
=
img
.
size
new_img_width
,
new_img_height
=
self
.
get_image_size_for_padding
(
img_width
,
img_height
)
if
new_img_width
!=
img_width
or
new_img_height
!=
img_height
:
img
=
self
.
square_pad
(
img
)
img_width
,
img_height
=
img
.
size
new_img_width
,
new_img_height
=
self
.
get_image_size_for_preprocess
(
img_width
,
img_height
)
img
=
img
.
resize
((
new_img_width
,
new_img_height
),
Image
.
Resampling
.
BILINEAR
)
window_size
=
self
.
determine_window_size
(
max
(
new_img_height
,
new_img_width
),
min
(
new_img_height
,
new_img_width
)
)
if
window_size
==
0
or
not
self
.
enable_patch
:
return
img
,
[],
None
else
:
new_img_width
,
new_img_height
=
self
.
get_image_size_for_crop
(
new_img_width
,
new_img_height
,
window_size
)
if
(
new_img_width
,
new_img_height
)
!=
(
img_width
,
img_height
):
img_for_crop
=
img
.
resize
(
(
new_img_width
,
new_img_height
),
Image
.
Resampling
.
BILINEAR
)
else
:
img_for_crop
=
img
patches
=
[]
newlines
=
[]
center_list
,
(
x_num
,
y_num
)
=
self
.
slide_window
(
new_img_width
,
new_img_height
,
[(
window_size
,
window_size
)],
[(
window_size
,
window_size
)],
)
for
patch_id
,
center_lf_point
in
enumerate
(
center_list
):
x
,
y
,
patch_w
,
patch_h
=
center_lf_point
big_patch
=
self
.
patch_crop
(
img_for_crop
,
y
,
x
,
patch_h
,
patch_w
)
patches
.
append
(
big_patch
)
if
(
patch_id
+
1
)
%
x_num
==
0
:
newlines
.
append
(
patch_id
)
if
newlines
and
newlines
[
-
1
]
==
len
(
patches
)
-
1
:
newlines
.
pop
()
return
(
img
,
patches
,
[
i
in
newlines
for
i
in
range
(
len
(
patches
))]
if
len
(
patches
)
>
0
else
None
,
)
class
Step3VLProcessor
:
def
__init__
(
self
,
config
:
PretrainedConfig
,
tokenizer
:
TokenizerLike
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
tokenizer
=
tokenizer
self
.
image_size
=
728
self
.
patch_size
=
504
self
.
image_preprocessor
=
Step3VisionProcessor
(
self
.
image_size
,
"bilinear"
,
self
.
patch_size
)
self
.
num_image_feature_size
=
169
self
.
num_patch_feature_size
=
81
self
.
image_token
=
"<im_patch>"
self
.
image_feature_placeholder
=
self
.
image_token
*
self
.
num_image_feature_size
self
.
patch_feature_placeholder
=
self
.
image_token
*
self
.
num_patch_feature_size
# Respect vision config switch to enable/disable patch extraction.
# For video understanding, it's preferable to disable patch.
enable_patch
=
getattr
(
self
.
config
.
vision_config
,
"enable_patch"
,
True
)
self
.
patcher
=
ImagePatcher
(
enable_patch
=
enable_patch
)
@
property
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
self
.
image_token
]
def
get_num_image_tokens
(
self
,
img_width
:
int
,
img_height
:
int
)
->
int
:
num_patches
,
num_newlines
=
self
.
patcher
.
get_num_patches
(
img_width
,
img_height
)
return
(
num_patches
*
(
self
.
num_patch_feature_size
+
2
)
+
self
.
num_image_feature_size
+
2
+
num_newlines
)
def
_split_images
(
self
,
images
:
list
[
Image
.
Image
])
->
list
[
ImageWithPatches
]:
result
=
[]
for
img
in
images
:
result
.
append
(
self
.
patcher
(
img
))
return
result
def
_convert_images_to_pixel_values
(
self
,
images
:
list
[
Image
.
Image
],
is_patch
:
bool
=
False
,
)
->
list
[
torch
.
Tensor
]:
return
[
self
.
image_preprocessor
(
img
,
is_patch
=
is_patch
)[
"pixel_values"
]
for
img
in
images
]
def
_get_patch_repl
(
self
,
num_patches
:
int
,
patch_newline_mask
:
list
[
bool
]
|
None
,
)
->
tuple
[
str
,
list
[
int
]]:
text
=
""
token_ids
=
[]
for
i
in
range
(
num_patches
):
assert
len
(
patch_newline_mask
)
==
num_patches
text
+=
f
"<patch_start>
{
self
.
patch_feature_placeholder
}
<patch_end>"
token_ids
.
extend
(
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_start>"
)]
+
[
self
.
image_token_id
]
*
self
.
num_patch_feature_size
+
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_end>"
)]
)
if
patch_newline_mask
and
patch_newline_mask
[
i
]:
text
+=
"<patch_newline>"
token_ids
.
append
(
self
.
tokenizer
.
convert_tokens_to_ids
(
"<patch_newline>"
)
)
return
text
,
token_ids
def
_get_image_repl
(
self
,
num_images
:
int
,
)
->
tuple
[
str
,
list
[
int
]]:
text
=
f
"<im_start>
{
self
.
image_feature_placeholder
}
<im_end>"
token_ids
=
(
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<im_start>"
)]
+
[
self
.
image_token_id
]
*
self
.
num_image_feature_size
+
[
self
.
tokenizer
.
convert_tokens_to_ids
(
"<im_end>"
)]
)
return
text
*
num_images
,
token_ids
*
num_images
def
_get_image_repl_features
(
self
,
num_images
:
int
,
num_patches
:
int
,
patch_new_line_idx
:
list
[
bool
]
|
None
,
)
->
tuple
[
str
,
list
[
int
]]:
if
num_patches
>
0
:
patch_repl
,
patch_repl_ids
=
self
.
_get_patch_repl
(
num_patches
,
patch_new_line_idx
)
else
:
patch_repl
=
""
patch_repl_ids
=
[]
image_repl
,
image_repl_ids
=
self
.
_get_image_repl
(
num_images
)
return
patch_repl
+
image_repl
,
patch_repl_ids
+
image_repl_ids
def
replace_placeholder
(
self
,
text
:
str
,
placeholder
:
str
,
repls
:
list
[
str
])
->
str
:
parts
=
text
.
split
(
placeholder
)
if
len
(
parts
)
-
1
!=
len
(
repls
):
raise
ValueError
(
"The number of placeholders does not match the number of replacements."
)
result
=
[
parts
[
0
]]
for
i
,
repl
in
enumerate
(
repls
):
result
.
append
(
repl
)
result
.
append
(
parts
[
i
+
1
])
return
""
.
join
(
result
)
def
__call__
(
self
,
text
:
str
|
list
[
str
]
|
None
=
None
,
images
:
Image
.
Image
|
list
[
Image
.
Image
]
|
None
=
None
,
return_tensors
:
str
|
TensorType
|
None
=
None
,
)
->
BatchFeature
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
text
=
[
text
]
if
images
is
None
:
images
=
[]
if
not
isinstance
(
images
,
list
):
images
=
[
images
]
if
len
(
images
)
==
0
:
image_inputs
=
{}
text_inputs
=
self
.
tokenizer
(
text
)
else
:
split_images_data
=
self
.
_split_images
(
images
)
pixel_values_lst
=
[]
patch_pixel_values_lst
=
[]
patch_newline_mask_lst
=
[]
image_repl_str_lst
=
[]
image_repl_ids_lst
=
[]
num_patches
=
[]
for
raw_img
,
img_patches
,
patch_newline_mask
in
split_images_data
:
pixel_values_lst
.
extend
(
self
.
_convert_images_to_pixel_values
([
raw_img
]))
if
len
(
img_patches
)
>
0
:
patch_pixel_values_lst
.
extend
(
self
.
_convert_images_to_pixel_values
(
img_patches
,
is_patch
=
True
)
)
num_patches
.
append
(
len
(
img_patches
))
image_repl_str
,
image_repl_ids
=
self
.
_get_image_repl_features
(
1
,
len
(
img_patches
),
patch_newline_mask
)
image_repl_str_lst
.
append
(
image_repl_str
)
image_repl_ids_lst
.
extend
(
image_repl_ids
)
if
patch_newline_mask
is
not
None
:
patch_newline_mask_lst
.
extend
(
patch_newline_mask
)
pixel_values
=
torch
.
cat
(
pixel_values_lst
)
patch_size
=
self
.
patch_size
image_inputs
=
{
"pixel_values"
:
pixel_values
,
"num_patches"
:
num_patches
,
"patch_pixel_values"
:
(
torch
.
cat
(
patch_pixel_values_lst
)
if
patch_pixel_values_lst
else
pixel_values
.
new_empty
((
0
,
3
,
patch_size
,
patch_size
))
),
"patch_newline_mask"
:
torch
.
tensor
(
patch_newline_mask_lst
,
dtype
=
torch
.
bool
),
}
text
=
[
self
.
replace_placeholder
(
t
,
self
.
image_token
,
image_repl_str_lst
)
for
t
in
text
]
text_inputs
=
self
.
tokenizer
(
text
)
return
BatchFeature
(
{
**
text_inputs
,
**
image_inputs
,
},
tensor_type
=
return_tensors
,
)
class
Step3VLProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_processor
(
self
)
->
Step3VLProcessor
:
...
...
vllm/model_executor/models/tarsier.py
View file @
657855ab
...
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.models.llava
import
LlavaDummyInputsBuilder
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
,
MultiModalKwargsItems
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
...
...
@@ -34,10 +33,8 @@ from vllm.multimodal.parse import (
MultiModalDataItems
,
)
from
vllm.multimodal.processing
import
(
BaseDummyInputsBuilder
,
BaseMultiModalProcessor
,
BaseProcessingInfo
,
InputProcessingContext
,
PromptReplacement
,
PromptUpdate
,
)
...
...
@@ -329,25 +326,6 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
]
def
_build_tarsier_hf_info
(
ctx
:
InputProcessingContext
)
->
TarsierProcessingInfo
:
return
TarsierProcessingInfo
(
ctx
)
def
_build_tarsier_hf_processor
(
info
:
_I_Tarsier
,
dummy_inputs
:
BaseDummyInputsBuilder
[
_I_Tarsier
],
*
,
cache
:
BaseMultiModalProcessorCache
|
None
=
None
,
)
->
BaseMultiModalProcessor
:
if
isinstance
(
info
,
TarsierProcessingInfo
):
return
TarsierMultiModalProcessor
(
info
,
dummy_inputs
,
cache
=
cache
,
)
raise
NotImplementedError
(
type
(
info
))
def
init_vision_tower_for_tarsier
(
hf_config
:
TarsierHfConfig
,
# Use the Tarsier specific config protocol
quant_config
:
QuantizationConfig
|
None
,
...
...
@@ -395,8 +373,8 @@ def init_vision_tower_for_tarsier(
@
MULTIMODAL_REGISTRY
.
register_processor
(
_build_tarsier_hf_p
rocessor
,
info
=
_build_tarsier_hf_i
nfo
,
TarsierMultiModalP
rocessor
,
info
=
TarsierProcessingI
nfo
,
dummy_inputs
=
TarsierDummyInputsBuilder
,
)
class
TarsierForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment