Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8c0b6267
Unverified
Commit
8c0b6267
authored
Mar 29, 2026
by
allgather
Committed by
GitHub
Mar 29, 2026
Browse files
[Transformers v5] fix missing pixtral/voxtral multimodal dispatch (#38410)
Signed-off-by:
allgather
<
all2allops@gmail.com
>
parent
43cc5138
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
40 additions
and
22 deletions
+40
-22
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+10
-5
vllm/model_executor/models/voxtral.py
vllm/model_executor/models/voxtral.py
+18
-9
vllm/transformers_utils/processors/pixtral.py
vllm/transformers_utils/processors/pixtral.py
+6
-4
vllm/transformers_utils/processors/voxtral.py
vllm/transformers_utils/processors/voxtral.py
+6
-4
No files found.
vllm/model_executor/models/pixtral.py
View file @
8c0b6267
...
...
@@ -61,7 +61,10 @@ from vllm.platforms import current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
cached_tokenizer_from_config
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.transformers_utils.processors.pixtral
import
MistralCommonPixtralProcessor
from
vllm.transformers_utils.processors.pixtral
import
(
MistralCommonImageProcessor
,
MistralCommonPixtralProcessor
,
)
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
...
...
@@ -128,18 +131,20 @@ class PixtralProcessingInfo(BaseProcessingInfo):
return
tokenizer
def
get_image_processor
(
self
)
->
MistralCommonImageProcessor
:
return
MistralCommonImageProcessor
(
self
.
get_tokenizer
().
instruct
.
mm_encoder
)
def
get_hf_processor
(
self
,
**
kwargs
)
->
MistralCommonPixtralProcessor
:
return
self
.
ctx
.
init_processor
(
MistralCommonPixtralProcessor
,
return
MistralCommonPixtralProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
image_processor
=
self
.
get_image_processor
()
,
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
return
{
"image"
:
None
}
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
image_processor
=
self
.
get_
hf_processor
().
image_processor
image_processor
=
self
.
get_image_processor
()
max_image_size
=
image_processor
.
mm_encoder
.
mm_config
.
max_image_size
return
ImageSize
(
width
=
max_image_size
,
height
=
max_image_size
)
...
...
vllm/model_executor/models/voxtral.py
View file @
8c0b6267
...
...
@@ -55,7 +55,10 @@ from vllm.multimodal.processing.processor import (
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
cached_tokenizer_from_config
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.transformers_utils.processors.voxtral
import
MistralCommonVoxtralProcessor
from
vllm.transformers_utils.processors.voxtral
import
(
MistralCommonFeatureExtractor
,
MistralCommonVoxtralProcessor
,
)
from
vllm.utils.collection_utils
import
is_list_of
from
.interfaces
import
SupportsLoRA
,
SupportsMultiModal
,
SupportsTranscription
...
...
@@ -84,15 +87,19 @@ class VoxtralProcessingInfo(BaseProcessingInfo):
return
tokenizer
def
get_feature_extractor
(
self
)
->
MistralCommonFeatureExtractor
:
return
MistralCommonFeatureExtractor
(
self
.
get_tokenizer
().
instruct
.
audio_encoder
)
def
get_hf_processor
(
self
,
**
kwargs
)
->
MistralCommonVoxtralProcessor
:
return
self
.
ctx
.
init_processor
(
MistralCommonVoxtralProcessor
,
return
MistralCommonVoxtralProcessor
(
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
feature_extractor
=
self
.
get_feature_extractor
()
,
)
def
get_data_parser
(
self
):
feature_extractor
=
self
.
get_
hf_processor
().
feature_extractor
feature_extractor
=
self
.
get_feature_extractor
()
return
MultiModalDataParser
(
target_sr
=
feature_extractor
.
sampling_rate
,
...
...
@@ -114,7 +121,7 @@ class VoxtralProcessingInfo(BaseProcessingInfo):
return
self
.
ctx
.
model_config
.
max_model_len
def
get_max_audio_array_len
(
self
)
->
int
:
feature_extractor
=
self
.
get_
hf_processor
().
feature_extractor
feature_extractor
=
self
.
get_feature_extractor
()
return
self
.
get_max_audio_tokens
()
*
int
(
feature_extractor
.
sampling_rate
//
feature_extractor
.
frame_rate
...
...
@@ -153,7 +160,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
mm_data
:
MultiModalDataDict
|
None
=
None
,
)
->
ProcessorInputs
:
tokenizer
=
self
.
info
.
get_tokenizer
()
feature_extractor
=
self
.
info
.
get_
hf_processor
().
feature_extractor
feature_extractor
=
self
.
info
.
get_feature_extractor
()
dummy_text
=
self
.
get_dummy_text
(
mm_counts
)
dummy_mm_data
=
(
...
...
@@ -480,8 +487,10 @@ class VoxtralForConditionalGeneration(
This is used for estimating the amount of processing for this audio.
"""
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
adapter
=
MistralCommonVoxtralProcessor
(
tokenizer
)
return
adapter
.
feature_extractor
.
get_num_audio_tokens
(
feature_extractor
=
MistralCommonFeatureExtractor
(
tokenizer
.
instruct
.
audio_encoder
)
return
feature_extractor
.
get_num_audio_tokens
(
int
(
audio_duration_s
*
stt_config
.
sample_rate
)
)
...
...
vllm/transformers_utils/processors/pixtral.py
View file @
8c0b6267
...
...
@@ -50,16 +50,18 @@ class MistralCommonImageProcessor:
class
MistralCommonPixtralProcessor
(
ProcessorMixin
):
attributes
=
[
"image_processor"
,
"tokenizer"
]
def
__init__
(
self
,
tokenizer
:
MistralTokenizer
)
->
None
:
def
__init__
(
self
,
tokenizer
:
MistralTokenizer
,
image_processor
:
MistralCommonImageProcessor
,
)
->
None
:
self
.
tokenizer
=
tokenizer
.
transformers_tokenizer
# Back-compatibility for Transformers v4
if
not
hasattr
(
self
.
tokenizer
,
"init_kwargs"
):
self
.
tokenizer
.
init_kwargs
=
{}
self
.
image_processor
=
MistralCommonImageProcessor
(
tokenizer
.
instruct
.
mm_encoder
)
self
.
image_processor
=
image_processor
image_special_ids
=
self
.
image_processor
.
mm_encoder
.
special_ids
self
.
image_break_id
=
image_special_ids
.
img_break
...
...
vllm/transformers_utils/processors/voxtral.py
View file @
8c0b6267
...
...
@@ -57,16 +57,18 @@ class MistralCommonFeatureExtractor:
class
MistralCommonVoxtralProcessor
(
ProcessorMixin
):
attributes
=
[
"feature_extractor"
,
"tokenizer"
]
def
__init__
(
self
,
tokenizer
:
MistralTokenizer
)
->
None
:
def
__init__
(
self
,
tokenizer
:
MistralTokenizer
,
feature_extractor
:
MistralCommonFeatureExtractor
,
)
->
None
:
self
.
tokenizer
=
tokenizer
.
transformers_tokenizer
# Back-compatibility for Transformers v4
if
not
hasattr
(
self
.
tokenizer
,
"init_kwargs"
):
self
.
tokenizer
.
init_kwargs
=
{}
self
.
feature_extractor
=
MistralCommonFeatureExtractor
(
tokenizer
.
instruct
.
audio_encoder
)
self
.
feature_extractor
=
feature_extractor
audio_special_ids
=
self
.
feature_extractor
.
audio_encoder
.
special_ids
self
.
audio_token_id
=
audio_special_ids
.
audio
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment