[Voxtral models] Skip warm-up to skip confusing error message in warm-up (#33576)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>

[Voxtral models] Skip warm-up to skip confusing error message in warm-up (#33576)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
f0d52517 · Patrick von Platen · GitHub · 5c4f2dd6 · f0d52517 · f0d52517
Unverified Commit f0d52517 authored Feb 03, 2026 by Patrick von Platen Committed by GitHub Feb 03, 2026
3 changed files
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ b/vllm/entrypoints/openai/translations/speech_to_text.py
@@ -138,6 +138,9 @@ class OpenAISpeechToText(OpenAIServing):
        if not supports_transcription(self.model_cls):
            return
+        if getattr(self.model_cls, "skip_warmup_audio_preprocessing", False):
+            return
        try:
            warmup_start = time.perf_counter()
            logger.info("Warming up audio preprocessing libraries...")
@@ -150,9 +153,7 @@ class OpenAISpeechToText(OpenAIServing):
            _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)
            # Warm up mel-spectrogram computation with model-specific parameters
-            from vllm.transformers_utils.processor import (
+            from vllm.transformers_utils.processor import cached_processor_from_config
-                cached_processor_from_config,
-            )
            processor = cached_processor_from_config(self.model_config)
            feature_extractor = None

--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -335,6 +335,9 @@ class VoxtralForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
 ):
    supported_languages = ISO639_1_SUPPORTED_LANGS
+    # transformers' currently has limited support for MistralCommon backend
+    # and cached_get_processor. Let's skip until fixed
+    skip_warmup_audio_preprocessing = True
    packed_modules_mapping = {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],

--- a/vllm/model_executor/models/voxtral_realtime.py
+++ b/vllm/model_executor/models/voxtral_realtime.py
@@ -218,6 +218,9 @@ class VoxtralRealtimeBuffer:
 @support_torch_compile
 class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
    requires_raw_input_tokens = True
+    # transformers' currently has limited support for MistralCommon backend
+    # and cached_get_processor. Let's skip until fixed
+    skip_warmup_audio_preprocessing = True
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)