[Voxstral Realtime] Enable tests (#33803)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>

[Voxstral Realtime] Enable tests (#33803)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
1100a976 · Patrick von Platen · GitHub · 766e1678 · 1100a976 · 1100a976
Unverified Commit 1100a976 authored Feb 12, 2026 by Patrick von Platen Committed by GitHub Feb 12, 2026
5 changed files
--- a/tests/entrypoints/openai/test_realtime_validation.py
+++ b/tests/entrypoints/openai/test_realtime_validation.py
@@ -27,15 +27,6 @@ MISTRAL_FORMAT_ARGS = [
 MODEL_NAME = "mistralai/Voxtral-Mini-4B-Realtime-2602"
-def _audio_to_base64_pcm16(path: str, target_sr: int = 16000) -> str:
-    """Load audio file, convert to PCM16 @ target sample rate, base64 encode."""
-    audio, _ = librosa.load(path, sr=target_sr, mono=True)
-    # Convert float32 [-1, 1] to int16 [-32768, 32767]
-    audio_int16 = (audio * 32767).astype(np.int16)
-    audio_bytes = audio_int16.tobytes()
-    return base64.b64encode(audio_bytes).decode("utf-8")
 def _get_websocket_url(server: RemoteOpenAIServer) -> str:
    """Convert HTTP URL to WebSocket URL for realtime endpoint."""
    http_url = server.url_root
@@ -74,12 +65,11 @@ def mary_had_lamb_audio_chunks() -> list[str]:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.skip(reason="Voxtral streaming is not yet public")
 async def test_multi_chunk_streaming(
    model_name, mary_had_lamb_audio_chunks, rocm_aiter_fa_attention
 ):
    """Test streaming multiple audio chunks before committing."""
-    server_args = ["--enforce-eager"]
+    server_args = ["--enforce-eager", "--max-model-len", "2048"]
    if model_name.startswith("mistralai"):
        server_args += MISTRAL_FORMAT_ARGS

--- a/tests/models/multimodal/generation/test_voxtral_realtime.py
+++ b/tests/models/multimodal/generation/test_voxtral_realtime.py
@@ -73,7 +73,6 @@ def async_engine() -> AsyncLLM:
    return AsyncLLM.from_engine_args(engine_args)
-@pytest.mark.skip(reason="Voxtral streaming is not yet public")
 def test_voxtral_realtime_forward(audio_assets, tokenizer, engine):
    audio_config = tokenizer.instruct_tokenizer.tokenizer.audio
@@ -218,7 +217,6 @@ class RealTimeAudioInput:
 @pytest.mark.asyncio
-@pytest.mark.skip(reason="Voxtral streaming is not yet public")
 async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine):
    sampling_params = SamplingParams(temperature=0.0, max_tokens=1)

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -441,6 +441,13 @@ def test_processing_correctness(
            "Qwen-VL tokenizer requires downloading a font file from "
            "servers that often refuse connections in CI"
        )
+    if model_id == "mistralai/Voxtral-Mini-4B-Realtime-2602":
+        pytest.skip(
+            "Voxtral Realtime doesn't make use of any place-holder"
+            "tokens and hence cannot pass the processing "
+            "correctness test as is. Let's revisit adapting this "
+            "test once more realtime models exist."
+        )
    if model_id == "internlm/Intern-S1-Pro":
        # FIXME(Isotr0py): Fix later.
        pytest.skip("Tokenization issue. Fix later")

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1031,13 +1031,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "VoxtralForConditionalGeneration": _HfExamplesInfo(
        "mistralai/Voxtral-Mini-3B-2507",
-        # disable this temporarily until we support HF format
+        tokenizer_mode="mistral",
-        is_available_online=False,
    ),
    "VoxtralRealtimeGeneration": _HfExamplesInfo(
-        "<place-holder>",
+        "mistralai/Voxtral-Mini-4B-Realtime-2602",
-        # disable this temporarily until we support HF format
+        enforce_eager=True,
-        is_available_online=False,
+        tokenizer_mode="mistral",
    ),
    # [Encoder-decoder]
    "NemotronParseForConditionalGeneration": _HfExamplesInfo(

--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -54,6 +54,7 @@ from vllm.multimodal.processing.processor import (
    BaseMultiModalProcessor,
    BaseProcessingInfo,
    MultiModalProcessingInfo,
+    PlaceholderFeaturesInfo,
    PromptReplacement,
    PromptUpdate,
 )
@@ -283,6 +284,15 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(audio_arrays=MultiModalFieldConfig.batched("audio"))
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+    ) -> None:
+        # mistral_common's tokenizer's does not follow HF's placeholder norms
+        # skip validation here
+        ...
    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,