Unverified Commit 1100a976 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Voxstral Realtime] Enable tests (#33803)


Signed-off-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
parent 766e1678
...@@ -27,15 +27,6 @@ MISTRAL_FORMAT_ARGS = [ ...@@ -27,15 +27,6 @@ MISTRAL_FORMAT_ARGS = [
MODEL_NAME = "mistralai/Voxtral-Mini-4B-Realtime-2602" MODEL_NAME = "mistralai/Voxtral-Mini-4B-Realtime-2602"
def _audio_to_base64_pcm16(path: str, target_sr: int = 16000) -> str:
"""Load audio file, convert to PCM16 @ target sample rate, base64 encode."""
audio, _ = librosa.load(path, sr=target_sr, mono=True)
# Convert float32 [-1, 1] to int16 [-32768, 32767]
audio_int16 = (audio * 32767).astype(np.int16)
audio_bytes = audio_int16.tobytes()
return base64.b64encode(audio_bytes).decode("utf-8")
def _get_websocket_url(server: RemoteOpenAIServer) -> str: def _get_websocket_url(server: RemoteOpenAIServer) -> str:
"""Convert HTTP URL to WebSocket URL for realtime endpoint.""" """Convert HTTP URL to WebSocket URL for realtime endpoint."""
http_url = server.url_root http_url = server.url_root
...@@ -74,12 +65,11 @@ def mary_had_lamb_audio_chunks() -> list[str]: ...@@ -74,12 +65,11 @@ def mary_had_lamb_audio_chunks() -> list[str]:
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="Voxtral streaming is not yet public")
async def test_multi_chunk_streaming( async def test_multi_chunk_streaming(
model_name, mary_had_lamb_audio_chunks, rocm_aiter_fa_attention model_name, mary_had_lamb_audio_chunks, rocm_aiter_fa_attention
): ):
"""Test streaming multiple audio chunks before committing.""" """Test streaming multiple audio chunks before committing."""
server_args = ["--enforce-eager"] server_args = ["--enforce-eager", "--max-model-len", "2048"]
if model_name.startswith("mistralai"): if model_name.startswith("mistralai"):
server_args += MISTRAL_FORMAT_ARGS server_args += MISTRAL_FORMAT_ARGS
......
...@@ -73,7 +73,6 @@ def async_engine() -> AsyncLLM: ...@@ -73,7 +73,6 @@ def async_engine() -> AsyncLLM:
return AsyncLLM.from_engine_args(engine_args) return AsyncLLM.from_engine_args(engine_args)
@pytest.mark.skip(reason="Voxtral streaming is not yet public")
def test_voxtral_realtime_forward(audio_assets, tokenizer, engine): def test_voxtral_realtime_forward(audio_assets, tokenizer, engine):
audio_config = tokenizer.instruct_tokenizer.tokenizer.audio audio_config = tokenizer.instruct_tokenizer.tokenizer.audio
...@@ -218,7 +217,6 @@ class RealTimeAudioInput: ...@@ -218,7 +217,6 @@ class RealTimeAudioInput:
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.skip(reason="Voxtral streaming is not yet public")
async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine): async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine):
sampling_params = SamplingParams(temperature=0.0, max_tokens=1) sampling_params = SamplingParams(temperature=0.0, max_tokens=1)
......
...@@ -441,6 +441,13 @@ def test_processing_correctness( ...@@ -441,6 +441,13 @@ def test_processing_correctness(
"Qwen-VL tokenizer requires downloading a font file from " "Qwen-VL tokenizer requires downloading a font file from "
"servers that often refuse connections in CI" "servers that often refuse connections in CI"
) )
if model_id == "mistralai/Voxtral-Mini-4B-Realtime-2602":
pytest.skip(
"Voxtral Realtime doesn't make use of any place-holder"
"tokens and hence cannot pass the processing "
"correctness test as is. Let's revisit adapting this "
"test once more realtime models exist."
)
if model_id == "internlm/Intern-S1-Pro": if model_id == "internlm/Intern-S1-Pro":
# FIXME(Isotr0py): Fix later. # FIXME(Isotr0py): Fix later.
pytest.skip("Tokenization issue. Fix later") pytest.skip("Tokenization issue. Fix later")
......
...@@ -1031,13 +1031,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -1031,13 +1031,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
), ),
"VoxtralForConditionalGeneration": _HfExamplesInfo( "VoxtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Voxtral-Mini-3B-2507", "mistralai/Voxtral-Mini-3B-2507",
# disable this temporarily until we support HF format tokenizer_mode="mistral",
is_available_online=False,
), ),
"VoxtralRealtimeGeneration": _HfExamplesInfo( "VoxtralRealtimeGeneration": _HfExamplesInfo(
"<place-holder>", "mistralai/Voxtral-Mini-4B-Realtime-2602",
# disable this temporarily until we support HF format enforce_eager=True,
is_available_online=False, tokenizer_mode="mistral",
), ),
# [Encoder-decoder] # [Encoder-decoder]
"NemotronParseForConditionalGeneration": _HfExamplesInfo( "NemotronParseForConditionalGeneration": _HfExamplesInfo(
......
...@@ -54,6 +54,7 @@ from vllm.multimodal.processing.processor import ( ...@@ -54,6 +54,7 @@ from vllm.multimodal.processing.processor import (
BaseMultiModalProcessor, BaseMultiModalProcessor,
BaseProcessingInfo, BaseProcessingInfo,
MultiModalProcessingInfo, MultiModalProcessingInfo,
PlaceholderFeaturesInfo,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
) )
...@@ -283,6 +284,15 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]) ...@@ -283,6 +284,15 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
) -> Mapping[str, MultiModalFieldConfig]: ) -> Mapping[str, MultiModalFieldConfig]:
return dict(audio_arrays=MultiModalFieldConfig.batched("audio")) return dict(audio_arrays=MultiModalFieldConfig.batched("audio"))
def _validate_mm_placeholders(
self,
mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
mm_item_counts: Mapping[str, int],
) -> None:
# mistral_common's tokenizer's does not follow HF's placeholder norms
# skip validation here
...
def _get_prompt_updates( def _get_prompt_updates(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment