Unverified Commit 40077ea3 authored by Andreas Karatzas's avatar Andreas Karatzas Committed by GitHub
Browse files

[CI] fix flaky empty responses and add diagnostic assertions in vision chat tests (#36341)


Signed-off-by: default avatarAndreas Karatzas <akaratza@amd.com>
parent 5d6aae45
...@@ -6,7 +6,7 @@ import json ...@@ -6,7 +6,7 @@ import json
import pytest import pytest
from ...utils import RemoteOpenAIServer from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
from .conftest import add_attention_backend from .conftest import add_attention_backend
MISTRAL_FORMAT_ARGS = [ MISTRAL_FORMAT_ARGS = [
...@@ -19,12 +19,55 @@ MISTRAL_FORMAT_ARGS = [ ...@@ -19,12 +19,55 @@ MISTRAL_FORMAT_ARGS = [
] ]
async def transcribe_and_check(
client,
model_name: str,
file,
*,
language: str,
expected_text: str,
expected_seconds: int | None = None,
case_sensitive: bool = False,
):
"""Run a transcription request and assert the output contains
*expected_text* and optionally that usage reports *expected_seconds*.
Provides detailed failure messages with the actual transcription output.
"""
transcription = await client.audio.transcriptions.create(
model=model_name,
file=file,
language=language,
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
if case_sensitive:
assert expected_text in out_text, (
f"Expected {expected_text!r} in transcription output, got: {out_text!r}"
)
else:
assert expected_text.lower() in out_text.lower(), (
f"Expected {expected_text!r} (case-insensitive) in transcription "
f"output, got: {out_text!r}"
)
if expected_seconds is not None:
assert out_usage["seconds"] == expected_seconds, (
f"Expected {expected_seconds}s of audio, "
f"got {out_usage['seconds']}s. Full usage: {out_usage!r}"
)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"] "model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"]
) )
async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention): async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
server_args = ["--enforce-eager"] server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]
if model_name.startswith("mistralai"): if model_name.startswith("mistralai"):
server_args += MISTRAL_FORMAT_ARGS server_args += MISTRAL_FORMAT_ARGS
...@@ -32,20 +75,18 @@ async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention): ...@@ -32,20 +75,18 @@ async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
add_attention_backend(server_args, rocm_aiter_fa_attention) add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(
model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create( await transcribe_and_check(
model=model_name, client,
file=mary_had_lamb, model_name,
mary_had_lamb,
language="en", language="en",
response_format="text", expected_text="Mary had a little lamb",
temperature=0.0, expected_seconds=16,
) )
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
assert "Mary had a little lamb" in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -74,20 +115,18 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): ...@@ -74,20 +115,18 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
add_attention_backend(server_args, rocm_aiter_fa_attention) add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(
model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create( await transcribe_and_check(
model=lora_model_name, client,
file=mary_had_lamb, lora_model_name,
mary_had_lamb,
language="en", language="en",
response_format="text", expected_text="mary had a little lamb",
temperature=0.0, expected_seconds=16,
) )
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
assert "mary had a little lamb" in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -97,20 +136,21 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): ...@@ -97,20 +136,21 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name): async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name):
# Gemma accuracy on some of the audio samples we use is particularly bad, # Gemma accuracy on some of the audio samples we use is particularly bad,
# hence we use a different one here. WER is evaluated separately. # hence we use a different one here. WER is evaluated separately.
server_args = ["--enforce-eager"] server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]
add_attention_backend(server_args, rocm_aiter_fa_attention) add_attention_backend(server_args, rocm_aiter_fa_attention)
with RemoteOpenAIServer( with RemoteOpenAIServer(
model_name, server_args, max_wait_seconds=480 model_name,
server_args,
max_wait_seconds=480,
env_dict=ROCM_ENV_OVERRIDES,
) as remote_server: ) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create( await transcribe_and_check(
model=model_name, client,
file=foscolo, model_name,
foscolo,
language="it", language="it",
response_format="text", expected_text="ove il mio corpo fanciulletto giacque",
temperature=0.0,
) )
out = json.loads(transcription)["text"]
assert "ove il mio corpo fanciulletto giacque" in out
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment