Unverified Commit 143e4dcc authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Misc] Add online audio_in_video test (#36775)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent 6590a3ec
...@@ -10,6 +10,7 @@ pytest-cov ...@@ -10,6 +10,7 @@ pytest-cov
# testing utils # testing utils
albumentations # required for Nemotron Parse in test_common.py albumentations # required for Nemotron Parse in test_common.py
av # required for audio_in_video tests
backoff # required for phi4mm test backoff # required for phi4mm test
blobfile # required for kimi-vl test blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl einops # required for MPT, qwen-vl
......
...@@ -62,6 +62,8 @@ attrs==24.2.0 ...@@ -62,6 +62,8 @@ attrs==24.2.0
# referencing # referencing
audioread==3.0.1 audioread==3.0.1
# via librosa # via librosa
av==16.1.0
# via -r requirements/test.in
backoff==2.2.1 backoff==2.2.1
# via # via
# -r requirements/test.in # -r requirements/test.in
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import json
import openai
import pytest
import pytest_asyncio
from ...conftest import VideoTestAssets
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
@pytest.fixture
def server():
args = [
"--max-model-len",
"8192",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": 1, "video": 1}),
]
with RemoteOpenAIServer(
MODEL_NAME,
args,
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.core_model
@pytest.mark.asyncio
async def test_online_audio_in_video(
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
):
"""Test video input with `audio_in_video=True`"""
# we don't use video_urls above because they missed audio stream.
video_path = video_assets[0].video_path
with open(video_path, "rb") as f:
video_base64 = base64.b64encode(f.read()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this video?"},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
],
}
]
# multi-turn to test mm processor cache as well
for _ in range(2):
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=16,
extra_body={
"mm_processor_kwargs": {
"use_audio_in_video": True,
}
},
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
...@@ -4,6 +4,7 @@ import base64 ...@@ -4,6 +4,7 @@ import base64
from pathlib import Path from pathlib import Path
from unittest.mock import patch from unittest.mock import patch
import librosa
import numpy as np import numpy as np
import pytest import pytest
...@@ -71,3 +72,13 @@ def test_audio_media_io_encode_base64(dummy_audio): ...@@ -71,3 +72,13 @@ def test_audio_media_io_encode_base64(dummy_audio):
decoded = base64.b64decode(out) decoded = base64.b64decode(out)
assert decoded == b"dummy_wav_data" assert decoded == b"dummy_wav_data"
mock_write.assert_called_once() mock_write.assert_called_once()
def test_audio_media_io_from_video(video_assets):
audio_io = AudioMediaIO()
video_path = video_assets[0].video_path
with open(video_path, "rb") as f:
audio, sr = audio_io.load_bytes(f.read())
audio_ref, sr_ref = librosa.load(video_path, sr=None)
assert sr == sr_ref
np.testing.assert_allclose(audio_ref, audio, atol=1e-4)
...@@ -506,6 +506,7 @@ class OpenAIServingRender: ...@@ -506,6 +506,7 @@ class OpenAIServingRender:
(ResponsesRequest not supported here); TODO comment dropped accordingly. (ResponsesRequest not supported here); TODO comment dropped accordingly.
""" """
renderer = self.renderer renderer = self.renderer
mm_config = self.model_config.multimodal_config
default_template_kwargs = merge_kwargs( default_template_kwargs = merge_kwargs(
default_template_kwargs, default_template_kwargs,
...@@ -518,7 +519,11 @@ class OpenAIServingRender: ...@@ -518,7 +519,11 @@ class OpenAIServingRender:
tok_params = request.build_tok_params(self.model_config) tok_params = request.build_tok_params(self.model_config)
chat_params = request.build_chat_params( chat_params = request.build_chat_params(
default_template, default_template_content_format default_template, default_template_content_format
).with_defaults(default_template_kwargs) ).with_defaults(
default_template_kwargs,
default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None),
default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None),
)
(conversation,), (engine_prompt,) = await renderer.render_chat_async( (conversation,), (engine_prompt,) = await renderer.render_chat_async(
[messages], [messages],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment