Unverified Commit 912fbe95 authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Bugfix] Fix Qwen2.5-Omni/Qwen3-Omni use_audio_in_video with multi-video inputs (#37147)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent 52131f88
...@@ -18,10 +18,10 @@ MODEL_NAME = "Qwen/Qwen2.5-Omni-3B" ...@@ -18,10 +18,10 @@ MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
def server(): def server():
args = [ args = [
"--max-model-len", "--max-model-len",
"8192", "16384",
"--enforce-eager", "--enforce-eager",
"--limit-mm-per-prompt", "--limit-mm-per-prompt",
json.dumps({"audio": 1, "video": 1}), json.dumps({"audio": 3, "video": 3}),
] ]
with RemoteOpenAIServer( with RemoteOpenAIServer(
...@@ -78,3 +78,98 @@ async def test_online_audio_in_video( ...@@ -78,3 +78,98 @@ async def test_online_audio_in_video(
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
@pytest.mark.core_model
@pytest.mark.asyncio
async def test_online_audio_in_video_multi_videos(
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
):
"""Test multi-video input with `audio_in_video=True`"""
# we don't use video_urls above because they missed audio stream.
video_path = video_assets[0].video_path
with open(video_path, "rb") as f:
video_base64 = base64.b64encode(f.read()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in these two videos?"},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
],
}
]
# multi-turn to test mm processor cache as well
for _ in range(2):
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=16,
extra_body={
"mm_processor_kwargs": {
"use_audio_in_video": True,
}
},
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
@pytest.mark.core_model
@pytest.mark.asyncio
async def test_online_audio_in_video_interleaved(
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
):
"""Test interleaved video/audio input with `audio_in_video=True`"""
# we don't use video_urls above because they missed audio stream.
video_path = video_assets[0].video_path
with open(video_path, "rb") as f:
video_base64 = base64.b64encode(f.read()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in these two videos?"},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
{
"type": "audio_url",
"audio_url": {"url": f"data:audio/mp4;base64,{video_base64}"},
},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
],
}
]
with pytest.raises(
openai.BadRequestError,
match="use_audio_in_video requires equal number of audio and video items",
):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=16,
extra_body={
"mm_processor_kwargs": {
"use_audio_in_video": True,
}
},
)
...@@ -34,8 +34,22 @@ MODELS = [ ...@@ -34,8 +34,22 @@ MODELS = [
] ]
def create_mm_data(num_videos: int) -> dict[str, list]:
# Small video (8 frames, 64×64) and ~0.5 s of audio at 16 kHz so the test
# stays fast even without a GPU.
mm_data = dict[str, list](video=[], audio=[])
for i in range(num_videos):
rng = np.random.RandomState(i)
video = random_video(rng, min_frames=8, max_frames=9, min_wh=64, max_wh=65)
audio, sr = random_audio(rng, min_len=8000, max_len=8001, sr=16000)
mm_data["video"].append(video)
mm_data["audio"].append((audio, sr))
return mm_data
@pytest.mark.parametrize("model_id", MODELS) @pytest.mark.parametrize("model_id", MODELS)
def test_audio_in_video_cache_correctness(model_id: str) -> None: @pytest.mark.parametrize("num_videos", [1, 2])
def test_audio_in_video_cache_correctness(model_id: str, num_videos: int) -> None:
""" """
Regression test for https://github.com/vllm-project/vllm/pull/36800 Regression test for https://github.com/vllm-project/vllm/pull/36800
...@@ -47,7 +61,7 @@ def test_audio_in_video_cache_correctness(model_id: str) -> None: ...@@ -47,7 +61,7 @@ def test_audio_in_video_cache_correctness(model_id: str) -> None:
""" """
ctx = build_model_context( ctx = build_model_context(
model_id, model_id,
limit_mm_per_prompt={"audio": 1, "image": 0, "video": 1}, limit_mm_per_prompt={"audio": num_videos, "image": 0, "video": num_videos},
mm_processor_cache_gb=1, mm_processor_cache_gb=1,
) )
...@@ -65,17 +79,12 @@ def test_audio_in_video_cache_correctness(model_id: str) -> None: ...@@ -65,17 +79,12 @@ def test_audio_in_video_cache_correctness(model_id: str) -> None:
video_token_id = baseline_processor.info.get_hf_config().video_token_id video_token_id = baseline_processor.info.get_hf_config().video_token_id
rng = np.random.RandomState(0) mm_data = create_mm_data(num_videos)
# Small video (8 frames, 64×64) and ~0.5 s of audio at 16 kHz so the test
# stays fast even without a GPU.
video = random_video(rng, min_frames=8, max_frames=9, min_wh=64, max_wh=65)
audio, sr = random_audio(rng, min_len=8000, max_len=8001, sr=16000)
mm_data = {"video": [video], "audio": [(audio, sr)]}
hf_processor_mm_kwargs = {"use_audio_in_video": True} hf_processor_mm_kwargs = {"use_audio_in_video": True}
def run(processor): def run(processor):
return processor( return processor(
[video_token_id], [video_token_id] * num_videos,
mm_items=baseline_processor.info.parse_mm_data(mm_data), mm_items=baseline_processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)["prompt_token_ids"] )["prompt_token_ids"]
......
...@@ -774,9 +774,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor( ...@@ -774,9 +774,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
def get_replacement_qwen2_use_audio_in_video(item_idx: int): def get_replacement_qwen2_use_audio_in_video(item_idx: int):
nonlocal audio_in_video_item_idx nonlocal audio_in_video_item_idx
audio_num_features = audio_output_lengths[ audio_num_features = audio_output_lengths[audio_in_video_item_idx]
audio_in_video_item_idx + item_idx
]
video_grid_thw = out_mm_data["video_grid_thw"][item_idx] video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
audio_in_video_item_idx += 1 audio_in_video_item_idx += 1
......
...@@ -1489,9 +1489,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( ...@@ -1489,9 +1489,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
def get_replacement_qwen2_use_audio_in_video(item_idx: int): def get_replacement_qwen2_use_audio_in_video(item_idx: int):
nonlocal audio_in_video_item_idx nonlocal audio_in_video_item_idx
audio_num_features = audio_output_lengths[ audio_num_features = audio_output_lengths[audio_in_video_item_idx]
audio_in_video_item_idx + item_idx
]
video_grid_thw = out_mm_data["video_grid_thw"][item_idx] video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
audio_in_video_item_idx += 1 audio_in_video_item_idx += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment