Unverified Commit 615e8033 authored by andrii.pasternak's avatar andrii.pasternak Committed by GitHub
Browse files

[Bug Fix] Handle variable-length tensors in MultiModalFlatField batching (#31751)


Signed-off-by: default avatarAndrii Pasternak <andriipasternak31@gmail.com>
Co-authored-by: default avatarClaude Opus 4.5 <noreply@anthropic.com>
parent d09135fb
...@@ -156,6 +156,51 @@ def test_models_with_multiple_audios( ...@@ -156,6 +156,51 @@ def test_models_with_multiple_audios(
) )
@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
def test_variable_length_audio_batching(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
) -> None:
"""Test batching of requests with different audio durations.
This exercises the variable-length tensor handling in
MultiModalFlatField._reduce_data() which was buggy before
https://github.com/vllm-project/vllm/issues/31658 was fixed.
"""
model_info = HF_EXAMPLE_MODELS.find_hf_info(MODEL_NAME)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
# Create prompts with single audio each (different durations)
prompts_and_audios = []
for audio, question in zip(audio_assets, AUDIO_PROMPTS):
prompt = _get_prompt(1, question, VLLM_PLACEHOLDER)
prompts_and_audios.append((prompt, [audio.audio_and_sample_rate]))
with vllm_runner(
MODEL_NAME,
dtype=dtype,
enforce_eager=True,
limit_mm_per_prompt={"audio": 1},
) as vllm_model:
# Generate for all prompts in a single batch
# This triggers the variable-length batching code path
outputs = vllm_model.generate_greedy(
[prompt for prompt, _ in prompts_and_audios],
max_tokens,
audios=[audios for _, audios in prompts_and_audios],
)
# Verify outputs were generated for each request
assert len(outputs) == len(prompts_and_audios)
for output in outputs:
assert len(output[1]) > 0, "Expected non-empty output"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_online_serving(client, audio_assets: AudioTestAssets): async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled.""" """Exercises online serving with/without chunked prefill enabled."""
......
...@@ -604,6 +604,47 @@ class MultiModalFlatField(BaseMultiModalField): ...@@ -604,6 +604,47 @@ class MultiModalFlatField(BaseMultiModalField):
) )
return torch.concat(batch, dim=self.dim, out=out) return torch.concat(batch, dim=self.dim, out=out)
# Variable-length case: non-concat dimensions differ
# (e.g., Ultravox with different audio durations).
# Use slice-assign approach (more efficient than padding).
# See: https://github.com/vllm-project/vllm/issues/31658
ndim = batch[0].ndim
# Step 1: Compute output shape
# - Non-concat dims: take max across batch
# - Concat dim: sum across batch
max_sizes: list[int] = []
for d in range(ndim):
if d == dim:
max_sizes.append(sum(t.shape[d] for t in batch))
else:
max_sizes.append(max(t.shape[d] for t in batch))
# Step 2: Create zero-initialized output tensor
out = torch.zeros(
max_sizes,
dtype=batch[0].dtype,
device=batch[0].device,
pin_memory=pin_memory,
)
# Step 3: Slice-assign each tensor to its proper position
concat_offset = 0
for tensor in batch:
slices: list[slice] = []
for d in range(ndim):
if d == dim:
slices.append(
slice(concat_offset, concat_offset + tensor.shape[d])
)
else:
slices.append(slice(0, tensor.shape[d]))
out[tuple(slices)] = tensor
concat_offset += tensor.shape[dim]
return out
assert self.dim == 0, "dim == 0 is required for nested list" assert self.dim == 0, "dim == 0 is required for nested list"
return [e for elem in batch for e in elem] return [e for elem in batch for e in elem]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment