[Bug Fix] Handle variable-length tensors in MultiModalFlatField batching (#31751)

Signed-off-by: Andrii Pasternak <andriipasternak31@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>

[Bug Fix] Handle variable-length tensors in MultiModalFlatField batching (#31751)
Signed-off-by: Andrii Pasternak <andriipasternak31@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
615e8033 · andrii.pasternak · GitHub · d09135fb · 615e8033 · 615e8033
Unverified Commit 615e8033 authored Jan 29, 2026 by andrii.pasternak Committed by GitHub Jan 29, 2026
Show whitespace changes
Inline Side-by-side

Showing with 86 additions and 0 deletions

tests/models/multimodal/generation/test_ultravox.py tests/models/multimodal/generation/test_ultravox.py +45 -0

vllm/multimodal/inputs.py vllm/multimodal/inputs.py +41 -0

No files found.
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -156,6 +156,51 @@ def test_models_with_multiple_audios(
    )
+@pytest.mark.core_model
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+def test_variable_length_audio_batching(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    """Test batching of requests with different audio durations.
+    This exercises the variable-length tensor handling in
+    MultiModalFlatField._reduce_data() which was buggy before
+    https://github.com/vllm-project/vllm/issues/31658 was fixed.
+    """
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(MODEL_NAME)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+    # Create prompts with single audio each (different durations)
+    prompts_and_audios = []
+    for audio, question in zip(audio_assets, AUDIO_PROMPTS):
+        prompt = _get_prompt(1, question, VLLM_PLACEHOLDER)
+        prompts_and_audios.append((prompt, [audio.audio_and_sample_rate]))
+    with vllm_runner(
+        MODEL_NAME,
+        dtype=dtype,
+        enforce_eager=True,
+        limit_mm_per_prompt={"audio": 1},
+    ) as vllm_model:
+        # Generate for all prompts in a single batch
+        # This triggers the variable-length batching code path
+        outputs = vllm_model.generate_greedy(
+            [prompt for prompt, _ in prompts_and_audios],
+            max_tokens,
+            audios=[audios for _, audios in prompts_and_audios],
+        )
+    # Verify outputs were generated for each request
+    assert len(outputs) == len(prompts_and_audios)
+    for output in outputs:
+        assert len(output[1]) > 0, "Expected non-empty output"
 @pytest.mark.asyncio
 async def test_online_serving(client, audio_assets: AudioTestAssets):
    """Exercises online serving with/without chunked prefill enabled."""

--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -604,6 +604,47 @@ class MultiModalFlatField(BaseMultiModalField):
                )
                return torch.concat(batch, dim=self.dim, out=out)
+            # Variable-length case: non-concat dimensions differ
+            # (e.g., Ultravox with different audio durations).
+            # Use slice-assign approach (more efficient than padding).
+            # See: https://github.com/vllm-project/vllm/issues/31658
+            ndim = batch[0].ndim
+            # Step 1: Compute output shape
+            # - Non-concat dims: take max across batch
+            # - Concat dim: sum across batch
+            max_sizes: list[int] = []
+            for d in range(ndim):
+                if d == dim:
+                    max_sizes.append(sum(t.shape[d] for t in batch))
+                else:
+                    max_sizes.append(max(t.shape[d] for t in batch))
+            # Step 2: Create zero-initialized output tensor
+            out = torch.zeros(
+                max_sizes,
+                dtype=batch[0].dtype,
+                device=batch[0].device,
+                pin_memory=pin_memory,
+            )
+            # Step 3: Slice-assign each tensor to its proper position
+            concat_offset = 0
+            for tensor in batch:
+                slices: list[slice] = []
+                for d in range(ndim):
+                    if d == dim:
+                        slices.append(
+                            slice(concat_offset, concat_offset + tensor.shape[d])
+                        )
+                    else:
+                        slices.append(slice(0, tensor.shape[d]))
+                out[tuple(slices)] = tensor
+                concat_offset += tensor.shape[dim]
+            return out
        assert self.dim == 0, "dim == 0 is required for nested list"
        return [e for elem in batch for e in elem]