Merge tag 'v0.6.0' into v0.6.0-dev

0640f227 · zhuwenwen · 82f1ffdf · 32e7db25 · 0640f227 · 0640f227
Commit 0640f227 authored Sep 09, 2024 by zhuwenwen
20 changed files
--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+
+
+def causal_conv1d_ref(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    initial_states: Optional[torch.Tensor] = None,
+    return_final_states: bool = False,
+    final_states_out: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+):
+    """
+    x: (batch, dim, seqlen)
+    weight: (dim, width)
+    bias: (dim,)
+    initial_states: (batch, dim, width - 1)
+    final_states_out: (batch, dim, width - 1)
+
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    x = x.to(weight.dtype)
+    seqlen = x.shape[-1]
+    dim, width = weight.shape
+    if initial_states is None:
+        out = F.conv1d(x,
+                       weight.unsqueeze(1),
+                       bias,
+                       padding=width - 1,
+                       groups=dim)
+    else:
+        x = torch.cat([initial_states, x], dim=-1)
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
+    out = out[..., :seqlen]
+    if return_final_states:
+        final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
+            dtype_in)  # (batch, dim, width - 1)
+        if final_states_out is not None:
+            final_states_out.copy_(final_states)
+        else:
+            final_states_out = final_states
+    out = (out if activation is None else F.silu(out)).to(dtype=dtype_in)
+    return (out, None) if not return_final_states else (out, final_states_out)
+
+
+def causal_conv1d_update_ref(x: torch.Tensor,
+                             conv_state: torch.Tensor,
+                             weight: torch.Tensor,
+                             bias: Optional[torch.Tensor] = None,
+                             activation: Optional[str] = None):
+    """
+    x: (batch, dim)
+    conv_state: (batch, dim, width)
+    weight: (dim, width)
+    bias: (dim,)
+
+    out: (batch, dim)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    batch, dim = x.shape
+    width = weight.shape[1]
+    assert conv_state.shape == (batch, dim, width)
+    assert weight.shape == (dim, width)
+    conv_state.copy_(torch.roll(conv_state, shifts=-1,
+                                dims=-1))  # Update state (B D W)
+    conv_state[:, :, -1] = x
+    out = torch.sum(conv_state * weight, dim=-1)  # (B D)
+    if bias is not None:
+        out += bias
+    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
+
+
+@pytest.mark.parametrize("return_final_states", [False, True])
+@pytest.mark.parametrize("has_initial_states", [False, True])
+@pytest.mark.parametrize("channel_last", [False, True])
+@pytest.mark.parametrize("itype", [torch.bfloat16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+@pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize("width", [4])
+@pytest.mark.parametrize("seqlen", [128, 512, 4096])
+@pytest.mark.parametrize('dim', [64, 4096 + 32])
+@pytest.mark.parametrize('batch', [1, 2])
+def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
+                       itype, channel_last, has_initial_states,
+                       return_final_states):
+    if not channel_last and (has_initial_states or return_final_states):
+        pytest.skip(
+            "Only channel_last support initial_states or return_final_states")
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    # set seed
+    torch.random.manual_seed(0)
+    if not channel_last:
+        x = torch.randn(batch,
+                        4096 + dim + 64,
+                        seqlen,
+                        device=device,
+                        dtype=itype)[:, 4096:4096 + dim, :]
+    else:
+        x = rearrange(
+            torch.randn(batch,
+                        seqlen,
+                        4096 + dim + 64,
+                        device=device,
+                        dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s")
+    weight = torch.randn(dim, width, device=device, dtype=itype)
+    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
+    if has_initial_states:
+        initial_states = torch.randn(batch,
+                                     width - 1,
+                                     dim,
+                                     device=device,
+                                     dtype=itype).transpose(1, 2)
+    else:
+        initial_states = None
+    x_ref = x.detach().clone()
+    weight_ref = weight.detach().clone()
+    bias_ref = bias.detach().clone() if bias is not None else None
+    initial_states_ref = initial_states.detach().clone(
+    ) if initial_states is not None else None
+    activation = None if not silu_activation else "silu"
+    out, final_states = causal_conv1d_fn(
+        x,
+        weight,
+        bias,
+        initial_states=initial_states,
+        return_final_states=return_final_states,
+        activation=activation)
+    out_ref, final_states_ref = causal_conv1d_ref(
+        x_ref,
+        weight_ref,
+        bias_ref,
+        initial_states=initial_states_ref,
+        return_final_states=return_final_states,
+        activation=activation)
+    if return_final_states:
+        assert final_states is not None and final_states_ref is not None
+        assert torch.allclose(final_states,
+                              final_states_ref,
+                              rtol=rtol,
+                              atol=atol)
+
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+    if return_final_states:
+        out += F.sigmoid(final_states).sum(dim=-1, keepdim=True)
+        out_ref += F.sigmoid(final_states_ref).sum(dim=-1, keepdim=True)
+
+
+@pytest.mark.parametrize("itype", [torch.bfloat16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+@pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize("width", [2, 3, 4])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+@pytest.mark.parametrize("batch", [1, 2])
+def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
+                              itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    # set seed
+    torch.random.manual_seed(0)
+    batch = 2
+    x = torch.randn(batch, dim, device=device, dtype=itype)
+    conv_state = torch.randn(batch, dim, width, device=device, dtype=itype)
+    weight = torch.randn(dim,
+                         width,
+                         device=device,
+                         dtype=itype,
+                         requires_grad=True)
+    if has_bias:
+        bias = torch.randn(dim, device=device, dtype=itype, requires_grad=True)
+    else:
+        bias = None
+    conv_state_ref = conv_state.detach().clone()
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_update(x,
+                               conv_state,
+                               weight,
+                               bias,
+                               activation=activation)
+    out_ref = causal_conv1d_update_ref(x,
+                                       conv_state_ref,
+                                       weight,
+                                       bias,
+                                       activation=activation)
+
+    assert torch.equal(conv_state, conv_state_ref)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -73,11 +73,14 @@ def ref_paged_attn(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
 @torch.inference_mode
-def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
-                                         num_heads: Tuple[int,
-                                                          int], head_size: int,
-                                         dtype: torch.dtype, block_size: int,
-                                         soft_cap: Optional[float]) -> None:
+def test_flashinfer_decode_with_paged_kv(
+    kv_lens: List[int],
+    num_heads: Tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+) -> None:
    torch.set_default_device("cuda")
    torch.cuda.manual_seed_all(0)
    num_seqs = len(kv_lens)
@@ -88,6 +91,7 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
    scale = head_size**-0.5

    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+
    key_value_cache = torch.randn(NUM_BLOCKS,
                                  2,
                                  block_size,
@@ -125,7 +129,7 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
    wrapper = flashinfer.\
        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
                use_tensor_cores=(
-                    (num_query_heads//num_kv_heads) not in (1, 2, 4, 8))
+                    (num_query_heads//num_kv_heads) > 4)
                )
    wrapper.begin_forward(kv_indptr,
                          kv_indices,
@@ -249,3 +253,215 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
                                soft_cap=soft_cap)
    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"
+
+
+@pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
+@pytest.mark.parametrize("num_heads", [(32, 8), (6, 1)])
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+def test_flashinfer_prefill_with_paged_fp8_kv(
+        seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int],
+        head_size: int, dtype: torch.dtype, block_size: int,
+        soft_cap: Optional[float]) -> None:
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    kv_cache_dtype = torch.float8_e4m3fn
+
+    query = torch.randn(sum(query_lens),
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    NUM_BLOCKS_FP8 = 2048
+    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    k_scale = key_cache.amax().item() / 448.0
+    v_scale = value_cache.amax().item() / 448.0
+
+    kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale],
+                             dim=1).to(kv_cache_dtype)
+
+    assert (kv_cache_fp8.shape == key_value_cache.shape)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS_FP8,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    qo_indptr = [0]
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+        qo_indptr.append(qo_indptr[-1] + query_lens[i])
+
+    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD")
+    wrapper.begin_forward(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+    )
+
+    output = wrapper.forward(query,
+                             kv_cache_fp8,
+                             logits_soft_cap=soft_cap,
+                             k_scale=k_scale,
+                             v_scale=v_scale)
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache.squeeze(1),
+                                value_cache=value_cache.squeeze(1),
+                                query_lens=query_lens,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap)
+    del query
+    del block_tables
+    # verify prefill fp8
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", [(32, 8), (64, 8), (6, 1)])
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@torch.inference_mode
+def test_flashinfer_decode_with_paged_fp8_kv(
+    kv_lens: List[int],
+    num_heads: Tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+) -> None:
+    # test doesn't work for num_heads = (16,16)
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+    use_tensor_cores = (num_query_heads // num_kv_heads) > 4
+    kv_cache_dtype = torch.float8_e4m3fn
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    NUM_BLOCKS_FP8 = 2048
+    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
+                                  2,
+                                  block_size,
+                                  num_kv_heads,
+                                  head_size,
+                                  dtype=dtype)
+    key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    k_scale = key_cache.amax().item() / 448.0
+    v_scale = value_cache.amax().item() / 448.0
+
+    key_cache_fp8 = (key_cache / k_scale).to(kv_cache_dtype)
+    value_cache_fp8 = (value_cache / v_scale).to(kv_cache_dtype)
+    assert (key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1)
+    kv_cache_fp8 = torch.cat([key_cache_fp8, value_cache_fp8], dim=1)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS_FP8,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.\
+        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
+                    use_tensor_cores=use_tensor_cores)
+    wrapper.begin_forward(kv_indptr,
+                          kv_indices,
+                          kv_last_page_lens,
+                          num_query_heads,
+                          num_kv_heads,
+                          head_size,
+                          block_size,
+                          "NONE",
+                          data_type=dtype)
+    output = wrapper.forward(query,
+                             kv_cache_fp8,
+                             logits_soft_cap=soft_cap,
+                             k_scale=k_scale,
+                             v_scale=v_scale)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    ref_output = ref_paged_attn(query=query,
+                                key_cache=key_cache,
+                                value_cache=value_cache,
+                                query_lens=[1] * num_seqs,
+                                kv_lens=kv_lens,
+                                block_tables=block_tables,
+                                scale=scale,
+                                soft_cap=soft_cap)
+    # Temporary fix: Increasing the tolerance. Seems like a flashinfer issue
+    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
+        f"{torch.max(torch.abs(output - ref_output))}"
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+
+
+def selective_state_update_ref(state,
+                               x,
+                               dt,
+                               A,
+                               B,
+                               C,
+                               D=None,
+                               z=None,
+                               dt_bias=None,
+                               dt_softplus=False):
+    """
+    Argument:
+        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
+        x: (batch, dim) or (batch, nheads, dim)
+        dt: (batch, dim) or (batch, nheads, dim)
+        A: (dim, dstate) or (nheads, dim, dstate)
+        B: (batch, dstate) or (batch, ngroups, dstate)
+        C: (batch, dstate) or (batch, ngroups, dstate)
+        D: (dim,) or (nheads, dim)
+        z: (batch, dim) or (batch, nheads, dim)
+        dt_bias: (dim,) or (nheads, dim)
+    Return:
+        out: (batch, dim) or (batch, nheads, dim)
+    """
+    has_heads = state.dim() > 3
+    if state.dim() == 3:
+        state = state.unsqueeze(1)
+    if x.dim() == 2:
+        x = x.unsqueeze(1)
+    if dt.dim() == 2:
+        dt = dt.unsqueeze(1)
+    if A.dim() == 2:
+        A = A.unsqueeze(0)
+    if B.dim() == 2:
+        B = B.unsqueeze(1)
+    if C.dim() == 2:
+        C = C.unsqueeze(1)
+    if D is not None and D.dim() == 1:
+        D = D.unsqueeze(0)
+    if z is not None and z.dim() == 2:
+        z = z.unsqueeze(1)
+    if dt_bias is not None and dt_bias.dim() == 1:
+        dt_bias = dt_bias.unsqueeze(0)
+    batch, nheads, dim, dstate = state.shape
+    assert x.shape == (batch, nheads, dim)
+    assert dt.shape == x.shape
+    assert A.shape == (nheads, dim, dstate)
+    ngroups = B.shape[1]
+    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
+    assert B.shape == (batch, ngroups, dstate)
+    assert C.shape == B.shape
+    if D is not None:
+        assert D.shape == (nheads, dim)
+    if z is not None:
+        assert z.shape == x.shape
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads, dim)
+        dt = dt + dt_bias
+    dt = F.softplus(dt) if dt_softplus else dt
+    dA = torch.exp(rearrange(dt, "b h d -> b h d 1") *
+                   A)  # (batch, nheads, dim, dstate)
+    B = repeat(B, "b g n -> b (g h) n",
+               h=nheads // ngroups)  # (batch, nheads, dstate)
+    C = repeat(C, "b g n -> b (g h) n",
+               h=nheads // ngroups)  # (batch, nheads, dstate)
+    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
+        B, "b h n -> b h 1 n")  # (batch, nheads, dim, dstate)
+    state.copy_(state * dA +
+                dB * rearrange(x, "b h d -> b h d 1"))  # (batch, dim, dstate
+    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
+    if D is not None:
+        out += (x * D).to(out.dtype)
+    out = (out if z is None else out * F.silu(z)).to(x.dtype)
+    if not has_heads:
+        out = out.squeeze(1)
+    return out
+
+
+def selective_scan_ref(u,
+                       delta,
+                       A,
+                       B,
+                       C,
+                       D=None,
+                       z=None,
+                       delta_bias=None,
+                       delta_softplus=False,
+                       return_last_state=False,
+                       position_indices=None,
+                       prev_state=None):
+    """
+    u: r(B D L)
+    delta: r(B D L)
+    A: c(D N) or r(D N)
+    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    D: r(D)
+    z: r(B D L)
+    delta_bias: r(D), fp32
+    prev_state: r(B D N), fp32
+
+    out: r(B D L)
+    last_state (optional): r(B D dstate) or c(B D dstate)
+    """
+    dtype_in = u.dtype
+    u = u.float()
+    delta = delta.float()
+    if delta_bias is not None:
+        delta = delta + delta_bias[..., None].float()
+    if delta_softplus:
+        delta = F.softplus(delta)
+    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
+    is_variable_B = B.dim() >= 3
+    is_variable_C = C.dim() >= 3
+    B = B.float()
+    C = C.float()
+    x = A.new_zeros((batch, dim, dstate)) if prev_state is None else prev_state
+    ys = []
+    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    if not is_variable_B:
+        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
+    else:
+        if B.dim() == 3:
+            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
+        else:
+            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
+            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
+    if is_variable_C and C.dim() == 4:
+        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
+    last_state = None
+    for i in range(u.shape[2]):
+        if position_indices is not None and position_indices[0, i] == 0:
+            x = deltaB_u[:, :, i]
+        else:
+            x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
+        if not is_variable_C:
+            y = torch.einsum('bdn,dn->bd', x, C)
+        else:
+            if C.dim() == 3:
+                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
+            else:
+                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
+        if i == u.shape[2] - 1:
+            last_state = x
+        ys.append(y)
+    y = torch.stack(ys, dim=2)  # (batch dim L)
+    out = y if D is None else y + u * rearrange(D, "d -> d 1")
+    if z is not None:
+        out = out * F.silu(z)
+    out = out.to(dtype=dtype_in)
+    return out if not return_last_state else (out, last_state)
+
+
+@pytest.mark.parametrize('wtype', [torch.float32])
+@pytest.mark.parametrize('itype', [torch.float32])
+@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize("return_last_state", [True])
+@pytest.mark.parametrize('has_delta_bias', [True])
+@pytest.mark.parametrize('delta_softplus', [True])
+@pytest.mark.parametrize('has_z', [True])
+@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("varBC_groups", [1, 2])
+@pytest.mark.parametrize("is_variable_C", [True])
+@pytest.mark.parametrize("is_variable_B", [True])
+@pytest.mark.parametrize("scan_chunks", [1, 2, 3])
+def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
+                        has_z, has_delta_bias, delta_softplus,
+                        return_last_state, seqlen, itype, wtype, scan_chunks):
+    if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
+        pytest.skip()  # This config is not applicable
+    device = 'cuda'
+    rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 3e-2, 5e-2
+    rtolw, atolw = (1e-3, 1e-3)
+    if has_z:  # If we have z, the errors on the weights seem higher
+        rtolw = max(rtolw, rtol)
+        atolw = max(atolw, atol)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    dim = 4
+    dstate = 8
+    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    if not is_variable_B:
+        B_shape = [dim, dstate]
+    elif varBC_groups == 1:
+        B_shape = [batch_size, dstate, seqlen]
+    else:
+        B_shape = [batch_size, varBC_groups, dstate, seqlen]
+    B = torch.randn(B_shape,
+                    device=device,
+                    dtype=wtype if not is_variable_B else itype)
+    if not is_variable_C:
+        C_shape = [dim, dstate]
+    elif varBC_groups == 1:
+        C_shape = [batch_size, dstate, seqlen]
+    else:
+        C_shape = [batch_size, varBC_groups, dstate, seqlen]
+    C = torch.randn(C_shape,
+                    device=device,
+                    dtype=wtype if not is_variable_C else itype)
+    D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
+    z = torch.randn(batch_size, dim, seqlen, device=device,
+                    dtype=itype) if has_z else None
+    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
+                  ) if has_delta_bias else None
+    u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
+    delta = (0.5 *
+             torch.rand(batch_size, dim, seqlen, device=device, dtype=itype))
+    state = None
+    state_ref = None
+    out = None
+    out_ref = None
+    outs = []
+    for c in range(scan_chunks):
+        chunked_prompt_len = seqlen // scan_chunks
+        chunk_start = chunked_prompt_len * c
+        chunk_end = chunked_prompt_len * (c + 1)
+        if c == scan_chunks - 1:
+            chunk_end = seqlen
+        _B = B
+        if is_variable_B:
+            _B = B[..., chunk_start:chunk_end]
+        _C = C
+        if is_variable_B:
+            _C = C[..., chunk_start:chunk_end]
+        _z = z
+        if has_z:
+            assert z is not None
+            _z = z[..., chunk_start:chunk_end]
+        out, *rest = selective_scan_fn(u[..., chunk_start:chunk_end],
+                                       delta[..., chunk_start:chunk_end],
+                                       A,
+                                       _B,
+                                       _C,
+                                       D,
+                                       z=_z,
+                                       delta_bias=delta_bias,
+                                       delta_softplus=delta_softplus,
+                                       return_last_state=return_last_state,
+                                       prev_state=state if c > 0 else None)
+        outs.append(out)
+        if return_last_state:
+            state = rest[0]
+    if len(outs) > 1:
+        out = torch.cat(outs, dim=-1)
+    out_ref, *rest = selective_scan_ref(u,
+                                        delta,
+                                        A,
+                                        B,
+                                        C,
+                                        D,
+                                        z=z,
+                                        delta_bias=delta_bias,
+                                        delta_softplus=delta_softplus,
+                                        return_last_state=return_last_state)
+    if return_last_state:
+        state_ref = rest[0]
+
+    assert out is not None and out_ref is not None
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+    if return_last_state:
+        assert state is not None and state_ref is not None
+        assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("has_z", [False, True])
+@pytest.mark.parametrize("dstate", [16, 32, 64])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+def test_selective_state_update(dim, dstate, has_z, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+        if torch.version.hip:
+            atol *= 2
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 1
+    state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
+    x = torch.randn(batch_size, dim, device=device, dtype=itype)
+    dt = torch.randn(batch_size, dim, device=device, dtype=itype)
+    dt_bias = torch.rand(dim, device=device) - 4.0
+    A = -torch.rand(dim, dstate, device=device) - 1.0
+    B = torch.randn(batch_size, dstate, device=device)
+    C = torch.randn(batch_size, dstate, device=device)
+    D = torch.randn(dim, device=device)
+    z = torch.randn_like(x) if has_z else None
+    state_ref = state.detach().clone()
+    out = selective_state_update(state,
+                                 x,
+                                 dt,
+                                 A,
+                                 B,
+                                 C,
+                                 D=D,
+                                 z=z,
+                                 dt_bias=dt_bias,
+                                 dt_softplus=True)
+    out_ref = selective_state_update_ref(state_ref,
+                                         x,
+                                         dt,
+                                         A,
+                                         B,
+                                         C,
+                                         D=D,
+                                         z=z,
+                                         dt_bias=dt_bias,
+                                         dt_softplus=True)
+
+    assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
 from typing import List

+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hip

 MODEL_PATH = "google/gemma-7b"

@@ -10,7 +13,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    prompts = [
        "Quote: Imagination is",
        "Quote: Be yourself;",
-        "Quote: So many books,",
+        "Quote: Painting is poetry that is seen rather than felt,",
    ]
    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
    outputs = llm.generate(
@@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts


+@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm")
 def test_gemma_lora(gemma_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
@@ -37,7 +41,8 @@ def test_gemma_lora(gemma_lora_files):
    expected_lora_output = [
        "more important than knowledge.\nAuthor: Albert Einstein\n",
        "everyone else is already taken.\nAuthor: Oscar Wilde\n",
-        "so little time.\nAuthor: Frank Zappa\n",
+        "and poetry is painting that is felt rather than seen.\n"
+        "Author: Leonardo da Vinci\n",
    ]

    output1 = do_sample(llm, gemma_lora_files, lora_id=1)

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -7,6 +7,7 @@ import pytest

 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hip

 from .conftest import cleanup

@@ -17,12 +18,23 @@ class ModelWithQuantization:
    quantization: str


-MODELS: List[ModelWithQuantization] = [
-    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-                          quantization="AWQ"),
-    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-                          quantization="GPTQ"),
-]
+MODELS: List[ModelWithQuantization]
+#AWQ quantization is currently not supported in ROCm.
+if is_hip():
+    MODELS = [
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            quantization="GPTQ"),
+    ]
+else:
+    MODELS = [
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            quantization="AWQ"),
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            quantization="GPTQ"),
+    ]


 def do_sample(llm: vllm.LLM,

--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -3,116 +3,97 @@
 Note: these tests will only pass on L4 GPU.
 """
 import os
-from typing import List
+from typing import Optional

 import pytest
-import torch
-from transformers import AutoTokenizer

+from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams

-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-MAX_MODEL_LEN = 1024
-
-MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-]
+from ..models.utils import check_logprobs_close

-EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": {
-        "auto": [
-            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no'
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system made up of several basic components that work together to enable it to',
-            'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk'
-        ]
-    },
-    "meta-llama/Meta-Llama-3-8B-Instruct": {
-        "auto": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-            'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu'
-        ]
-    },
-}
+os.environ["TOKENIZERS_PARALLELISM"] = "true"


-# This test compares against golden strings for exact match since
-# there is no baseline implementation to compare against
-# and is unstable w.r.t specifics of the fp8 implementation or
-# the hardware being run on.
-# Disabled to prevent it from breaking the build
-@pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build.")
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model_name", MODELS)
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
-    model = LLM(model=model_name,
-                max_model_len=MAX_MODEL_LEN,
-                trust_remote_code=True,
-                enforce_eager=True,
-                quantization="fp8",
-                kv_cache_dtype=kv_cache_dtype)
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model,scale_path",
+    [
+        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
+        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
+         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
+         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
+         "meta-llama/Llama-2-7b-chat-hf",
+         "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    scale_path: Optional[str],
+    max_tokens: int,
+    enforce_eager: bool,
+    backend: str,
+    tensor_parallel_size: int,
+    disable_async_output_proc: bool,
+    monkeypatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+    override_backend_env_variable(monkeypatch, backend)
+
+    MAX_MODEL_LEN = 1024
+    NUM_LOG_PROBS = 8
+
+    with vllm_runner(
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        baseline_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)

-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for prompt in example_prompts
-    ]
+    extra_kwargs = {}
+    if scale_path is not None:
+        extra_kwargs["quantization_param_path"] = scale_path

-    params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
-    # Note: these need to be run 1 at a time due to numerical precision,
-    # since the expected strs were generated this way.
-    for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
-        generations.append(outputs[0].outputs[0].text)
-    del model
+    with vllm_runner(
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+            **extra_kwargs,
+    ) as vllm_model:
+        test_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)

-    print(model_name, kv_cache_dtype, generations)
-    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
-    for i in range(len(example_prompts)):
-        generated_str = generations[i]
-        expected_str = expected_strs[i]
-        assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+    check_logprobs_close(
+        outputs_0_lst=baseline_outputs,
+        outputs_1_lst=test_outputs,
+        name_0="fp16_kv_cache",
+        name_1="fp8_kv_cache",
+    )
--- a/tests/models/test_granite.py
+++ b/tests/models/test_granite.py
+"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
+
+Run `pytest tests/models/test_granite.py`.
+"""
+import importlib.metadata
+
+import pytest
+
+from .utils import check_logprobs_close
+
+TRANSFORMERS_VERSION = tuple(
+    map(int,
+        importlib.metadata.version("transformers").split(".")))
+
+MODELS = [
+    "ibm/PowerLM-3b",
+]
+
+
+# GraniteForCausalLM will be in transformers >= 4.45
+@pytest.mark.skipif(TRANSFORMERS_VERSION < (4, 45),
+                    reason="granite model test requires transformers >= 4.45")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    # TODO(sang): Sliding window should be tested separately.
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/test_intern_vit.py
+++ b/tests/models/test_intern_vit.py
@@ -6,8 +6,6 @@ import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor

-from vllm.model_executor.models.intern_vit import InternVisionModel
-
 from ..conftest import _ImageAssets, cleanup

 pytestmark = pytest.mark.vlm
@@ -49,6 +47,7 @@ def run_intern_vit_test(
        for pixel_value in pixel_values
    ]

+    from vllm.model_executor.models.intern_vit import InternVisionModel
    vllm_model = InternVisionModel(config)
    vllm_model.load_weights(hf_model.state_dict().items())


--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
@@ -3,13 +3,9 @@ from typing import List, Optional, Tuple, Type

 import pytest
 import torch
-from huggingface_hub import snapshot_download
 from PIL.Image import Image
 from transformers import AutoConfig

-from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
-                                                 IMG_START,
-                                                 image_to_pixel_values)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu

@@ -25,49 +21,15 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
 })

-# we use snapshot_download to prevent conflicts between
-# dynamic_module and trust_remote_code for hf_runner
-DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
 models = [
-    snapshot_download("OpenGVLab/InternVL2-1B",
-                      allow_patterns=DOWNLOAD_PATTERN),
-    snapshot_download("OpenGVLab/InternVL2-2B",
-                      allow_patterns=DOWNLOAD_PATTERN),
+    "OpenGVLab/InternVL2-1B",
+    "OpenGVLab/InternVL2-2B",
    # Broken due to outdated implementation of Phi-3
    # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
-    # snapshot_download("OpenGVLab/InternVL2-4B"),
+    # "OpenGVLab/InternVL2-4B",
 ]


-class InternVLProcessor:
-    """A simple processor for InternVL2 HF model which misses a processor."""
-
-    def __init__(self, hf_runner: HfRunner):
-        self.num_image_token = hf_runner.model.num_image_token
-        self.tokenizer = hf_runner.tokenizer
-        self.dtype = hf_runner.model.dtype
-
-        self.config = AutoConfig.from_pretrained(hf_runner.model_name)
-        self.vision_config = self.config.vision_config
-        self.use_thumbnail = self.config.use_thumbnail
-        self.min_num = self.config.min_dynamic_patch
-        self.max_num = self.config.max_dynamic_patch
-        self.image_size = self.vision_config.image_size
-
-    def __call__(self, text: str, images: Image, **kwargs):
-        pixel_values = image_to_pixel_values(images, self.image_size,
-                                             self.min_num, self.max_num,
-                                             self.use_thumbnail).to(self.dtype)
-        num_patches_list = [pixel_values.shape[0]]
-        for num_patches in num_patches_list:
-            context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
-            image_tokens = IMG_START + context_tokens + IMG_END
-            text = text.replace('<image>', image_tokens, 1)
-        prompt = self.tokenizer(text, return_tensors="pt")
-        prompt.update({"pixel_values": pixel_values})
-        return prompt
-
-
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
 def generate(
    self,
@@ -133,6 +95,37 @@ def run_test(
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).

+    class InternVLProcessor:
+        """A simple processor for InternVL2 which misses a processor."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+            self.dtype = hf_runner.model.dtype
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Image, **kwargs):
+            from vllm.model_executor.models.internvl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+            pixel_values = image_to_pixel_values(
+                images, self.image_size, self.min_num, self.max_num,
+                self.use_thumbnail).to(self.dtype)
+            num_patches_list = [pixel_values.shape[0]]
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     max_model_len=4096,

--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -179,3 +179,20 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
    )
+
+
+@pytest.mark.parametrize("model", models)
+def test_context_length_too_short(vllm_runner, image_assets, model):
+    images = [asset.pil_image for asset in image_assets]
+
+    with pytest.raises(ValueError, match="too long to fit into the model"):
+        vllm_model = vllm_runner(
+            model,
+            max_model_len=128,  # LLaVA has a feature size of 576
+            enforce_eager=True,
+        )
+
+        with vllm_model:
+            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
+                                       max_tokens=1,
+                                       images=[images[0]])
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -6,24 +6,22 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                        _ImageAssets)
 from .utils import check_logprobs_close

 pytestmark = pytest.mark.vlm

-_PREFACE = (
-    "A chat between a curious human and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the human's "
-    "questions.")
+_LIMIT_IMAGE_PER_PROMPT = 4

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
-    f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:",
+    "[INST] <image>\nWhat's the content of the image? [/INST]",
    "cherry_blossom":
-    f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
+    "[INST] <image>\nWhat is the season? [/INST]",
 })

-models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
+models = ["llava-hf/llava-v1.6-mistral-7b-hf"]


 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
@@ -114,19 +112,43 @@ def run_test(
    else:
        raise ValueError("You must provide either `size_factors` or `sizes`")

+    _run_test(hf_runner,
+              vllm_runner,
+              inputs_per_image,
+              model,
+              dtype=dtype,
+              max_tokens=max_tokens,
+              num_logprobs=num_logprobs,
+              tensor_parallel_size=tensor_parallel_size,
+              distributed_executor_backend=distributed_executor_backend)
+
+
+def _run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput]],
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     dtype=dtype,
-                     max_model_len=4096,
+                     max_model_len=10240,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+                     enforce_eager=True,
+                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
+                                          }) as vllm_model:
        vllm_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

    with hf_runner(model, dtype=dtype,
@@ -136,7 +158,7 @@ def run_test(
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
@@ -177,7 +199,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,

    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
+    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
@@ -216,3 +238,48 @@ def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
+                                      model, dtype, max_tokens,
+                                      num_logprobs) -> None:
+    stop_sign = image_assets[0].pil_image
+    cherry_blossom = image_assets[1].pil_image
+
+    inputs = [(
+        [
+            "[INST] <image><image>\nDescribe 2 images. [/INST]",
+            "[INST] <image><image>\nDescribe 2 images. [/INST]",
+            "[INST] <image><image><image><image>\nDescribe 4 images. [/INST]",
+            "[INST] <image>\nWhat is the season? [/INST]"
+        ],
+        [
+            [stop_sign, cherry_blossom],
+            # Images with different sizes and aspect-ratios
+            [
+                rescale_image_size(stop_sign, 0.1),
+                stop_sign,
+            ],
+            [
+                stop_sign,
+                rescale_image_size(stop_sign, 0.25),
+                cherry_blossom.resize((183, 488)),
+                cherry_blossom.resize((488, 183))
+            ],
+            cherry_blossom,
+        ])]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        inputs,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/test_minicpmv.py
+++ b/tests/models/test_minicpmv.py
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, Union

 import pytest
 import torch
 import torch.types
+from PIL import Image
 from transformers import BatchEncoding

 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
 from .utils import check_logprobs_close

 pytestmark = pytest.mark.vlm
@@ -24,6 +25,11 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
        "(<image>./</image>)\nWhat is the season?<|eot_id|>" \
        "<|start_header_id|>assistant<|end_header_id|>\n\n",
 })
+HF_MULTIIMAGE_IMAGE_PROMPT = \
+    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
+    "(<image>./</image>)\n(<image>./</image>)\n" \
+    "Describe these images.<|eot_id|>" \
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"

 models = ["openbmb/MiniCPM-Llama3-V-2_5"]

@@ -46,13 +52,14 @@ target_dtype = "half"
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
+    inputs: List[Tuple[List[str], Union[List[Image.Image],
+                                        List[List[Image.Image]]]]],
    model: str,
    *,
-    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
+    mm_limit: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
@@ -65,12 +72,6 @@ def run_test(
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]

    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
@@ -82,6 +83,7 @@ def run_test(
                     max_model_len=4096,
                     max_num_seqs=1,
                     dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
@@ -93,7 +95,7 @@ def run_test(
                                                num_logprobs=num_logprobs,
                                                images=images,
                                                stop_token_ids=stop_token_ids)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

    hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
@@ -104,7 +106,7 @@ def run_test(
                                                    num_logprobs=num_logprobs,
                                                    images=images,
                                                    tokenizer=tokenizer)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
@@ -138,104 +140,26 @@ def run_test(
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
    run_test(
        hf_runner,
        vllm_runner,
-        image_assets,
+        inputs_per_image,
        model,
-        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
+        mm_limit=1,
        tensor_parallel_size=1,
    )


-HF_MULTIIMAGE_IMAGE_PROMPT = \
-    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
-    "(<image>./</image>)\n(<image>./</image>)\n" \
-    "Describe these images.<|eot_id|>" \
-    "<|start_header_id|>assistant<|end_header_id|>\n\n"
-
-
-def run_multi_image_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     max_model_len=4096,
-                     max_num_seqs=1,
-                     limit_mm_per_prompt={"image": len(images)},
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
-        stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
-        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                stop_token_ids=stop_token_ids)
-            for prompts, images in inputs_per_case
-        ]
-
-    hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
-    with hf_model, torch.no_grad():
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    tokenizer=tokenizer)
-            for prompts, images in inputs_per_case
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=[
-                trunc_hf_output(hf_output) for hf_output in hf_outputs
-            ],
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
    "size_factors",
@@ -256,14 +180,22 @@ def run_multi_image_test(
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
                             size_factors, dtype: str, max_tokens: int,
                             num_logprobs: int) -> None:
-    run_multi_image_test(
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+
+    run_test(
        hf_runner,
        vllm_runner,
-        image_assets,
+        inputs_per_case,
        model,
-        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
+        mm_limit=2,
        tensor_parallel_size=1,
    )
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -30,9 +30,11 @@ def test_models(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype,
+                     tokenizer_mode="mistral") as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
+
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,

--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
 import os
 import re
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, Union

 import pytest
+from PIL import Image
 from transformers import AutoTokenizer

 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu, is_hip

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
 from .utils import check_logprobs_close

 pytestmark = pytest.mark.vlm
@@ -20,6 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "cherry_blossom":
    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
 })
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501

 models = ["microsoft/Phi-3.5-vision-instruct"]

@@ -58,13 +60,14 @@ if is_hip():
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
+    inputs: List[Tuple[List[str], Union[List[Image.Image],
+                                        List[List[Image.Image]]]]],
    model: str,
    *,
-    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
+    mm_limit: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
@@ -77,15 +80,6 @@ def run_test(
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [
-            rescale_image_size(image, factor, transpose=idx)
-            for idx, factor in enumerate(size_factors)
-        ],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]

    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
@@ -97,15 +91,16 @@ def run_test(
                     max_model_len=4096,
                     max_num_seqs=1,
                     dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
+        vllm_outputs_per_case = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
@@ -113,17 +108,17 @@ def run_test(
    with hf_runner(model, dtype=dtype,
                   model_kwargs=hf_model_kwargs) as hf_model:
        eos_token_id = hf_model.processor.tokenizer.eos_token_id
-        hf_outputs_per_image = [
+        hf_outputs_per_case = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images,
                                                    eos_token_id=eos_token_id)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=[
@@ -156,14 +151,86 @@ def run_test(
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
+                         dtype) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_regresion_7840 = [
+        ([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
+
+    # Regression test for #7840.
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_regresion_7840,
+        model,
+        dtype=dtype,
+        max_tokens=128,
+        num_logprobs=10,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_tokens: int,
+                             num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+
    run_test(
        hf_runner,
        vllm_runner,
-        image_assets,
+        inputs_per_case,
        model,
-        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
+        mm_limit=2,
        tensor_parallel_size=1,
    )
--- a/tests/models/test_phimoe.py
+++ b/tests/models/test_phimoe.py
+"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
+
+Run `pytest tests/models/test_phimoe.py`.
+"""
+import pytest
+import torch
+
+from vllm.utils import is_cpu
+
+from .utils import check_logprobs_close
+
+MODELS = [
+    "microsoft/Phi-3.5-MoE-instruct",
+]
+
+
+def test_phimoe_routing_function():
+    from vllm.model_executor.models.phimoe import phimoe_routing_function
+    test_case = {
+        0: {
+            "hidden_states":
+            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                         dtype=torch.float32,
+                         requires_grad=False).view(4, 2),
+            "gating_output":
+            torch.tensor([0.1, 0.2, 0.3, 0.4],
+                         dtype=torch.float32,
+                         requires_grad=False),
+            "topk":
+            2,
+            "renormalize":
+            False,
+        },
+        1: {
+            "hidden_states":
+            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                         dtype=torch.float32,
+                         requires_grad=False).view(4, 2),
+            "gating_output":
+            torch.tensor([0.4, 0.2, 0.3, 0.4],
+                         dtype=torch.float32,
+                         requires_grad=False),
+            "topk":
+            2,
+            "renormalize":
+            False,
+        }
+    }
+
+    ground_truth = {
+        0: {
+            "topk_weights":
+            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
+            "topk_ids":
+            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+        },
+        1: {
+            "topk_weights":
+            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
+            "topk_ids":
+            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        }
+    }
+
+    for test_id in test_case:
+        topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
+        assert torch.allclose(topk_weights,
+                              ground_truth[test_id]["topk_weights"])
+        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
+
+
+def get_gpu_memory():
+    try:
+        props = torch.cuda.get_device_properties(torch.cuda.current_device())
+        gpu_memory = props.total_memory / (1024**3)
+        return gpu_memory
+    except Exception:
+        return 0
+
+
+@pytest.mark.skipif(condition=is_cpu(),
+                    reason="This test takes a lot time to run on CPU, "
+                    "and vllm CI's disk space is not enough for this model.")
+@pytest.mark.skipif(condition=get_gpu_memory() < 100,
+                    reason="Skip this test if GPU memory is insufficient.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/test_ultravox.py
+++ b/tests/models/test_ultravox.py
 from typing import List, Optional, Tuple, Type

-import librosa
 import numpy as np
 import pytest
 from transformers import AutoModel, AutoTokenizer, BatchEncoding

-from vllm.assets.audio import AudioAsset
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

@@ -18,36 +16,32 @@ MODEL_NAME = "fixie-ai/ultravox-v0_3"

 AudioTuple = Tuple[np.ndarray, int]

+VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+HF_PLACEHOLDER = "<|audio|>"
+

 @pytest.fixture(scope="session")
-def audio_and_sample_rate():
-    return AudioAsset("mary_had_lamb").audio_and_sample_rate
+def audio_assets():
+    from vllm.assets.audio import AudioAsset
+    return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]


-@pytest.fixture
-def prompts_and_audios(audio_and_sample_rate):
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
+def audio(request):
+    from vllm.assets.audio import AudioAsset
+    return AudioAsset(request.param)

-    vllm_placeholder = "<|reserved_special_token_0|>"
-    hf_placeholder = "<|audio|>"

-    question = "What's in the audio?"
-    vllm_prompt = tokenizer.apply_chat_template(
-        [{
-            'role': 'user',
-            'content': f"{vllm_placeholder}\n{question}"
-        }],
-        tokenize=False,
-        add_generation_prompt=True)
-    hf_prompt = tokenizer.apply_chat_template(
-        [{
-            'role': 'user',
-            'content': f"{hf_placeholder}\n{question}"
-        }],
-        tokenize=False,
-        add_generation_prompt=True)
+def _get_prompt(audio_count, question, placeholder):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    placeholder = f"{placeholder}\n" * audio_count

-    return [(vllm_prompt, hf_prompt, audio_and_sample_rate)]
+    return tokenizer.apply_chat_template([{
+        'role': 'user',
+        'content': f"{placeholder}{question}"
+    }],
+                                         tokenize=False,
+                                         add_generation_prompt=True)


 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
@@ -109,6 +103,7 @@ def run_test(
                   dtype=dtype,
                   postprocess_inputs=process,
                   auto_cls=AutoModel) as hf_model:
+        import librosa

        hf_outputs_per_audio = [
            hf_model.generate_greedy_logprobs_limit(
@@ -134,15 +129,71 @@ def run_test(
        )


+def run_multi_audio_test(
+    vllm_runner: Type[VllmRunner],
+    prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True,
+                     limit_mm_per_prompt={
+                         "audio":
+                         max((len(audio) for _, audio in prompts_and_audios))
+                     }) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            [prompt for prompt, _ in prompts_and_audios],
+            max_tokens,
+            num_logprobs=num_logprobs,
+            audios=[audios for _, audios in prompts_and_audios])
+
+    # The HuggingFace model doesn't support multiple audios yet, so
+    # just assert that some tokens were generated.
+    assert all(tokens for tokens, *_ in vllm_outputs)
+
+
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, prompts_and_audios, dtype: str,
-                max_tokens: int, num_logprobs: int) -> None:
+def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
+
+    vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
+    hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
    run_test(
        hf_runner,
        vllm_runner,
-        prompts_and_audios,
+        [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)],
+        MODEL_NAME,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
+                                     max_tokens: int,
+                                     num_logprobs: int) -> None:
+
+    vllm_prompt = _get_prompt(len(audio_assets),
+                              "Describe each of the audios above.",
+                              VLLM_PLACEHOLDER)
+    run_multi_audio_test(
+        vllm_runner,
+        [(vllm_prompt, [audio.audio_and_sample_rate
+                        for audio in audio_assets])],
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
 import warnings
 from typing import Dict, List, Optional, Sequence, Tuple, Union

-from vllm.sequence import SampleLogprobs
+from vllm.sequence import Logprob, SampleLogprobs

 TokensText = Tuple[List[int], str]

@@ -38,34 +38,39 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
                                                                    float]],
                                                          SampleLogprobs]]]

+# Allow for tokens to be represented as str's rather than IDs
+TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
+                                                        List[Dict[str,
+                                                                  Logprob]]]]]
+

 def check_logprobs_close(
    *,
-    outputs_0_lst: Sequence[TokensTextLogprobs],
-    outputs_1_lst: Sequence[TokensTextLogprobs],
+    outputs_0_lst: Sequence[Union[TokensTextLogprobs, TextTextLogprobs]],
+    outputs_1_lst: Sequence[Union[TokensTextLogprobs, TextTextLogprobs]],
    name_0: str,
    name_1: str,
    num_outputs_0_skip_tokens: int = 0,
    warn_on_mismatch: bool = True,
-):
-    """
-    Compare the logprobs of two sequences generated by different models,
+    always_check_logprobs: bool = False,
+) -> None:
+    """Compare the logprobs of two sequences generated by different models,
    which should be similar but not necessarily equal.

-    Arguments:
-
-    * outputs_0_lst: First sequence to compare
-    * outputs_0_lst: Second sequence to compare
-    * name_0: sequence #0 name
-    * name_1: sequence #1 name
-    * num_outputs_0_skip_tokens: If > 0, specifies the number of initial
+    Args:
+      outputs_0_lst: First sequence to compare
+      outputs_0_lst: Second sequence to compare
+      name_0: sequence #0 name
+      name_1: sequence #1 name
+      num_outputs_0_skip_tokens: If > 0, specifies the number of initial
                                 sequence #0 tokens & logprobs to discard
                                 before comparison, i.e. all
                                 of sequence #1 will be compared to
                                 sequence #0 beginning at index
                                 num_outputs_0_skip_tokens
-    * warn_on_mismatch: Issue a warning if there is token-wise or text-wise
+      warn_on_mismatch: Issue a warning if there is token-wise or text-wise
                        mismatch between the two sequences
+      always_check_logprobs: If true, check logprobs even when tokens match
    """
    assert len(outputs_0_lst) == len(outputs_1_lst)

@@ -94,8 +99,12 @@ def check_logprobs_close(
        for idx, (output_id_0,
                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):

-            # If generated tokens don't match, then
-            if output_id_0 != output_id_1:
+            is_tok_mismatch = output_id_0 != output_id_1
+
+            # If generated tokens don't match
+            # or it is desired to always check logprobs,
+            # then
+            if is_tok_mismatch or always_check_logprobs:
                logprobs_elem_0 = logprobs_0[idx]
                logprobs_elem_1 = logprobs_1[idx]

@@ -111,7 +120,7 @@ def check_logprobs_close(
                assert output_id_0 in logprobs_elem_1, fail_msg
                assert output_id_1 in logprobs_elem_0, fail_msg

-                if warn_on_mismatch:
+                if warn_on_mismatch and is_tok_mismatch:
                    with warnings.catch_warnings():
                        # This ensures that repeated warnings are shown
                        # in the output, not just the first occurrence

--- a/tests/multi_step/test_correctness.py
+++ b/tests/multi_step/test_correctness.py
 # Test the AsyncLLMEngine with multi-step-decoding

-from typing import List
+from typing import List, Optional

 import pytest

-from ..utils import RemoteOpenAIServer
+from ..models.utils import check_logprobs_close
+from ..utils import (completions_with_server_args, get_client_text_generations,
+                     get_client_text_logprob_generations)

 MODELS = [
    "JackFram/llama-160m",
@@ -23,22 +25,6 @@ DEFAULT_SERVER_ARGS: List[str] = [
 ]


-async def completions_with_server_args(prompts: List[str], model_name: str,
-                                       server_cli_args: List[str]):
-
-    outputs = None
-    with RemoteOpenAIServer(model_name, server_cli_args) as server:
-        client = server.get_async_client()
-        outputs = await client.completions.create(model=model_name,
-                                                  prompt=prompts,
-                                                  temperature=0,
-                                                  stream=False,
-                                                  max_tokens=5)
-    assert outputs is not None
-
-    return outputs
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize(("tp_size, pp_size"), [
    (1, 1),
@@ -47,10 +33,43 @@ async def completions_with_server_args(prompts: List[str], model_name: str,
 @pytest.mark.parametrize("eager_mode", [False, True])
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.parametrize("is_async", [False, True])
 @pytest.mark.asyncio
-async def test_multi_step(example_prompts, model: str, tp_size: int,
-                          pp_size: int, eager_mode: int,
-                          num_scheduler_steps: int, num_prompts: int):
+async def test_multi_step(
+    example_prompts,
+    model: str,
+    tp_size: int,
+    pp_size: int,
+    eager_mode: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    is_async: bool,
+    num_logprobs: Optional[int],
+) -> None:
+    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
+    client/server environment.
+
+    Set up an engine with single-step scheduling as a ground-truth reference.
+
+    Send a completions API request to both engines with the same prompts.
+
+    Validate:
+    * Generated tokens match
+    * Generated logprobs are all very close
+
+    Args:
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      tp_size: degree of tensor-parallelism
+      pp_size: degree of pipeline-parallelism
+      eager_mode
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
+                    completions endpoint; `None` -> no logprobs
+    """

    prompts = example_prompts
    if len(prompts) < num_prompts:
@@ -62,6 +81,9 @@ async def test_multi_step(example_prompts, model: str, tp_size: int,
    ms_server_args = DEFAULT_SERVER_ARGS + \
        ["--num-scheduler-steps", f"{num_scheduler_steps}"]

+    if not is_async:
+        ms_server_args += ["--disable-async-output-proc"]
+
    if eager_mode:
        ms_server_args.append("--enforce-eager")

@@ -72,14 +94,36 @@ async def test_multi_step(example_prompts, model: str, tp_size: int,
        str(pp_size),
    ]

+    # Spin up client/server & issue completion API requests.
+    # Default `max_wait_seconds` is 240 but was empirically
+    # was raised 3x to 720 *just for this test* due to
+    # observed timeouts in GHA CI
    ref_completions = await completions_with_server_args(
-        prompts, model, server_args + distributed_args)
+        prompts,
+        model,
+        server_args + distributed_args,
+        num_logprobs,
+        max_wait_seconds=5 * 240)
    test_completions = await completions_with_server_args(
-        prompts, model, ms_server_args + distributed_args)
-
-    def get_text_generations(completions):
-        return [x.text for x in completions.choices]
-
-    ref_generations = get_text_generations(ref_completions)
-    test_generations = get_text_generations(test_completions)
+        prompts,
+        model,
+        ms_server_args + distributed_args,
+        num_logprobs,
+        max_wait_seconds=5 * 240)
+
+    # Assert multi-step scheduling produces identical tokens
+    # to single-step scheduling.
+    ref_generations = get_client_text_generations(ref_completions)
+    test_generations = get_client_text_generations(test_completions)
    assert ref_generations == test_generations
+
+    # Assert multi-step scheduling produces nearly-identical logprobs
+    # to single-step scheduling.
+    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
+    test_text_logprobs = get_client_text_logprob_generations(test_completions)
+    check_logprobs_close(
+        outputs_0_lst=ref_text_logprobs,
+        outputs_1_lst=test_text_logprobs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
+# Test the LLMEngine with multi-step-decoding
+
+from typing import Optional
+
+import pytest
+
+from ..models.utils import check_logprobs_close, check_outputs_equal
+
+MODELS = [
+    "JackFram/llama-160m",
+]
+NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
+NUM_PROMPTS = [10]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("num_logprobs", [None, 5])
+def test_multi_step_llm(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    tp_size: int,
+    max_tokens: int,
+    enforce_eager: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    num_logprobs: Optional[int],
+) -> None:
+    """Test vLLM engine with multi-step scheduling via sync LLM Engine.
+
+    Set up a HuggingFace (HF) transformers model as a ground-truth reference.
+
+    Prompt them with the same example prompts.
+
+    Validate:
+    * Generated tokens match
+    * Generated logprobs are all very close
+
+    Args:
+      hf_runner: HF transformers model runner fixture
+      vllm_runner: vLLM model runner fixture
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      dtype: tensor datatype for engine to utilize
+      tp_size: degree of tensor-parallelism
+      max_tokens: the maximum number of tokens to generate
+      enforce_eager
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
+                    completions endpoint; `None` -> no logprobs
+    """
+
+    prompts = example_prompts
+    if len(prompts) < num_prompts:
+        prompts = prompts * ((num_prompts // len(prompts)) + 1)
+    prompts = prompts[:num_prompts]
+    assert len(prompts) == num_prompts
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            use_v2_block_manager=True,
+            num_scheduler_steps=num_scheduler_steps,
+    ) as vllm_model:
+        vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
+                        if num_logprobs is None else
+                        vllm_model.generate_greedy_logprobs(
+                            prompts, max_tokens, num_logprobs))
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
+                      if num_logprobs is None else
+                      hf_model.generate_greedy_logprobs_limit(
+                          prompts, max_tokens, num_logprobs))
+
+    if num_logprobs is None:
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+    else:
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
--- a/tests/multimodal/test_base.py
+++ b/tests/multimodal/test_base.py
+import torch
+
+from vllm.multimodal.base import MultiModalInputs, NestedTensors
+
+
+def assert_nested_tensors_equal(expected: NestedTensors,
+                                actual: NestedTensors):
+    assert type(expected) == type(actual)
+    if isinstance(expected, torch.Tensor):
+        assert torch.equal(expected, actual)
+    else:
+        for expected_item, actual_item in zip(expected, actual):
+            assert_nested_tensors_equal(expected_item, actual_item)
+
+
+def assert_multimodal_inputs_equal(expected: MultiModalInputs,
+                                   actual: MultiModalInputs):
+    assert set(expected.keys()) == set(actual.keys())
+    for key in expected:
+        assert_nested_tensors_equal(expected[key], actual[key])
+
+
+def test_multimodal_input_batch_single_tensor():
+    t = torch.rand([1, 2])
+    result = MultiModalInputs.batch([{"image": t}])
+    assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
+
+
+def test_multimodal_input_batch_multiple_tensors():
+    a = torch.rand([1, 1, 2])
+    b = torch.rand([1, 1, 2])
+    c = torch.rand([1, 1, 2])
+    result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
+    assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
+
+
+def test_multimodal_input_batch_multiple_heterogeneous_tensors():
+    a = torch.rand([1, 2, 2])
+    b = torch.rand([1, 3, 2])
+    c = torch.rand([1, 4, 2])
+    result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
+    assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
+
+
+def test_multimodal_input_batch_nested_tensors():
+    a = torch.rand([2, 3])
+    b = torch.rand([2, 3])
+    c = torch.rand([2, 3])
+    result = MultiModalInputs.batch([{
+        "image": [a]
+    }, {
+        "image": [b]
+    }, {
+        "image": [c]
+    }])
+    assert_multimodal_inputs_equal(result, {
+        "image":
+        torch.stack([a.unsqueeze(0),
+                     b.unsqueeze(0),
+                     c.unsqueeze(0)])
+    })
+
+
+def test_multimodal_input_batch_heterogeneous_lists():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 2, 3])
+    c = torch.rand([1, 2, 3])
+    result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
+    assert_multimodal_inputs_equal(
+        result,
+        {"image": [torch.stack([a, b]), c.unsqueeze(0)]})
+
+
+def test_multimodal_input_batch_multiple_batchable_lists():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 2, 3])
+    c = torch.rand([1, 2, 3])
+    d = torch.rand([1, 2, 3])
+    result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}])
+    assert_multimodal_inputs_equal(
+        result,
+        {"image": torch.stack([torch.stack([a, b]),
+                               torch.stack([c, d])])})
+
+
+def test_multimodal_input_batch_mixed_stacking_depths():
+    a = torch.rand([1, 2, 3])
+    b = torch.rand([1, 3, 3])
+    c = torch.rand([1, 4, 3])
+
+    result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
+    assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
+
+    result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}])
+    assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})