Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

0da93439 · zhuwenwen · 25f2f756 · 298e5108 · 0da93439 · 0da93439
Commit 0da93439 authored Mar 26, 2026 by zhuwenwen
20 changed files
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -15,4 +15,4 @@ torch==2.10.0+xpu
 torchaudio
 torchvision

-vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.3/vllm_xpu_kernels-0.1.3-cp38-abi3-linux_x86_64.whl
+vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.4/vllm_xpu_kernels-0.1.4-cp38-abi3-manylinux_2_28_x86_64.whl
--- a/setup.py
+++ b/setup.py
@@ -54,6 +54,9 @@ elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is Non
    if torch.version.hip is not None:
        VLLM_TARGET_DEVICE = "rocm"
        logger.info("Auto-detected ROCm")
+    elif torch.version.xpu is not None:
+        VLLM_TARGET_DEVICE = "xpu"
+        logger.info("Auto-detected XPU")
    elif torch.version.cuda is not None:
        VLLM_TARGET_DEVICE = "cuda"
        logger.info("Auto-detected CUDA")
@@ -597,6 +600,7 @@ class precompiled_wheel_utils:
            with zipfile.ZipFile(wheel_path) as wheel:
                files_to_copy = [
                    "vllm/_C.abi3.so",
+                    "vllm/_C_stable_libtorch.abi3.so",
                    "vllm/_moe_C.abi3.so",
                    "vllm/_flashmla_C.abi3.so",
                    "vllm/_flashmla_extension_C.abi3.so",
@@ -932,6 +936,10 @@ if _is_cpu():

 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))
+    # also _is_hip() once https://github.com/vllm-project/vllm/issues/35163 is
+    # fixed
+    if _is_cuda():
+        ext_modules.append(CMakeExtension(name="vllm._C_stable_libtorch"))

 package_data = {
    "vllm": [
@@ -979,11 +987,11 @@ setup(
        "instanttensor": ["instanttensor >= 0.1.5"],
        "runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
        "audio": [
-            "librosa",
+            "av",
+            "resampy",
            "scipy",
            "soundfile",
            "mistral_common[audio]",
-            "av",
        ],  # Required for audio processing
        "video": [],  # Kept for backwards compatibility
        "flashinfer": [],  # Kept for backwards compatibility

--- a/tests/benchmarks/test_random_multimodal_dataset_video.py
+++ b/tests/benchmarks/test_random_multimodal_dataset_video.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import base64
 import os
 from tempfile import NamedTemporaryFile
 from typing import Any, cast

 import cv2
+import pybase64 as base64
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase


--- a/tests/compile/fusions_e2e/conftest.py
+++ b/tests/compile/fusions_e2e/conftest.py
@@ -82,6 +82,13 @@ def run_e2e_fusion_test(monkeypatch, caplog_mp_spawn):
                f"attention backend '{attn_backend.backend.name}'"
            )

+        # TODO: remove this after finishing migration from envs to model kwargs
+        if model_name == "openai/gpt-oss-20b":
+            from .common import is_blackwell
+
+            if is_blackwell():
+                monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
+
        # Disable, compile cache to make sure custom passes run.
        # Otherwise, we can't verify fusion happened through the logs.
        monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")

--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -162,3 +162,12 @@ deepseek_v3_fp8 = ModelFusionInfo(
        # async_tp=n_layers * 2,
    ),
 )
+
+gpt_oss_20b = ModelFusionInfo(
+    model_name="openai/gpt-oss-20b",
+    matches=lambda n_layers: Matches(
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 2,
+    ),
+)
--- a/tests/compile/fusions_e2e/test_tp2_ar_rms.py
+++ b/tests/compile/fusions_e2e/test_tp2_ar_rms.py
@@ -20,6 +20,7 @@ from .models import (
    FLASHINFER_MLA_ATTN,
    TRITON_ATTN,
    deepseek_v3_fp8,
+    gpt_oss_20b,
    llama3_8b,
    llama3_8b_fp4,
    llama3_8b_fp8,
@@ -158,7 +159,7 @@ def test_tp2_ar_rms_fp4_fusions(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
    "model_name, matches_fn, model_kwargs, hf_overrides",
-    [llama3_8b, qwen3_a3b],
+    [llama3_8b, qwen3_a3b, gpt_oss_20b],
 )
 @pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
 @pytest.mark.parametrize("n_layers", [4])

--- a/tests/compile/passes/test_rope_kvcache_fusion.py
+++ b/tests/compile/passes/test_rope_kvcache_fusion.py
@@ -295,7 +295,7 @@ def test_rope_kvcache_fusion(
            }
            q_unfused, k_unfused, v_unfused, dummy = model(qkv_unfused, pos_unfused)
            attn_layer = forward_context.no_compile_layers[model.layer_name]
-            kv_cache_unfused = attn_layer.kv_cache[forward_context.virtual_engine]
+            kv_cache_unfused = attn_layer.kv_cache[0]
        del dummy

        torch._dynamo.mark_dynamic(qkv, 0)
@@ -309,7 +309,7 @@ def test_rope_kvcache_fusion(
            }
            q_fused, k_fused, v_fused, dummy = model_fused(qkv, pos)
            attn_layer = forward_context.no_compile_layers[model.layer_name]
-            kv_cache_fused = attn_layer.kv_cache[forward_context.virtual_engine]
+            kv_cache_fused = attn_layer.kv_cache[0]
        del dummy

        assert fusion_pass.matched_count == 1

--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -14,6 +14,7 @@ from unittest.mock import Mock, patch
 import pytest
 import torch

+import vllm.envs as envs
 import vllm.model_executor.layers.activation
 from vllm.compilation.backends import VllmBackend
 from vllm.compilation.caching import (
@@ -162,6 +163,9 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):

 @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
 def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
+    from torch._subclasses import FakeTensorMode
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
    def foo(x: torch.Tensor):
        return x[slice(0, x.shape[0])]

@@ -172,12 +176,13 @@ def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
    gm = torch.fx.symbolic_trace(foo)
    assert "getitem_1 = x[slice(0, getitem, None)]" in gm.code
    with use_vllm_config(vllm_config):
-        payload = VllmSerializableFunction.serialize_compile_artifacts(
-            VllmSerializableFunction(gm, (example_input,), "", foo)
+        payload = VllmSerializableFunction.serialize_graph_module(gm)
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+        loaded_gm = VllmSerializableFunction.deserialize_graph_module(
+            payload, fake_mode
        )
-        fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)

-    assert gm.code == fn.graph_module.code
+    assert gm.code == loaded_gm.code


 @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
@@ -725,6 +730,10 @@ class TestStandaloneCompiledArtifactsIntegration:
        ]:
            assert cache.get(submod, shape) == shared_data

+    @pytest.mark.skipif(
+        envs.VLLM_USE_MEGA_AOT_ARTIFACT,
+        reason="There's no AOT Autograd run with mega artifact",
+    )
    def test_functorch_config(self):
        vllm_config = make_vllm_config()
        example_inputs = (torch.randn(10, 10),)

--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -23,8 +23,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer

 def get_test_models():
    """Get list of models to test based on PyTorch version"""
-    # TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
-    return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
+    models = [
+        "gpt2",
+        "Qwen/Qwen2-7B-Instruct",
+        "meta-llama/Llama-3.1-8B",
+    ]
+    if is_torch_equal_or_newer("2.12.0"):
+        models.append("Qwen/Qwen3-4B-Instruct-2507")
+    return models


 @pytest.mark.parametrize("model_name", get_test_models())

--- a/tests/compile/test_graph_partition.py
+++ b/tests/compile/test_graph_partition.py
@@ -5,6 +5,8 @@ import operator

 import pytest
 import torch
+import torch._dynamo
+import torch.fx as fx
 from torch.fx.experimental.proxy_tensor import make_fx

 from vllm.compilation.backends import _is_empty_allocation_node, split_graph
@@ -327,3 +329,296 @@ def test_builtin_empty_only_partition_is_merged():
    output_original = gm(x)
    output_split = split_gm(x)
    assert torch.allclose(output_original, output_split), "Output mismatch after split"
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+def test_sym_size_whole_shape_boundary():
+    """
+    Test that using x.size() (whole shape) across a split boundary can be
+    compiled by standalone_compile.
+
+    The dynamo graph looks like:
+        shape = x.size()
+        y = sigmoid(x)          # split point
+        z = y.clone().view(shape)
+
+    Which splits into:
+        subgraph0(x) -> shape          # returns torch.Size — problematic
+        subgraph1(x) -> y              # sigmoid
+        subgraph2(y, shape) -> z       # view
+
+    Two approaches to fix the torch.Size crossing:
+
+    Approach 1 — move sym_size to consumer (memory implication: x passed to
+    subgraph2 just for .size()):
+        subgraph0(x) ->                # empty
+        subgraph1(x) -> y
+        subgraph2(y, x) -> z           # computes shape locally from x
+
+    Approach 2 — decompose shape into individual int/SymInt values:
+        subgraph0(x) -> s0, val        # returns individual scalars, not Size
+        subgraph1(x) -> y
+        subgraph2(y, s0, val) -> z     # reconstructs view args from scalars
+    """
+    from torch._inductor import standalone_compile
+
+    captured_graph = None
+
+    def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
+        nonlocal captured_graph
+        captured_graph = gm
+        return gm
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        shape = x.size()
+        x = torch.ops.aten.sigmoid.default(x)
+        x = x.clone().view(shape)
+        return x
+
+    x = torch.randn(4, 8)
+    torch._dynamo.mark_dynamic(x, 0)
+    compiled_fn = torch.compile(model_fn, backend=capturing_backend)
+    compiled_fn(x)
+
+    split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
+    assert len(split_items) == 3
+
+    submod_0 = split_gm.submod_0
+    example_input = torch.randn(4, 8)
+    compiled = standalone_compile(
+        submod_0, [example_input, 4], dynamic_shapes="from_example_inputs"
+    )
+    assert compiled is not None
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+def test_symint_crosses_split_boundary():
+    """
+    Test that SymInt placeholders from torch.compile + mark_dynamic
+    cross split boundaries safely via split_module's natural threading.
+
+    SymInt values are threaded through subgraphs by split_module and
+    handled correctly by inductor — no special replacement is needed.
+    """
+    captured_graph = None
+
+    def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
+        nonlocal captured_graph
+        captured_graph = gm
+        return gm
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        batch_size = x.shape[0]
+        hidden_size = x.shape[1]
+        x = torch.ops.aten.sigmoid.default(x)
+        x = x.clone().view(batch_size, hidden_size)
+        x = torch.ops.aten.sigmoid.default(x)
+        x = x.clone().view(batch_size, hidden_size)
+        x = torch.ops.aten.sigmoid.default(x)
+        x = x.clone().view(batch_size, hidden_size)
+        return x
+
+    x = torch.randn(4, 8)
+    torch._dynamo.mark_dynamic(x, 0)
+
+    compiled_fn = torch.compile(model_fn, backend=capturing_backend)
+    compiled_fn(x)
+
+    assert captured_graph is not None, "Graph should be captured by backend"
+
+    # SymInt placeholders should exist in the captured graph
+    symint_placeholders = [
+        node
+        for node in captured_graph.graph.nodes
+        if node.op == "placeholder"
+        and isinstance(node.meta.get("example_value"), torch.SymInt)
+    ]
+    assert len(symint_placeholders) > 0, (
+        "Captured graph should have SymInt placeholders from mark_dynamic."
+    )
+
+    # split_graph should handle SymInt placeholders without error
+    split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
+
+    # Should have 3 splitting subgraphs (3 sigmoids)
+    splitting_subgraphs = [item for item in split_items if item.is_splitting_graph]
+    assert len(splitting_subgraphs) == 3, (
+        f"Expected 3 splitting subgraphs (3 sigmoids), got {len(splitting_subgraphs)}"
+    )
+    assert len(split_items) >= 6, (
+        f"Expected at least 6 total subgraphs, got {len(split_items)}"
+    )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+def test_shape_boundary_standalone_compile():
+    """
+    Repro for the original production bug:
+
+        AssertionError: out_spec mismatch
+        TreeSpec(tuple, None, [*, *, TreeSpec(Size, None, [*, *]), *])
+        vs
+        TreeSpec(tuple, None, [*, *, *, *])
+
+    A subgraph outputs torch.Size (e.g. torch.Size([s72, 2048])) as one of
+    its values when shape info crosses a split boundary. aot_autograd / inductor
+    expect all submodule outputs to be flat tensors or scalars, not torch.Size.
+
+    With the fix, x.size() is decomposed into individual sym_size.int calls
+    so only scalar SymInts cross the boundary — not the torch.Size.
+    """
+    from torch._inductor import standalone_compile
+
+    captured_graph = None
+
+    def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
+        nonlocal captured_graph
+        captured_graph = gm
+        return gm
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        shape = x.size()
+        x = torch.ops.aten.sigmoid.default(x)
+        x = x.clone().view(shape)
+        return x
+
+    x = torch.randn(4, 8)
+    torch._dynamo.mark_dynamic(x, 0)
+    torch.compile(model_fn, backend=capturing_backend)(x)
+
+    split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
+    assert len(split_items) == 3
+
+    # Verify that the consumer subgraph only has a placeholder for the dynamic
+    # dim (SymInt) — the static dim (8) should be inlined as a literal, not
+    # threaded as a placeholder.
+    consumer = split_items[-1]  # valid since len == 3: [producer, sigmoid, consumer]
+    symint_placeholders = [
+        n
+        for n in consumer.graph.graph.nodes
+        if n.op == "placeholder"
+        and isinstance(n.meta.get("example_value"), torch.SymInt)
+    ]
+    static_int_placeholders = [
+        n
+        for n in consumer.graph.graph.nodes
+        if n.op == "placeholder"
+        and isinstance(n.meta.get("example_value"), int)
+        and not isinstance(n.meta.get("example_value"), torch.SymInt)
+    ]
+    assert len(symint_placeholders) >= 1, (
+        "Consumer should have a SymInt placeholder for the dynamic dim."
+    )
+    assert len(static_int_placeholders) == 0, (
+        "Static dims should be inlined as literals, not threaded as placeholders."
+    )
+
+    submod_0 = split_gm.submod_0
+
+    standalone_compile(
+        submod_0, [torch.randn(4, 8), 4], dynamic_shapes="from_example_inputs"
+    )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+def test_size_used_in_multiple_consumer_subgraphs():
+    """
+    Validates that x.size() (whole shape) used by multiple downstream subgraphs
+    does not cause torch.Size to cross split boundaries.
+
+    Model:
+        shape = x.size()          # whole shape — must not cross as torch.Size
+        z1 = sigmoid(x)           # split point 1
+        y1 = y.view(shape)        # consumer 1 uses shape
+        z2 = sigmoid(z1)          # split point 2
+        y2 = y.view(shape)        # consumer 2 uses shape again
+
+    Without the fix, torch.Size crosses the boundary as a submodule output,
+    which aot_autograd / standalone_compile rejects.
+    """
+    captured_graph = None
+    captured_inputs = None
+
+    def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
+        nonlocal captured_graph, captured_inputs
+        captured_graph = gm
+        captured_inputs = example_inputs
+        return gm
+
+    def model_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        shape = x.size()
+        z1 = torch.ops.aten.sigmoid.default(x)
+        y1 = y.view(shape)
+        z2 = torch.ops.aten.sigmoid.default(z1)
+        y2 = y.view(shape)
+        return z2 + y1 + y2
+
+    x = torch.randn(4, 8)
+    y = torch.randn(4, 8)  # same shape as x so view(shape) doesn't specialize dim 0
+    torch._dynamo.mark_dynamic(x, 0)
+    torch._dynamo.mark_dynamic(y, 0)
+    torch.compile(model_fn, backend=capturing_backend)(x, y)
+
+    split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
+
+    splitting_items = [item for item in split_items if item.is_splitting_graph]
+    assert len(splitting_items) == 2
+
+    # Verify functional correctness — fails without the fix because torch.Size
+    # would cross a split boundary as a submodule output
+    output_original = model_fn(x, y)
+    output_split = split_gm(*captured_inputs)
+    if isinstance(output_split, tuple):
+        output_split = next(o for o in output_split if isinstance(o, torch.Tensor))
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+def test_sym_size_metadata_propagated():
+    """
+    Validates that new sym_size.int nodes created by the pre-pass have
+    example_value metadata set. Without it, placeholder metadata in consumer
+    subgraphs would be None, breaking any code that dynamically builds
+    example inputs from metadata (e.g. standalone_compile per-submodule).
+    """
+    from torch._inductor import standalone_compile
+
+    captured_graph = None
+
+    def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
+        nonlocal captured_graph
+        captured_graph = gm
+        return gm
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        shape = x.size()
+        x = torch.ops.aten.sigmoid.default(x)
+        x = x.clone().view(shape)
+        return x
+
+    x = torch.randn(4, 8)
+    torch._dynamo.mark_dynamic(x, 0)
+    torch.compile(model_fn, backend=capturing_backend)(x)
+
+    split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
+
+    # For each submodule, build example inputs purely from placeholder metadata.
+    # This fails if example_value is None on any placeholder (i.e. metadata
+    # was not propagated to the sym_size.int nodes we created).
+    for item in split_items:
+        submod = item.graph
+        example_inputs = []
+        for n in submod.graph.nodes:
+            if n.op != "placeholder":
+                continue
+            ev = n.meta.get("example_value")
+            assert ev is not None, (
+                f"Placeholder '{n.name}' in {item.submod_name} has no "
+                "example_value metadata. sym_size.int nodes must propagate "
+                "metadata so consumer subgraphs can be introspected."
+            )
+            if isinstance(ev, torch.Tensor):
+                example_inputs.append(torch.randn(*(int(d) for d in ev.shape)))
+            else:
+                example_inputs.append(int(ev))
+        standalone_compile(submod, example_inputs, dynamic_shapes="from_example_inputs")
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
@@ -9,11 +9,15 @@ then runs in the parent with clean in-memory state but populated caches.

 import multiprocessing as mp

+import pytest
 from torch._dynamo.utils import counters

+import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode

+from ..utils import fork_new_process_for_each_test
+
 MODEL = "microsoft/Phi-tiny-MoE-instruct"


@@ -45,8 +49,11 @@ def _cold_start(vllm_runner):
    assert counters["aot_autograd"]["autograd_cache_hit"] == 0


-def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"])
+def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)

    # Cold start in a forked child (must fork before CUDA init).
    # This model has 32 identical transformer layers which produce
@@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
        num_compiled_artifacts_saved=0,
    ):
        _run_vllm(vllm_runner)
-    assert counters["aot_autograd"]["total"] == 30
+    if envs.VLLM_USE_MEGA_AOT_ARTIFACT:
+        # MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on
+        # subgraphs.
+        assert counters["aot_autograd"]["total"] == 0
+    else:
+        assert counters["aot_autograd"]["total"] == 30
    assert counters["aot_autograd"]["autograd_cache_miss"] == 0
    assert (
        counters["aot_autograd"]["autograd_cache_hit"] == 0

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,9 +6,6 @@ from copy import deepcopy

 from tblib import pickling_support

-# Import fixture
-from tests.v1.entrypoints.conftest import sample_json_schema  # noqa
-
 # ruff: noqa

 # Install support for pickling exceptions so that we can nicely propagate
@@ -81,6 +78,55 @@ if TYPE_CHECKING:

 logger = init_logger(__name__)

+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                },
+            },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$",
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {"type": "string"},
+                        "duration": {
+                            "type": "number",
+                            "minimum": 0.0,
+                            "maximum": 100.0,
+                        },
+                        "position": {"type": "string"},
+                    },
+                    "required": ["company", "duration", "position"],
+                    "additionalProperties": False,
+                },
+                "minItems": 0,
+                "maxItems": 3,
+            },
+        },
+        "required": ["name", "age", "skills", "grade", "email", "work_history"],
+        "additionalProperties": False,
+        "minProperties": 1,
+        "maxProperties": 10,
+    }
+
+
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]

--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from ..entrypoints.openai.test_oot_registration import run_and_test_dummy_opt_api_server
+from tests.entrypoints.openai.chat_completion.test_oot_registration import (
+    run_and_test_dummy_opt_api_server,
+)


 def test_distributed_oot(dummy_opt_path: str):

--- a/tests/distributed/test_eplb_algo.py
+++ b/tests/distributed/test_eplb_algo.py
@@ -5,6 +5,7 @@ import numpy as np
 import pytest
 import torch

+from vllm.distributed.eplb.eplb_state import compute_logical_maps
 from vllm.distributed.eplb.policy.default import DefaultEplbPolicy


@@ -24,9 +25,10 @@ def test_basic_rebalance():
    num_nodes = 2
    num_gpus = 8

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
+    log2phy, logcnt = compute_logical_maps(phy2log, weight.shape[-1])

    # Verify output shapes
    assert phy2log.shape == (
@@ -78,9 +80,10 @@ def test_single_gpu_case():
    num_nodes = 1
    num_gpus = 1

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
+    log2phy, logcnt = compute_logical_maps(phy2log, weight.shape[-1])

    # Verify shapes
    assert phy2log.shape == (1, 4)
@@ -100,9 +103,10 @@ def test_equal_weights():
    num_nodes = 2
    num_gpus = 4

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
+    _, logcnt = compute_logical_maps(phy2log, weight.shape[-1])

    # Verify shapes
    assert phy2log.shape == (1, 8)
@@ -123,9 +127,10 @@ def test_extreme_weight_imbalance():
    num_nodes = 2
    num_gpus = 4

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
+    _, logcnt = compute_logical_maps(phy2log, weight.shape[-1])

    # Verify shapes
    assert phy2log.shape == (1, 12)
@@ -151,9 +156,10 @@ def test_multiple_layers():
    num_nodes = 2
    num_gpus = 4

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
+    _, logcnt = compute_logical_maps(phy2log, weight.shape[-1])

    # Verify shapes
    assert phy2log.shape == (3, 8)
@@ -176,7 +182,8 @@ def test_parameter_validation():
    # Test non-divisible case - this should handle normally without throwing
    # errors because the function will fall back to global load balancing
    # strategy
-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
+    phy2log = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
+    _, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
    assert phy2log.shape == (1, 8)
    assert logcnt.shape == (1, 4)

@@ -198,9 +205,10 @@ def test_small_scale_hierarchical():
    num_nodes = 2  # 2 nodes
    num_gpus = 4  # 4 GPUs

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
+    _, logcnt = compute_logical_maps(phy2log, weight.shape[-1])

    # Verify basic constraints
    assert phy2log.shape == (1, 12)
@@ -225,9 +233,10 @@ def test_global_load_balance_fallback():
    num_nodes = 2
    num_gpus = 4

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
+    _, logcnt = compute_logical_maps(phy2log, weight.shape[-1])

    # Should work normally, just using global load balancing strategy
    assert phy2log.shape == (1, 8)
@@ -247,9 +256,10 @@ def test_device_compatibility(device):
    num_nodes = 1
    num_gpus = 2

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
+    _, logcnt = compute_logical_maps(phy2log, weight.shape[-1])

    # Function will convert to CPU internally, but should handle different
    # device inputs normally
@@ -264,9 +274,8 @@ def test_additional_cases():
    weight1 = torch.tensor(
        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
    )
-    phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts(
-        weight1, 24, 8, 4, 8
-    )
+    phy2log1 = DefaultEplbPolicy.rebalance_experts(weight1, 24, 8, 4, 8)
+    _, logcnt1 = compute_logical_maps(phy2log1, weight1.shape[-1])

    assert phy2log1.shape == (1, 24)
    assert logcnt1.shape == (1, 16)
@@ -279,9 +288,8 @@ def test_additional_cases():
            [12, 25, 50, 100, 150, 200],  # Increasing weights
        ]
    )
-    phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts(
-        weight2, 10, 3, 1, 2
-    )
+    phy2log2 = DefaultEplbPolicy.rebalance_experts(weight2, 10, 3, 1, 2)
+    _, logcnt2 = compute_logical_maps(phy2log2, weight2.shape[-1])

    assert phy2log2.shape == (2, 10)
    assert logcnt2.shape == (2, 6)
@@ -292,6 +300,42 @@ def test_additional_cases():
        assert logcnt2[layer, max_weight_idx] >= 2


+def test_compute_logical_maps_with_negative_indices():
+    """
+    Test that compute_logical_maps correctly handles physical slots containing
+    -1 (unused slots).
+    """
+    # 2 layers, 6 physical slots, 4 logical experts.
+    # Slots 2 and 5 are unused (-1).
+    phy2log = torch.tensor(
+        [
+            [0, 1, -1, 2, 3, -1],
+            [3, -1, 2, 1, 0, -1],
+        ]
+    )
+    num_layers = 2
+    num_logical_experts = 4
+
+    log2phy, logcnt = compute_logical_maps(phy2log, num_logical_experts)
+
+    assert logcnt.shape == (num_layers, num_logical_experts)
+    assert log2phy.shape == (num_layers, num_logical_experts, 1)
+
+    expected_logcnt = torch.ones(num_layers, num_logical_experts, dtype=phy2log.dtype)
+    assert torch.all(logcnt == expected_logcnt), (
+        f"Expected that all replica counts == 1, got {logcnt}"
+    )
+
+    assert torch.all(log2phy >= 0), (
+        "log2phy should only contain valid physical indices, not -1"
+    )
+
+    assert log2phy[0, 0, 0] == 0
+    assert log2phy[0, 1, 0] == 1
+    assert log2phy[0, 2, 0] == 3
+    assert log2phy[0, 3, 0] == 4
+
+
 if __name__ == "__main__":
    weight = torch.tensor(
        [
@@ -305,7 +349,7 @@ if __name__ == "__main__":
    num_nodes = 2
    num_gpus = 8

-    phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
+    phy2log = DefaultEplbPolicy.rebalance_experts(
        weight, num_replicas, num_groups, num_nodes, num_gpus
    )
    print(phy2log)
@@ -434,9 +478,10 @@ def test_preserve_intragpu_slots(
    """Experts that stay on a GPU keep their old slots; incoming not lost."""
    phy_replicas_idx = _make_phy_replicas_idx_from_phy2log(new_phy2log)

-    post_phy2log, post_phy_replicas_idx = DefaultEplbPolicy.preserve_intragpu_slots(
-        new_phy2log, phy_replicas_idx, num_ranks, old_phy2log
+    post_phy2log = DefaultEplbPolicy.preserve_intragpu_slots(
+        new_phy2log, num_ranks, old_phy2log
    )
+    post_phy_replicas_idx = _make_phy_replicas_idx_from_phy2log(post_phy2log)

    # Shapes preserved
    assert post_phy2log.shape == new_phy2log.shape

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -319,9 +319,6 @@ def _compare_tp(
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
-        # Temporary. Currently when zeromq + SPMD is used, it does not properly
-        # terminate because of a Ray Compiled Graph issue.
-        common_args.append("--disable-frontend-multiprocessing")
    elif distributed_backend == "mp":
        pp_env = None
    else:

--- a/tests/distributed/test_weight_transfer.py
+++ b/tests/distributed/test_weight_transfer.py
@@ -6,10 +6,10 @@ Unit tests for engine classes (parsing, validation, registry).
 Integration tests for NCCL and IPC weight transfer between processes using Ray.
 """

-import base64
 import pickle
 from unittest.mock import MagicMock

+import pybase64 as base64
 import pytest
 import ray
 import torch

--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -5,7 +5,7 @@ import anthropic
 import pytest
 import pytest_asyncio

-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer

 MODEL_NAME = "Qwen/Qwen3-0.6B"


--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -4,12 +4,11 @@ import weakref

 import pytest

+from tests.entrypoints.openai.chat_completion.test_vision import TEST_IMAGE_ASSETS
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.sampling_params import SamplingParams

-from ..openai.test_vision import TEST_IMAGE_ASSETS
-

 @pytest.fixture(scope="function")
 def text_llm():

--- a/tests/entrypoints/llm/test_mm_cache_stats.py
+++ b/tests/entrypoints/llm/test_mm_cache_stats.py
@@ -6,13 +6,12 @@ import logging
 import pytest
 import regex as re

+from tests.entrypoints.openai.chat_completion.test_vision import TEST_IMAGE_ASSETS
 from vllm import LLM
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.v1.metrics import loggers as stat_loggers
 from vllm.v1.metrics.reader import Counter, Metric

-from ..openai.test_vision import TEST_IMAGE_ASSETS
-

 def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
    return [

--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -24,6 +24,108 @@ from vllm.sampling_params import (
    StructuredOutputsParams,
 )

+SAMPLE_REGEX = (
+    r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+    r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+)
+
+# Note: Ensure this only uses attributes compatible with xgrammar
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+            },
+        },
+        "grade": {
+            "type": "string",
+            "pattern": "^[A-D]$",  # Regex pattern
+        },
+        "email": {
+            "type": "string",
+            "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {"type": "string"},
+                    "duration": {
+                        "type": "number",
+                        "minimum": 0.0,
+                        "maximum": 100.0,  # Numeric range
+                    },
+                    "position": {"type": "string"},
+                },
+                "required": ["company", "duration", "position"],
+                "additionalProperties": False,
+            },
+            "minItems": 0,
+            "maxItems": 3,
+        },
+    },
+    "required": ["name", "age", "skills", "grade", "email", "work_history"],
+    "additionalProperties": False,
+    "minProperties": 1,
+    "maxProperties": 10,
+}
+
+# A schema unsupported by xgrammar
+UNSUPPORTED_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "score": {
+            "type": "integer",
+            "multipleOf": 5,  # Numeric multiple
+        },
+        "tags": {
+            "type": "array",
+            "items": {"type": "string", "minLength": 10, "maxLength": 20},
+        },
+    },
+    "required": ["score", "tags"],
+    "additionalProperties": False,
+    "patternProperties": {
+        "^score$": {"type": "integer"},
+    },
+}
+
+SAMPLE_STRUCTURED_OUTPUTS_CHOICES = [
+    "Python",
+    "Java",
+    "JavaScript",
+    "C++",
+    "C#",
+    "PHP",
+    "TypeScript",
+    "Ruby",
+    "Swift",
+    "Kotlin",
+]
+
+SAMPLE_SQL_EBNF = """
+root ::= select_statement
+select_statement ::= "SELECT" column "from" table "where" condition
+column ::= "col_1" | "col_2"
+table ::= "table_1" | "table_2"
+condition ::= column "=" number
+number ::= "1" | "2"
+"""
+
+SAMPLE_SQL_LARK = """
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+"""
+
 NGRAM_SPEC_CONFIG = {
    "model": "[ngram]",
    "num_speculative_tokens": 5,
@@ -110,17 +212,17 @@ class CarDescription(BaseModel):
    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
 )
 def test_structured_output(
-    sample_json_schema: dict[str, Any],
-    unsupported_json_schema: dict[str, Any],
-    sample_sql_ebnf: str,
-    sample_sql_lark: str,
-    sample_regex: str,
-    sample_structured_outputs_choices: str,
    backend: str,
    tokenizer_mode: str,
    model_name: str,
    speculative_config: dict[str, Any],
 ):
+    sample_json_schema = SAMPLE_JSON_SCHEMA
+    unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA
+    sample_sql_ebnf = SAMPLE_SQL_EBNF
+    sample_sql_lark = SAMPLE_SQL_LARK
+    sample_regex = SAMPLE_REGEX
+    sample_structured_outputs_choices = SAMPLE_STRUCTURED_OUTPUTS_CHOICES
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")

@@ -702,10 +804,10 @@ def test_structured_output_with_reasoning_matrices(

 @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
-    unsupported_json_schema: dict[str, Any],
    model_name: str,
    tokenizer_mode: str,
 ):
+    unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA
    llm = LLM(
        model=model_name,
        max_model_len=1024,
@@ -808,9 +910,9 @@ def test_guidance_no_additional_properties():

 @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
 def test_structured_output_batched_with_non_structured_outputs_requests(
-    sample_json_schema: dict[str, Any],
    backend: str,
 ):
+    sample_json_schema = SAMPLE_JSON_SCHEMA
    # Don't use eager execution on TPUs because we want to test for no
    # recompilation at runtime
    enforce_eager = bool(not current_platform.is_tpu())