Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
......@@ -15,4 +15,4 @@ torch==2.10.0+xpu
torchaudio
torchvision
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.3/vllm_xpu_kernels-0.1.3-cp38-abi3-linux_x86_64.whl
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.4/vllm_xpu_kernels-0.1.4-cp38-abi3-manylinux_2_28_x86_64.whl
......@@ -54,6 +54,9 @@ elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is Non
if torch.version.hip is not None:
VLLM_TARGET_DEVICE = "rocm"
logger.info("Auto-detected ROCm")
elif torch.version.xpu is not None:
VLLM_TARGET_DEVICE = "xpu"
logger.info("Auto-detected XPU")
elif torch.version.cuda is not None:
VLLM_TARGET_DEVICE = "cuda"
logger.info("Auto-detected CUDA")
......@@ -597,6 +600,7 @@ class precompiled_wheel_utils:
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_C_stable_libtorch.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/_flashmla_extension_C.abi3.so",
......@@ -932,6 +936,10 @@ if _is_cpu():
if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C"))
# also _is_hip() once https://github.com/vllm-project/vllm/issues/35163 is
# fixed
if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm._C_stable_libtorch"))
package_data = {
"vllm": [
......@@ -979,11 +987,11 @@ setup(
"instanttensor": ["instanttensor >= 0.1.5"],
"runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
"audio": [
"librosa",
"av",
"resampy",
"scipy",
"soundfile",
"mistral_common[audio]",
"av",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
"flashinfer": [], # Kept for backwards compatibility
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import os
from tempfile import NamedTemporaryFile
from typing import Any, cast
import cv2
import pybase64 as base64
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
......
......@@ -82,6 +82,13 @@ def run_e2e_fusion_test(monkeypatch, caplog_mp_spawn):
f"attention backend '{attn_backend.backend.name}'"
)
# TODO: remove this after finishing migration from envs to model kwargs
if model_name == "openai/gpt-oss-20b":
from .common import is_blackwell
if is_blackwell():
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
# Disable, compile cache to make sure custom passes run.
# Otherwise, we can't verify fusion happened through the logs.
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
......
......@@ -162,3 +162,12 @@ deepseek_v3_fp8 = ModelFusionInfo(
# async_tp=n_layers * 2,
),
)
gpt_oss_20b = ModelFusionInfo(
model_name="openai/gpt-oss-20b",
matches=lambda n_layers: Matches(
ar_rms_fusion=n_layers * 2 + 1,
sequence_parallel=n_layers * 2 + 1,
async_tp=n_layers * 2,
),
)
......@@ -20,6 +20,7 @@ from .models import (
FLASHINFER_MLA_ATTN,
TRITON_ATTN,
deepseek_v3_fp8,
gpt_oss_20b,
llama3_8b,
llama3_8b_fp4,
llama3_8b_fp8,
......@@ -158,7 +159,7 @@ def test_tp2_ar_rms_fp4_fusions(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"model_name, matches_fn, model_kwargs, hf_overrides",
[llama3_8b, qwen3_a3b],
[llama3_8b, qwen3_a3b, gpt_oss_20b],
)
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
@pytest.mark.parametrize("n_layers", [4])
......
......@@ -295,7 +295,7 @@ def test_rope_kvcache_fusion(
}
q_unfused, k_unfused, v_unfused, dummy = model(qkv_unfused, pos_unfused)
attn_layer = forward_context.no_compile_layers[model.layer_name]
kv_cache_unfused = attn_layer.kv_cache[forward_context.virtual_engine]
kv_cache_unfused = attn_layer.kv_cache[0]
del dummy
torch._dynamo.mark_dynamic(qkv, 0)
......@@ -309,7 +309,7 @@ def test_rope_kvcache_fusion(
}
q_fused, k_fused, v_fused, dummy = model_fused(qkv, pos)
attn_layer = forward_context.no_compile_layers[model.layer_name]
kv_cache_fused = attn_layer.kv_cache[forward_context.virtual_engine]
kv_cache_fused = attn_layer.kv_cache[0]
del dummy
assert fusion_pass.matched_count == 1
......
......@@ -14,6 +14,7 @@ from unittest.mock import Mock, patch
import pytest
import torch
import vllm.envs as envs
import vllm.model_executor.layers.activation
from vllm.compilation.backends import VllmBackend
from vllm.compilation.caching import (
......@@ -162,6 +163,9 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
from torch._subclasses import FakeTensorMode
from torch.fx.experimental.symbolic_shapes import ShapeEnv
def foo(x: torch.Tensor):
return x[slice(0, x.shape[0])]
......@@ -172,12 +176,13 @@ def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
gm = torch.fx.symbolic_trace(foo)
assert "getitem_1 = x[slice(0, getitem, None)]" in gm.code
with use_vllm_config(vllm_config):
payload = VllmSerializableFunction.serialize_compile_artifacts(
VllmSerializableFunction(gm, (example_input,), "", foo)
payload = VllmSerializableFunction.serialize_graph_module(gm)
fake_mode = FakeTensorMode(shape_env=ShapeEnv())
loaded_gm = VllmSerializableFunction.deserialize_graph_module(
payload, fake_mode
)
fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
assert gm.code == fn.graph_module.code
assert gm.code == loaded_gm.code
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
......@@ -725,6 +730,10 @@ class TestStandaloneCompiledArtifactsIntegration:
]:
assert cache.get(submod, shape) == shared_data
@pytest.mark.skipif(
envs.VLLM_USE_MEGA_AOT_ARTIFACT,
reason="There's no AOT Autograd run with mega artifact",
)
def test_functorch_config(self):
vllm_config = make_vllm_config()
example_inputs = (torch.randn(10, 10),)
......
......@@ -23,8 +23,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
def get_test_models():
"""Get list of models to test based on PyTorch version"""
# TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
models = [
"gpt2",
"Qwen/Qwen2-7B-Instruct",
"meta-llama/Llama-3.1-8B",
]
if is_torch_equal_or_newer("2.12.0"):
models.append("Qwen/Qwen3-4B-Instruct-2507")
return models
@pytest.mark.parametrize("model_name", get_test_models())
......
......@@ -5,6 +5,8 @@ import operator
import pytest
import torch
import torch._dynamo
import torch.fx as fx
from torch.fx.experimental.proxy_tensor import make_fx
from vllm.compilation.backends import _is_empty_allocation_node, split_graph
......@@ -327,3 +329,296 @@ def test_builtin_empty_only_partition_is_merged():
output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
def test_sym_size_whole_shape_boundary():
"""
Test that using x.size() (whole shape) across a split boundary can be
compiled by standalone_compile.
The dynamo graph looks like:
shape = x.size()
y = sigmoid(x) # split point
z = y.clone().view(shape)
Which splits into:
subgraph0(x) -> shape # returns torch.Size — problematic
subgraph1(x) -> y # sigmoid
subgraph2(y, shape) -> z # view
Two approaches to fix the torch.Size crossing:
Approach 1 — move sym_size to consumer (memory implication: x passed to
subgraph2 just for .size()):
subgraph0(x) -> # empty
subgraph1(x) -> y
subgraph2(y, x) -> z # computes shape locally from x
Approach 2 — decompose shape into individual int/SymInt values:
subgraph0(x) -> s0, val # returns individual scalars, not Size
subgraph1(x) -> y
subgraph2(y, s0, val) -> z # reconstructs view args from scalars
"""
from torch._inductor import standalone_compile
captured_graph = None
def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
nonlocal captured_graph
captured_graph = gm
return gm
def model_fn(x: torch.Tensor) -> torch.Tensor:
shape = x.size()
x = torch.ops.aten.sigmoid.default(x)
x = x.clone().view(shape)
return x
x = torch.randn(4, 8)
torch._dynamo.mark_dynamic(x, 0)
compiled_fn = torch.compile(model_fn, backend=capturing_backend)
compiled_fn(x)
split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
assert len(split_items) == 3
submod_0 = split_gm.submod_0
example_input = torch.randn(4, 8)
compiled = standalone_compile(
submod_0, [example_input, 4], dynamic_shapes="from_example_inputs"
)
assert compiled is not None
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
def test_symint_crosses_split_boundary():
"""
Test that SymInt placeholders from torch.compile + mark_dynamic
cross split boundaries safely via split_module's natural threading.
SymInt values are threaded through subgraphs by split_module and
handled correctly by inductor — no special replacement is needed.
"""
captured_graph = None
def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
nonlocal captured_graph
captured_graph = gm
return gm
def model_fn(x: torch.Tensor) -> torch.Tensor:
batch_size = x.shape[0]
hidden_size = x.shape[1]
x = torch.ops.aten.sigmoid.default(x)
x = x.clone().view(batch_size, hidden_size)
x = torch.ops.aten.sigmoid.default(x)
x = x.clone().view(batch_size, hidden_size)
x = torch.ops.aten.sigmoid.default(x)
x = x.clone().view(batch_size, hidden_size)
return x
x = torch.randn(4, 8)
torch._dynamo.mark_dynamic(x, 0)
compiled_fn = torch.compile(model_fn, backend=capturing_backend)
compiled_fn(x)
assert captured_graph is not None, "Graph should be captured by backend"
# SymInt placeholders should exist in the captured graph
symint_placeholders = [
node
for node in captured_graph.graph.nodes
if node.op == "placeholder"
and isinstance(node.meta.get("example_value"), torch.SymInt)
]
assert len(symint_placeholders) > 0, (
"Captured graph should have SymInt placeholders from mark_dynamic."
)
# split_graph should handle SymInt placeholders without error
split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
# Should have 3 splitting subgraphs (3 sigmoids)
splitting_subgraphs = [item for item in split_items if item.is_splitting_graph]
assert len(splitting_subgraphs) == 3, (
f"Expected 3 splitting subgraphs (3 sigmoids), got {len(splitting_subgraphs)}"
)
assert len(split_items) >= 6, (
f"Expected at least 6 total subgraphs, got {len(split_items)}"
)
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
def test_shape_boundary_standalone_compile():
"""
Repro for the original production bug:
AssertionError: out_spec mismatch
TreeSpec(tuple, None, [*, *, TreeSpec(Size, None, [*, *]), *])
vs
TreeSpec(tuple, None, [*, *, *, *])
A subgraph outputs torch.Size (e.g. torch.Size([s72, 2048])) as one of
its values when shape info crosses a split boundary. aot_autograd / inductor
expect all submodule outputs to be flat tensors or scalars, not torch.Size.
With the fix, x.size() is decomposed into individual sym_size.int calls
so only scalar SymInts cross the boundary — not the torch.Size.
"""
from torch._inductor import standalone_compile
captured_graph = None
def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
nonlocal captured_graph
captured_graph = gm
return gm
def model_fn(x: torch.Tensor) -> torch.Tensor:
shape = x.size()
x = torch.ops.aten.sigmoid.default(x)
x = x.clone().view(shape)
return x
x = torch.randn(4, 8)
torch._dynamo.mark_dynamic(x, 0)
torch.compile(model_fn, backend=capturing_backend)(x)
split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
assert len(split_items) == 3
# Verify that the consumer subgraph only has a placeholder for the dynamic
# dim (SymInt) — the static dim (8) should be inlined as a literal, not
# threaded as a placeholder.
consumer = split_items[-1] # valid since len == 3: [producer, sigmoid, consumer]
symint_placeholders = [
n
for n in consumer.graph.graph.nodes
if n.op == "placeholder"
and isinstance(n.meta.get("example_value"), torch.SymInt)
]
static_int_placeholders = [
n
for n in consumer.graph.graph.nodes
if n.op == "placeholder"
and isinstance(n.meta.get("example_value"), int)
and not isinstance(n.meta.get("example_value"), torch.SymInt)
]
assert len(symint_placeholders) >= 1, (
"Consumer should have a SymInt placeholder for the dynamic dim."
)
assert len(static_int_placeholders) == 0, (
"Static dims should be inlined as literals, not threaded as placeholders."
)
submod_0 = split_gm.submod_0
standalone_compile(
submod_0, [torch.randn(4, 8), 4], dynamic_shapes="from_example_inputs"
)
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
def test_size_used_in_multiple_consumer_subgraphs():
"""
Validates that x.size() (whole shape) used by multiple downstream subgraphs
does not cause torch.Size to cross split boundaries.
Model:
shape = x.size() # whole shape — must not cross as torch.Size
z1 = sigmoid(x) # split point 1
y1 = y.view(shape) # consumer 1 uses shape
z2 = sigmoid(z1) # split point 2
y2 = y.view(shape) # consumer 2 uses shape again
Without the fix, torch.Size crosses the boundary as a submodule output,
which aot_autograd / standalone_compile rejects.
"""
captured_graph = None
captured_inputs = None
def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
nonlocal captured_graph, captured_inputs
captured_graph = gm
captured_inputs = example_inputs
return gm
def model_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
shape = x.size()
z1 = torch.ops.aten.sigmoid.default(x)
y1 = y.view(shape)
z2 = torch.ops.aten.sigmoid.default(z1)
y2 = y.view(shape)
return z2 + y1 + y2
x = torch.randn(4, 8)
y = torch.randn(4, 8) # same shape as x so view(shape) doesn't specialize dim 0
torch._dynamo.mark_dynamic(x, 0)
torch._dynamo.mark_dynamic(y, 0)
torch.compile(model_fn, backend=capturing_backend)(x, y)
split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
splitting_items = [item for item in split_items if item.is_splitting_graph]
assert len(splitting_items) == 2
# Verify functional correctness — fails without the fix because torch.Size
# would cross a split boundary as a submodule output
output_original = model_fn(x, y)
output_split = split_gm(*captured_inputs)
if isinstance(output_split, tuple):
output_split = next(o for o in output_split if isinstance(o, torch.Tensor))
assert torch.allclose(output_original, output_split), "Output mismatch after split"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
def test_sym_size_metadata_propagated():
"""
Validates that new sym_size.int nodes created by the pre-pass have
example_value metadata set. Without it, placeholder metadata in consumer
subgraphs would be None, breaking any code that dynamically builds
example inputs from metadata (e.g. standalone_compile per-submodule).
"""
from torch._inductor import standalone_compile
captured_graph = None
def capturing_backend(gm: fx.GraphModule, example_inputs: list) -> fx.GraphModule:
nonlocal captured_graph
captured_graph = gm
return gm
def model_fn(x: torch.Tensor) -> torch.Tensor:
shape = x.size()
x = torch.ops.aten.sigmoid.default(x)
x = x.clone().view(shape)
return x
x = torch.randn(4, 8)
torch._dynamo.mark_dynamic(x, 0)
torch.compile(model_fn, backend=capturing_backend)(x)
split_gm, split_items = split_graph(captured_graph, ["aten::sigmoid"])
# For each submodule, build example inputs purely from placeholder metadata.
# This fails if example_value is None on any placeholder (i.e. metadata
# was not propagated to the sym_size.int nodes we created).
for item in split_items:
submod = item.graph
example_inputs = []
for n in submod.graph.nodes:
if n.op != "placeholder":
continue
ev = n.meta.get("example_value")
assert ev is not None, (
f"Placeholder '{n.name}' in {item.submod_name} has no "
"example_value metadata. sym_size.int nodes must propagate "
"metadata so consumer subgraphs can be introspected."
)
if isinstance(ev, torch.Tensor):
example_inputs.append(torch.randn(*(int(d) for d in ev.shape)))
else:
example_inputs.append(int(ev))
standalone_compile(submod, example_inputs, dynamic_shapes="from_example_inputs")
......@@ -9,11 +9,15 @@ then runs in the parent with clean in-memory state but populated caches.
import multiprocessing as mp
import pytest
from torch._dynamo.utils import counters
import vllm.envs as envs
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
from ..utils import fork_new_process_for_each_test
MODEL = "microsoft/Phi-tiny-MoE-instruct"
......@@ -45,8 +49,11 @@ def _cold_start(vllm_runner):
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
@fork_new_process_for_each_test
@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"])
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)
# Cold start in a forked child (must fork before CUDA init).
# This model has 32 identical transformer layers which produce
......@@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
num_compiled_artifacts_saved=0,
):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 30
if envs.VLLM_USE_MEGA_AOT_ARTIFACT:
# MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on
# subgraphs.
assert counters["aot_autograd"]["total"] == 0
else:
assert counters["aot_autograd"]["total"] == 30
assert counters["aot_autograd"]["autograd_cache_miss"] == 0
assert (
counters["aot_autograd"]["autograd_cache_hit"] == 0
......
......@@ -6,9 +6,6 @@ from copy import deepcopy
from tblib import pickling_support
# Import fixture
from tests.v1.entrypoints.conftest import sample_json_schema # noqa
# ruff: noqa
# Install support for pickling exceptions so that we can nicely propagate
......@@ -81,6 +78,55 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
@pytest.fixture
def sample_json_schema():
return {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {
"type": "string",
},
},
"grade": {
"type": "string",
"pattern": "^[A-D]$",
},
"email": {
"type": "string",
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {"type": "string"},
"duration": {
"type": "number",
"minimum": 0.0,
"maximum": 100.0,
},
"position": {"type": "string"},
},
"required": ["company", "duration", "position"],
"additionalProperties": False,
},
"minItems": 0,
"maxItems": 3,
},
},
"required": ["name", "age", "skills", "grade", "email", "work_history"],
"additionalProperties": False,
"minProperties": 1,
"maxProperties": 10,
}
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from ..entrypoints.openai.test_oot_registration import run_and_test_dummy_opt_api_server
from tests.entrypoints.openai.chat_completion.test_oot_registration import (
run_and_test_dummy_opt_api_server,
)
def test_distributed_oot(dummy_opt_path: str):
......
......@@ -5,6 +5,7 @@ import numpy as np
import pytest
import torch
from vllm.distributed.eplb.eplb_state import compute_logical_maps
from vllm.distributed.eplb.policy.default import DefaultEplbPolicy
......@@ -24,9 +25,10 @@ def test_basic_rebalance():
num_nodes = 2
num_gpus = 8
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
log2phy, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
# Verify output shapes
assert phy2log.shape == (
......@@ -78,9 +80,10 @@ def test_single_gpu_case():
num_nodes = 1
num_gpus = 1
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
log2phy, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
# Verify shapes
assert phy2log.shape == (1, 4)
......@@ -100,9 +103,10 @@ def test_equal_weights():
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
_, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
# Verify shapes
assert phy2log.shape == (1, 8)
......@@ -123,9 +127,10 @@ def test_extreme_weight_imbalance():
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
_, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
# Verify shapes
assert phy2log.shape == (1, 12)
......@@ -151,9 +156,10 @@ def test_multiple_layers():
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
_, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
# Verify shapes
assert phy2log.shape == (3, 8)
......@@ -176,7 +182,8 @@ def test_parameter_validation():
# Test non-divisible case - this should handle normally without throwing
# errors because the function will fall back to global load balancing
# strategy
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
phy2log = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
_, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
assert phy2log.shape == (1, 8)
assert logcnt.shape == (1, 4)
......@@ -198,9 +205,10 @@ def test_small_scale_hierarchical():
num_nodes = 2 # 2 nodes
num_gpus = 4 # 4 GPUs
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
_, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
# Verify basic constraints
assert phy2log.shape == (1, 12)
......@@ -225,9 +233,10 @@ def test_global_load_balance_fallback():
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
_, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
# Should work normally, just using global load balancing strategy
assert phy2log.shape == (1, 8)
......@@ -247,9 +256,10 @@ def test_device_compatibility(device):
num_nodes = 1
num_gpus = 2
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
_, logcnt = compute_logical_maps(phy2log, weight.shape[-1])
# Function will convert to CPU internally, but should handle different
# device inputs normally
......@@ -264,9 +274,8 @@ def test_additional_cases():
weight1 = torch.tensor(
[[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
)
phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts(
weight1, 24, 8, 4, 8
)
phy2log1 = DefaultEplbPolicy.rebalance_experts(weight1, 24, 8, 4, 8)
_, logcnt1 = compute_logical_maps(phy2log1, weight1.shape[-1])
assert phy2log1.shape == (1, 24)
assert logcnt1.shape == (1, 16)
......@@ -279,9 +288,8 @@ def test_additional_cases():
[12, 25, 50, 100, 150, 200], # Increasing weights
]
)
phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts(
weight2, 10, 3, 1, 2
)
phy2log2 = DefaultEplbPolicy.rebalance_experts(weight2, 10, 3, 1, 2)
_, logcnt2 = compute_logical_maps(phy2log2, weight2.shape[-1])
assert phy2log2.shape == (2, 10)
assert logcnt2.shape == (2, 6)
......@@ -292,6 +300,42 @@ def test_additional_cases():
assert logcnt2[layer, max_weight_idx] >= 2
def test_compute_logical_maps_with_negative_indices():
"""
Test that compute_logical_maps correctly handles physical slots containing
-1 (unused slots).
"""
# 2 layers, 6 physical slots, 4 logical experts.
# Slots 2 and 5 are unused (-1).
phy2log = torch.tensor(
[
[0, 1, -1, 2, 3, -1],
[3, -1, 2, 1, 0, -1],
]
)
num_layers = 2
num_logical_experts = 4
log2phy, logcnt = compute_logical_maps(phy2log, num_logical_experts)
assert logcnt.shape == (num_layers, num_logical_experts)
assert log2phy.shape == (num_layers, num_logical_experts, 1)
expected_logcnt = torch.ones(num_layers, num_logical_experts, dtype=phy2log.dtype)
assert torch.all(logcnt == expected_logcnt), (
f"Expected that all replica counts == 1, got {logcnt}"
)
assert torch.all(log2phy >= 0), (
"log2phy should only contain valid physical indices, not -1"
)
assert log2phy[0, 0, 0] == 0
assert log2phy[0, 1, 0] == 1
assert log2phy[0, 2, 0] == 3
assert log2phy[0, 3, 0] == 4
if __name__ == "__main__":
weight = torch.tensor(
[
......@@ -305,7 +349,7 @@ if __name__ == "__main__":
num_nodes = 2
num_gpus = 8
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
phy2log = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
print(phy2log)
......@@ -434,9 +478,10 @@ def test_preserve_intragpu_slots(
"""Experts that stay on a GPU keep their old slots; incoming not lost."""
phy_replicas_idx = _make_phy_replicas_idx_from_phy2log(new_phy2log)
post_phy2log, post_phy_replicas_idx = DefaultEplbPolicy.preserve_intragpu_slots(
new_phy2log, phy_replicas_idx, num_ranks, old_phy2log
post_phy2log = DefaultEplbPolicy.preserve_intragpu_slots(
new_phy2log, num_ranks, old_phy2log
)
post_phy_replicas_idx = _make_phy_replicas_idx_from_phy2log(post_phy2log)
# Shapes preserved
assert post_phy2log.shape == new_phy2log.shape
......
......@@ -319,9 +319,6 @@ def _compare_tp(
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing")
elif distributed_backend == "mp":
pp_env = None
else:
......
......@@ -6,10 +6,10 @@ Unit tests for engine classes (parsing, validation, registry).
Integration tests for NCCL and IPC weight transfer between processes using Ray.
"""
import base64
import pickle
from unittest.mock import MagicMock
import pybase64 as base64
import pytest
import ray
import torch
......
......@@ -5,7 +5,7 @@ import anthropic
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
......
......@@ -4,12 +4,11 @@ import weakref
import pytest
from tests.entrypoints.openai.chat_completion.test_vision import TEST_IMAGE_ASSETS
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.sampling_params import SamplingParams
from ..openai.test_vision import TEST_IMAGE_ASSETS
@pytest.fixture(scope="function")
def text_llm():
......
......@@ -6,13 +6,12 @@ import logging
import pytest
import regex as re
from tests.entrypoints.openai.chat_completion.test_vision import TEST_IMAGE_ASSETS
from vllm import LLM
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.v1.metrics import loggers as stat_loggers
from vllm.v1.metrics.reader import Counter, Metric
from ..openai.test_vision import TEST_IMAGE_ASSETS
def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
return [
......
......@@ -24,6 +24,108 @@ from vllm.sampling_params import (
StructuredOutputsParams,
)
SAMPLE_REGEX = (
r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
# Note: Ensure this only uses attributes compatible with xgrammar
SAMPLE_JSON_SCHEMA = {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {
"type": "string",
},
},
"grade": {
"type": "string",
"pattern": "^[A-D]$", # Regex pattern
},
"email": {
"type": "string",
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {"type": "string"},
"duration": {
"type": "number",
"minimum": 0.0,
"maximum": 100.0, # Numeric range
},
"position": {"type": "string"},
},
"required": ["company", "duration", "position"],
"additionalProperties": False,
},
"minItems": 0,
"maxItems": 3,
},
},
"required": ["name", "age", "skills", "grade", "email", "work_history"],
"additionalProperties": False,
"minProperties": 1,
"maxProperties": 10,
}
# A schema unsupported by xgrammar
UNSUPPORTED_JSON_SCHEMA = {
"type": "object",
"properties": {
"score": {
"type": "integer",
"multipleOf": 5, # Numeric multiple
},
"tags": {
"type": "array",
"items": {"type": "string", "minLength": 10, "maxLength": 20},
},
},
"required": ["score", "tags"],
"additionalProperties": False,
"patternProperties": {
"^score$": {"type": "integer"},
},
}
SAMPLE_STRUCTURED_OUTPUTS_CHOICES = [
"Python",
"Java",
"JavaScript",
"C++",
"C#",
"PHP",
"TypeScript",
"Ruby",
"Swift",
"Kotlin",
]
SAMPLE_SQL_EBNF = """
root ::= select_statement
select_statement ::= "SELECT" column "from" table "where" condition
column ::= "col_1" | "col_2"
table ::= "table_1" | "table_2"
condition ::= column "=" number
number ::= "1" | "2"
"""
SAMPLE_SQL_LARK = """
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""
NGRAM_SPEC_CONFIG = {
"model": "[ngram]",
"num_speculative_tokens": 5,
......@@ -110,17 +212,17 @@ class CarDescription(BaseModel):
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
)
def test_structured_output(
sample_json_schema: dict[str, Any],
unsupported_json_schema: dict[str, Any],
sample_sql_ebnf: str,
sample_sql_lark: str,
sample_regex: str,
sample_structured_outputs_choices: str,
backend: str,
tokenizer_mode: str,
model_name: str,
speculative_config: dict[str, Any],
):
sample_json_schema = SAMPLE_JSON_SCHEMA
unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA
sample_sql_ebnf = SAMPLE_SQL_EBNF
sample_sql_lark = SAMPLE_SQL_LARK
sample_regex = SAMPLE_REGEX
sample_structured_outputs_choices = SAMPLE_STRUCTURED_OUTPUTS_CHOICES
if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding")
......@@ -702,10 +804,10 @@ def test_structured_output_with_reasoning_matrices(
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
def test_structured_output_auto_mode(
unsupported_json_schema: dict[str, Any],
model_name: str,
tokenizer_mode: str,
):
unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA
llm = LLM(
model=model_name,
max_model_len=1024,
......@@ -808,9 +910,9 @@ def test_guidance_no_additional_properties():
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
def test_structured_output_batched_with_non_structured_outputs_requests(
sample_json_schema: dict[str, Any],
backend: str,
):
sample_json_schema = SAMPLE_JSON_SCHEMA
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager = bool(not current_platform.is_tpu())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment