Merge tag 'v0.8.5' into v0.8.5-dev

dcb5624a · zhuwenwen · 55880ca2 · ba41cc90 · dcb5624a · dcb5624a
Commit dcb5624a authored Apr 29, 2025 by zhuwenwen
20 changed files
--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+import pytest
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+@pytest.mark.benchmark
+def test_bench_latency():
+    command = [
+        "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
+        "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+import pytest
+from ..utils import RemoteOpenAIServer
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest.mark.benchmark
+def test_bench_serve(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+import pytest
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+@pytest.mark.benchmark
+def test_bench_throughput():
+    command = [
+        "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
+        "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
        ("facebook/opt-125m", {}),
        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
            "dtype": torch.float16,
-            "quantization": "compressed-tensors"
        }),
        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
            "dtype": torch.float16,
-            "quantization": "compressed-tensors"
-        }),
-        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
-            "quantization": "compressed-tensors"
        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]

--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -11,7 +11,7 @@ from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, VllmConfig
 from .backend import TestBackend
 from ..utils import models_path_prefix
@@ -51,13 +51,15 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
                               do_fusion: bool):
    torch.set_default_device("cuda")
-    config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
+    vllm_config = VllmConfig()
-                                          enable_noop=True)
+    vllm_config.compilation_config = CompilationConfig(pass_config= \
-    noop_pass = NoOpEliminationPass(config)
+        CompilationConfig.PassConfig(enable_fusion=do_fusion,
-    fusion_pass = FusionPass.instance(config)
+                                          enable_noop=True))
+    noop_pass = NoOpEliminationPass(vllm_config)
+    fusion_pass = FusionPass.instance(vllm_config)
    passes = [noop_pass, fusion_pass] if do_fusion else [noop_pass]
-    func_pass = FixFunctionalizationPass(config)
+    func_pass = FixFunctionalizationPass(vllm_config)
    backend_func = TestBackend(*passes, func_pass)
    backend_no_func = TestBackend(*passes)

--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -77,12 +77,13 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
    vllm_config = VllmConfig(compilation_config=CompilationConfig(
        level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm"]))
+    vllm_config.compilation_config.pass_config = \
+            CompilationConfig.PassConfig(enable_fusion=True,
+                                              enable_noop=True)
    with vllm.config.set_current_vllm_config(vllm_config):
        # Reshape pass is needed for the fusion pass to work
-        config = CompilationConfig.PassConfig(enable_fusion=True,
+        noop_pass = NoOpEliminationPass(vllm_config)
-                                              enable_noop=True)
+        fusion_pass = FusionPass.instance(vllm_config)
-        noop_pass = NoOpEliminationPass(config)
-        fusion_pass = FusionPass.instance(config)
        backend = TestBackend(noop_pass, fusion_pass)
        model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)

--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -6,7 +6,7 @@ import torch
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
-from vllm.config import CompilationConfig
+from vllm.config import VllmConfig
 # dummy custom pass that doesn't inherit
@@ -16,7 +16,7 @@ def simple_callable(graph: torch.fx.Graph):
 # Should fail to add directly to the pass manager
 def test_bad_callable():
-    config = CompilationConfig().pass_config
+    config = VllmConfig()
    pass_manager = PostGradPassManager()
    pass_manager.configure(config)
@@ -43,7 +43,7 @@ class ProperPass(InductorPass):
    ],
 )
 def test_pass_manager_uuid(callable):
-    config = CompilationConfig().pass_config
+    config = VllmConfig()
    pass_manager = PostGradPassManager()
    pass_manager.configure(config)
@@ -64,7 +64,8 @@ def test_pass_manager_uuid(callable):
    # UUID should be different due to config change
    config2 = copy.deepcopy(config)
-    config2.enable_fusion = not config2.enable_fusion
+    config2.compilation_config.pass_config.enable_fusion = not \
+        config2.compilation_config.pass_config.enable_fusion
    pass_manager3 = PostGradPassManager()
    pass_manager3.configure(config2)
    pass_manager3.add(callable)

--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+import vllm.envs as envs
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe,
+                                       find_specified_fn,
+                                       find_specified_fn_maybe, is_func)
+from vllm.compilation.sequence_parallelism import SequenceParallelismPass
+from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
+                         VllmConfig)
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+from ..utils import multi_gpu_test
+from .backend import TestBackend
+OPS_IN_MODEL_BEFORE = [
+    torch.ops.vllm.all_reduce.default,
+]
+OPS_IN_MODEL_AFTER = [
+    torch.ops.vllm.reduce_scatter.default,
+    torch.ops.vllm.all_gather.default,
+]
+OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default]
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+class TestModel(torch.nn.Module):
+    def __init__(self, hidden_size=16, intermediate_size=32):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty((intermediate_size, hidden_size)))
+        self.norm = RMSNorm(hidden_size, 1e-05)
+        # Initialize weights
+        torch.nn.init.normal_(self.gate_proj, std=0.02)
+    def forward(self, hidden_states, residual):
+        """
+        Forward pass implementing the operations in the FX graph
+        Args:
+            hidden_states: Input tensor
+            residual: Residual tensor from previous layer
+        Returns:
+            Tuple containing the output tensor
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+        #matrix multiplication
+        permute = self.gate_proj.permute(1, 0)
+        mm = torch.mm(view, permute)
+        # Tensor parallel all-reduce
+        all_reduce = tensor_model_parallel_all_reduce(mm)
+        # layer normalization
+        norm_output, residual_output = self.norm(all_reduce, residual)
+        return norm_output, residual_output
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [16])
+@pytest.mark.parametrize("hidden_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_sequence_parallelism_pass(batch_size: int, seq_len: int,
+                                   hidden_size: int, dtype: torch.dtype):
+    num_processes = 2
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(num_processes, batch_size, seq_len,
+                                          hidden_size, dtype),
+                                    nprocs=nprocs)
+    run_torch_spawn(sequence_parallelism_pass_on_test_model, num_processes)
+def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
+                                            batch_size: int, seq_len: int,
+                                            hidden_size: int,
+                                            dtype: torch.dtype):
+    current_platform.seed_everything(0)
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    # configure vllm config for SequenceParallelismPass
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig(
+        pass_config=CompilationConfig.PassConfig(
+            enable_sequence_parallelism=True, ), )
+    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+    # this is a fake model name to construct the model config
+    # in the vllm_config, it's not really used.
+    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
+    vllm_config.model_config = ModelConfig(model=model,
+                                           task="auto",
+                                           tokenizer=model,
+                                           tokenizer_mode="auto",
+                                           trust_remote_code=True,
+                                           dtype=dtype,
+                                           seed=42)
+    sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
+    backend_no_func = TestBackend(sequence_parallelism_pass)
+    func_pass = FixFunctionalizationPass(vllm_config)
+    backend_func = TestBackend(sequence_parallelism_pass, func_pass)
+    model = TestModel(hidden_size, hidden_size * 2)
+    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
+                                dtype=dtype)
+    residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
+    compiled_model_no_func = torch.compile(model, backend=backend_no_func)
+    compiled_model_no_func(hidden_states, residual)
+    compiled_model_func = torch.compile(model, backend=backend_func)
+    compiled_model_func(hidden_states, residual)
+    # Check substitution worked
+    pre_nodes = backend_no_func.graph_pre_pass.nodes
+    post_nodes = backend_no_func.graph_post_pass.nodes
+    # In pre-nodes, all reduce should be there,
+    # reduce scatter and all gather should not
+    for op in OPS_IN_MODEL_BEFORE:
+        find_specified_fn(pre_nodes, op)
+    for op in OPS_IN_MODEL_AFTER:
+        assert find_specified_fn_maybe(pre_nodes, op) is None
+    # In post-nodes, reduce scatter and all gather should be there,
+    # all reduce should not
+    for op in OPS_IN_MODEL_AFTER:
+        find_specified_fn(post_nodes, op)
+    for op in OPS_IN_MODEL_BEFORE:
+        assert find_specified_fn_maybe(post_nodes, op) is None
+    # check if the functionalization pass is applied
+    for op in OPS_IN_MODEL:
+        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
+        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
+                                  op) is None  # noqa: E501
+    # make sure the ops were all de-functionalized
+    found = dict()
+    for node in backend_func.graph_post_pass.nodes:
+        for op in OPS_IN_MODEL:
+            if is_func(node, op):
+                found[op] = True
+    assert all(found[op] for op in OPS_IN_MODEL)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -24,23 +24,24 @@ from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
 from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
+from vllm.config import TaskOption, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         TokensPrompt, to_enc_dec_tuple_list,
+                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
-                         zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import cuda_device_count_stateless, is_list_of
+from vllm.utils import cuda_device_count_stateless
 from .utils import models_path_prefix
 logger = init_logger(__name__)
 _TEST_DIR = os.path.dirname(__file__)
@@ -109,10 +110,25 @@ class _VideoAssets(_VideoAssetsBase):
        return [prompts["sample_demo_1"]]
+class _AudioAssetsBase(UserList[AudioAsset]):
+    pass
+class _AudioAssets(_AudioAssetsBase):
+    def __init__(self) -> None:
+        super().__init__([
+            AudioAsset("mary_had_lamb"),
+            AudioAsset("winning_call"),
+        ])
 IMAGE_ASSETS = _ImageAssets()
 """Singleton instance of :class:`_ImageAssets`."""
 VIDEO_ASSETS = _VideoAssets()
 """Singleton instance of :class:`_VideoAssets`."""
+AUDIO_ASSETS = _AudioAssets()
+"""Singleton instance of :class:`_AudioAssets`."""
 @pytest.fixture(scope="function", autouse=True)
@@ -269,6 +285,11 @@ def video_assets() -> _VideoAssets:
    return VIDEO_ASSETS
+@pytest.fixture(scope="session")
+def audio_assets() -> _AudioAssets:
+    return AUDIO_ASSETS
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
 _R = TypeVar("_R")
@@ -396,10 +417,15 @@ class HfRunner:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
-            if audios is not None and (audio_tuple := audios[i]) is not None:
+            if audios is not None and (audio_inputs := audios[i]) is not None:
-                audio, sr = audio_tuple
+                # HACK - not all processors take sampling_rate; we should
-                processor_kwargs["audio"] = audio
+                # clean this up in the future.
-                processor_kwargs["sampling_rate"] = sr
+                if len(audio_inputs) == 2:
+                    audio, sr = audio_inputs
+                    processor_kwargs["audio"] = audio
+                    processor_kwargs["sampling_rate"] = sr
+                else:
+                    processor_kwargs["audio"] = audio_inputs
            inputs = self.processor(**processor_kwargs)
            if isinstance(inputs, BatchFeature):
@@ -474,12 +500,19 @@ class HfRunner:
        prompts: list[str],
        beam_width: int,
        max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
    ) -> list[tuple[list[list[int]], list[str]]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
-                                num_return_sequences=beam_width)
+                                num_return_sequences=beam_width,
+                                images=images,
+                                videos=videos,
+                                audios=audios)
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
@@ -530,7 +563,10 @@ class HfRunner:
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
-                last_hidden_states.to(output_embeddings.weight.device),
+                last_hidden_states.to(
+                    device=output_embeddings.weight.device,
+                    dtype=output_embeddings.weight.dtype,
+                ),
                output_embeddings.weight.t(),
            )
            if getattr(output_embeddings, "bias", None) is not None:
@@ -924,6 +960,7 @@ class VllmRunner:
        max_tokens: int,
        num_logprobs: int,
        num_prompt_logprobs: Optional[int] = None,
+        skip_special_tokens: bool = True,
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
@@ -931,6 +968,7 @@ class VllmRunner:
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
+            skip_special_tokens=skip_special_tokens,
        )
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
@@ -941,18 +979,20 @@ class VllmRunner:
    def generate_beam_search(
        self,
-        prompts: Union[list[str], list[list[int]]],
+        prompts: list[str],
        beam_width: int,
        max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
    ) -> list[tuple[list[list[int]], list[str]]]:
-        if is_list_of(prompts, str, check="all"):
+        inputs = self.get_inputs(prompts,
-            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
+                                 images=images,
-        else:
+                                 videos=videos,
-            prompts = [
+                                 audios=audios)
-                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
-            ]
        outputs = self.model.beam_search(
-            prompts,
+            inputs,
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
        returned_outputs = []
        for output in outputs:
@@ -1005,20 +1045,6 @@ def vllm_runner():
    return VllmRunner
-def get_tokenizer_pool_config(tokenizer_group_type):
-    if tokenizer_group_type is None:
-        return None
-    if tokenizer_group_type == "ray":
-        return TokenizerPoolConfig(pool_size=1,
-                                   pool_type="ray",
-                                   extra_config={})
-    if isinstance(tokenizer_group_type, type):
-        return TokenizerPoolConfig(pool_size=1,
-                                   pool_type=tokenizer_group_type,
-                                   extra_config={})
-    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
 @pytest.fixture()
 def temporary_enable_log_propagate():
    import logging

--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -197,15 +197,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs",
                         [{
-                             "block_size": 8,
+                             "block_size": 16,
                             "max_num_batched_tokens": 2,
                             "max_num_seqs": 2,
                         }, {
-                             "block_size": 8,
+                             "block_size": 16,
                             "max_num_batched_tokens": 3,
                             "max_num_seqs": 2,
                         }, {
-                             "block_size": 8,
+                             "block_size": 16,
                             "max_num_batched_tokens": 256,
                             "max_num_seqs": 10,
                         }])

--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -14,7 +14,8 @@ import torch
 from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce)
+                              tensor_model_parallel_all_reduce,
+                              tensor_model_parallel_reduce_scatter)
 from ..utils import init_test_distributed_environment, multi_process_parallel
@@ -47,6 +48,34 @@ def all_reduce_test_worker(
    torch.testing.assert_close(t, expected)
+@ray.remote(num_gpus=1, max_calls=1)
+def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
+                               pp_size: int, rank: int,
+                               distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
+        (r + 1) for r in range(tp_size)
+    ]
+    index = rank % tp_size
+    partition_size = num_elements // tp_size
+    all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    expected = all_reduce[index * partition_size:(index + 1) * partition_size]
+    t = all_tensors[index]
+    t = tensor_model_parallel_reduce_scatter(t, 0)
+    torch.testing.assert_close(t, expected)
 @ray.remote(num_gpus=1, max_calls=1)
 def all_gather_test_worker(
    monkeypatch: pytest.MonkeyPatch,

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -161,12 +161,12 @@ TEXT_GENERATION_MODELS = {
    os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V2-Lite-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "tiiuae/falcon-7b"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "google/gemma-2b"): PPTestSettings.fast(),
+    os.path.join(models_path_prefix, "google/gemma-1.1-2b-it"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "google/gemma-2-9b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "gpt2"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "bigcode/starcoder"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "EleutherAI/gpt-j-6b"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "EleutherAI/pythia-12b"): PPTestSettings.fast(),
+    os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "ibm/PowerLM-3b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "ibm/PowerMoE-3b"): PPTestSettings.fast(),
    # Uses Llama
@@ -195,7 +195,7 @@ TEXT_GENERATION_MODELS = {
    os.path.join(models_path_prefix, "microsoft/Phi-3-small-8k-instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"): PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
    os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"): PPTestSettings.fast(),
+    os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): PPTestSettings.fast(),

--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+import pytest
+from vllm.config import TaskOption
+from vllm.logger import init_logger
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import compare_two_settings, create_new_process_for_each_test
+logger = init_logger("test_sequence_parallel")
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    sp_enabled: bool
+    eager_mode: bool
+    chunked_prefill: bool
+class SPTestOptions(NamedTuple):
+    multi_node_only: bool
+    load_format: Optional[str] = None
+@dataclass
+class SPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
+    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
+    task: TaskOption
+    test_options: SPTestOptions
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 2,
+        multi_node_only: bool = False,
+        task: TaskOption = "auto",
+        load_format: Optional[str] = None,
+    ):
+        return SPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=True,
+                              chunked_prefill=True)
+            ],
+            distributed_backends=["mp", "ray"],
+            vllm_major_versions=["1", "1"],
+            task=task,
+            test_options=SPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 2,
+        task: TaskOption = "auto",
+        multi_node_only: bool = False,
+        load_format: Optional[str] = None,
+    ):
+        return SPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=False,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "ray"],
+            vllm_major_versions=["1", "1"],
+            task=task,
+            test_options=SPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+    def iter_params(self, model_id: str):
+        opts = self.test_options
+        for parallel_setup in self.parallel_setups:
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.task, opts)
+def _compare_sp(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: SPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate", "encode"],
+    is_multimodal: bool,
+):
+    (
+        tp_size,
+        sp_enabled,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+    multi_node_only, load_format = test_options
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
+    pp_size = 1
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    compilation_config = {
+        'level': 3,
+        'custom_ops': ["+rms_norm"],
+        'compile_sizes': [4, 8],
+        'splitting_ops': [],
+        'pass_config': {
+            'enable_sequence_parallism': sp_enabled,
+            'enable_noop': True,
+            'enable_fusion': True,
+        },
+    }
+    tp_sp_env = tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+    tp_sp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+        "--compilation_config",
+        str(compilation_config),
+    ]
+    tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+    try:
+        compare_two_settings(model_id,
+                             tp_sp_args,
+                             tp_args,
+                             tp_sp_env,
+                             tp_env,
+                             method=method)
+    except Exception:
+        testing_ray_compiled_graph = tp_sp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
+SP_TEXT_GENERATION_MODELS = {
+    # [Decoder-only]
+    "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.detailed(),
+}
+SP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "meta-llama/Llama-3.2-1B-Instruct",
+]
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
+    [
+        params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in SP_TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_tp_sp_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: SPTestOptions,
+    num_gpus_available,
+):
+    _compare_sp(model_id,
+                parallel_setup,
+                distributed_backend,
+                vllm_major_version,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate",
+                is_multimodal=False)
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
 # SPDX-License-Identifier: Apache-2.0
+import json
 from argparse import ArgumentError, ArgumentTypeError
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from typing import Literal, Optional
 import pytest
-from vllm.config import PoolerConfig
+from vllm.config import PoolerConfig, config
-from vllm.engine.arg_utils import EngineArgs, nullable_kvs
+from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
+                                   get_type, is_not_builtin, is_type,
+                                   nullable_kvs, optional_type)
 from vllm.utils import FlexibleArgumentParser
+@pytest.mark.parametrize(("type", "value", "expected"), [
+    (int, "42", 42),
+    (int, "None", None),
+    (float, "3.14", 3.14),
+    (float, "None", None),
+    (str, "Hello World!", "Hello World!"),
+    (str, "None", None),
+    (json.loads, '{"foo":1,"bar":2}', {
+        "foo": 1,
+        "bar": 2
+    }),
+    (json.loads, "foo=1,bar=2", {
+        "foo": 1,
+        "bar": 2
+    }),
+    (json.loads, "None", None),
+])
+def test_optional_type(type, value, expected):
+    optional_type_func = optional_type(type)
+    context = nullcontext()
+    if value == "foo=1,bar=2":
+        context = pytest.warns(DeprecationWarning)
+    with context:
+        assert optional_type_func(value) == expected
+@pytest.mark.parametrize(("type_hint", "type", "expected"), [
+    (int, int, True),
+    (int, float, False),
+    (list[int], list, True),
+    (list[int], tuple, False),
+    (Literal[0, 1], Literal, True),
+])
+def test_is_type(type_hint, type, expected):
+    assert is_type(type_hint, type) == expected
+@pytest.mark.parametrize(("type_hints", "type", "expected"), [
+    ({float, int}, int, True),
+    ({int, tuple[int]}, int, True),
+    ({int, tuple[int]}, float, False),
+    ({str, Literal["x", "y"]}, Literal, True),
+])
+def test_contains_type(type_hints, type, expected):
+    assert contains_type(type_hints, type) == expected
+@pytest.mark.parametrize(("type_hints", "type", "expected"), [
+    ({int, float}, int, int),
+    ({int, float}, str, None),
+    ({str, Literal["x", "y"]}, Literal, Literal["x", "y"]),
+])
+def test_get_type(type_hints, type, expected):
+    assert get_type(type_hints, type) == expected
+@config
+@dataclass
+class DummyConfigClass:
+    regular_bool: bool = True
+    """Regular bool with default True"""
+    optional_bool: Optional[bool] = None
+    """Optional bool with default None"""
+    optional_literal: Optional[Literal["x", "y"]] = None
+    """Optional literal with default None"""
+    tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
+    """Tuple with default (1, 2, 3)"""
+    tuple_2: tuple[int, int] = field(default_factory=lambda: (1, 2))
+    """Tuple with default (1, 2)"""
+    list_n: list[int] = field(default_factory=lambda: [1, 2, 3])
+    """List with default [1, 2, 3]"""
+@pytest.mark.parametrize(("type_hint", "expected"), [
+    (int, False),
+    (DummyConfigClass, True),
+])
+def test_is_not_builtin(type_hint, expected):
+    assert is_not_builtin(type_hint) == expected
+def test_get_kwargs():
+    kwargs = get_kwargs(DummyConfigClass)
+    print(kwargs)
+    # bools should not have their type set
+    assert kwargs["regular_bool"].get("type") is None
+    assert kwargs["optional_bool"].get("type") is None
+    # optional literals should have None as a choice
+    assert kwargs["optional_literal"]["choices"] == ["x", "y", "None"]
+    # tuples should have the correct nargs
+    assert kwargs["tuple_n"]["nargs"] == "+"
+    assert kwargs["tuple_2"]["nargs"] == 2
+    # lists should work
+    assert kwargs["list_n"]["type"] is int
+    assert kwargs["list_n"]["nargs"] == "+"
 @pytest.mark.parametrize(("arg", "expected"), [
-    (None, None),
+    (None, dict()),
    ("image=16", {
        "image": 16
    }),
@@ -24,6 +128,10 @@ from vllm.utils import FlexibleArgumentParser
    }),
 ])
 def test_limit_mm_per_prompt_parser(arg, expected):
+    """This functionality is deprecated and will be removed in the future.
+    This argument should be passed as JSON string instead.
+    TODO: Remove with nullable_kvs."""
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
    if arg is None:
        args = parser.parse_args([])
@@ -53,12 +161,20 @@ def test_compilation_config():
    assert args.compilation_config.level == 3
    # set to string form of a dict
-    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
+    args = parser.parse_args([
-    assert args.compilation_config.level == 3
+        "--compilation-config",
+        "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}",
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
    # set to string form of a dict
-    args = parser.parse_args(["--compilation-config={'level': 3}"])
+    args = parser.parse_args([
-    assert args.compilation_config.level == 3
+        "--compilation-config="
+        "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}",
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
 def test_prefix_cache_default():

--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -91,3 +91,31 @@ def test_chat_multi_image(image_urls: list[str]):
    }]
    outputs = llm.chat(messages)
    assert len(outputs) >= 0
+def test_llm_chat_tokenization_no_double_bos():
+    """
+    LLM.chat() should not add special tokens when using chat templates.
+    Check we get a single BOS token for llama chat.
+    """
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True)
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello!"
+        },
+    ]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 1
+    prompt_token_ids = getattr(outputs[0], "prompt_token_ids", None)
+    assert prompt_token_ids is not None
+    bos_token = llm.get_tokenizer().bos_token_id
+    # Ensure we have a single BOS
+    assert prompt_token_ids[0] == bos_token
+    assert prompt_token_ids[1] != bos_token, "Double BOS"
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -308,7 +308,7 @@ def test_disable_guided_decoding_fallback(sample_regex, llm):
    with pytest.raises(
            ValueError,
            match="xgrammar does not support advanced JSON schema features "
-            "like enums, patterns or numeric ranges."):
+            "like string length, item limits, or property bounds."):
        llm.generate(prompts="This should fail",
                     sampling_params=sampling_params,
                     use_tqdm=True)
@@ -386,4 +386,118 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
        assert generated_text is not None
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        output_json = json.loads(generated_text)
        jsonschema.validate(instance=output_json, schema=json_schema)
\ No newline at end of file
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_number_range_json_completion(llm,
+                                             guided_decoding_backend: str):
+    sample_output_schema = {
+        "type": "object",
+        "properties": {
+            "age": {
+                "type": "integer",
+                "minimum": 18,
+                "maximum": 99
+            },
+            "score": {
+                "type": "number",
+                "minimum": 0.0,
+                "maximum": 100.0
+            },
+            "zipcode": {
+                "type": "string",
+                "pattern": r"^\d{5}(-\d{4})?$"
+            },
+        },
+        "required": ["age", "score", "zipcode"],
+    }
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_output_schema,
+                                             backend=guided_decoding_backend),
+    )
+    outputs = llm.generate(
+        prompts=[
+            "Create a JSON object for a user with age, score, and zipcode."
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_output_schema)
+        assert 18 <= output_json["age"] <= 99
+        assert 0.0 <= output_json["score"] <= 100.0
+        assert (re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"])
+                is not None)
+@pytest.mark.skip_global_cleanup
+def test_guidance_no_additional_properties(llm):
+    schema = {
+        'type': 'object',
+        'properties': {
+            'a1': {
+                'type': 'string'
+            },
+            'a2': {
+                'type': 'string'
+            },
+            'a3': {
+                'type': 'string'
+            }
+        },
+        'required': ['a1', 'a2', 'a3'],
+    }
+    prompt = (
+        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
+        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
+        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
+        "<|im_end|>\n<|im_start|>assistant\n")
+    def generate_with_backend(backend):
+        guided_params = GuidedDecodingParams(json=schema, backend=backend)
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=256,
+                                         guided_decoding=guided_params)
+        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        assert outputs is not None
+        generated_text = outputs[0].outputs[0].text
+        assert generated_text is not None
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
+        return parsed_json
+    base_generated = generate_with_backend('guidance:disable-any-whitespace')
+    assert "a1" in base_generated
+    assert "a2" in base_generated
+    assert "a3" in base_generated
+    # by default additional keys are generated
+    assert "a4" in base_generated
+    assert "a5" in base_generated
+    assert "a6" in base_generated
+    generated = generate_with_backend(
+        'guidance:no-additional-properties,disable-any-whitespace')
+    assert "a1" in generated
+    assert "a2" in generated
+    assert "a3" in generated
+    assert "a4" not in generated
+    assert "a5" not in generated
+    assert "a6" not in generated
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -150,6 +150,7 @@ def test_wer_correctness(model_name,
                         expected_wer,
                         n_examples=-1,
                         max_concurrent_request=None):
+    # TODO refactor to use `ASRDataset`
    with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
        dataset = load_hf_dataset(dataset_repo)

--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
 # SPDX-License-Identifier: Apache-2.0
+import json
 import openai
 import pytest
 import os
@@ -27,7 +29,7 @@ def server():
        "--enforce-eager",
        "--trust-remote-code",
        "--limit-mm-per-prompt",
-        f"audio={MAXIMUM_AUDIOS}",
+        json.dumps({"audio": MAXIMUM_AUDIOS}),
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -102,6 +104,35 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": audio_url
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+    # audio_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])

--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -12,11 +12,13 @@ import requests
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from ...models.embedding.utils import check_embeddings_close
+from ...models.embedding.utils import correctness_test
 from ...utils import RemoteOpenAIServer, models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+DTYPE = "bfloat16"
 @pytest.fixture(scope="module")
@@ -26,7 +28,7 @@ def server():
        "embed",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        "bfloat16",
+        DTYPE,
        "--enforce-eager",
        "--max-model-len",
        "512",
@@ -44,9 +46,17 @@ async def client(server):
        yield async_client
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    with hf_runner(MODEL_NAME, dtype=DTYPE,
+                   is_sentence_transformer=True) as hf_model:
+        yield hf_model
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
+async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
+                                model_name: str):
    input_texts = [
        "The chef prepared a delicious meal.",
    ]
@@ -67,6 +77,9 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
    assert embeddings.usage.prompt_tokens == 11
    assert embeddings.usage.total_tokens == 11
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    correctness_test(hf_model, input_texts, vllm_outputs)
    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
    embedding_response = await client.embeddings.create(
@@ -87,7 +100,8 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
+async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
+                               model_name: str):
    # test list[str]
    input_texts = [
        "The cat sat on the mat.", "A feline was resting on a rug.",
@@ -108,6 +122,9 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
    assert embeddings.usage.prompt_tokens == 33
    assert embeddings.usage.total_tokens == 33
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    correctness_test(hf_model, input_texts, vllm_outputs)
    # test list[list[int]]
    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                    [25, 32, 64, 77]]
@@ -182,7 +199,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
+async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
                                      model_name: str):
    input_texts = [
        "Hello my name is",
@@ -193,6 +210,7 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
                                                     model=model_name,
                                                     encoding_format="float")
    float_data = [d.embedding for d in responses_float.data]
+    correctness_test(hf_model, input_texts, float_data)
    responses_base64 = await client.embeddings.create(input=input_texts,
                                                      model=model_name,
@@ -203,24 +221,13 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
            np.frombuffer(base64.b64decode(data.embedding),
                          dtype="float32").tolist())
-    check_embeddings_close(
+    correctness_test(hf_model, input_texts, base64_data)
-        embeddings_0_lst=float_data,
-        embeddings_1_lst=base64_data,
-        name_0="float",
-        name_1="base64",
-    )
    # Default response is float32 decoded from base64 by OpenAI Client
    responses_default = await client.embeddings.create(input=input_texts,
                                                       model=model_name)
    default_data = [d.embedding for d in responses_default.data]
+    correctness_test(hf_model, input_texts, default_data)
-    check_embeddings_close(
-        embeddings_0_lst=float_data,
-        embeddings_1_lst=default_data,
-        name_0="float",
-        name_1="default",
-    )
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -3,80 +3,121 @@
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
-from typing import NamedTuple
+from typing import Optional
 import openai
 import pytest
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from ...conftest import HfRunner
+from ...models.embedding.utils import EmbedModelInfo, correctness_test
 from ...utils import RemoteOpenAIServer
-class ModelInfo(NamedTuple):
-    name: str
-    is_matryoshka: bool
 MODELS = [
-    ModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
+    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
-    ModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   matryoshka_dimensions=[256]),
 ]
 input_texts = [
    "The chef prepared a delicious meal.",
-] * 3
+]
-@pytest.mark.asyncio
+@pytest.fixture(scope="module", params=MODELS)
-@pytest.mark.parametrize("model", MODELS)
+def model_info(request):
-async def test_validating_dimensions(model: ModelInfo):
+    return request.param
+@pytest.fixture(scope="module", params=["bfloat16"])
+def dtype(request):
+    return request.param
+@pytest.fixture(scope="module")
+def server(model_info, dtype: str):
    args = [
        "--task",
        "embed",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        "bfloat16",
+        dtype,
        "--enforce-eager",
        "--max-model-len",
-        "512",
+        "512"
-        "--trust_remote_code"
    ]
-    with RemoteOpenAIServer(model.name, args) as remote_server:
-        client = remote_server.get_async_client()
-        async def make_request(dimensions):
-            embedding_response = await client.embeddings.create(
-                model=model.name,
-                input=input_texts,
-                dimensions=dimensions,
-                encoding_format="float",
-            )
-            embeddings = EmbeddingResponse.model_validate(
-                embedding_response.model_dump(mode="json"))
-            assert embeddings.id is not None
-            assert len(embeddings.data) == 3
-            assert len(embeddings.data[0].embedding) > 0
-            assert embeddings.usage.completion_tokens == 0
-            assert embeddings.usage.prompt_tokens > 0
-            assert embeddings.usage.total_tokens > 0
-            if dimensions is not None:
-                assert len(embeddings.data[0].embedding) == dimensions
-        if model.is_matryoshka:
-            for dimensions in [None, 16]:
-                await make_request(dimensions)
+    if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
+        # Manually enable Matryoshka Embeddings
+        args.extend([
+            "--trust_remote_code", "--hf_overrides",
+            '{"matryoshka_dimensions":[256]}'
+        ])
+    with RemoteOpenAIServer(model_info.name, args) as remote_server:
+        yield remote_server
+@pytest.fixture(scope="module")
+def hf_model(hf_runner, model_info, dtype: str):
+    with hf_runner(model_info.name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        yield hf_model
+@pytest.mark.asyncio
+async def test_matryoshka(model_info: EmbedModelInfo,
+                          server: RemoteOpenAIServer, hf_model: HfRunner):
+    client = server.get_async_client()
+    async def make_request_and_correctness_test(dimensions):
+        prompts = input_texts * 3
+        embedding_response = await client.embeddings.create(
+            model=model_info.name,
+            input=prompts,
+            dimensions=dimensions,
+            encoding_format="float",
+        )
+        embeddings = EmbeddingResponse.model_validate(
+            embedding_response.model_dump(mode="json"))
+        assert embeddings.id is not None
+        assert len(embeddings.data) == 3
+        assert len(embeddings.data[0].embedding) > 0
+        assert embeddings.usage.completion_tokens == 0
+        assert embeddings.usage.prompt_tokens > 0
+        assert embeddings.usage.total_tokens > 0
+        if dimensions is not None:
+            assert len(embeddings.data[0].embedding) == dimensions
+        vllm_outputs = [d.embedding for d in embeddings.data]
+        correctness_test(hf_model, prompts, vllm_outputs, dimensions)
+    if model_info.is_matryoshka:
+        valid_dimensions: list[Optional[int]] = [None]
+        if model_info.matryoshka_dimensions is not None:
+            valid_dimensions += model_info.matryoshka_dimensions[:2]
+        for dimensions in valid_dimensions:
+            await make_request_and_correctness_test(dimensions)
+        invalid_dimensions: list[Optional[int]] = [-1]
+        if model_info.matryoshka_dimensions is not None:
+            assert 5 not in model_info.matryoshka_dimensions
+            invalid_dimensions.append(5)
+        for dimensions in invalid_dimensions:
            with pytest.raises(openai.BadRequestError):
-                for dimensions in [-1]:
+                await make_request_and_correctness_test(dimensions)
-                    await make_request(dimensions)
-        else:
+    else:
-            for dimensions in [None]:
+        for dimensions in [None]:
-                await make_request(dimensions)
+            await make_request_and_correctness_test(dimensions)
+        for dimensions in [-1, 16]:
            with pytest.raises(openai.BadRequestError):
-                for dimensions in [-1, 16]:
+                await make_request_and_correctness_test(dimensions)
-                    await make_request(dimensions)