Merge tag 'v0.8.4' into v0.8.4-dev

31330101 · zhuwenwen · e8933c34 · dc1b4a6f · 31330101 · 31330101
Commit 31330101 authored Apr 16, 2025 by zhuwenwen
20 changed files
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -15,3 +15,6 @@ torchaudio==2.6.0; platform_machine == "ppc64le"
 torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
 torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
+
+# cpu cannot use triton 3.3.0
+triton==3.2.0; platform_machine != "ppc64le"
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -2,7 +2,7 @@
 -r common.txt

 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61; python_version > '3.9'
+numba == 0.61.2; python_version > '3.9'

 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.

--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
@@ -5,6 +5,7 @@
 ray
 triton==3.1.0
 pandas
+numpy==1.26.4
 tabulate
 setuptools>=61
 setuptools-scm>=8

--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -2,7 +2,7 @@
 -r common.txt

 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61; python_version > '3.9'
+numba == 0.61.2; python_version > '3.9'

 # Dependencies for hcus
 awscli

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -5,6 +5,7 @@ pytest-forked
 pytest-asyncio
 pytest-rerunfailures
 pytest-shard
+pytest-timeout

 # testing utils
 awscli
@@ -27,10 +28,11 @@ torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
+num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
-transformers==4.51.0
+transformers==4.51.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 # quantization
 bitsandbytes>=0.45.3
@@ -40,7 +42,7 @@ genai_perf==0.0.8
 tritonclient==2.51.0

 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61; python_version > '3.9'
+numba == 0.61.2; python_version > '3.9'
 numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -101,6 +101,8 @@ dill==0.3.8
    #   multiprocess
 dnspython==2.7.0
    # via email-validator
+docopt==0.6.2
+    # via num2words
 docutils==0.16
    # via awscli
 einops==0.8.0
@@ -263,7 +265,9 @@ networkx==3.2.1
    # via torch
 nltk==3.9.1
    # via rouge-score
-numba==0.61.0
+num2words==0.5.14
+    # via -r requirements/test.in
+numba==0.61.2
    # via
    #   -r requirements/test.in
    #   librosa
@@ -444,6 +448,7 @@ pytest==8.3.3
    #   pytest-mock
    #   pytest-rerunfailures
    #   pytest-shard
+    #   pytest-timeout
 pytest-asyncio==0.24.0
    # via -r requirements/test.in
 pytest-forked==1.6.0
@@ -454,6 +459,8 @@ pytest-rerunfailures==14.0
    # via -r requirements/test.in
 pytest-shard==0.1.2
    # via -r requirements/test.in
+pytest-timeout==2.3.1
+    # via -r requirements/test.in
 python-dateutil==2.9.0.post0
    # via
    #   botocore
@@ -645,7 +652,7 @@ tqdm==4.66.6
    #   transformers
 tqdm-multiprocess==0.0.11
    # via lm-eval
-transformers==4.51.0
+transformers==4.51.1
    # via
    #   -r requirements/test.in
    #   genai-perf

--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,10 +17,10 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

--- a/setup.py
+++ b/setup.py
@@ -563,9 +563,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    
    new_version_content = f"""
 try:
-    __version__ = "0.8.3"
-    __version_tuple__ = (0, 8, 3)
-    __hcu_version__ = f'0.8.3+{version}' 
+    __version__ = "0.8.4"
+    __version_tuple__ = (0, 8, 4)
+    __hcu_version__ = f'0.8.4+{version}' 
    
    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:

--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -2,7 +2,7 @@

 from __future__ import annotations

-from typing import Any, Union
+from typing import Any, Optional, Union

 import pytest
 import torch
@@ -15,7 +15,7 @@ from vllm.platforms import current_platform
 from ..utils import create_new_process_for_each_test


-def models_list(all: bool):
+def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
        ("facebook/opt-125m", {}),
        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -32,47 +32,50 @@ def models_list(all: bool):
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]

-    if not all:
-        return TEST_MODELS
+    if all:
+        if is_quant_method_supported("aqlm"):
+            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+                "quantization": "aqlm"
+            }))
+
+        # TODO: figure out why this fails.
+        if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+                "quantization": "gguf"
+            }))
+
+        if is_quant_method_supported("gptq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+                "quantization": "gptq"
+            }))
+
+        if is_quant_method_supported("gptq_marlin"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+                "quantization": "gptq_marlin"
+            }))

-    if is_quant_method_supported("aqlm"):
-        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-            "quantization": "aqlm"
-        }))
-
-    # TODO: figure out why this fails.
-    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-            "quantization": "gguf"
-        }))
-
-    if is_quant_method_supported("gptq"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-            "quantization": "gptq"
-        }))
-
-    if is_quant_method_supported("gptq_marlin"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-            "quantization": "gptq_marlin"
-        }))
-
-    if is_quant_method_supported("gptq_marlin_24"):
-        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-            "quantization": "gptq_marlin_24"
-        }))
-
-    if is_quant_method_supported("marlin"):
-        TEST_MODELS.append(
-            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-                "quantization": "marlin"
+        if is_quant_method_supported("gptq_marlin_24"):
+            TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+                "quantization": "gptq_marlin_24"
            }))

-    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-            "quantization": "AWQ"
-        }))
+        if is_quant_method_supported("marlin"):
+            TEST_MODELS.append(
+                ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                    "quantization": "marlin"
+                }))

-    return TEST_MODELS
+        if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+                "quantization": "AWQ"
+            }))
+
+    if keywords is None:
+        return TEST_MODELS
+
+    # filter by keywords
+    pred = lambda model: any(keyword in model[0] for keyword in keywords)
+    return list(filter(pred, TEST_MODELS))


 @pytest.mark.parametrize(
@@ -96,20 +99,30 @@ def test_full_graph(
        run_model(optimization_level, model, model_kwargs)


+PassConfig = CompilationConfig.PassConfig
+
+
 # TODO(luka) add other supported compilation config scenarios here
 @pytest.mark.parametrize(
-    "compilation_config",
-    # additional compile sizes
+    "compilation_config, model_info",
    [
-        CompilationConfig(level=CompilationLevel.PIECEWISE,
-                          compile_sizes=[1, 2])
+        # additional compile sizes, only some of the models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           compile_sizes=[1, 2]), model)
+        for model in models_list(all=False)
+    ] + [
+        # RMSNorm + quant fusion, only 8-bit quant models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           custom_ops=["+rms_norm"],
+                           pass_config=PassConfig(enable_fusion=True,
+                                                  enable_noop=True)), model)
+        for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
    ])
 # only test some of the models
-@pytest.mark.parametrize("model_info", models_list(all=False))
 @create_new_process_for_each_test()
 def test_custom_compile_config(
-    model_info: tuple[str, dict[str, Any]],
    compilation_config: CompilationConfig,
+    model_info: tuple[str, dict[str, Any]],
 ):
    model, model_kwargs = model_info
    print(f"MODEL={model}")

--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -44,12 +44,17 @@ class TestModel(torch.nn.Module):
        resid = torch.sqrt(x)
        y = self.norm[0](x)

-        x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
+        x2 = self.fp8_linear.apply(y,
+                                   self.w[0],
+                                   self.wscale[0],
+                                   input_scale=self.scale[0])
        # make sure resid is used for replacement to work
        y2, resid = self.norm[1](x2, resid)

-        x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
-                                   self.scale[1])
+        x3 = self.fp8_linear.apply(y2,
+                                   self.w[1],
+                                   self.wscale[1],
+                                   input_scale=self.scale[1])
        y3, resid = self.norm[2](x3, resid)  # use resid here
        return y3


--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -676,8 +676,9 @@ class HfRunner:
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

-    def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
-        return self.model.encode(prompts)
+    def encode(self, prompts: list[str], *args,
+               **kwargs) -> list[list[torch.Tensor]]:
+        return self.model.encode(prompts, *args, **kwargs)

    def predict(self, prompts: list[list[str]]) -> torch.Tensor:
        return self.model.predict(prompts, convert_to_tensor=True)
@@ -964,19 +965,19 @@ class VllmRunner:
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

-    def encode(
-        self,
-        prompts: list[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> list[list[float]]:
+    def encode(self,
+               prompts: list[str],
+               images: Optional[PromptImageInput] = None,
+               videos: Optional[PromptVideoInput] = None,
+               audios: Optional[PromptAudioInput] = None,
+               *args,
+               **kwargs) -> list[list[float]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

-        req_outputs = self.model.embed(inputs)
+        req_outputs = self.model.embed(inputs, *args, **kwargs)
        return [req_output.outputs.embedding for req_output in req_outputs]

    def score(

--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
 # SPDX-License-Identifier: Apache-2.0

-from argparse import ArgumentTypeError
+from argparse import ArgumentError, ArgumentTypeError

 import pytest

@@ -142,3 +142,39 @@ def test_composite_arg_parser(arg, expected, option):
    else:
        args = parser.parse_args([f"--{option}", arg])
    assert getattr(args, option.replace("-", "_")) == expected
+
+
+def test_human_readable_model_len():
+    # `exit_on_error` disabled to test invalid values below
+    parser = EngineArgs.add_cli_args(
+        FlexibleArgumentParser(exit_on_error=False))
+
+    args = parser.parse_args([])
+    assert args.max_model_len is None
+
+    args = parser.parse_args(["--max-model-len", "1024"])
+    assert args.max_model_len == 1024
+
+    # Lower
+    args = parser.parse_args(["--max-model-len", "1m"])
+    assert args.max_model_len == 1_000_000
+    args = parser.parse_args(["--max-model-len", "10k"])
+    assert args.max_model_len == 10_000
+
+    # Capital
+    args = parser.parse_args(["--max-model-len", "3K"])
+    assert args.max_model_len == 1024 * 3
+    args = parser.parse_args(["--max-model-len", "10M"])
+    assert args.max_model_len == 2**20 * 10
+
+    # Decimal values
+    args = parser.parse_args(["--max-model-len", "10.2k"])
+    assert args.max_model_len == 10200
+    # ..truncated to the nearest int
+    args = parser.parse_args(["--max-model-len", "10.212345k"])
+    assert args.max_model_len == 10212
+
+    # Invalid (do not allow decimals with binary multipliers)
+    for invalid in ["1a", "pwd", "10.24", "1.23M"]:
+        with pytest.raises(ArgumentError):
+            args = parser.parse_args(["--max-model-len", invalid])
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -19,7 +19,8 @@ models = [os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")]
 def test_context_length_too_short(vllm_runner, image_assets, model):
    images = [asset.pil_image for asset in image_assets]

-    with pytest.raises(ValueError, match="too long to fit into the model"):
+    with pytest.raises(ValueError,
+                       match="longer than the maximum model length"):
        vllm_model = vllm_runner(
            model,
            max_model_len=128,  # LLaVA has a feature size of 576

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -3,10 +3,12 @@
 import json
 import re
 import weakref
+from enum import Enum

 import jsonschema
 import pytest
 import os
+from pydantic import BaseModel

 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
@@ -287,15 +289,26 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):

 @pytest.mark.skip_global_cleanup
 def test_disable_guided_decoding_fallback(sample_regex, llm):
+    # see has_xgrammar_unsupported_json_features()
+    unsupported_json = {
+        "type": "object",
+        "properties": {
+            "example": {
+                "type": "string",
+                "minLength": 5  # unsupported by xgrammar
+            }
+        }
+    }
    sampling_params = SamplingParams(temperature=0.8,
                                     top_p=0.95,
                                     guided_decoding=GuidedDecodingParams(
-                                         regex=sample_regex,
+                                         json=unsupported_json,
                                         backend="xgrammar:no-fallback"))

    with pytest.raises(
            ValueError,
-            match="xgrammar does not support regex guided decoding"):
+            match="xgrammar does not support advanced JSON schema features "
+            "like enums, patterns or numeric ranges."):
        llm.generate(prompts="This should fail",
                     sampling_params=sampling_params,
                     use_tqdm=True)
@@ -333,3 +346,44 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
            # Parse to verify it is valid JSON
            parsed_json = json.loads(generated_text)
            assert isinstance(parsed_json, dict)
+
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=json_schema,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts="Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
\ No newline at end of file
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -16,8 +16,8 @@ def v1(run_with_both_engines):


 def test_empty_prompt():
-    llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
-    with pytest.raises(ValueError, match='Prompt cannot be empty'):
+    llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"),, enforce_eager=True)
+    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
        llm.generate([""])



--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -12,8 +12,10 @@ from ...utils import RemoteOpenAIServer, models_path_prefix

 MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
 TEST_AUDIO_URLS = [
-    "http://localhost:8000/winning_call.ogg"
+    AudioAsset("winning_call").url,
+    AudioAsset("mary_had_lamb").url,
 ]
+MAXIMUM_AUDIOS = 2

 @pytest.fixture(scope="module")
 def server():
@@ -24,6 +26,8 @@ def server():
        "5",
        "--enforce-eager",
        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"audio={MAXIMUM_AUDIOS}",
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -46,7 +50,7 @@ def base64_encoded_audio() -> dict[str, str]:

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
                                         model_name: str, audio_url: str):
    messages = [{
@@ -100,7 +104,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio_base64encoded(
        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
        base64_encoded_audio: dict[str, str]):
@@ -158,7 +162,7 @@ async def test_single_chat_session_audio_base64encoded(

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_input_audio(
        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
        base64_encoded_audio: dict[str, str]):
@@ -330,28 +334,21 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize(
+    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]])
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str,
-                                 base64_encoded_audio: dict[str, str]):
+                                 audio_urls: list[str]):

    messages = [{
        "role":
        "user",
        "content": [
-            {
+            *({
                "type": "audio_url",
                "audio_url": {
                    "url": audio_url
                }
-            },
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": base64_encoded_audio[audio_url],
-                    "format": "wav"
-                }
-            },
+            } for audio_url in audio_urls),
            {
                "type": "text",
                "text": "What's happening in this audio?"
@@ -359,20 +356,30 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
        ],
    }]

-    with pytest.raises(openai.BadRequestError):  # test multi-audio input
-        await client.chat.completions.create(
+    if len(audio_urls) > MAXIMUM_AUDIOS:
+        with pytest.raises(openai.BadRequestError):  # test multi-audio input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_completion_tokens=10,
            temperature=0.0,
        )
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    completion = completion.choices[0].text
-    assert completion is not None and len(completion) >= 0
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -21,8 +21,6 @@ from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")

-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
-

 @pytest.fixture(scope="module")
 def monkeypatch_module():
@@ -492,20 +490,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
    assert last_completion_tokens == 10


-# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
-# (i.e. using the same ordering as in the Completions API tests), the test
-# will fail on the second `guided_decoding_backend` even when I swap their order
-# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  is_v1_server: bool,
-                                  guided_decoding_backend: str,
                                  sample_guided_choice):
-
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
-
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -520,8 +507,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
    choice1 = chat_completion.choices[0].message.content
    assert choice1 in sample_guided_choice

@@ -535,22 +521,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
    choice2 = chat_completion.choices[0].message.content
    assert choice2 in sample_guided_choice
    assert choice1 != choice2


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
-                                guided_decoding_backend: str,
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                sample_json_schema):

-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported in V1")
-
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -565,8 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
    message = chat_completion.choices[0].message
    assert message.content is not None
    json1 = json.loads(message.content)
@@ -583,8 +562,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
    message = chat_completion.choices[0].message
    assert message.content is not None
    json2 = json.loads(message.content)
@@ -594,13 +572,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 is_v1_server: bool,
-                                 guided_decoding_backend: str, sample_regex):
-
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):

    messages = [{
        "role": "system",
@@ -615,8 +587,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
    ip1 = chat_completion.choices[0].message.content
    assert ip1 is not None
    assert re.fullmatch(sample_regex, ip1) is not None
@@ -627,8 +598,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
    ip2 = chat_completion.choices[0].message.content
    assert ip2 is not None
    assert re.fullmatch(sample_regex, ip2) is not None
@@ -657,15 +627,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           is_v1_server: bool,
-                                           guided_decoding_backend: str,
                                           sample_guided_choice):

-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
-
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -681,8 +645,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
        max_completion_tokens=10,
        logprobs=True,
        top_logprobs=5,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))

    assert chat_completion.choices[0].logprobs is not None
    assert chat_completion.choices[0].logprobs.content is not None
@@ -694,14 +657,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
-                              guided_decoding_backend: str,
-                              sample_json_schema):
-
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
-
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -733,7 +689,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                "name": "dummy_function_name"
            }
        },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
+    )
    message = chat_completion.choices[0].message
    assert len(message.content) == 0
    json_string = message.tool_calls[0].function.arguments
@@ -768,7 +724,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                "name": "dummy_function_name"
            }
        },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
        stream=True)

    output = []
@@ -893,7 +848,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
        model=model_name,
        tools=tools,
        tool_choice="required",
-        extra_body=dict(guided_decoding_backend="outlines"),
    )

    assert chat_completion.choices[0].message.tool_calls is not None
@@ -905,7 +859,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
        model=model_name,
        tools=tools,
        tool_choice="required",
-        extra_body=dict(guided_decoding_backend="outlines"),
        stream=True,
    )

@@ -919,12 +872,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,

 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  is_v1_server: bool,
                                                  sample_json_schema):
-
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
-
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"

--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+# SPDX-License-Identifier: Apache-2.0
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.config import ModelConfig
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        task="auto",
+        tokenizer=model_name,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_valid(client):
+    """Test that valid logit_bias values are accepted in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    valid_token_id = vocab_size - 1
+
+    completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "Testing valid logit bias"
+        }],
+        max_tokens=5,
+        logit_bias={str(valid_token_id): 1.0},
+    )
+
+    assert completion.choices[0].message.content is not None
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_invalid(client):
+    """Test that invalid logit_bias values are rejected in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    invalid_token_id = vocab_size + 1
+
+    with pytest.raises(openai.BadRequestError) as excinfo:
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": "Testing invalid logit bias"
+            }],
+            max_tokens=5,
+            logit_bias={str(invalid_token_id): 1.0},
+        )
+
+    error = excinfo.value
+    error_message = str(error)
+
+    assert error.status_code == 400
+    assert str(invalid_token_id) in error_message
+    assert str(vocab_size) in error_message
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -12,6 +12,7 @@ import requests
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer

+from ...models.embedding.utils import check_embeddings_close
 from ...utils import RemoteOpenAIServer, models_path_prefix

 MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
@@ -191,30 +192,35 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
    responses_float = await client.embeddings.create(input=input_texts,
                                                     model=model_name,
                                                     encoding_format="float")
+    float_data = [d.embedding for d in responses_float.data]

    responses_base64 = await client.embeddings.create(input=input_texts,
                                                      model=model_name,
                                                      encoding_format="base64")
-
-    decoded_responses_base64_data = []
+    base64_data = []
    for data in responses_base64.data:
-        decoded_responses_base64_data.append(
+        base64_data.append(
            np.frombuffer(base64.b64decode(data.embedding),
                          dtype="float32").tolist())

-    assert responses_float.data[0].embedding == decoded_responses_base64_data[
-        0]
-    assert responses_float.data[1].embedding == decoded_responses_base64_data[
-        1]
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=base64_data,
+        name_0="float",
+        name_1="base64",
+    )

    # Default response is float32 decoded from base64 by OpenAI Client
    responses_default = await client.embeddings.create(input=input_texts,
                                                       model=model_name)
+    default_data = [d.embedding for d in responses_default.data]

-    assert responses_float.data[0].embedding == responses_default.data[
-        0].embedding
-    assert responses_float.data[1].embedding == responses_default.data[
-        1].embedding
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=default_data,
+        name_0="float",
+        name_1="default",
+    )


 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
+"""
+
+from typing import NamedTuple
+
+import openai
+import pytest
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+
+from ...utils import RemoteOpenAIServer
+
+
+class ModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool
+
+
+MODELS = [
+    ModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
+    ModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
+]
+
+input_texts = [
+    "The chef prepared a delicious meal.",
+] * 3
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+async def test_validating_dimensions(model: ModelInfo):
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--trust_remote_code"
+    ]
+    with RemoteOpenAIServer(model.name, args) as remote_server:
+        client = remote_server.get_async_client()
+
+        async def make_request(dimensions):
+            embedding_response = await client.embeddings.create(
+                model=model.name,
+                input=input_texts,
+                dimensions=dimensions,
+                encoding_format="float",
+            )
+            embeddings = EmbeddingResponse.model_validate(
+                embedding_response.model_dump(mode="json"))
+
+            assert embeddings.id is not None
+            assert len(embeddings.data) == 3
+            assert len(embeddings.data[0].embedding) > 0
+            assert embeddings.usage.completion_tokens == 0
+            assert embeddings.usage.prompt_tokens > 0
+            assert embeddings.usage.total_tokens > 0
+
+            if dimensions is not None:
+                assert len(embeddings.data[0].embedding) == dimensions
+
+        if model.is_matryoshka:
+            for dimensions in [None, 16]:
+                await make_request(dimensions)
+
+            with pytest.raises(openai.BadRequestError):
+                for dimensions in [-1]:
+                    await make_request(dimensions)
+
+        else:
+            for dimensions in [None]:
+                await make_request(dimensions)
+
+            with pytest.raises(openai.BadRequestError):
+                for dimensions in [-1, 16]:
+                    await make_request(dimensions)