Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

0da93439 · zhuwenwen · 25f2f756 · 298e5108 · 0da93439 · 0da93439
Commit 0da93439 authored Mar 26, 2026 by zhuwenwen
20 changed files
--- a/tests/v1/entrypoints/llm/__init__.py
+++ b/tests/v1/entrypoints/llm/__init__.py
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -6,8 +6,8 @@ import json

 import pytest

-from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
-from .conftest import add_attention_backend
+from tests.entrypoints.openai.conftest import add_attention_backend
+from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer

 MISTRAL_FORMAT_ARGS = [
    "--tokenizer_mode",
@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
            model_name,
            foscolo,
            language="it",
-            expected_text="ove il mio corpo fanciulletto giacque",
+            expected_text="ove il mio corpo fanciulletto",
        )
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -13,7 +13,7 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer

 MODEL_NAME = "openai/whisper-large-v3-turbo"


--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -14,8 +14,8 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from ...utils import RemoteOpenAIServer
-from .conftest import add_attention_backend
+from tests.entrypoints.openai.conftest import add_attention_backend
+from tests.utils import RemoteOpenAIServer

 SERVER_ARGS = ["--enforce-eager"]


--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -291,3 +291,32 @@ def test_served_model_name_parsing(tmp_path, vllm_parser, args, raises):
    else:
        with pytest.raises(raises):
            vllm_parser.parse_args(args=args)
+
+
+### Tests for LoRA target modules parsing
+def test_lora_target_modules_single(serve_parser):
+    """Test parsing single lora-target-modules argument"""
+    args = serve_parser.parse_args(
+        args=["--enable-lora", "--lora-target-modules", "o_proj"]
+    )
+    assert args.lora_target_modules == ["o_proj"]
+
+
+def test_lora_target_modules_multiple(serve_parser):
+    """Test parsing multiple lora-target-modules arguments"""
+    args = serve_parser.parse_args(
+        args=[
+            "--enable-lora",
+            "--lora-target-modules",
+            "o_proj",
+            "qkv_proj",
+            "down_proj",
+        ]
+    )
+    assert args.lora_target_modules == ["o_proj", "qkv_proj", "down_proj"]
+
+
+def test_lora_target_modules_default_none(serve_parser):
+    """Test that lora-target-modules defaults to None"""
+    args = serve_parser.parse_args(args=[])
+    assert args.lora_target_modules is None
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join(
    ]
 )

-MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
+MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/"
 INPUT_TRANSCRIPTION_BATCH = (
    json.dumps(
        {

--- a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
@@ -5,7 +5,7 @@ import json

 import pytest

-from tests.entrypoints.openai.tool_parsers.utils import (
+from tests.tool_parsers.utils import (
    run_tool_extraction,
    run_tool_extraction_streaming,
 )
@@ -13,6 +13,13 @@ from vllm.entrypoints.openai.engine.protocol import FunctionCall
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager

+MSG_SEP_TOKEN = "<|message_sep|>\n\n"
+ROLE_SEP_TOKEN = "<|role_sep|>\n"
+EOS_TOKEN = "</s>"
+TOOL_HEADER_GIGACHAT3 = f"function call{ROLE_SEP_TOKEN}"
+TOOL_HEADER_GIGACHAT31 = "<|function_call|>"
+
+
 SIMPLE_ARGS_DICT = {
    "action": "create",
    "id": "preferences",
@@ -24,7 +31,10 @@ SIMPLE_FUNCTION_JSON = json.dumps(
    },
    ensure_ascii=False,
 )
-SIMPLE_FUNCTION_OUTPUT = "function call" + SIMPLE_FUNCTION_JSON
+SIMPLE_FUNCTION_OUTPUT_GIGACHAT3 = (
+    f"{MSG_SEP_TOKEN}{TOOL_HEADER_GIGACHAT3}{SIMPLE_FUNCTION_JSON}"
+)
+SIMPLE_FUNCTION_OUTPUT_GIGACHAT31 = f"{TOOL_HEADER_GIGACHAT31}{SIMPLE_FUNCTION_JSON}"
 SIMPLE_FUNCTION_CALL = FunctionCall(
    name="manage_user_memory",
    arguments=json.dumps(SIMPLE_ARGS_DICT, ensure_ascii=False),
@@ -38,7 +48,12 @@ PARAMETERLESS_FUNCTION_JSON = json.dumps(
    },
    ensure_ascii=False,
 )
-PARAMETERLESS_FUNCTION_OUTPUT = "function call" + PARAMETERLESS_FUNCTION_JSON
+PARAMETERLESS_FUNCTION_OUTPUT_GIGACHAT3 = (
+    f"{MSG_SEP_TOKEN}{TOOL_HEADER_GIGACHAT3}{PARAMETERLESS_FUNCTION_JSON}"
+)
+PARAMETERLESS_FUNCTION_OUTPUT_GIGACHAT31 = (
+    f"{TOOL_HEADER_GIGACHAT31}{PARAMETERLESS_FUNCTION_JSON}"
+)
 PARAMETERLESS_FUNCTION_CALL = FunctionCall(
    name="manage_user_memory",
    arguments=json.dumps({}, ensure_ascii=False),
@@ -62,17 +77,38 @@ COMPLEX_FUNCTION_JSON = json.dumps(
    },
    ensure_ascii=False,
 )
-COMPLEX_FUNCTION_OUTPUT = "function call" + COMPLEX_FUNCTION_JSON
+COMPLEX_FUNCTION_OUTPUT_GIGACHAT3 = (
+    f"{MSG_SEP_TOKEN}{TOOL_HEADER_GIGACHAT3}{COMPLEX_FUNCTION_JSON}"
+)
+COMPLEX_FUNCTION_OUTPUT_GIGACHAT31 = f"{TOOL_HEADER_GIGACHAT31}{COMPLEX_FUNCTION_JSON}"
 COMPLEX_FUNCTION_CALL = FunctionCall(
    name="manage_user_memory",
    arguments=json.dumps(COMPLEX_ARGS_DICT, ensure_ascii=False),
 )


+CONTENT_TEXT = "I'll check that for you."
+MIXED_OUTPUT_GIGACHAT3 = f"{CONTENT_TEXT}{SIMPLE_FUNCTION_OUTPUT_GIGACHAT3}"
+MIXED_OUTPUT_GIGACHAT31 = f"{CONTENT_TEXT}{SIMPLE_FUNCTION_OUTPUT_GIGACHAT31}"
+
+
+@pytest.fixture(name="gigachat_tokenizer")
+def fixture_gigachat_tokenizer(default_tokenizer: TokenizerLike):
+    default_tokenizer.add_tokens(
+        [
+            MSG_SEP_TOKEN,
+            ROLE_SEP_TOKEN,
+            TOOL_HEADER_GIGACHAT31,
+            EOS_TOKEN,
+        ]
+    )
+    return default_tokenizer
+
+
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
+def test_no_tool_call(streaming: bool, gigachat_tokenizer: TokenizerLike):
    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
-        default_tokenizer
+        gigachat_tokenizer
    )
    model_output = "How can I help you today?"
    content, tool_calls = run_tool_extraction(
@@ -85,45 +121,143 @@ def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
 TEST_CASES = [
    pytest.param(
        True,
-        SIMPLE_FUNCTION_OUTPUT,
+        SIMPLE_FUNCTION_OUTPUT_GIGACHAT3,
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_streaming_gigachat3",
+    ),
+    pytest.param(
+        False,
+        SIMPLE_FUNCTION_OUTPUT_GIGACHAT3,
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_nonstreaming_gigachat3",
+    ),
+    pytest.param(
+        True,
+        PARAMETERLESS_FUNCTION_OUTPUT_GIGACHAT3,
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_streaming_gigachat3",
+    ),
+    pytest.param(
+        False,
+        PARAMETERLESS_FUNCTION_OUTPUT_GIGACHAT3,
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_nonstreaming_gigachat3",
+    ),
+    pytest.param(
+        True,
+        COMPLEX_FUNCTION_OUTPUT_GIGACHAT3,
+        [COMPLEX_FUNCTION_CALL],
+        None,
+        id="complex_streaming_gigachat3",
+    ),
+    pytest.param(
+        False,
+        COMPLEX_FUNCTION_OUTPUT_GIGACHAT3,
+        [COMPLEX_FUNCTION_CALL],
+        None,
+        id="complex_nonstreaming_gigachat3",
+    ),
+    pytest.param(
+        True,
+        MIXED_OUTPUT_GIGACHAT3,
+        [SIMPLE_FUNCTION_CALL],
+        CONTENT_TEXT,
+        id="mixed_content_streaming_gigachat3",
+    ),
+    pytest.param(
+        False,
+        MIXED_OUTPUT_GIGACHAT3,
+        [SIMPLE_FUNCTION_CALL],
+        CONTENT_TEXT,
+        id="mixed_content_nonstreaming_gigachat3",
+    ),
+    pytest.param(
+        True,
+        MIXED_OUTPUT_GIGACHAT3 + EOS_TOKEN,
+        [SIMPLE_FUNCTION_CALL],
+        CONTENT_TEXT,
+        id="mixed_content_streaming_with_eos_gigachat3",
+    ),
+    pytest.param(
+        False,
+        MIXED_OUTPUT_GIGACHAT3 + EOS_TOKEN,
+        [SIMPLE_FUNCTION_CALL],
+        CONTENT_TEXT,
+        id="mixed_content_nonstreaming_with_eos_gigachat3",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_FUNCTION_OUTPUT_GIGACHAT31,
        [SIMPLE_FUNCTION_CALL],
        None,
-        id="simple_streaming",
+        id="simple_streaming_gigachat31",
    ),
    pytest.param(
        False,
-        SIMPLE_FUNCTION_OUTPUT,
+        SIMPLE_FUNCTION_OUTPUT_GIGACHAT31,
        [SIMPLE_FUNCTION_CALL],
        None,
-        id="simple_nonstreaming",
+        id="simple_nonstreaming_gigachat31",
    ),
    pytest.param(
        True,
-        PARAMETERLESS_FUNCTION_OUTPUT,
+        PARAMETERLESS_FUNCTION_OUTPUT_GIGACHAT31,
        [PARAMETERLESS_FUNCTION_CALL],
        None,
-        id="parameterless_streaming",
+        id="parameterless_streaming_gigachat31",
    ),
    pytest.param(
        False,
-        PARAMETERLESS_FUNCTION_OUTPUT,
+        PARAMETERLESS_FUNCTION_OUTPUT_GIGACHAT31,
        [PARAMETERLESS_FUNCTION_CALL],
        None,
-        id="parameterless_nonstreaming",
+        id="parameterless_nonstreaming_gigachat31",
    ),
    pytest.param(
        True,
-        COMPLEX_FUNCTION_OUTPUT,
+        COMPLEX_FUNCTION_OUTPUT_GIGACHAT31,
        [COMPLEX_FUNCTION_CALL],
        None,
-        id="complex_streaming",
+        id="complex_streaming_gigachat31",
    ),
    pytest.param(
        False,
-        COMPLEX_FUNCTION_OUTPUT,
+        COMPLEX_FUNCTION_OUTPUT_GIGACHAT31,
        [COMPLEX_FUNCTION_CALL],
        None,
-        id="complex_nonstreaming",
+        id="complex_nonstreaming_gigachat31",
+    ),
+    pytest.param(
+        True,
+        MIXED_OUTPUT_GIGACHAT31,
+        [SIMPLE_FUNCTION_CALL],
+        CONTENT_TEXT,
+        id="mixed_content_streaming_gigachat31",
+    ),
+    pytest.param(
+        False,
+        MIXED_OUTPUT_GIGACHAT31,
+        [SIMPLE_FUNCTION_CALL],
+        CONTENT_TEXT,
+        id="mixed_content_nonstreaming_gigachat31",
+    ),
+    pytest.param(
+        True,
+        MIXED_OUTPUT_GIGACHAT31 + EOS_TOKEN,
+        [SIMPLE_FUNCTION_CALL],
+        CONTENT_TEXT,
+        id="mixed_content_streaming_with_eos_gigachat31",
+    ),
+    pytest.param(
+        False,
+        MIXED_OUTPUT_GIGACHAT31 + EOS_TOKEN,
+        [SIMPLE_FUNCTION_CALL],
+        CONTENT_TEXT,
+        id="mixed_content_nonstreaming_with_eos_gigachat31",
    ),
 ]

@@ -136,14 +270,16 @@ def test_tool_call(
    model_output: str,
    expected_tool_calls: list[FunctionCall],
    expected_content: str | None,
-    default_tokenizer: TokenizerLike,
+    gigachat_tokenizer: TokenizerLike,
 ):
    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
-        default_tokenizer
+        gigachat_tokenizer
    )
    content, tool_calls = run_tool_extraction(
        tool_parser, model_output, streaming=streaming
    )
+    if content == "":
+        content = None
    assert content == expected_content
    assert len(tool_calls) == len(expected_tool_calls)
    for actual, expected in zip(tool_calls, expected_tool_calls):
@@ -154,15 +290,46 @@ def test_tool_call(
        assert actual_args == expected_args


-def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
+@pytest.mark.parametrize(
+    "model_output_deltas",
+    [
+        pytest.param(
+            [
+                CONTENT_TEXT[:3],
+                CONTENT_TEXT[3:5],
+                CONTENT_TEXT[5:],
+                MSG_SEP_TOKEN,
+                TOOL_HEADER_GIGACHAT3,
+                COMPLEX_FUNCTION_JSON[:40],
+                COMPLEX_FUNCTION_JSON[40:-1],
+                COMPLEX_FUNCTION_JSON[-1],
+            ],
+            id="gigachat3",
+        ),
+        pytest.param(
+            [
+                CONTENT_TEXT[:3],
+                CONTENT_TEXT[3:5],
+                CONTENT_TEXT[5:],
+                TOOL_HEADER_GIGACHAT31,
+                COMPLEX_FUNCTION_JSON[:40],
+                COMPLEX_FUNCTION_JSON[40:-1],
+                COMPLEX_FUNCTION_JSON[-1],
+            ],
+            id="gigachat31",
+        ),
+    ],
+)
+def test_streaming_tool_call_with_large_steps(
+    model_output_deltas: list[str],
+    gigachat_tokenizer: TokenizerLike,
+):
+    """
+    Test that the closing braces are streamed correctly.
+    """
    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
-        default_tokenizer
+        gigachat_tokenizer
    )
-    model_output_deltas = [
-        "function call",
-        COMPLEX_FUNCTION_JSON[:40],
-        COMPLEX_FUNCTION_JSON[40:],
-    ]
    reconstructor = run_tool_extraction_streaming(
        tool_parser,
        model_output_deltas,

--- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@@ -7,7 +7,7 @@ from unittest.mock import MagicMock

 import pytest

-from tests.entrypoints.openai.tool_parsers.utils import (
+from tests.tool_parsers.utils import (
    run_tool_extraction,
    run_tool_extraction_streaming,
 )

--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -5,7 +5,7 @@ from unittest.mock import MagicMock, patch

 import pytest

-from tests.entrypoints.openai.tool_parsers.utils import (
+from tests.tool_parsers.utils import (
    run_tool_extraction,
    run_tool_extraction_streaming,
 )

--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -5,7 +5,7 @@ from unittest.mock import MagicMock, patch

 import pytest

-from tests.entrypoints.openai.tool_parsers.utils import (
+from tests.tool_parsers.utils import (
    run_tool_extraction,
    run_tool_extraction_streaming,
 )

--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -5,7 +5,7 @@ from unittest.mock import MagicMock, patch

 import pytest

-from tests.entrypoints.openai.tool_parsers.utils import (
+from tests.tool_parsers.utils import (
    run_tool_extraction,
    run_tool_extraction_streaming,
 )

--- a/tests/entrypoints/pooling/embed/test_cohere_online.py
+++ b/tests/entrypoints/pooling/embed/test_cohere_online.py
@@ -7,10 +7,10 @@ embedding models, covering text embedding, embedding type conversions,
 response structure, batching, normalisation, and semantic similarity.
 """

-import base64
 import struct

 import numpy as np
+import pybase64 as base64
 import pytest
 import requests


--- a/tests/entrypoints/pooling/embed/test_cohere_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_cohere_online_vision.py
@@ -6,11 +6,11 @@ Validates image embedding, batching, normalisation, and embedding type
 conversions through the /v2/embed endpoint.
 """

-import base64
 import struct
 import zlib

 import numpy as np
+import pybase64 as base64
 import pytest
 import requests


--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import base64
 import json

 import numpy as np
 import openai
+import pybase64 as base64
 import pytest
 import pytest_asyncio
 import requests

--- a/tests/entrypoints/pooling/embed/test_protocol.py
+++ b/tests/entrypoints/pooling/embed/test_protocol.py
@@ -3,10 +3,10 @@
 """Unit tests for Cohere embed protocol: build_typed_embeddings and its
 underlying packing helpers, plus Cohere-specific serving helpers."""

-import base64
 import struct

 import numpy as np
+import pybase64 as base64
 import pytest

 from vllm.entrypoints.pooling.embed.protocol import (

--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import base64
 import json

 import numpy as np
+import pybase64 as base64
 import pytest
 import requests
 import torch

--- a/tests/v1/entrypoints/openai/serving_responses/__init__.py
+++ b/tests/v1/entrypoints/openai/serving_responses/__init__.py
--- a/tests/entrypoints/serve/disagg/__init__.py
+++ b/tests/entrypoints/serve/disagg/__init__.py
--- a/tests/entrypoints/openai/test_serving_tokens.py
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@@ -8,12 +8,11 @@ import pytest
 import pytest_asyncio
 from transformers import AutoTokenizer

+from tests.utils import RemoteOpenAIServer
 from vllm.config import ModelConfig
 from vllm.config.utils import getattr_iter
 from vllm.v1.engine.detokenizer import check_stop_strings

-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "Qwen/Qwen3-0.6B"
 GEN_ENDPOINT = "/inference/v1/generate"