feat(v4): cherry-pick #8665 onto release/deepseekv4 (#8709)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat(v4): cherry-pick #8665 onto release/deepseekv4 (#8709)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>
35fa7129 · Keiven C · GitHub · 01002df7 · 35fa7129 · 35fa7129
Unverified Commit 35fa7129 authored Apr 24, 2026 by Keiven C Committed by GitHub Apr 24, 2026
20 changed files
--- a/.github/workflows/copyright-check.ps1
+++ b/.github/workflows/copyright-check.ps1
@@ -84,7 +84,7 @@ $global:copyright_results = @{

 $ignored_files = @('.clang-format', '.gitattributes', '.gitignore', '.gitkeep', '.patch', 'Cargo.lock', 'LICENSE', 'uv.lock', 'rust-toolchain.toml', 'codespell.txt', 'exclusions.txt')
 write-debug "<copyright-check> ignored_files = ['$($ignored_files -join "','")']."
-$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'lib/llm/tests/data/sample-models', 'lib/llm/tests/data/deepseek-v3.2')
+$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'lib/llm/tests/data/sample-models', 'lib/llm/tests/data/deepseek-v3.2', 'lib/llm/tests/data/deepseek-v4')
 write-debug "<copyright-check> ignored_paths = ['$($ignored_paths -join "','")']."
 $ignored_types = @('.bat', '.gif', '.ico', '.ipynb', '.jpg', '.jpeg', '.patch', '.png', '.pyc', '.pyi', '.rst', '.zip', '.md', '.json')
 write-debug "<copyright-check> ignored_types = ['$($ignored_types -join "', '")']."

--- a/components/src/dynamo/frontend/sglang_prepost.py
+++ b/components/src/dynamo/frontend/sglang_prepost.py
@@ -3,9 +3,12 @@

 from __future__ import annotations

+import copy
+import inspect
 import json
 import logging
 from dataclasses import dataclass
+from functools import lru_cache
 from typing import Any, TypeAlias

 from sglang.srt.entrypoints.openai.protocol import Function as SglangFunction
@@ -129,6 +132,127 @@ def _is_named_tool_choice(tool_choice: Any) -> bool:
    )


+def _normalize_deepseek_v4_hint(value: Any) -> str:
+    return str(value or "").lower().replace("-", "").replace("_", "")
+
+
+def _should_use_deepseek_v4_encoding(
+    request: dict[str, Any],
+    *,
+    tokenizer,
+    tool_call_parser_name: str | None,
+    reasoning_parser_name: str | None,
+) -> bool:
+    if getattr(tokenizer, "chat_template", None) is not None:
+        return False
+
+    return any(
+        "deepseekv4" in _normalize_deepseek_v4_hint(value)
+        for value in (
+            request.get("model"),
+            tool_call_parser_name,
+            reasoning_parser_name,
+        )
+    )
+
+
+def _filter_template_tools(
+    request: dict[str, Any],
+    *,
+    exclude_tools_when_tool_choice_none: bool,
+) -> list[dict[str, Any]] | None:
+    raw_tools = request.get("tools") or []
+    if not raw_tools:
+        return None
+
+    tool_choice = request.get("tool_choice", "auto")
+    if exclude_tools_when_tool_choice_none and tool_choice == "none":
+        return None
+
+    if _is_named_tool_choice(tool_choice):
+        chosen_name = tool_choice["function"]["name"]
+        return [
+            copy.deepcopy(tool)
+            for tool in raw_tools
+            if tool.get("function", {}).get("name") == chosen_name
+        ]
+
+    return copy.deepcopy(raw_tools)
+
+
+def _render_deepseek_v4_prompt_token_ids(
+    request: dict[str, Any],
+    *,
+    messages: list[dict[str, Any]],
+    tokenizer,
+    template_tools: list[dict[str, Any]] | None,
+) -> list[int]:
+    try:
+        from sglang.srt.entrypoints.openai.encoding_dsv4 import encode_messages
+    except ImportError as exc:
+        raise ValueError(
+            "DeepSeek-V4 preprocessing requires SGLang's "
+            "sglang.srt.entrypoints.openai.encoding_dsv4 encoder. "
+            "Install an SGLang build that includes the DeepSeek-V4 integration."
+        ) from exc
+
+    encoding_messages = copy.deepcopy(messages)
+    for msg in encoding_messages:
+        if msg.get("content") is None:
+            msg["content"] = ""
+
+    if template_tools:
+        if not encoding_messages or encoding_messages[0].get("role") != "system":
+            encoding_messages.insert(0, {"role": "system", "content": ""})
+        encoding_messages[0]["tools"] = template_tools
+
+    chat_template_kwargs = request.get("chat_template_kwargs") or {}
+    thinking_mode = "thinking" if chat_template_kwargs.get("thinking") else "chat"
+    reasoning_effort = (
+        request.get("reasoning_effort")
+        or chat_template_kwargs.get("reasoning_effort")
+        or None
+    )
+    if reasoning_effort not in ("max", "high", None):
+        reasoning_effort = None
+
+    prompt = encode_messages(
+        encoding_messages,
+        thinking_mode=thinking_mode,
+        reasoning_effort=reasoning_effort,
+    )
+    return _normalize_prompt_token_ids(tokenizer.encode(prompt))
+
+
+@lru_cache(maxsize=64)
+def _callable_accepts_kwarg(func: Any, kwarg: str) -> bool:
+    try:
+        signature = inspect.signature(func)
+    except (TypeError, ValueError):
+        return False
+
+    for name, param in signature.parameters.items():
+        if param.kind == inspect.Parameter.VAR_KEYWORD:
+            return True
+        if name == kwarg and param.kind in (
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            inspect.Parameter.KEYWORD_ONLY,
+        ):
+            return True
+    return False
+
+
+def _call_with_optional_parallel_tool_calls(
+    func: Any,
+    *args: Any,
+    parallel_tool_calls: Any,
+) -> Any:
+    """Call SGLang helpers across versions with/without parallel_tool_calls."""
+    if _callable_accepts_kwarg(func, "parallel_tool_calls"):
+        return func(*args, parallel_tool_calls=parallel_tool_calls)
+    return func(*args)
+
+
 def build_tool_call_guided_decoding(
    request: dict[str, Any],
    *,
@@ -161,7 +285,8 @@ def build_tool_call_guided_decoding(
            )
        constraint = (
            "json_schema",
-            get_json_schema_constraint(
+            _call_with_optional_parallel_tool_calls(
+                get_json_schema_constraint,
                sglang_tools,
                sglang_tool_choice,
                parallel_tool_calls=parallel_tool_calls,
@@ -172,7 +297,8 @@ def build_tool_call_guided_decoding(
            tools=sglang_tools,
            tool_call_parser=tool_call_parser_name,
        )
-        constraint = parser.get_structure_constraint(
+        constraint = _call_with_optional_parallel_tool_calls(
+            parser.get_structure_constraint,
            tool_choice,
            parallel_tool_calls=parallel_tool_calls,
        )
@@ -239,30 +365,38 @@ def preprocess_chat_request(
                f"present in tools (available: {sorted(available_names) or 'none'})"
            )

-    # Build template kwargs -- single call for rendering + tokenization
-    template_kwargs: dict[str, Any] = {
-        "add_generation_prompt": True,
-        "tokenize": True,
-    }
-    # Strip tools from template when tool_choice=none so the model doesn't
-    # see them and generate raw XML tool calls in its response.
-    # When tool_choice names a specific function, only include that tool
-    # in the template so the model doesn't see irrelevant definitions.
-    if sglang_tools and not (
-        exclude_tools_when_tool_choice_none and tool_choice == "none"
-    ):
-        if _is_named_tool_choice(tool_choice):
-            chosen_name = tool_choice["function"]["name"]
-            template_kwargs["tools"] = [
-                t.model_dump() for t in sglang_tools if t.function.name == chosen_name
-            ]
-        else:
-            template_kwargs["tools"] = [t.model_dump() for t in sglang_tools]
-
-    prompt_token_ids = _normalize_prompt_token_ids(
-        tokenizer.apply_chat_template(messages, **template_kwargs)
+    template_tools = _filter_template_tools(
+        request,
+        exclude_tools_when_tool_choice_none=exclude_tools_when_tool_choice_none,
    )

+    if _should_use_deepseek_v4_encoding(
+        request,
+        tokenizer=tokenizer,
+        tool_call_parser_name=tool_call_parser_name,
+        reasoning_parser_name=reasoning_parser_name,
+    ):
+        prompt_token_ids = _render_deepseek_v4_prompt_token_ids(
+            request,
+            messages=messages,
+            tokenizer=tokenizer,
+            template_tools=template_tools,
+        )
+    else:
+        # Build template kwargs -- single call for rendering + tokenization
+        template_kwargs: dict[str, Any] = {
+            "add_generation_prompt": True,
+            "tokenize": True,
+        }
+        if template_tools:
+            template_kwargs["tools"] = template_tools
+
+        prompt_token_ids = _normalize_prompt_token_ids(
+            tokenizer.apply_chat_template(messages, **template_kwargs)
+        )
+
+    # Build parsers after rendering, so DeepSeek-V4 can use its custom encoder
+    # while still sharing the existing Dynamo parser/guided-decoding behavior.
    tool_call_parser, reasoning_parser = create_parsers(
        request,
        tool_call_parser_name=tool_call_parser_name,

--- a/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py
+++ b/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py
@@ -11,12 +11,15 @@ Parallels test_vllm_unit.py for the vLLM backend.


 import json
+import sys
+import types

 import pytest
 from sglang.srt.function_call.function_call_parser import FunctionCallParser
 from sglang.srt.function_call.json_array_parser import JsonArrayParser
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer

+import dynamo.frontend.sglang_prepost as sglang_prepost_module
 import dynamo.frontend.sglang_processor as sglang_processor_module
 from dynamo.frontend.sglang_prepost import (
    SglangPreprocessResult,
@@ -445,6 +448,85 @@ class TestBuildToolCallGuidedDecoding:
        assert isinstance(guided, dict)
        assert "json" in guided

+    def test_required_tool_choice_supports_older_sglang_constraint_signature(
+        self, monkeypatch
+    ):
+        tools = convert_tools(
+            [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"city": {"type": "string"}},
+                        },
+                    },
+                }
+            ]
+        )
+
+        def old_get_json_schema_constraint(sglang_tools, tool_choice):
+            assert sglang_tools == tools
+            assert tool_choice == "required"
+            return {"type": "array", "items": {"type": "object"}}
+
+        monkeypatch.setattr(
+            sglang_prepost_module,
+            "get_json_schema_constraint",
+            old_get_json_schema_constraint,
+        )
+
+        guided = build_tool_call_guided_decoding(
+            {"tool_choice": "required", "parallel_tool_calls": False},
+            tool_call_parser_name=None,
+            sglang_tools=tools,
+        )
+
+        assert guided == {"json": {"type": "array", "items": {"type": "object"}}}
+
+    def test_auto_tool_choice_supports_older_structure_constraint_signature(
+        self, monkeypatch
+    ):
+        tools = convert_tools(
+            [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "strict": True,
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"city": {"type": "string"}},
+                        },
+                    },
+                }
+            ]
+        )
+
+        class OldFunctionCallParser:
+            def __init__(self, *, tools, tool_call_parser):
+                self.tools = tools
+                self.tool_call_parser = tool_call_parser
+
+            def get_structure_constraint(self, tool_choice):
+                assert tool_choice == "auto"
+                return "structural_tag", {"type": "object"}
+
+        monkeypatch.setattr(
+            sglang_prepost_module,
+            "FunctionCallParser",
+            OldFunctionCallParser,
+        )
+
+        guided = build_tool_call_guided_decoding(
+            {"tool_choice": "auto", "parallel_tool_calls": False},
+            tool_call_parser_name="kimi_k2",
+            sglang_tools=tools,
+        )
+
+        assert guided == {"structural_tag": {"type": "object"}}
+
    def test_auto_strict_tools_can_build_structural_tag_guidance(self):
        tools = convert_tools(
            [
@@ -990,6 +1072,240 @@ class TestPreprocessChatRequest:
        )
        assert len(with_system.prompt_token_ids) > len(without_system.prompt_token_ids)

+    def test_deepseek_v4_uses_sglang_encoder_when_chat_template_missing(
+        self, monkeypatch
+    ):
+        """DeepSeek-V4 uses SGLang's encoder instead of HF chat_template."""
+        captured = {}
+        fake_module = types.ModuleType("sglang.srt.entrypoints.openai.encoding_dsv4")
+
+        def fake_encode_messages(messages, *, thinking_mode, reasoning_effort=None):
+            captured["messages"] = messages
+            captured["thinking_mode"] = thinking_mode
+            captured["reasoning_effort"] = reasoning_effort
+            return "<dsv4-prompt>"
+
+        fake_module.encode_messages = fake_encode_messages
+        monkeypatch.setitem(
+            sys.modules,
+            "sglang.srt.entrypoints.openai.encoding_dsv4",
+            fake_module,
+        )
+
+        class NoTemplateTokenizer:
+            chat_template = None
+
+            def apply_chat_template(self, *args, **kwargs):
+                raise AssertionError("apply_chat_template should not be called")
+
+            def encode(self, prompt):
+                assert prompt == "<dsv4-prompt>"
+                return [1, 2, 3]
+
+        request = {
+            "model": "deepseek-ai/DeepSeek-V4-Pro",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "chat_template_kwargs": {
+                "thinking": True,
+                "reasoning_effort": "max",
+            },
+            "tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "description": "Get weather",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"city": {"type": "string"}},
+                        },
+                    },
+                }
+            ],
+        }
+
+        result = preprocess_chat_request(
+            request,
+            tokenizer=NoTemplateTokenizer(),
+            tool_call_parser_name=None,
+            reasoning_parser_name="deepseek_v4",
+        )
+
+        assert result.prompt_token_ids == [1, 2, 3]
+        assert captured["thinking_mode"] == "thinking"
+        assert captured["reasoning_effort"] == "max"
+        assert captured["messages"][0]["role"] == "system"
+        assert captured["messages"][0]["tools"][0]["function"]["name"] == "get_weather"
+        assert captured["messages"][1]["role"] == "user"
+
+    def test_deepseek_v4_named_tool_choice_filters_encoder_tools(self, monkeypatch):
+        captured = {}
+        fake_module = types.ModuleType("sglang.srt.entrypoints.openai.encoding_dsv4")
+
+        def fake_encode_messages(messages, *, thinking_mode, reasoning_effort=None):
+            captured["messages"] = messages
+            return "<dsv4-prompt>"
+
+        fake_module.encode_messages = fake_encode_messages
+        monkeypatch.setitem(
+            sys.modules,
+            "sglang.srt.entrypoints.openai.encoding_dsv4",
+            fake_module,
+        )
+
+        class NoTemplateTokenizer:
+            chat_template = None
+
+            def encode(self, prompt):
+                return [1]
+
+        request = {
+            "model": "deepseek-ai/DeepSeek-V4-Pro",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "tools": [
+                {
+                    "type": "function",
+                    "function": {"name": "get_weather", "parameters": {}},
+                },
+                {
+                    "type": "function",
+                    "function": {"name": "get_time", "parameters": {}},
+                },
+            ],
+            "tool_choice": {
+                "type": "function",
+                "function": {"name": "get_time"},
+            },
+        }
+
+        preprocess_chat_request(
+            request,
+            tokenizer=NoTemplateTokenizer(),
+            tool_call_parser_name=None,
+            reasoning_parser_name="deepseek_v4",
+        )
+
+        tools = captured["messages"][0]["tools"]
+        assert [tool["function"]["name"] for tool in tools] == ["get_time"]
+
+    def test_deepseek_v4_respects_existing_chat_template(self, monkeypatch):
+        fake_module = types.ModuleType("sglang.srt.entrypoints.openai.encoding_dsv4")
+
+        def fake_encode_messages(messages, *, thinking_mode, reasoning_effort=None):
+            raise AssertionError("encoding_dsv4 should not be called")
+
+        fake_module.encode_messages = fake_encode_messages
+        monkeypatch.setitem(
+            sys.modules,
+            "sglang.srt.entrypoints.openai.encoding_dsv4",
+            fake_module,
+        )
+
+        class TemplateTokenizer:
+            chat_template = (
+                "{% for message in messages %}{{ message.content }}{% endfor %}"
+            )
+
+            def apply_chat_template(self, messages, **kwargs):
+                assert kwargs["add_generation_prompt"] is True
+                assert kwargs["tokenize"] is True
+                return [4, 5, 6]
+
+            def encode(self, prompt):
+                raise AssertionError("encode should not be called")
+
+        result = preprocess_chat_request(
+            {
+                "model": "deepseek-ai/DeepSeek-V4-Pro",
+                "messages": [{"role": "user", "content": "Hello"}],
+            },
+            tokenizer=TemplateTokenizer(),
+            tool_call_parser_name=None,
+            reasoning_parser_name=None,
+        )
+
+        assert result.prompt_token_ids == [4, 5, 6]
+
+    def test_deepseek_v4_normalizes_none_content_without_mutating_request(
+        self, monkeypatch
+    ):
+        captured = {}
+        fake_module = types.ModuleType("sglang.srt.entrypoints.openai.encoding_dsv4")
+
+        def fake_encode_messages(messages, *, thinking_mode, reasoning_effort=None):
+            captured["messages"] = messages
+            return "<dsv4-prompt>"
+
+        fake_module.encode_messages = fake_encode_messages
+        monkeypatch.setitem(
+            sys.modules,
+            "sglang.srt.entrypoints.openai.encoding_dsv4",
+            fake_module,
+        )
+
+        class NoTemplateTokenizer:
+            chat_template = None
+
+            def encode(self, prompt):
+                return [7]
+
+        request = {
+            "model": "deepseek-ai/DeepSeek-V4-Pro",
+            "messages": [{"role": "assistant", "content": None}],
+        }
+
+        result = preprocess_chat_request(
+            request,
+            tokenizer=NoTemplateTokenizer(),
+            tool_call_parser_name=None,
+            reasoning_parser_name=None,
+        )
+
+        assert result.prompt_token_ids == [7]
+        assert captured["messages"] == [{"role": "assistant", "content": ""}]
+        assert request["messages"] == [{"role": "assistant", "content": None}]
+
+    def test_deepseek_v4_tool_choice_none_strips_encoder_tools(self, monkeypatch):
+        captured = {}
+        fake_module = types.ModuleType("sglang.srt.entrypoints.openai.encoding_dsv4")
+
+        def fake_encode_messages(messages, *, thinking_mode, reasoning_effort=None):
+            captured["messages"] = messages
+            return "<dsv4-prompt>"
+
+        fake_module.encode_messages = fake_encode_messages
+        monkeypatch.setitem(
+            sys.modules,
+            "sglang.srt.entrypoints.openai.encoding_dsv4",
+            fake_module,
+        )
+
+        class NoTemplateTokenizer:
+            chat_template = None
+
+            def encode(self, prompt):
+                return [8]
+
+        preprocess_chat_request(
+            {
+                "model": "deepseek-ai/DeepSeek-V4-Pro",
+                "messages": [{"role": "system", "content": "Stay terse."}],
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {"name": "get_weather", "parameters": {}},
+                    }
+                ],
+                "tool_choice": "none",
+            },
+            tokenizer=NoTemplateTokenizer(),
+            tool_call_parser_name=None,
+            reasoning_parser_name=None,
+            exclude_tools_when_tool_choice_none=True,
+        )
+
+        assert "tools" not in captured["messages"][0]
+

 # ---------------------------------------------------------------------------
 # SglangStreamingPostProcessor: incremental detokenization

--- a/components/src/dynamo/sglang/_compat.py
+++ b/components/src/dynamo/sglang/_compat.py
@@ -15,13 +15,96 @@ removed. When the old version falls outside the support window, delete the
 fallback and any associated polyfills.
 """

+import inspect
 import ipaddress
 import logging
 import socket
+from functools import lru_cache
 from typing import Any

 logger = logging.getLogger(__name__)

+
+# ---------------------------------------------------------------------------
+# Top-level sglang exports: Engine, ServerArgs
+#
+# Some SGLang dev builds (including 0.5.x snapshots) do not re-export these
+# from sglang/__init__.py, while Dynamo historically uses `import sglang as sgl`
+# followed by `sgl.Engine(...)` throughout this backend.
+# ---------------------------------------------------------------------------
+def ensure_sglang_top_level_exports() -> None:
+    """Restore top-level SGLang exports omitted by some install flavors."""
+    import sglang as sgl
+
+    if not hasattr(sgl, "Engine"):
+        from sglang.srt.entrypoints.engine import Engine
+
+        sgl.Engine = Engine
+
+    if not hasattr(sgl, "ServerArgs"):
+        from sglang.srt.server_args import ServerArgs
+
+        sgl.ServerArgs = ServerArgs
+
+
+ensure_sglang_top_level_exports()
+
+
+@lru_cache(maxsize=32)
+def _get_async_generate_supported_kwarg_names(
+    async_generate: Any,
+) -> frozenset[str] | None:
+    """Return supported async_generate keyword names, or None for **kwargs."""
+    try:
+        signature = inspect.signature(async_generate)
+    except (TypeError, ValueError):
+        logger.debug(
+            "Could not inspect SGLang Engine.async_generate signature; "
+            "dropping optional compatibility kwargs"
+        )
+        return frozenset()
+
+    names: set[str] = set()
+    for name, param in signature.parameters.items():
+        if param.kind == inspect.Parameter.VAR_KEYWORD:
+            return None
+        if param.kind in (
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            inspect.Parameter.KEYWORD_ONLY,
+        ):
+            names.add(name)
+
+    return frozenset(names)
+
+
+def filter_supported_async_generate_kwargs(
+    engine: Any, kwargs: dict[str, Any]
+) -> dict[str, Any]:
+    """Return only async_generate kwargs accepted by this SGLang engine.
+
+    SGLang occasionally adds optional Engine.async_generate kwargs before every
+    supported install flavor has them. Keep the compatibility boundary narrow:
+    callers decide which kwargs are optional, and this helper only drops those
+    optional kwargs when the installed engine cannot accept them.
+    """
+    async_generate = engine.async_generate
+    signature_source = getattr(async_generate, "__func__", async_generate)
+
+    try:
+        supported_kwarg_names = _get_async_generate_supported_kwarg_names(
+            signature_source
+        )
+    except TypeError:
+        supported_kwarg_names = _get_async_generate_supported_kwarg_names.__wrapped__(
+            signature_source
+        )
+
+    if supported_kwarg_names is None:
+        return kwargs
+
+    return {key: value for key, value in kwargs.items() if key in supported_kwarg_names}
+
+
 # ---------------------------------------------------------------------------
 # Network utilities: NetworkAddress, get_local_ip_auto, get_zmq_socket
 #
@@ -201,6 +284,8 @@ def enable_disjoint_streaming_output(server_args: Any) -> None:
 __all__ = [
    "NetworkAddress",
    "enable_disjoint_streaming_output",
+    "ensure_sglang_top_level_exports",
+    "filter_supported_async_generate_kwargs",
    "get_local_ip_auto",
    "get_scheduler_info",
    "get_zmq_socket",

--- a/components/src/dynamo/sglang/publisher.py
+++ b/components/src/dynamo/sglang/publisher.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

+from __future__ import annotations
+
 import asyncio
 import json
 import logging

--- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
@@ -13,6 +13,7 @@ from dynamo._core import Context
 from dynamo.common.constants import DisaggregationMode
 from dynamo.common.utils.engine_response import normalize_finish_reason
 from dynamo.common.utils.otel_tracing import build_trace_headers
+from dynamo.sglang._compat import filter_supported_async_generate_kwargs
 from dynamo.sglang.args import Config
 from dynamo.sglang.publisher import DynamoSglangPublisher
 from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
@@ -275,6 +276,9 @@ class DecodeWorkerHandler(BaseWorkerHandler):
        return_routed_experts = getattr(
            self.config.server_args, "enable_return_routed_experts", False
        )
+        routed_experts_kwargs = filter_supported_async_generate_kwargs(
+            self.engine, {"return_routed_experts": return_routed_experts}
+        )
        priority = (request.get("routing") or {}).get("priority")
        logprob_kwargs = self._build_logprob_kwargs(request)

@@ -308,7 +312,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                **input_param,
                sampling_params=sampling_params,
                stream=True,
-                return_routed_experts=return_routed_experts,
+                **routed_experts_kwargs,
                bootstrap_host=bootstrap_info["bootstrap_host"],
                bootstrap_port=bootstrap_info["bootstrap_port"],
                bootstrap_room=bootstrap_info["bootstrap_room"],
@@ -346,7 +350,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                video_data=video_data,
                sampling_params=sampling_params,
                stream=True,
-                return_routed_experts=return_routed_experts,
+                **routed_experts_kwargs,
                external_trace_header=trace_header,
                rid=trace_id,
                data_parallel_rank=dp_rank,

--- a/components/src/dynamo/sglang/tests/test_sglang_unit.py
+++ b/components/src/dynamo/sglang/tests/test_sglang_unit.py
@@ -11,6 +11,11 @@ import pytest
 import yaml
 from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST

+import dynamo.sglang._compat as sglang_compat
+from dynamo.sglang._compat import (
+    ensure_sglang_top_level_exports,
+    filter_supported_async_generate_kwargs,
+)
 from dynamo.sglang.args import parse_args
 from dynamo.sglang.health_check import (
    SglangDisaggHealthCheckPayload,
@@ -38,6 +43,99 @@ pytestmark = [
 mock_sglang_cli = make_cli_args_fixture("dynamo.sglang")


+def test_compat_restores_sglang_top_level_exports():
+    """Dynamo supports SGLang builds that omit top-level Engine/ServerArgs."""
+    import sglang as sgl
+    from sglang.srt.entrypoints.engine import Engine
+    from sglang.srt.server_args import ServerArgs
+
+    missing = object()
+    original_engine = getattr(sgl, "Engine", missing)
+    original_server_args = getattr(sgl, "ServerArgs", missing)
+
+    try:
+        if hasattr(sgl, "Engine"):
+            delattr(sgl, "Engine")
+        if hasattr(sgl, "ServerArgs"):
+            delattr(sgl, "ServerArgs")
+
+        ensure_sglang_top_level_exports()
+
+        assert sgl.Engine is Engine
+        assert sgl.ServerArgs is ServerArgs
+    finally:
+        if original_engine is missing:
+            if hasattr(sgl, "Engine"):
+                delattr(sgl, "Engine")
+        else:
+            sgl.Engine = original_engine
+
+        if original_server_args is missing:
+            if hasattr(sgl, "ServerArgs"):
+                delattr(sgl, "ServerArgs")
+        else:
+            sgl.ServerArgs = original_server_args
+
+
+def test_compat_filters_async_generate_kwargs_for_older_engines():
+    class OldEngine:
+        async def async_generate(self, input_ids=None, sampling_params=None):
+            return None
+
+    kwargs = {
+        "input_ids": [1, 2, 3],
+        "return_routed_experts": True,
+    }
+
+    assert filter_supported_async_generate_kwargs(OldEngine(), kwargs) == {
+        "input_ids": [1, 2, 3]
+    }
+
+
+def test_compat_keeps_async_generate_kwargs_for_newer_engines():
+    class NewEngine:
+        async def async_generate(self, return_routed_experts=False):
+            return None
+
+    kwargs = {"return_routed_experts": True}
+
+    assert filter_supported_async_generate_kwargs(NewEngine(), kwargs) == kwargs
+
+
+def test_compat_keeps_async_generate_kwargs_for_variadic_engines():
+    class VariadicEngine:
+        async def async_generate(self, **kwargs):
+            return None
+
+    kwargs = {"return_routed_experts": True}
+
+    assert filter_supported_async_generate_kwargs(VariadicEngine(), kwargs) == kwargs
+
+
+def test_compat_caches_async_generate_signature_inspection(monkeypatch):
+    class CachedEngine:
+        async def async_generate(self, return_routed_experts=False):
+            return None
+
+    sglang_compat._get_async_generate_supported_kwarg_names.cache_clear()
+    calls = 0
+    original_signature = sglang_compat.inspect.signature
+
+    def counting_signature(obj):
+        nonlocal calls
+        calls += 1
+        return original_signature(obj)
+
+    monkeypatch.setattr(sglang_compat.inspect, "signature", counting_signature)
+
+    kwargs = {"return_routed_experts": True}
+    assert filter_supported_async_generate_kwargs(CachedEngine(), kwargs) == kwargs
+    assert filter_supported_async_generate_kwargs(CachedEngine(), kwargs) == kwargs
+    assert calls == 1
+
+    sglang_compat._get_async_generate_supported_kwarg_names.cache_clear()
+
+
 @pytest.mark.asyncio
 async def test_custom_jinja_template_invalid_path(mock_sglang_cli):
    """Test that invalid file path raises FileNotFoundError."""

--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -1227,8 +1227,9 @@ impl OpenAIPreprocessor {
    /// For kimi_k25: disabled when chat_template_args contains "thinking": false.
    /// For nemotron_nano: disabled when chat_template_args contains "enable_thinking": false
    ///   or "force_nonempty_content": true.
-    /// For deepseek_r1: disabled when chat_template_args contains "thinking": false
-    ///   or "thinking_mode": "chat".
+    /// For deepseek_r1 / deepseek_v4: disabled when chat_template_args contains
+    ///   "thinking": false or "thinking_mode": "chat" — matches the V4 formatter's
+    ///   `resolve_thinking_mode` convention, so the parser and the prompt stay in sync.
    fn is_reasoning_disabled_by_request(
        reasoning_parser: Option<&str>,
        chat_template_args: Option<&std::collections::HashMap<String, serde_json::Value>>,
@@ -1257,7 +1258,8 @@ impl OpenAIPreprocessor {
                }
                false
            }
-            Some("deepseek_r1") => {
+            Some("deepseek_r1") | Some("deepseek_v4") | Some("deepseek-v4")
+            | Some("deepseekv4") => {
                if let Some(args) = chat_template_args {
                    if let Some(thinking) = args.get("thinking") {
                        return thinking == &serde_json::Value::Bool(false);
@@ -1829,6 +1831,50 @@ mod tests {
                false,
                "nemotron_nano + empty args → enabled",
            ),
+            // deepseek_v4 — same convention as deepseek_r1; verify all three aliases
+            // (deepseek_v4 / deepseek-v4 / deepseekv4) plus both signal keys.
+            (
+                Some("deepseek_v4"),
+                Some(&thinking_false),
+                true,
+                "deepseek_v4 + thinking=false → disabled",
+            ),
+            (
+                Some("deepseek_v4"),
+                Some(&thinking_true),
+                false,
+                "deepseek_v4 + thinking=true → enabled",
+            ),
+            (
+                Some("deepseek_v4"),
+                Some(&thinking_mode_chat),
+                true,
+                "deepseek_v4 + thinking_mode=chat → disabled",
+            ),
+            (
+                Some("deepseek_v4"),
+                Some(&thinking_mode_thinking),
+                false,
+                "deepseek_v4 + thinking_mode=thinking → enabled",
+            ),
+            (
+                Some("deepseek_v4"),
+                None,
+                false,
+                "deepseek_v4 + no args → enabled",
+            ),
+            (
+                Some("deepseek-v4"),
+                Some(&thinking_false),
+                true,
+                "deepseek-v4 (hyphen alias) + thinking=false → disabled",
+            ),
+            (
+                Some("deepseekv4"),
+                Some(&thinking_mode_chat),
+                true,
+                "deepseekv4 (joined alias) + thinking_mode=chat → disabled",
+            ),
        ];

        for (parser, args, expected, desc) in cases {

--- a/lib/llm/src/preprocessor/prompt.rs
+++ b/lib/llm/src/preprocessor/prompt.rs
@@ -26,6 +26,7 @@ use std::sync::Arc;
 use crate::preprocessor::media::MediaDecoder;

 pub mod deepseek_v32;
+pub mod deepseek_v4;
 mod template;

 pub use template::{ChatTemplate, ContextMixins};

--- a/lib/llm/src/preprocessor/prompt/deepseek_v4.rs
+++ b/lib/llm/src/preprocessor/prompt/deepseek_v4.rs
--- a/lib/llm/src/preprocessor/prompt/template.rs
+++ b/lib/llm/src/preprocessor/prompt/template.rs
@@ -19,12 +19,35 @@ use tokcfg::ChatTemplateValue;

 impl PromptFormatter {
    pub fn from_mdc(mdc: &ModelDeploymentCard) -> Result<PromptFormatter> {
-        // Special handling for DeepSeek-V3.2(-Speciale) which doesn't provide Jinja chat_template
-        let name_lower = mdc.display_name.to_lowercase();
-        if name_lower.contains("deepseek")
-            && name_lower.contains("v3.2")
-            && !name_lower.contains("exp")
-        {
+        // Special handling for DeepSeek models whose HF repos don't ship a Jinja chat_template.
+        //
+        // Prefer the authoritative `model_type` from config.json — it's set by
+        // the model author and survives any `--served-model-name` rename. Fall
+        // back to a tight substring match on `display_name` only when config.json
+        // is absent (e.g., tokenizer-only MDCs) or unreadable.
+        //
+        // An empty `model_type` string (rare but legal in the JSON) carries
+        // no signal — normalize it to `None` so the display-name fallback
+        // still runs instead of being silently suppressed.
+        let model_type_lower = mdc
+            .model_info
+            .as_ref()
+            .and_then(|info| info.get_model_info().ok())
+            .map(|info| info.model_type().to_lowercase())
+            .filter(|s| !s.is_empty());
+        let display_name_lower = mdc.display_name.to_lowercase();
+
+        if is_deepseek_v4(&model_type_lower, &display_name_lower) {
+            tracing::info!(
+                model_type = ?model_type_lower,
+                display_name = %mdc.display_name,
+                "Detected DeepSeek V4 model, using native Rust formatter",
+            );
+            return Ok(Self::OAI(Arc::new(
+                super::deepseek_v4::DeepSeekV4Formatter::new_thinking(),
+            )));
+        }
+        if is_deepseek_v3_2_non_exp(&model_type_lower, &display_name_lower) {
            tracing::info!("Detected DeepSeek V3.2 model (non-Exp), using native Rust formatter");
            return Ok(Self::OAI(Arc::new(
                super::deepseek_v32::DeepSeekV32Formatter::new_thinking(),
@@ -187,3 +210,158 @@ struct HfTokenizerConfigJsonFormatter {
 pub struct ContextMixins {
    context_mixins: HashSet<PromptContextMixin>,
 }
+
+/// Decides whether to activate the DeepSeek-V4 native formatter.
+///
+/// Primary signal: config.json `model_type`. DeepSeek-V4-Pro and V4-Flash both
+/// ship `"model_type": "deepseek_v4"`, set by the model author — this survives
+/// any `--served-model-name` rename.
+///
+/// Fallback: `display_name`, tight-matched against
+/// `^deepseek(?:[-_.])?v4(?:[-_.]|$)`. Only consulted when config.json is
+/// absent (tokenizer-only MDCs) or unreadable; a concrete config.json value
+/// that is *not* `deepseek_v4` is authoritative and suppresses the fallback.
+fn is_deepseek_v4(model_type_lower: &Option<String>, display_name_lower: &str) -> bool {
+    match model_type_lower.as_deref() {
+        Some("deepseek_v4") => true,
+        Some(_) => false, // config.json says something else — trust it
+        None => is_deepseek_v4_name(display_name_lower),
+    }
+}
+
+/// Decides whether to activate the DeepSeek-V3.2 (non-Exp) native formatter.
+/// Same config-primary / name-fallback rule as V4.
+fn is_deepseek_v3_2_non_exp(model_type_lower: &Option<String>, display_name_lower: &str) -> bool {
+    let name_match = display_name_lower.contains("deepseek")
+        && display_name_lower.contains("v3.2")
+        && !display_name_lower.contains("exp");
+    match model_type_lower.as_deref() {
+        Some("deepseek_v3_2") => !display_name_lower.contains("exp"),
+        Some(_) => false,
+        None => name_match,
+    }
+}
+
+/// Tight, anchored match for DeepSeek-V4 display names. Equivalent to the
+/// regex `^deepseek(?:[-_.])?v4(?:[-_.]|$)` over an already-lowercased string.
+/// Written with string ops to avoid pulling in the `regex` crate.
+///
+/// Rejects composite names that previously short-circuited the V4 branch:
+/// - `deepseek-v3.2-v4-foo` (the `v3.2` variant is the real one)
+/// - `deepseek-v40` / `deepseek-v4pro` (no separator after `v4`)
+/// - `my-deepseek-v4` (prefix must be at the start)
+fn is_deepseek_v4_name(name_lower: &str) -> bool {
+    let Some(rest) = name_lower.strip_prefix("deepseek") else {
+        return false;
+    };
+    // Optional single separator between "deepseek" and "v4".
+    let rest = rest
+        .strip_prefix(|c: char| matches!(c, '-' | '_' | '.'))
+        .unwrap_or(rest);
+    let Some(after_v4) = rest.strip_prefix("v4") else {
+        return false;
+    };
+    // `v4` must end the name or be followed by a separator — anything else
+    // (e.g. `v40`, `v4pro`) is a different model family.
+    after_v4.is_empty() || after_v4.starts_with(['-', '_', '.'])
+}
+
+#[cfg(test)]
+mod detection_tests {
+    use super::{is_deepseek_v3_2_non_exp, is_deepseek_v4, is_deepseek_v4_name};
+
+    #[test]
+    fn v4_name_matches_canonical_variants() {
+        for name in [
+            "deepseek-v4",
+            "deepseek_v4",
+            "deepseek.v4",
+            "deepseekv4",
+            "deepseek-v4-pro",
+            "deepseek-v4-flash",
+            "deepseek-v4-flash-2507",
+            "deepseek-v4.1",
+            "deepseek_v4_thinking",
+        ] {
+            assert!(is_deepseek_v4_name(name), "expected {name} to match V4");
+        }
+    }
+
+    #[test]
+    fn v4_name_rejects_non_v4() {
+        // Composite names that previously short-circuited to V4 before the
+        // V3.2 branch — now correctly rejected.
+        for name in [
+            "deepseek-v3.2-v4-foo",
+            "my-deepseek-v4",
+            "deepseek-v40",
+            "deepseek-v4pro",
+            "deepseekv40",
+            "deepseek-v3",
+            "deepseek-v3.2",
+            "deepseek-r1",
+            "qwen3-v4", // only deepseek-prefixed names qualify
+            "dsflash",
+            "",
+        ] {
+            assert!(
+                !is_deepseek_v4_name(name),
+                "expected {name} to NOT match V4",
+            );
+        }
+    }
+
+    #[test]
+    fn v4_detection_prefers_config_model_type() {
+        // config.json `model_type = "deepseek_v4"` wins regardless of what
+        // the operator calls the model via --served-model-name.
+        let v4 = Some("deepseek_v4".to_string());
+        for display in ["dsflash", "my-pet-model", "llama-3-8b", ""] {
+            assert!(
+                is_deepseek_v4(&v4, display),
+                "config says deepseek_v4, display {display:?} — expected V4",
+            );
+        }
+
+        // A concrete non-V4 config.json suppresses the display-name fallback.
+        // Even if the operator names the served model "deepseek-v4", a model
+        // with `model_type = "llama"` is NOT DeepSeek-V4.
+        let llama = Some("llama".to_string());
+        for display in ["deepseek-v4", "deepseek-v4-flash", "anything"] {
+            assert!(
+                !is_deepseek_v4(&llama, display),
+                "config says llama, display {display:?} — expected NOT V4",
+            );
+        }
+
+        // No config.json — fall back to display-name match.
+        assert!(is_deepseek_v4(&None, "deepseek-v4-flash"));
+        assert!(!is_deepseek_v4(&None, "dsflash"));
+
+        // A config.json with `"model_type": ""` is treated as "no signal" at
+        // the call site (normalized to None before is_deepseek_v4 is called),
+        // so the display-name fallback still runs — pin that contract.
+        let empty: Option<String> = None;
+        assert!(is_deepseek_v4(&empty, "deepseek-v4-flash"));
+        assert!(!is_deepseek_v4(&empty, "dsflash"));
+    }
+
+    #[test]
+    fn v3_2_detection_prefers_config_model_type() {
+        // config says deepseek_v3_2, any non-"exp" display name triggers.
+        let v3_2 = Some("deepseek_v3_2".to_string());
+        assert!(is_deepseek_v3_2_non_exp(&v3_2, "whatever"));
+        assert!(is_deepseek_v3_2_non_exp(&v3_2, "deepseek-v3.2"));
+        // V3.2-Exp is a separate model family; suppress even via config.
+        assert!(!is_deepseek_v3_2_non_exp(&v3_2, "deepseek-v3.2-exp"));
+
+        // Other config types lose regardless of display name.
+        let other = Some("deepseek_v4".to_string());
+        assert!(!is_deepseek_v3_2_non_exp(&other, "deepseek-v3.2"));
+
+        // No config — fall back to the original display-name heuristic.
+        assert!(is_deepseek_v3_2_non_exp(&None, "deepseek-v3.2-pro"));
+        assert!(!is_deepseek_v3_2_non_exp(&None, "deepseek-v3.2-exp"));
+        assert!(!is_deepseek_v3_2_non_exp(&None, "deepseek-v4"));
+    }
+}
--- a/lib/llm/tests/data/deepseek-v4/test_input_1.json
+++ b/lib/llm/tests/data/deepseek-v4/test_input_1.json
+{"tools":[{"type":"function","function":{"name":"get_weather","description":"Get the weather for a specific location","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city name"},"unit":{"type":"string","enum":["celsius","fahrenheit"],"description":"Temperature unit"}},"required":["location"]}}},{"type":"function","function":{"name":"search","description":"Search the web for information","parameters":{"type":"object","properties":{"query":{"type":"string","description":"Search query"},"num_results":{"type":"integer","description":"Number of results to return"}},"required":["query"]}}}],"messages":[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":"What's the weather in Beijing?"},{"role":"assistant","reasoning_content":"The user wants to know the weather in Beijing. I should use the get_weather tool.","tool_calls":[{"id":"call_001","type":"function","function":{"name":"get_weather","arguments":"{\"location\": \"Beijing\", \"unit\": \"celsius\"}"}}]},{"role":"tool","tool_call_id":"call_001","content":"{\"temperature\": 22, \"condition\": \"sunny\", \"humidity\": 45}"},{"role":"assistant","reasoning_content":"Got the weather data. Let me format a nice response.","content":"The weather in Beijing is currently sunny with a temperature of 22°C and 45% humidity."}]}
--- a/lib/llm/tests/data/deepseek-v4/test_input_2.json
+++ b/lib/llm/tests/data/deepseek-v4/test_input_2.json
+[{"role":"system","content":"You are a helpful assistant."},{"role":"user","content":"Hello"},{"role":"assistant","reasoning_content":"The user said hello, I should greet back.","content":"Hi there! How can I help you?"},{"role":"user","content":"What is the capital of France?"},{"role":"assistant","reasoning_content":"The user asks about the capital of France. It is Paris.","content":"The capital of France is Paris."}]
--- a/lib/llm/tests/data/deepseek-v4/test_input_3.json
+++ b/lib/llm/tests/data/deepseek-v4/test_input_3.json
+[{"role":"system","content":"该助手为DeepSeek，由深度求索公司创造。"},{"role":"latest_reminder","content":"2026-02-21,星期六,广州,App,中文"},{"role":"developer","content":"小柴胡冲剂和布洛芬能一起吃吗？\n\nCITATION FORMAT: 【{cursor_id}†L{start_line_id}(-L{end_line_id})?】","tools":[{"type":"function","function":{"name":"search","description":"Web search. Split multiple queries with '||'.","parameters":{"type":"object","properties":{"queries":{"type":"string","description":"query1||query2"}},"required":["queries"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}}},{"type":"function","function":{"name":"open","description":"Batch open IDs (format 【{id}†...】) or URLs.","parameters":{"type":"object","properties":{"open_list":{"type":"array","items":{"type":"object","properties":{"id":{"description":"ID or URL","anyOf":[{"type":"integer"},{"type":"string"}],"default":-1},"cursor":{"type":"integer","description":"","default":-1},"loc":{"type":"integer","description":"Start line","default":-1},"num_lines":{"type":"integer","description":"","default":-1},"view_source":{"type":"boolean","description":"","default":false}},"additionalProperties":false},"description":""}},"required":["open_list"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}}},{"type":"function","function":{"name":"find","description":"Find exact text pattern in pages.","parameters":{"type":"object","properties":{"find_list":{"type":"array","items":{"type":"object","properties":{"pattern":{"type":"string","description":""},"cursor":{"type":"integer","description":"","default":-1}},"required":["pattern"],"additionalProperties":false},"description":""}},"required":["find_list"],"additionalProperties":false,"$schema":"http://json-schema.org/draft-07/schema#"}}}]},{"role":"assistant","content":"","reasoning_content":"用户想知道小柴胡冲剂和布洛芬能否一起服用。","tool_calls":[{"type":"function","function":{"name":"search","arguments":"{\"queries\": \"小柴胡冲剂 布洛芬 相互作用 一起吃\"}"}}]},{"role":"tool","content":"[0]"},{"role":"assistant","content":"请及时就医。","reasoning_content":"现在开始组织回答。","tool_calls":[]}]
--- a/lib/llm/tests/data/deepseek-v4/test_input_4.json
+++ b/lib/llm/tests/data/deepseek-v4/test_input_4.json
+[{"role":"system","content":"该助手为DeepSeek-V3，由深度求索公司创造。\n今天是2025年10月17日，星期五。"},{"role":"latest_reminder","content":"2024-11-15,上海市,App,中文"},{"role":"user","content":"热海大滚锅是世界著名温泉吗"},{"role":"assistant","content":"关于热海大滚锅是否是世界著名温泉，可以这样总结：它在**中国乃至全球的地热奇观中占有重要地位**，但“世界著名”的称号更侧重于它作为**独特的地质现象和旅游景点**，而非普遍的温泉疗养体验。\n\n为了让你快速了解，我整理了一个简要的表格：\n\n| 维度 | 热海大滚锅的地位与特点 |\n| :--- | :--- |\n| **地质奇观** | **世界罕见**的高温地热系统，泉眼多、水温高、形态多样。 |\n| **国内声誉** | **中国三大地热区之一**，**国家5A级旅游景区**，自古闻名（徐霞客曾游历并记载）。 |\n| **国际知名度** | 在一些旅游平台被提及为“世界六大温泉”之一，但此说法流传不广，其国际声誉更多建立在地质独特性上。 |\n| **核心体验** | **观赏地热奇观**（如97℃沸腾的“大滚锅”）、**体验温泉煮鸡蛋**。 |\n\n### 💡 游玩攻略与温馨提示\n\n如果你计划前往热海大滚锅，这里有一些实用信息供你参考：\n\n- **门票与开放时间**：\n    - **门票**：景区门票约为**50元/人**。如果选择包含温泉沐浴的套餐，价格会更高，例如约**288元**。\n    - **开放时间**：景区一般**08:00-18:00**开放，但具体时间可能变动，建议提前核实。\n\n- **特色体验**：\n    - **温泉煮鸡蛋**：这几乎是必试项目。可以在景区门口购买用草绳串起的生鸡蛋（约5-8元/串），然后到“大滚锅”旁的指定区域蒸煮，几分钟便可熟食，趣味十足。\n    - **金汤足浴**：可以直接用从“大滚锅”流出的温泉水泡脚，缓解旅途疲劳。\n\n- **注意事项**：\n    - **安全第一**：“大滚锅”水温极高，务必遵守游览规则，在指定区域内观赏，切勿随意触碰泉水。\n    - **规划行程**：建议为热海景区预留**3-4小时**的游览时间。景区内步道不走回头路，出入口有观光车接送。\n\n希望这些信息能帮助你更好地了解热海大滚锅。如果你对腾冲的其他景点或者行程规划有更多疑问，我很乐意提供进一步的信息。","mask":1},{"role":"user","content":"世界著名温泉有哪些","task":"action"},{"role":"assistant","content":"Search"}]
--- a/lib/llm/tests/data/deepseek-v4/test_output_1.txt
+++ b/lib/llm/tests/data/deepseek-v4/test_output_1.txt
+<｜begin▁of▁sentence｜>You are a helpful assistant.
+
+## Tools
+
+You have access to a set of tools to help answer the user's question. You can invoke tools by writing a "<｜DSML｜tool_calls>" block like the following:
+
+<｜DSML｜tool_calls>
+<｜DSML｜invoke name="$TOOL_NAME">
+<｜DSML｜parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</｜DSML｜parameter>
+...
+</｜DSML｜invoke>
+<｜DSML｜invoke name="$TOOL_NAME2">
+...
+</｜DSML｜invoke>
+</｜DSML｜tool_calls>
+
+String parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string="false"`.
+
+If thinking_mode is enabled (triggered by <think>), you MUST output your complete reasoning inside <think>...</think> BEFORE any tool calls or final response.
+
+Otherwise, output directly after </think> with tool calls or final response.
+
+### Available Tool Schemas
+
+{"name": "get_weather", "description": "Get the weather for a specific location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location"]}}
+{"name": "search", "description": "Search the web for information", "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "Search query"}, "num_results": {"type": "integer", "description": "Number of results to return"}}, "required": ["query"]}}
+
+You MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.
+<｜User｜>What's the weather in Beijing?<｜Assistant｜><think>The user wants to know the weather in Beijing. I should use the get_weather tool.</think>
+
+<｜DSML｜tool_calls>
+<｜DSML｜invoke name="get_weather">
+<｜DSML｜parameter name="location" string="true">Beijing</｜DSML｜parameter>
+<｜DSML｜parameter name="unit" string="true">celsius</｜DSML｜parameter>
+</｜DSML｜invoke>
+</｜DSML｜tool_calls><｜end▁of▁sentence｜><｜User｜><tool_result>{"temperature": 22, "condition": "sunny", "humidity": 45}</tool_result><｜Assistant｜><think>Got the weather data. Let me format a nice response.</think>The weather in Beijing is currently sunny with a temperature of 22°C and 45% humidity.<｜end▁of▁sentence｜>
\ No newline at end of file
--- a/lib/llm/tests/data/deepseek-v4/test_output_2.txt
+++ b/lib/llm/tests/data/deepseek-v4/test_output_2.txt
+<｜begin▁of▁sentence｜>You are a helpful assistant.<｜User｜>Hello<｜Assistant｜></think>Hi there! How can I help you?<｜end▁of▁sentence｜><｜User｜>What is the capital of France?<｜Assistant｜><think>The user asks about the capital of France. It is Paris.</think>The capital of France is Paris.<｜end▁of▁sentence｜>
\ No newline at end of file
--- a/lib/llm/tests/data/deepseek-v4/test_output_3.txt
+++ b/lib/llm/tests/data/deepseek-v4/test_output_3.txt
+<｜begin▁of▁sentence｜>该助手为DeepSeek，由深度求索公司创造。<｜latest_reminder｜>2026-02-21,星期六,广州,App,中文<｜User｜>小柴胡冲剂和布洛芬能一起吃吗？
+
+CITATION FORMAT: 【{cursor_id}†L{start_line_id}(-L{end_line_id})?】
+
+## Tools
+
+You have access to a set of tools to help answer the user's question. You can invoke tools by writing a "<｜DSML｜tool_calls>" block like the following:
+
+<｜DSML｜tool_calls>
+<｜DSML｜invoke name="$TOOL_NAME">
+<｜DSML｜parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</｜DSML｜parameter>
+...
+</｜DSML｜invoke>
+<｜DSML｜invoke name="$TOOL_NAME2">
+...
+</｜DSML｜invoke>
+</｜DSML｜tool_calls>
+
+String parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string="false"`.
+
+If thinking_mode is enabled (triggered by <think>), you MUST output your complete reasoning inside <think>...</think> BEFORE any tool calls or final response.
+
+Otherwise, output directly after </think> with tool calls or final response.
+
+### Available Tool Schemas
+
+{"name": "search", "description": "Web search. Split multiple queries with '||'.", "parameters": {"type": "object", "properties": {"queries": {"type": "string", "description": "query1||query2"}}, "required": ["queries"], "additionalProperties": false, "$schema": "http://json-schema.org/draft-07/schema#"}}
+{"name": "open", "description": "Batch open IDs (format 【{id}†...】) or URLs.", "parameters": {"type": "object", "properties": {"open_list": {"type": "array", "items": {"type": "object", "properties": {"id": {"description": "ID or URL", "anyOf": [{"type": "integer"}, {"type": "string"}], "default": -1}, "cursor": {"type": "integer", "description": "", "default": -1}, "loc": {"type": "integer", "description": "Start line", "default": -1}, "num_lines": {"type": "integer", "description": "", "default": -1}, "view_source": {"type": "boolean", "description": "", "default": false}}, "additionalProperties": false}, "description": ""}}, "required": ["open_list"], "additionalProperties": false, "$schema": "http://json-schema.org/draft-07/schema#"}}
+{"name": "find", "description": "Find exact text pattern in pages.", "parameters": {"type": "object", "properties": {"find_list": {"type": "array", "items": {"type": "object", "properties": {"pattern": {"type": "string", "description": ""}, "cursor": {"type": "integer", "description": "", "default": -1}}, "required": ["pattern"], "additionalProperties": false}, "description": ""}}, "required": ["find_list"], "additionalProperties": false, "$schema": "http://json-schema.org/draft-07/schema#"}}
+
+You MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.
+<｜Assistant｜><think>用户想知道小柴胡冲剂和布洛芬能否一起服用。</think>
+
+<｜DSML｜tool_calls>
+<｜DSML｜invoke name="search">
+<｜DSML｜parameter name="queries" string="true">小柴胡冲剂 布洛芬 相互作用 一起吃</｜DSML｜parameter>
+</｜DSML｜invoke>
+</｜DSML｜tool_calls><｜end▁of▁sentence｜><｜User｜><tool_result>[0]</tool_result><｜Assistant｜><think>现在开始组织回答。</think>请及时就医。<｜end▁of▁sentence｜>
\ No newline at end of file
--- a/lib/llm/tests/data/deepseek-v4/test_output_4.txt
+++ b/lib/llm/tests/data/deepseek-v4/test_output_4.txt
+<｜begin▁of▁sentence｜>该助手为DeepSeek-V3，由深度求索公司创造。
+今天是2025年10月17日，星期五。<｜latest_reminder｜>2024-11-15,上海市,App,中文<｜User｜>热海大滚锅是世界著名温泉吗<｜Assistant｜></think>关于热海大滚锅是否是世界著名温泉，可以这样总结：它在**中国乃至全球的地热奇观中占有重要地位**，但“世界著名”的称号更侧重于它作为**独特的地质现象和旅游景点**，而非普遍的温泉疗养体验。
+
+为了让你快速了解，我整理了一个简要的表格：
+
+| 维度 | 热海大滚锅的地位与特点 |
+| :--- | :--- |
+| **地质奇观** | **世界罕见**的高温地热系统，泉眼多、水温高、形态多样。 |
+| **国内声誉** | **中国三大地热区之一**，**国家5A级旅游景区**，自古闻名（徐霞客曾游历并记载）。 |
+| **国际知名度** | 在一些旅游平台被提及为“世界六大温泉”之一，但此说法流传不广，其国际声誉更多建立在地质独特性上。 |
+| **核心体验** | **观赏地热奇观**（如97℃沸腾的“大滚锅”）、**体验温泉煮鸡蛋**。 |
+
+### 💡 游玩攻略与温馨提示
+
+如果你计划前往热海大滚锅，这里有一些实用信息供你参考：
+
+- **门票与开放时间**：
+    - **门票**：景区门票约为**50元/人**。如果选择包含温泉沐浴的套餐，价格会更高，例如约**288元**。
+    - **开放时间**：景区一般**08:00-18:00**开放，但具体时间可能变动，建议提前核实。
+
+- **特色体验**：
+    - **温泉煮鸡蛋**：这几乎是必试项目。可以在景区门口购买用草绳串起的生鸡蛋（约5-8元/串），然后到“大滚锅”旁的指定区域蒸煮，几分钟便可熟食，趣味十足。
+    - **金汤足浴**：可以直接用从“大滚锅”流出的温泉水泡脚，缓解旅途疲劳。
+
+- **注意事项**：
+    - **安全第一**：“大滚锅”水温极高，务必遵守游览规则，在指定区域内观赏，切勿随意触碰泉水。
+    - **规划行程**：建议为热海景区预留**3-4小时**的游览时间。景区内步道不走回头路，出入口有观光车接送。
+
+希望这些信息能帮助你更好地了解热海大滚锅。如果你对腾冲的其他景点或者行程规划有更多疑问，我很乐意提供进一步的信息。<｜end▁of▁sentence｜><｜User｜>世界著名温泉有哪些<｜Assistant｜></think><｜action｜>Search<｜end▁of▁sentence｜>
\ No newline at end of file
--- a/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_content_before_tool.json
+++ b/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_content_before_tool.json
+{
+  "request_id": "deepseek-v4-content-before-tool-test",
+  "expected_output": {"normal_content": "Let me check the forecast for Tokyo right now.", "reasoning_content": "The user wants today's weather in Tokyo. I'll acknowledge the request, then call get_weather.", "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"Tokyo\", \"unit\": \"celsius\"}"}}]},
+  "input_stream": [
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":"<think>The user wants today's weather in Tokyo. I'll acknowledge the request, then call get_weather.</think>","role":"assistant","reasoning_content":"The user wants today's weather in Tokyo. I'll acknowledge the request, then call get_weather."}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":"Let me check the forecast for Tokyo right now.","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜tool_calls>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜invoke name=\"get_weather\">\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"location\" string=\"true\">Tokyo</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"unit\" string=\"true\">celsius</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":"</｜DSML｜invoke>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":"</｜DSML｜tool_calls>","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-content-before-tool","choices":[{"index":0,"delta":{"content":null,"role":"assistant"},"finish_reason":"tool_calls"}]}}
+  ]
+}