feat: add SGLang chat processor for frontend pre/post processing (#6834)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

feat: add SGLang chat processor for frontend pre/post processing (#6834)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
51dfd760 · ishandhanani · GitHub · 63d7c01c · 51dfd760 · 51dfd760
Unverified Commit 51dfd760 authored Mar 09, 2026 by ishandhanani Committed by GitHub Mar 09, 2026
14 changed files
--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -71,6 +71,7 @@ class FrontendConfig(KvRouterConfigBase):
    event_plane: str
    chat_processor: str
    enable_anthropic_api: bool
+    debug_perf: bool
    preprocess_workers: int
    def validate(self) -> None:
@@ -350,10 +351,25 @@ class FrontendArgGroup(ArgGroup):
            default="dynamo",
            dest="chat_processor",
            help=(
-                "[EXPERIMENTAL] When set to 'vllm', use local vllm for the pre and post "
+                "[EXPERIMENTAL] Chat pre/post processor backend. 'dynamo' uses the Rust "
-                "processor."
+                "preprocessor. 'vllm' uses local vLLM for pre and post processing. "
+                "'sglang' uses SGLang APIs for chat template rendering, tool call "
+                "parsing, and reasoning parsing."
+            ),
+            choices=["dynamo", "vllm", "sglang"],
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--dyn-debug-perf",
+            env_var="DYN_DEBUG_PERF",
+            default=False,
+            dest="debug_perf",
+            help=(
+                "[EXPERIMENTAL] Enable performance instrumentation for diagnosing preprocessing bottlenecks. "
+                "Logs per-function timing, request concurrency, and hot-path section durations. "
+                "Supported with '--dyn-chat-processor vllm' and '--dyn-chat-processor sglang'."
            ),
-            choices=["dynamo", "vllm"],
        )
        add_argument(
@@ -366,7 +382,8 @@ class FrontendArgGroup(ArgGroup):
                "[EXPERIMENTAL] Number of worker processes for preprocessing and output processing. "
                "When > 0, offloads CPU-bound work (tokenization, template rendering, "
                "detokenization) to a ProcessPoolExecutor with N workers, each with its "
-                "own GIL. 0 (default) keeps all processing on the main event loop. '--dyn-chat-processor vllm' only."
+                "own GIL. 0 (default) keeps all processing on the main event loop. "
+                "Supported with '--dyn-chat-processor vllm' and '--dyn-chat-processor sglang'."
            ),
            arg_type=int,
        )
--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -63,11 +63,36 @@ def setup_engine_factory(
    return EngineFactory(runtime, router_config, config, vllm_flags)
-def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:
+def setup_sglang_engine_factory(
+    runtime: DistributedRuntime,
+    router_config: RouterConfig,
+    config: FrontendConfig,
+    sglang_flags: Optional[Namespace] = None,
+):
+    """
+    When using sglang pre and post processor, create the SglangEngineFactory
+    that creates the engines that run requests.
+    """
+    from .sglang_processor import SglangEngineFactory
+    tool_call_parser = getattr(sglang_flags, "tool_call_parser", None)
+    reasoning_parser = getattr(sglang_flags, "reasoning_parser", None)
+    return SglangEngineFactory(
+        runtime,
+        router_config,
+        config,
+        debug_perf=config.debug_perf,
+        tool_call_parser_name=tool_call_parser,
+        reasoning_parser_name=reasoning_parser,
+    )
+def parse_args() -> tuple[FrontendConfig, Optional[Namespace], Optional[Namespace]]:
    """Parse command-line arguments for the Dynamo frontend.
    Returns:
-        FrontendConfig: Parsed configuration object.
+        Tuple of (FrontendConfig, vllm_flags, sglang_flags).
    """
    parser = argparse.ArgumentParser(
@@ -83,6 +108,7 @@ def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:
    config.validate()
    vllm_flags = None
+    sglang_flags = None
    # parse extra vllm flags using vllm native parser.
    if config.chat_processor == "vllm":
@@ -108,11 +134,19 @@ def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:
        vllm_parser = AsyncEngineArgs.add_cli_args(vllm_parser)
        # the result is returned as Namespace object rather than AsyncEngineArgs object to avoid import error for non-vllm users.
        vllm_flags = vllm_parser.parse_args(unknown)
+    elif config.chat_processor == "sglang":
+        sglang_parser = argparse.ArgumentParser(add_help=False)
+        sglang_parser.add_argument("--tool-call-parser", default=None)
+        sglang_parser.add_argument("--reasoning-parser", default=None)
+        sglang_flags, remaining = sglang_parser.parse_known_args(unknown)
+        if remaining:
+            logger.error(f"Unknown arguments specified: {remaining}")
+            sys.exit(1)
    else:
        if unknown:
            logger.error(f"Unknown arguments specified: {unknown}")
            sys.exit(1)
-    return config, vllm_flags
+    return config, vllm_flags, sglang_flags
 async def async_main():
@@ -128,7 +162,7 @@ async def async_main():
    # bind that port before the worker, causing port conflicts and/or scraping the
    # wrong metrics endpoint.
    os.environ.pop("DYN_SYSTEM_PORT", None)
-    config, vllm_flags = parse_args()
+    config, vllm_flags, sglang_flags = parse_args()
    dump_config(config.dump_config_to, config)
    os.environ["DYN_EVENT_PLANE"] = config.event_plane
    logger.info(
@@ -233,6 +267,11 @@ async def async_main():
            runtime, router_config, config, vllm_flags
        ).chat_engine_factory
        kwargs["chat_engine_factory"] = chat_engine_factory
+    elif config.chat_processor == "sglang":
+        chat_engine_factory = setup_sglang_engine_factory(
+            runtime, router_config, config, sglang_flags
+        ).chat_engine_factory
+        kwargs["chat_engine_factory"] = chat_engine_factory
    e = EntrypointArgs(EngineType.Dynamic, **kwargs)
    engine = await make_engine(runtime, e)

--- a/components/src/dynamo/frontend/sglang_prepost.py
+++ b/components/src/dynamo/frontend/sglang_prepost.py
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+from sglang.srt.entrypoints.openai.protocol import Function as SglangFunction
+from sglang.srt.entrypoints.openai.protocol import Tool as SglangTool
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from .utils import random_call_id
+@dataclass
+class SglangPreprocessResult:
+    """Result of SGLang preprocessing."""
+    prompt_token_ids: list[int]
+    tool_call_parser: FunctionCallParser | None
+    reasoning_parser: ReasoningParser | None
+    request: dict[str, Any]
+def convert_tools(tools: list[dict[str, Any]] | None) -> list[SglangTool] | None:
+    """Convert OpenAI tool dicts to SGLang Tool objects."""
+    if not tools:
+        return None
+    sglang_tools = []
+    for tool in tools:
+        func = tool.get("function", {})
+        sglang_tools.append(
+            SglangTool(
+                type=tool.get("type", "function"),
+                function=SglangFunction(
+                    name=func.get("name", ""),
+                    description=func.get("description"),
+                    parameters=func.get("parameters"),
+                    strict=func.get("strict", False),
+                ),
+            )
+        )
+    return sglang_tools
+def _materialize_messages(messages: list[Any]) -> list[dict[str, Any]]:
+    """Convert message objects to plain dicts for apply_chat_template."""
+    normalized = []
+    for msg in messages:
+        if hasattr(msg, "model_dump"):
+            normalized.append(msg.model_dump(exclude_none=False))
+        elif isinstance(msg, dict):
+            normalized.append(msg)
+        else:
+            normalized.append(dict(msg))
+    return normalized
+def create_parsers(
+    request: dict[str, Any],
+    *,
+    tool_call_parser_name: str | None,
+    reasoning_parser_name: str | None,
+    sglang_tools: list[SglangTool] | None = None,
+) -> tuple[FunctionCallParser | None, ReasoningParser | None]:
+    """Create tool call and reasoning parsers for a request.
+    Shared by both the single-process preprocessing path and the pool path
+    (which must recreate non-picklable parsers in the main process).
+    If ``sglang_tools`` is provided, reuses them; otherwise converts from
+    the request's ``tools`` field.
+    """
+    if sglang_tools is None:
+        sglang_tools = convert_tools(request.get("tools"))
+    tool_choice = request.get("tool_choice", "auto")
+    tool_call_parser = None
+    if tool_call_parser_name and sglang_tools and tool_choice != "none":
+        tool_call_parser = FunctionCallParser(
+            tools=sglang_tools,
+            tool_call_parser=tool_call_parser_name,
+        )
+    reasoning_parser = None
+    if reasoning_parser_name:
+        reasoning_parser = ReasoningParser(
+            model_type=reasoning_parser_name,
+            stream_reasoning=True,
+        )
+    return tool_call_parser, reasoning_parser
+def preprocess_chat_request(
+    request: dict[str, Any],
+    *,
+    tokenizer,
+    tool_call_parser_name: str | None,
+    reasoning_parser_name: str | None,
+) -> SglangPreprocessResult:
+    """Preprocess a chat request using SGLang tokenizer and parser APIs.
+    Synchronous -- suitable for both main-process and worker-process execution.
+    """
+    messages = _materialize_messages(request.get("messages", []))
+    # Convert tools to SGLang format (done once, shared with parser creation)
+    sglang_tools = convert_tools(request.get("tools"))
+    # Build template kwargs -- single call for rendering + tokenization
+    template_kwargs: dict[str, Any] = {
+        "add_generation_prompt": True,
+        "tokenize": True,
+    }
+    if sglang_tools:
+        template_kwargs["tools"] = [t.model_dump() for t in sglang_tools]
+    prompt_token_ids = tokenizer.apply_chat_template(messages, **template_kwargs)
+    if not isinstance(prompt_token_ids, list):
+        prompt_token_ids = list(prompt_token_ids)
+    tool_call_parser, reasoning_parser = create_parsers(
+        request,
+        tool_call_parser_name=tool_call_parser_name,
+        reasoning_parser_name=reasoning_parser_name,
+        sglang_tools=sglang_tools,
+    )
+    return SglangPreprocessResult(
+        prompt_token_ids=prompt_token_ids,
+        tool_call_parser=tool_call_parser,
+        reasoning_parser=reasoning_parser,
+        request=request,
+    )
+def _random_call_id() -> str:
+    return random_call_id()
+class SglangStreamingPostProcessor:
+    """Streaming post-processor using SGLang parsers and HF tokenizer detokenization.
+    Handles:
+    - Incremental detokenization via sliding-window decode (6-token lookback)
+    - Reasoning content extraction via SGLang ReasoningParser
+    - Tool call parsing via SGLang FunctionCallParser (parameter deltas)
+    """
+    # Lookback window size for incremental detokenization.  UTF-8 characters
+    # can span up to 4 bytes, each potentially its own token.  A lookback of
+    # 6 covers the worst case (4-token char) plus margin for BPE merges that
+    # cross the old/new boundary.
+    LOOKBACK = 6
+    def __init__(
+        self,
+        *,
+        tokenizer,
+        tool_call_parser: FunctionCallParser | None,
+        reasoning_parser: ReasoningParser | None,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.tool_call_parser = tool_call_parser
+        self.reasoning_parser = reasoning_parser
+        self._fast_plain_text = tool_call_parser is None and reasoning_parser is None
+        self._all_token_ids: list[int] = []
+        # Tool call accumulation.  SGLang's streaming parser returns
+        # deltas (name in one chunk, argument fragments across subsequent
+        # chunks).  However, when the complete tool-call JSON arrives in a
+        # single chunk the parser emits the name but never streams
+        # arguments (a chunking-sensitivity issue in the base detector).
+        # We accumulate names + arg fragments from streaming deltas and,
+        # on finish, fall back to parse_non_stream on the detector buffer
+        # for any tool call whose arguments are still missing.
+        self._tool_call_ids: dict[int, str] = {}  # tool_index -> call_id
+        self._tool_call_names: dict[int, str] = {}  # tool_index -> name
+        self._tool_call_args: dict[int, list[str]] = {}  # tool_index -> arg chunks
+    def _incremental_decode(self, new_token_ids: list[int]) -> str:
+        """Decode new tokens with lookback window for multi-byte char boundaries.
+        Re-decodes a small window of previous tokens alongside new tokens so that
+        multi-byte characters spanning token boundaries are correctly resolved.
+        Only retains the last LOOKBACK tokens to bound memory usage.
+        """
+        prev_count = len(self._all_token_ids)
+        self._all_token_ids.extend(new_token_ids)
+        start = max(0, prev_count - self.LOOKBACK)
+        # Trim to avoid unbounded growth -- only the tail matters for decoding
+        if len(self._all_token_ids) > self.LOOKBACK * 16:
+            self._all_token_ids = self._all_token_ids[
+                -(self.LOOKBACK + len(new_token_ids)) :
+            ]
+            prev_count = len(self._all_token_ids) - len(new_token_ids)
+            start = max(0, prev_count - self.LOOKBACK)
+        # Decode lookback-only prefix (before new tokens)
+        prefix_tokens = self._all_token_ids[start:prev_count]
+        prefix_text = (
+            self.tokenizer.decode(prefix_tokens, skip_special_tokens=True)
+            if prefix_tokens
+            else ""
+        )
+        # Decode lookback + new tokens together
+        window_tokens = self._all_token_ids[start:]
+        window_text = self.tokenizer.decode(window_tokens, skip_special_tokens=True)
+        return window_text[len(prefix_text) :]
+    def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | None:
+        """Process a single engine response chunk into an OpenAI SSE choice dict.
+        Args:
+            engine_response: Dict with ``token_ids`` and optional ``finish_reason``.
+        Returns:
+            OpenAI choice dict or ``None`` if nothing to emit yet.
+        """
+        raw_ids = engine_response.get("token_ids")
+        token_ids = raw_ids if isinstance(raw_ids, list) else list(raw_ids or [])
+        finish_reason = engine_response.get("finish_reason")
+        delta_text = self._incremental_decode(token_ids) if token_ids else ""
+        if self._fast_plain_text:
+            if delta_text:
+                return {
+                    "index": 0,
+                    "delta": {"role": "assistant", "content": delta_text},
+                    "finish_reason": finish_reason,
+                    "logprobs": None,
+                }
+            elif finish_reason:
+                return {
+                    "index": 0,
+                    "delta": {},
+                    "finish_reason": finish_reason,
+                    "logprobs": None,
+                }
+            return None
+        # -- Reasoning parsing --
+        reasoning_text = None
+        normal_text = delta_text
+        if self.reasoning_parser and delta_text:
+            r_text, n_text = self.reasoning_parser.parse_stream_chunk(delta_text)
+            reasoning_text = r_text or None
+            normal_text = n_text or ""
+        # -- Tool call parsing (accumulate deltas) --
+        content_text = normal_text
+        if self.tool_call_parser and normal_text:
+            parsed_text, tool_calls = self.tool_call_parser.parse_stream_chunk(
+                normal_text
+            )
+            content_text = parsed_text
+            for tc in tool_calls:
+                idx = tc.tool_index
+                if idx not in self._tool_call_ids:
+                    self._tool_call_ids[idx] = _random_call_id()
+                if tc.name:
+                    self._tool_call_names[idx] = tc.name
+                if tc.parameters:
+                    self._tool_call_args.setdefault(idx, []).append(tc.parameters)
+        # -- Assemble delta --
+        delta: dict[str, Any] = {"role": "assistant"}
+        has_content = False
+        if content_text:
+            delta["content"] = content_text
+            has_content = True
+        if reasoning_text:
+            delta["reasoning_content"] = reasoning_text
+            has_content = True
+        # Emit complete tool calls on finish.  For any tool call whose
+        # arguments are still empty (chunking-sensitivity issue), fall
+        # back to parse_non_stream on the detector's buffer.
+        if finish_reason and self._tool_call_names:
+            missing_args = any(
+                idx not in self._tool_call_args for idx in self._tool_call_names
+            )
+            if missing_args:
+                buffer = getattr(self.tool_call_parser.detector, "_buffer", "")
+                if buffer:
+                    _, final_calls = self.tool_call_parser.parse_non_stream(buffer)
+                    for tc in final_calls:
+                        idx = tc.tool_index
+                        if idx not in self._tool_call_ids:
+                            self._tool_call_ids[idx] = _random_call_id()
+                        if tc.name:
+                            self._tool_call_names[idx] = tc.name
+                        if tc.parameters:
+                            self._tool_call_args[idx] = [tc.parameters]
+            tool_calls_out: list[dict[str, Any]] = []
+            for idx in sorted(self._tool_call_names):
+                tool_calls_out.append(
+                    {
+                        "index": idx,
+                        "id": self._tool_call_ids[idx],
+                        "type": "function",
+                        "function": {
+                            "name": self._tool_call_names[idx],
+                            "arguments": "".join(self._tool_call_args.get(idx, [])),
+                        },
+                    }
+                )
+            delta["tool_calls"] = tool_calls_out
+            has_content = True
+        if has_content or finish_reason:
+            return {
+                "index": 0,
+                "delta": delta if has_content else {},
+                "finish_reason": finish_reason,
+                "logprobs": None,
+            }
+        return None
--- a/components/src/dynamo/frontend/sglang_processor.py
+++ b/components/src/dynamo/frontend/sglang_processor.py
--- a/components/src/dynamo/frontend/tests/test_sglang_processor_api.py
+++ b/components/src/dynamo/frontend/tests/test_sglang_processor_api.py
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+"""Conformance tests for the SGLang API surface used by the sglang processor.
+These tests lock down the SGLang interfaces we depend on so that SGLang
+upgrades that break our integration surface are caught immediately.
+"""
+import inspect
+import pickle
+# ---------------------------------------------------------------------------
+# Import tests -- verify all required modules and symbols exist
+# ---------------------------------------------------------------------------
+def test_get_tokenizer_importable():
+    from sglang.srt.utils.hf_transformers_utils import get_tokenizer
+    assert callable(get_tokenizer)
+def test_function_call_parser_importable():
+    from sglang.srt.function_call.function_call_parser import FunctionCallParser
+    assert callable(FunctionCallParser)
+def test_tool_call_item_importable():
+    from sglang.srt.function_call.core_types import ToolCallItem
+    assert callable(ToolCallItem)
+def test_reasoning_parser_importable():
+    from sglang.srt.parser.reasoning_parser import ReasoningParser
+    assert callable(ReasoningParser)
+def test_sglang_tool_importable():
+    from sglang.srt.entrypoints.openai.protocol import Function, Tool
+    assert callable(Tool)
+    assert callable(Function)
+# ---------------------------------------------------------------------------
+# get_tokenizer signature
+# ---------------------------------------------------------------------------
+def test_get_tokenizer_accepts_tokenizer_mode():
+    from sglang.srt.utils.hf_transformers_utils import get_tokenizer
+    sig = inspect.signature(get_tokenizer)
+    params = sig.parameters
+    assert "tokenizer_name" in params or list(params.keys())[0] != ""
+    assert "tokenizer_mode" in params
+# ---------------------------------------------------------------------------
+# FunctionCallParser
+# ---------------------------------------------------------------------------
+def test_function_call_parser_init():
+    """Verify FunctionCallParser constructor accepts tools and tool_call_parser."""
+    from sglang.srt.entrypoints.openai.protocol import Function, Tool
+    from sglang.srt.function_call.function_call_parser import FunctionCallParser
+    tools = [
+        Tool(
+            type="function",
+            function=Function(
+                name="get_weather",
+                description="Get weather for a city",
+                parameters={
+                    "type": "object",
+                    "properties": {"city": {"type": "string"}},
+                },
+            ),
+        )
+    ]
+    parser = FunctionCallParser(tools=tools, tool_call_parser="hermes")
+    assert parser is not None
+def test_function_call_parser_enum_keys():
+    """Verify commonly-used parser names are accepted."""
+    from sglang.srt.entrypoints.openai.protocol import Function, Tool
+    from sglang.srt.function_call.function_call_parser import FunctionCallParser
+    tools = [
+        Tool(
+            type="function",
+            function=Function(
+                name="f",
+                description="d",
+                parameters={"type": "object", "properties": {}},
+            ),
+        )
+    ]
+    # These parser names must remain available
+    for name in ("hermes", "llama3", "qwen25"):
+        parser = FunctionCallParser(tools=tools, tool_call_parser=name)
+        assert parser is not None
+def test_parse_stream_chunk_signature():
+    """Verify parse_stream_chunk returns (str, list[ToolCallItem])."""
+    from sglang.srt.entrypoints.openai.protocol import Function, Tool
+    from sglang.srt.function_call.function_call_parser import FunctionCallParser
+    tools = [
+        Tool(
+            type="function",
+            function=Function(
+                name="f",
+                description="d",
+                parameters={"type": "object", "properties": {}},
+            ),
+        )
+    ]
+    parser = FunctionCallParser(tools=tools, tool_call_parser="hermes")
+    result = parser.parse_stream_chunk("Hello world")
+    assert isinstance(result, tuple)
+    assert len(result) == 2
+    normal_text, calls = result
+    assert isinstance(normal_text, str)
+    assert isinstance(calls, list)
+def test_tool_call_item_fields():
+    """Verify ToolCallItem has expected fields."""
+    from sglang.srt.function_call.core_types import ToolCallItem
+    item = ToolCallItem(tool_index=0, name="test", parameters='{"x": 1}')
+    assert item.tool_index == 0
+    assert item.name == "test"
+    assert item.parameters == '{"x": 1}'
+# ---------------------------------------------------------------------------
+# ReasoningParser
+# ---------------------------------------------------------------------------
+def test_reasoning_parser_init():
+    """Verify ReasoningParser constructor accepts model_type."""
+    from sglang.srt.parser.reasoning_parser import ReasoningParser
+    parser = ReasoningParser(model_type="deepseek-r1", stream_reasoning=True)
+    assert parser is not None
+def test_reasoning_parser_detector_map():
+    """Verify commonly-used detector names are accepted."""
+    from sglang.srt.parser.reasoning_parser import ReasoningParser
+    for name in ("deepseek-r1", "qwen3"):
+        parser = ReasoningParser(model_type=name, stream_reasoning=True)
+        assert parser is not None
+def test_reasoning_parser_parse_stream_chunk():
+    """Verify parse_stream_chunk returns (reasoning_text, normal_text)."""
+    from sglang.srt.parser.reasoning_parser import ReasoningParser
+    parser = ReasoningParser(model_type="deepseek-r1", stream_reasoning=True)
+    result = parser.parse_stream_chunk("Hello")
+    assert isinstance(result, tuple)
+    assert len(result) == 2
+# ---------------------------------------------------------------------------
+# StreamingParseResult (function call variant)
+# ---------------------------------------------------------------------------
+def test_streaming_parse_result_fields():
+    """Verify function-call StreamingParseResult has expected fields."""
+    from sglang.srt.function_call.core_types import StreamingParseResult
+    r = StreamingParseResult(normal_text="hello", calls=[])
+    assert r.normal_text == "hello"
+    assert r.calls == []
+# ---------------------------------------------------------------------------
+# Tool / Function protocol models
+# ---------------------------------------------------------------------------
+def test_sglang_tool_model_dump():
+    """Verify Tool.model_dump() produces a dict suitable for chat templates."""
+    from sglang.srt.entrypoints.openai.protocol import Function, Tool
+    tool = Tool(
+        type="function",
+        function=Function(
+            name="search",
+            description="Search the web",
+            parameters={"type": "object", "properties": {"q": {"type": "string"}}},
+        ),
+    )
+    d = tool.model_dump()
+    assert d["type"] == "function"
+    assert d["function"]["name"] == "search"
+    assert "properties" in d["function"]["parameters"]
+# ---------------------------------------------------------------------------
+# Picklability (required for ProcessPoolExecutor worker results)
+# ---------------------------------------------------------------------------
+def test_preprocess_result_picklability():
+    """Verify SglangPreprocessWorkerResult survives pickle round-trip."""
+    from dynamo.frontend.sglang_processor import SglangPreprocessWorkerResult
+    result = SglangPreprocessWorkerResult(
+        prompt_token_ids=[1, 2, 3],
+        dynamo_preproc={
+            "model": "test",
+            "token_ids": [1, 2, 3],
+            "stop_conditions": {},
+            "sampling_options": {},
+            "output_options": {},
+            "eos_token_ids": [],
+            "annotations": [],
+        },
+        request={"model": "test", "messages": []},
+    )
+    restored = pickle.loads(pickle.dumps(result))
+    assert restored.prompt_token_ids == result.prompt_token_ids
+    assert restored.dynamo_preproc == result.dynamo_preproc
+    assert restored.request == result.request
--- a/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py
+++ b/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py
--- a/components/src/dynamo/frontend/tests/test_sglang_tool_calls.py
+++ b/components/src/dynamo/frontend/tests/test_sglang_tool_calls.py
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+"""Tests for tool call parsing in SglangStreamingPostProcessor.
+Covers the interaction between SGLang's FunctionCallParser, ReasoningParser,
+and our post-processor's accumulate-and-emit-on-finish logic, including the
+parse_non_stream fallback for the chunking-sensitivity issue in
+BaseFormatDetector.parse_streaming_increment.
+"""
+import json
+import pytest
+from sglang.srt.entrypoints.openai.protocol import Function as SglangFunction
+from sglang.srt.entrypoints.openai.protocol import Tool as SglangTool
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
+from dynamo.frontend.sglang_prepost import SglangStreamingPostProcessor
+MODEL = "Qwen/Qwen3-0.6B"
+@pytest.fixture(scope="module")
+def tokenizer():
+    return get_tokenizer(MODEL)
+TOOLS = [
+    SglangTool(
+        type="function",
+        function=SglangFunction(
+            name="search_gutenberg_books",
+            description="Search for books in the Project Gutenberg library",
+            parameters={
+                "type": "object",
+                "properties": {
+                    "search_terms": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of search terms to find books",
+                    }
+                },
+                "required": ["search_terms"],
+            },
+        ),
+    ),
+    SglangTool(
+        type="function",
+        function=SglangFunction(
+            name="get_weather",
+            description="Get weather for a city",
+            parameters={
+                "type": "object",
+                "properties": {"city": {"type": "string"}},
+                "required": ["city"],
+            },
+        ),
+    ),
+]
+def _run_postprocessor(tokenizer, full_text, batch_size, *, use_reasoning=True):
+    """Tokenize text, feed through post-processor in batches, return all choices."""
+    tcp = FunctionCallParser(tools=TOOLS, tool_call_parser="hermes")
+    rp = (
+        ReasoningParser(model_type="qwen3", stream_reasoning=True)
+        if use_reasoning
+        else None
+    )
+    post = SglangStreamingPostProcessor(
+        tokenizer=tokenizer,
+        tool_call_parser=tcp,
+        reasoning_parser=rp,
+    )
+    token_ids = tokenizer.encode(full_text)
+    results = []
+    for i in range(0, len(token_ids), batch_size):
+        batch = token_ids[i : i + batch_size]
+        is_last = i + batch_size >= len(token_ids)
+        choice = post.process_output(
+            {"token_ids": batch, "finish_reason": "stop" if is_last else None}
+        )
+        if choice:
+            results.append(choice)
+    return results
+def _extract_tool_calls(results):
+    """Extract tool_calls from the list of choices."""
+    for r in results:
+        tc = r.get("delta", {}).get("tool_calls")
+        if tc:
+            return tc
+    return []
+# ---------------------------------------------------------------------------
+# Single tool call
+# ---------------------------------------------------------------------------
+class TestSingleToolCall:
+    """Single tool call with reasoning, various batch sizes."""
+    TEXT = (
+        "<think>\nLet me search for books.\n</think>\n\n"
+        '<tool_call>\n{"name": "search_gutenberg_books", '
+        '"arguments": {"search_terms": ["James Joyce"]}}\n</tool_call>'
+    )
+    def test_large_batches(self, tokenizer):
+        """stream_interval=20 scenario -- complete JSON in one chunk."""
+        tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 20))
+        assert len(tc) == 1
+        assert tc[0]["function"]["name"] == "search_gutenberg_books"
+        args = json.loads(tc[0]["function"]["arguments"])
+        assert args == {"search_terms": ["James Joyce"]}
+    def test_small_batches(self, tokenizer):
+        """Token-by-token-ish scenario -- streaming deltas work directly."""
+        tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 3))
+        assert len(tc) == 1
+        assert tc[0]["function"]["name"] == "search_gutenberg_books"
+        args = json.loads(tc[0]["function"]["arguments"])
+        assert args == {"search_terms": ["James Joyce"]}
+    def test_medium_batches(self, tokenizer):
+        """Intermediate batch size."""
+        tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
+        assert len(tc) == 1
+        assert tc[0]["function"]["name"] == "search_gutenberg_books"
+        args = json.loads(tc[0]["function"]["arguments"])
+        assert args == {"search_terms": ["James Joyce"]}
+    def test_tool_call_has_id_and_type(self, tokenizer):
+        """Each tool call must have id and type fields."""
+        tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 20))
+        assert tc[0]["id"].startswith("call_")
+        assert tc[0]["type"] == "function"
+        assert tc[0]["index"] == 0
+# ---------------------------------------------------------------------------
+# No reasoning parser
+# ---------------------------------------------------------------------------
+class TestNoReasoningParser:
+    """Tool calls without reasoning parser active."""
+    TEXT = (
+        '<tool_call>\n{"name": "get_weather", '
+        '"arguments": {"city": "Paris"}}\n</tool_call>'
+    )
+    def test_large_batches(self, tokenizer):
+        tc = _extract_tool_calls(
+            _run_postprocessor(tokenizer, self.TEXT, 15, use_reasoning=False)
+        )
+        assert len(tc) == 1
+        assert tc[0]["function"]["name"] == "get_weather"
+        args = json.loads(tc[0]["function"]["arguments"])
+        assert args == {"city": "Paris"}
+    def test_small_batches(self, tokenizer):
+        tc = _extract_tool_calls(
+            _run_postprocessor(tokenizer, self.TEXT, 3, use_reasoning=False)
+        )
+        assert len(tc) == 1
+        assert tc[0]["function"]["name"] == "get_weather"
+        args = json.loads(tc[0]["function"]["arguments"])
+        assert args == {"city": "Paris"}
+# ---------------------------------------------------------------------------
+# Multiple tool calls
+# ---------------------------------------------------------------------------
+class TestMultipleToolCalls:
+    """Two tool calls in a single response."""
+    TEXT = (
+        "<think>\nI'll search and check weather.\n</think>\n\n"
+        '<tool_call>\n{"name": "search_gutenberg_books", '
+        '"arguments": {"search_terms": ["Joyce"]}}\n</tool_call>\n'
+        '<tool_call>\n{"name": "get_weather", '
+        '"arguments": {"city": "London"}}\n</tool_call>'
+    )
+    def test_both_tools_present(self, tokenizer):
+        tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
+        assert len(tc) == 2
+        names = {t["function"]["name"] for t in tc}
+        assert names == {"search_gutenberg_books", "get_weather"}
+    def test_arguments_correct(self, tokenizer):
+        tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
+        by_name = {t["function"]["name"]: t for t in tc}
+        assert json.loads(
+            by_name["search_gutenberg_books"]["function"]["arguments"]
+        ) == {"search_terms": ["Joyce"]}
+        assert json.loads(by_name["get_weather"]["function"]["arguments"]) == {
+            "city": "London"
+        }
+    def test_distinct_ids(self, tokenizer):
+        tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
+        ids = [t["id"] for t in tc]
+        assert len(set(ids)) == len(ids), "Tool call IDs must be unique"
+# ---------------------------------------------------------------------------
+# Content alongside tool calls
+# ---------------------------------------------------------------------------
+class TestContentWithToolCalls:
+    """Reasoning content and regular content are preserved alongside tool calls."""
+    TEXT = (
+        "<think>\nThinking about it.\n</think>\n\n"
+        '<tool_call>\n{"name": "get_weather", '
+        '"arguments": {"city": "NYC"}}\n</tool_call>'
+    )
+    def test_reasoning_content_present(self, tokenizer):
+        results = _run_postprocessor(tokenizer, self.TEXT, 20)
+        reasoning = ""
+        for r in results:
+            rc = r.get("delta", {}).get("reasoning_content", "")
+            reasoning += rc
+        assert "Thinking about it" in reasoning
+    def test_content_is_whitespace_only(self, tokenizer):
+        """Content between </think> and <tool_call> should be whitespace only."""
+        results = _run_postprocessor(tokenizer, self.TEXT, 20)
+        content = ""
+        for r in results:
+            c = r.get("delta", {}).get("content", "")
+            content += c
+        assert content.strip() == ""
+# ---------------------------------------------------------------------------
+# No tool calls (plain text)
+# ---------------------------------------------------------------------------
+class TestNoToolCalls:
+    """When no tool call markup is present, no tool_calls should appear."""
+    TEXT = "<think>\nJust thinking.\n</think>\n\nHello, world!"
+    def test_no_tool_calls_emitted(self, tokenizer):
+        tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
+        assert tc == []
+    def test_content_preserved(self, tokenizer):
+        results = _run_postprocessor(tokenizer, self.TEXT, 10)
+        content = ""
+        for r in results:
+            c = r.get("delta", {}).get("content", "")
+            content += c
+        assert "Hello, world!" in content
--- a/components/src/dynamo/frontend/utils.py
+++ b/components/src/dynamo/frontend/utils.py
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+"""Shared utilities for frontend chat processors (vLLM, SGLang)."""
+import uuid
+from typing import Any
+_MASK_64_BITS = (1 << 64) - 1
+def random_uuid() -> str:
+    """Generate a random 16-character hex UUID."""
+    return f"{uuid.uuid4().int & _MASK_64_BITS:016x}"
+def random_call_id() -> str:
+    """Generate a random tool call ID in OpenAI format."""
+    return f"call_{uuid.uuid4().int & _MASK_64_BITS:016x}"
+def worker_warmup() -> bool:
+    """Dummy task to ensure a ProcessPoolExecutor worker is fully initialized."""
+    return True
+class PreprocessError(Exception):
+    """Raised by preprocess workers for user-facing errors (e.g., n!=1)."""
+    def __init__(self, error_dict: dict[str, Any]):
+        self.error_dict = error_dict
+        super().__init__(str(error_dict))
--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
@@ -9,9 +9,11 @@ import asyncio
 import logging
 import os
 import time
-import uuid
 from argparse import Namespace
 from collections.abc import AsyncGenerator
+from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures import wait as _futures_wait
+from dataclasses import dataclass
 from typing import Any
 from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
@@ -36,12 +38,16 @@ from dynamo.llm import (
 )
 from dynamo.runtime import Client, DistributedRuntime
-from .prepost import StreamingPostProcessor, preprocess_chat_request
+from .prepost import (
+    StreamingPostProcessor,
+    preprocess_chat_request,
+    preprocess_chat_request_sync,
+)
+from .utils import PreprocessError, random_uuid, worker_warmup
 logger = logging.getLogger(__name__)
-_MASK_64_BITS = (1 << 64) - 1
 _FINISH_REASON_MAP: dict[str, FinishReason] = {
    "eos": FinishReason.STOP,
    "stop": FinishReason.STOP,
@@ -52,10 +58,6 @@ _FINISH_REASON_MAP: dict[str, FinishReason] = {
 }
-def random_uuid() -> str:
-    return f"{uuid.uuid4().int & _MASK_64_BITS:016x}"  # 16 hex chars
 def map_finish_reason(raw_reason: str | None) -> FinishReason | None:
    if raw_reason is None:
        return None
@@ -72,6 +74,181 @@ def map_finish_reason(raw_reason: str | None) -> FinishReason | None:
    return mapped
+# --- Worker process globals (initialized once per process by _init_worker) ---
+_w_input_processor: InputProcessor | None = None
+_w_tokenizer: Any = None
+_w_tool_parser_class: type[ToolParser] | None = None
+@dataclass
+class PreprocessWorkerResult:
+    """Picklable return value from the preprocess worker."""
+    dynamo_preproc: dict[str, Any]
+    tokens: list[int]
+    vllm_preproc: EngineCoreRequest
+    sampling_params: SamplingParams
+    request_for_sampling: Any  # ChatCompletionRequest (Pydantic model, picklable)
+    chat_template_kwargs: dict[str, Any]
+def _init_worker(
+    model_path: str,
+    tokenizer_mode: str,
+    config_format: str,
+    load_format: str,
+    tool_parser_name: str | None,
+) -> None:
+    """Initialize a worker process with its own VllmConfig and InputProcessor."""
+    global _w_input_processor, _w_tokenizer, _w_tool_parser_class
+    global _w_reasoning_parser_class
+    model_config = ModelConfig(
+        model=model_path,
+        tokenizer_mode=tokenizer_mode,
+        config_format=config_format,
+    )
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        load_config=LoadConfig(load_format=load_format),
+        cache_config=CacheConfig(),
+    )
+    _w_input_processor = InputProcessor(vllm_config)
+    _w_tokenizer = _w_input_processor.get_tokenizer()
+    if tool_parser_name:
+        _w_tool_parser_class = ToolParserManager.get_tool_parser(tool_parser_name)
+    else:
+        _w_tool_parser_class = None
+def _preprocess_worker(
+    request: dict[str, Any],
+    request_id: str,
+    model_name: str,
+) -> PreprocessWorkerResult:
+    """Preprocess a request in a worker process and return a picklable result."""
+    assert _w_input_processor is not None
+    pre = preprocess_chat_request_sync(
+        request,
+        tokenizer=_w_tokenizer,
+        renderer=_w_input_processor.renderer,
+        tool_parser_class=_w_tool_parser_class,
+    )
+    request_for_sampling = pre.request_for_sampling
+    engine_prompt = pre.engine_prompt
+    tokens = pre.prompt_token_ids
+    if request_for_sampling.max_completion_tokens is not None:
+        max_tokens = request_for_sampling.max_completion_tokens
+    elif request_for_sampling.max_tokens is not None:
+        max_tokens = request_for_sampling.max_tokens
+    else:
+        max_tokens = None
+    sampling_params = SamplingParams(
+        output_kind=RequestOutputKind.DELTA,
+        max_tokens=max_tokens,
+    )
+    for k, v in _w_input_processor.generation_config_fields.items():
+        if hasattr(sampling_params, k):
+            setattr(sampling_params, k, v)
+    sampling_fields = (
+        set(getattr(SamplingParams, "__annotations__", ()))
+        & set(type(request_for_sampling).model_fields)
+    ) - {"max_tokens", "logprobs", "output_kind"}
+    for k in sorted(sampling_fields):
+        v = getattr(request_for_sampling, k, None)
+        if v is not None:
+            setattr(sampling_params, k, v)
+    logprobs = request_for_sampling.logprobs
+    top_logprobs = request_for_sampling.top_logprobs
+    if logprobs is True:
+        sampling_params.logprobs = top_logprobs or 1
+    elif isinstance(logprobs, int) and not isinstance(logprobs, bool):
+        sampling_params.logprobs = logprobs
+    elif top_logprobs not in (None, 0):
+        sampling_params.logprobs = top_logprobs
+    prompt_inputs = TokensPrompt(prompt_token_ids=tokens)
+    if "multi_modal_data" in engine_prompt:
+        prompt_inputs["multi_modal_data"] = engine_prompt["multi_modal_data"]
+    if "multi_modal_uuids" in engine_prompt:
+        prompt_inputs["multi_modal_uuids"] = engine_prompt["multi_modal_uuids"]
+    if request_for_sampling.cache_salt is not None:
+        prompt_inputs["cache_salt"] = request_for_sampling.cache_salt
+    if request_for_sampling.mm_processor_kwargs is not None:
+        prompt_inputs["mm_processor_kwargs"] = request_for_sampling.mm_processor_kwargs
+    vllm_preproc: EngineCoreRequest = _w_input_processor.process_inputs(
+        request_id,
+        prompt_inputs,
+        sampling_params,
+    )
+    InputProcessor.assign_request_id(vllm_preproc)
+    sp = vllm_preproc.sampling_params
+    if sp.n != 1:
+        raise PreprocessError(
+            {
+                "error": {
+                    "message": (
+                        f"Unsupported value: 'n={sp.n}'. "
+                        "This endpoint currently supports only n=1."
+                    ),
+                    "type": "invalid_request_error",
+                    "param": "n",
+                    "code": "unsupported_value",
+                }
+            }
+        )
+    dynamo_preproc = {
+        "model": model_name,
+        "token_ids": tokens,
+        "stop_conditions": {
+            "max_tokens": sp.max_tokens,
+            "stop": sp.stop,
+            "stop_token_ids": sp.stop_token_ids,
+            "min_tokens": sp.min_tokens,
+            "ignore_eos": sp.ignore_eos,
+        },
+        "sampling_options": {
+            "n": sp.n,
+            "presence_penalty": sp.presence_penalty,
+            "frequency_penalty": sp.frequency_penalty,
+            "repetition_penalty": sp.repetition_penalty,
+            "temperature": sp.temperature,
+            "top_p": sp.top_p,
+            "top_k": sp.top_k,
+            "min_p": sp.min_p,
+            "seed": sp.seed,
+        },
+        "output_options": {
+            "logprobs": sp.logprobs,
+            "prompt_logprobs": sp.prompt_logprobs,
+            "skip_special_tokens": sp.skip_special_tokens,
+        },
+        "eos_token_ids": (
+            [vllm_preproc.eos_token_id] if vllm_preproc.eos_token_id is not None else []
+        ),
+        "annotations": [],
+    }
+    return PreprocessWorkerResult(
+        dynamo_preproc=dynamo_preproc,
+        tokens=tokens,
+        vllm_preproc=vllm_preproc,
+        sampling_params=sampling_params,
+        request_for_sampling=request_for_sampling,
+        chat_template_kwargs=pre.chat_template_kwargs,
+    )
 class VllmProcessor:
    def __init__(
        self,
@@ -234,9 +411,11 @@ class VllmProcessor:
                "prompt_logprobs": sp.prompt_logprobs,
                "skip_special_tokens": sp.skip_special_tokens,
            },
-            "eos_token_ids": [vllm_preproc.eos_token_id]
+            "eos_token_ids": (
-            if vllm_preproc.eos_token_id is not None
+                [vllm_preproc.eos_token_id]
-            else [],
+                if vllm_preproc.eos_token_id is not None
+                else []
+            ),
            "annotations": [],
        }
@@ -347,6 +526,77 @@ class VllmProcessor:
                    [vllm_preproc.request_id], internal=True
                )
+    async def _generator_inner_pool(
+        self, request: dict[str, Any]
+    ) -> AsyncGenerator[dict[str, Any], None]:
+        """Process a request using the worker pool.
+        Phase 1: Preprocess in a worker process (semaphore held).
+        Phase 2: Remote inference via router (no worker held).
+        Phase 3: Post-process tokens in the main process.
+        """
+        request_id = random_uuid()
+        # --- Phase 1: Preprocess (semaphore held) ---
+        try:
+            assert self._worker_semaphore is not None
+            async with self._worker_semaphore:
+                assert self.preprocess_pool is not None
+                future = self.preprocess_pool.submit(
+                    _preprocess_worker, request, request_id, request["model"]
+                )
+                preproc_result: PreprocessWorkerResult = await asyncio.wrap_future(
+                    future
+                )
+            # Semaphore + worker released here
+        except PreprocessError as exc:
+            yield exc.error_dict
+            return
+        except Exception as exc:
+            logger.exception("Worker preprocessing failed for request %s", request_id)
+            yield {
+                "error": {
+                    "message": f"Worker error: {exc}",
+                    "type": "internal_error",
+                }
+            }
+            return
+        # --- Between phases: reconstruct main-process objects ---
+        dynamo_preproc = preproc_result.dynamo_preproc
+        tokens = preproc_result.tokens
+        vllm_preproc = preproc_result.vllm_preproc
+        sampling_params = preproc_result.sampling_params
+        request_for_sampling = preproc_result.request_for_sampling
+        tool_parser = None
+        if (
+            self.tool_parser_class
+            and request_for_sampling.tools
+            and request_for_sampling.tool_choice != "none"
+        ):
+            tool_parser = self.tool_parser_class(self.tokenizer)
+        post = StreamingPostProcessor(
+            tokenizer=self.tokenizer,
+            request_for_sampling=request_for_sampling,
+            sampling_params=sampling_params,
+            prompt_token_ids=tokens,
+            tool_parser=tool_parser,
+            reasoning_parser_class=self.reasoning_parser_class,
+            chat_template_kwargs=preproc_result.chat_template_kwargs,
+        )
+        async for item in self._generate_and_stream(
+            request_id,
+            request,
+            dynamo_preproc,
+            tokens,
+            vllm_preproc,
+            post,
+        ):
+            yield item
 class EngineFactory:
    def __init__(
@@ -439,7 +689,7 @@ class EngineFactory:
        else:
            reasoning_parser_class = None
-        (namespace_name, component_name, endpoint_name) = instance_id.triple()
+        namespace_name, component_name, endpoint_name = instance_id.triple()
        generate_endpoint = self.runtime.endpoint(
            f"{namespace_name}.{component_name}.{endpoint_name}"
        )
@@ -455,6 +705,45 @@ class EngineFactory:
                router_mode=self.router_config.router_mode
            )
+        preprocess_pool = None
+        preprocess_workers = self.config.preprocess_workers
+        if preprocess_workers > 0:
+            logger.info(
+                "Creating preprocess worker pool with %d workers for model %s",
+                preprocess_workers,
+                source_path,
+            )
+            preprocess_pool = ProcessPoolExecutor(
+                max_workers=preprocess_workers,
+                initializer=_init_worker,
+                initargs=(
+                    source_path,
+                    tokenizer_mode,
+                    config_format,
+                    load_format,
+                    tool_parser_name,
+                ),
+            )
+            # Warm up all workers to ensure initialization completes
+            futures = [
+                preprocess_pool.submit(worker_warmup) for _ in range(preprocess_workers)
+            ]
+            done, not_done = _futures_wait(futures, timeout=120)
+            if not_done:
+                for f in not_done:
+                    f.cancel()
+                preprocess_pool.shutdown(wait=False, cancel_futures=True)
+                raise RuntimeError(
+                    "Timed out waiting for preprocess worker pool warmup"
+                )
+            try:
+                for f in done:
+                    f.result()  # Raises if initializer failed
+            except Exception:
+                preprocess_pool.shutdown(wait=False, cancel_futures=True)
+                raise
+            logger.info("Preprocess worker pool ready (%d workers)", preprocess_workers)
        gen = VllmProcessor(
            tokenizer,
            input_processor,

--- a/components/src/dynamo/sglang/args.py
+++ b/components/src/dynamo/sglang/args.py
@@ -8,6 +8,7 @@ import os
 import socket
 import sys
 import tempfile
+import warnings
 from argparse import Namespace
 from pathlib import Path
 from typing import Any, Dict, Generator, Optional
@@ -374,6 +375,14 @@ async def parse_args(args: list[str]) -> Config:
    server_args.stream_output = True
    if dynamo_config.use_sglang_tokenizer:
+        warnings.warn(
+            "--use-sglang-tokenizer is deprecated and will be removed in a future "
+            "release. Use '--dyn-chat-processor sglang' on the frontend instead, "
+            "which provides the same SGLang-native pre/post processing with KV "
+            "router support.",
+            FutureWarning,
+            stacklevel=2,
+        )
        logging.info(
            "Using SGLang's built in tokenizer. Setting skip_tokenizer_init to False"
        )

--- a/components/src/dynamo/sglang/backend_args.py
+++ b/components/src/dynamo/sglang/backend_args.py
@@ -34,7 +34,10 @@ class DynamoSGLangArgGroup(ArgGroup):
            flag_name="--use-sglang-tokenizer",
            env_var="DYN_SGL_USE_TOKENIZER",
            default=False,
-            help="Use SGLang's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend. Cannot be used with --custom-jinja-template.",
+            help="[Deprecated] Use SGLang's tokenizer for pre and post processing. "
+            "This option will be removed in a future release. Use "
+            "'--dyn-chat-processor sglang' on the frontend instead, which provides "
+            "the same SGLang-native pre/post processing with KV router support.",
        )
        add_negatable_bool_argument(

--- a/docs/backends/sglang/sglang-chat-processor.md
+++ b/docs/backends/sglang/sglang-chat-processor.md
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: SGLang Chat Processor
+subtitle: SGLang-native preprocessing and postprocessing for chat completions
+---
+The SGLang chat processor enables SGLang-native preprocessing and postprocessing in the Dynamo frontend. It uses SGLang's tokenizer, chat templates, tool call parser, and reasoning parser directly -- bypassing the default Rust preprocessor for `v1/chat/completions` requests.
+## When to Use
+Use `--dyn-chat-processor sglang` when Dynamo's built-in Rust preprocessor does not yet support a tool call parser or reasoning parser you need. The SGLang processor delegates to SGLang's Python implementations, so any parser SGLang supports works immediately.
+Common cases:
+- A **tool call format** not yet in the Rust `tool_calling` library
+- A **reasoning parser** not yet supported natively
+- A **chat template** that the Rust preprocessor doesn't handle correctly
+If the parser you need is missing from the Rust preprocessor, consider [opening an issue or PR](https://github.com/ai-dynamo/dynamo/issues) to add native support -- native parsers avoid the Python GIL overhead entirely.
+## Quick Start
+```bash
+# Frontend with SGLang processor, tool calling, and reasoning
+python -m dynamo.frontend \
+  --router-mode kv \
+  --dyn-chat-processor sglang \
+  --tool-call-parser hermes \
+  --reasoning-parser qwen3
+# Workers (unchanged)
+CUDA_VISIBLE_DEVICES=0 python -m dynamo.sglang \
+  --model-path Qwen/Qwen3-14B-FP8 \
+  --served-model-name Qwen/Qwen3-14B-FP8 \
+  --tp 1 --trust-remote-code \
+  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}'
+```
+## Frontend Arguments
+These arguments are passed to the **frontend** (not the worker) when using `--dyn-chat-processor sglang`:
+| Argument | Default | Description |
+|----------|---------|-------------|
+| `--dyn-chat-processor sglang` | (none) | Enable the SGLang chat processor |
+| `--tool-call-parser` | `None` | Tool call parser name (any SGLang-supported parser) |
+| `--reasoning-parser` | `None` | Reasoning parser name (any SGLang-supported parser) |
+### Environment Variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `DYN_SGLANG_STREAM_INTERVAL` | `20` | Number of tokens to accumulate before detokenizing. Higher values improve throughput. The first chunk always emits immediately (interval=1) to minimize time-to-first-token. |
+## Tool Calling
+The processor supports all SGLang tool call formats. Pass `--tool-call-parser` on the frontend:
+```bash
+python -m dynamo.frontend \
+  --dyn-chat-processor sglang \
+  --tool-call-parser hermes
+```
+Any parser supported by SGLang can be used. See the [SGLang documentation](https://docs.sglang.ai/) for the full list of available tool call parsers.
+### Example: Tool Call Request
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen3-14B-FP8",
+    "messages": [{"role": "user", "content": "What is the weather in Paris?"}],
+    "tools": [{
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get weather for a city",
+        "parameters": {
+          "type": "object",
+          "properties": {"city": {"type": "string"}},
+          "required": ["city"]
+        }
+      }
+    }],
+    "tool_choice": "auto"
+  }'
+```
+Response:
+```json
+{
+  "choices": [{
+    "message": {
+      "role": "assistant",
+      "tool_calls": [{
+        "id": "call_8cd24396f3671048",
+        "type": "function",
+        "function": {
+          "name": "get_weather",
+          "arguments": "{\"city\": \"Paris\"}"
+        }
+      }],
+      "reasoning_content": "The user wants weather info for Paris..."
+    },
+    "finish_reason": "tool_calls"
+  }]
+}
+```
+## Reasoning Parsing
+For models that produce chain-of-thought reasoning (e.g., Qwen3, DeepSeek-R1), pass `--reasoning-parser`:
+```bash
+python -m dynamo.frontend \
+  --dyn-chat-processor sglang \
+  --reasoning-parser qwen3
+```
+The parser separates think tag content into the `reasoning_content` field and regular content into the `content` field.
+## Migration from `--use-sglang-tokenizer`
+`--use-sglang-tokenizer` on the **worker** is deprecated. Replace with `--dyn-chat-processor sglang` on the **frontend**:
+```diff
+  # Before (deprecated)
+- python -m dynamo.sglang --model-path <model> --use-sglang-tokenizer
+- python -m dynamo.frontend
+  # After
+  python -m dynamo.sglang --model-path <model>
+ python -m dynamo.frontend --dyn-chat-processor sglang
+```
+Key differences:
+| | `--use-sglang-tokenizer` | `--dyn-chat-processor sglang` |
+|---|---|---|
+| Location | Worker flag | Frontend flag |
+| KV router | Not supported | Supported |
+| Tool calling | Not supported | Supported |
+| Reasoning | Not supported | Supported |
+| Endpoints | `v1/chat/completions` only | `v1/chat/completions` only |
+## See Also
+- **[Tool Calling](../../agents/tool-calling.md)**: General tool calling guide
+- **[Reference Guide](sglang-reference-guide.md)**: Full SGLang backend reference
+- **[Agentic Workloads](agents.md)**: Priority scheduling and cache pinning for agents
--- a/docs/backends/sglang/sglang-reference-guide.md
+++ b/docs/backends/sglang/sglang-reference-guide.md
@@ -35,7 +35,7 @@ These arguments are added by Dynamo on top of SGLang's native arguments.
 | Argument | Env Var | Default | Description |
 |----------|---------|---------|-------------|
 | `--endpoint` | `DYN_ENDPOINT` | Auto-generated | Dynamo endpoint in `dyn://namespace.component.endpoint` format |
-| `--use-sglang-tokenizer` | `DYN_SGL_USE_TOKENIZER` | `false` | Use SGLang's tokenizer instead of Dynamo's |
+| `--use-sglang-tokenizer` | `DYN_SGL_USE_TOKENIZER` | `false` | **[Deprecated]** Use `--dyn-chat-processor sglang` on the frontend instead. See [SGLang Chat Processor](sglang-chat-processor.md). |
 | `--dyn-tool-call-parser` | `DYN_TOOL_CALL_PARSER` | `None` | [Tool call](../../agents/tool-calling.md) parser (overrides SGLang's `--tool-call-parser`) |
 | `--dyn-reasoning-parser` | `DYN_REASONING_PARSER` | `None` | Reasoning parser for chain-of-thought models |
 | `--custom-jinja-template` | `DYN_CUSTOM_JINJA_TEMPLATE` | `None` | Custom chat template path (incompatible with `--use-sglang-tokenizer`) |
@@ -56,10 +56,10 @@ These arguments are added by Dynamo on top of SGLang's native arguments.
 By default, Dynamo handles tokenization and detokenization through its Rust-based frontend, passing `input_ids` to SGLang. This enables all frontend endpoints (`v1/chat/completions`, `v1/completions`, `v1/embeddings`).
-With `--use-sglang-tokenizer`, SGLang handles tokenization internally and Dynamo passes raw prompts. This restricts the frontend to `v1/chat/completions` only.
+For SGLang-native preprocessing (tool calling, reasoning parsing, chat templates), use `--dyn-chat-processor sglang` on the frontend. See [SGLang Chat Processor](sglang-chat-processor.md) for architecture and usage.
 <Warning>
-`--custom-jinja-template` and `--use-sglang-tokenizer` are mutually exclusive. Custom templates require Dynamo's preprocessor.
+`--use-sglang-tokenizer` is deprecated. Use `--dyn-chat-processor sglang` on the frontend instead, which provides the same SGLang-native processing with KV router support and the completions endpoint.
 </Warning>
 ## Request Cancellation

--- a/docs/index.yml
+++ b/docs/index.yml
@@ -149,6 +149,8 @@ navigation:
        contents:
          - page: Reference Guide
            path: backends/sglang/sglang-reference-guide.md
+          - page: Chat Processor
+            path: backends/sglang/sglang-chat-processor.md
          - page: Examples
            path: backends/sglang/sglang-examples.md
          - page: Disaggregation