Unverified Commit 51dfd760 authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat: add SGLang chat processor for frontend pre/post processing (#6834)


Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 63d7c01c
...@@ -71,6 +71,7 @@ class FrontendConfig(KvRouterConfigBase): ...@@ -71,6 +71,7 @@ class FrontendConfig(KvRouterConfigBase):
event_plane: str event_plane: str
chat_processor: str chat_processor: str
enable_anthropic_api: bool enable_anthropic_api: bool
debug_perf: bool
preprocess_workers: int preprocess_workers: int
def validate(self) -> None: def validate(self) -> None:
...@@ -350,10 +351,25 @@ class FrontendArgGroup(ArgGroup): ...@@ -350,10 +351,25 @@ class FrontendArgGroup(ArgGroup):
default="dynamo", default="dynamo",
dest="chat_processor", dest="chat_processor",
help=( help=(
"[EXPERIMENTAL] When set to 'vllm', use local vllm for the pre and post " "[EXPERIMENTAL] Chat pre/post processor backend. 'dynamo' uses the Rust "
"processor." "preprocessor. 'vllm' uses local vLLM for pre and post processing. "
"'sglang' uses SGLang APIs for chat template rendering, tool call "
"parsing, and reasoning parsing."
),
choices=["dynamo", "vllm", "sglang"],
)
add_negatable_bool_argument(
g,
flag_name="--dyn-debug-perf",
env_var="DYN_DEBUG_PERF",
default=False,
dest="debug_perf",
help=(
"[EXPERIMENTAL] Enable performance instrumentation for diagnosing preprocessing bottlenecks. "
"Logs per-function timing, request concurrency, and hot-path section durations. "
"Supported with '--dyn-chat-processor vllm' and '--dyn-chat-processor sglang'."
), ),
choices=["dynamo", "vllm"],
) )
add_argument( add_argument(
...@@ -366,7 +382,8 @@ class FrontendArgGroup(ArgGroup): ...@@ -366,7 +382,8 @@ class FrontendArgGroup(ArgGroup):
"[EXPERIMENTAL] Number of worker processes for preprocessing and output processing. " "[EXPERIMENTAL] Number of worker processes for preprocessing and output processing. "
"When > 0, offloads CPU-bound work (tokenization, template rendering, " "When > 0, offloads CPU-bound work (tokenization, template rendering, "
"detokenization) to a ProcessPoolExecutor with N workers, each with its " "detokenization) to a ProcessPoolExecutor with N workers, each with its "
"own GIL. 0 (default) keeps all processing on the main event loop. '--dyn-chat-processor vllm' only." "own GIL. 0 (default) keeps all processing on the main event loop. "
"Supported with '--dyn-chat-processor vllm' and '--dyn-chat-processor sglang'."
), ),
arg_type=int, arg_type=int,
) )
...@@ -63,11 +63,36 @@ def setup_engine_factory( ...@@ -63,11 +63,36 @@ def setup_engine_factory(
return EngineFactory(runtime, router_config, config, vllm_flags) return EngineFactory(runtime, router_config, config, vllm_flags)
def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]: def setup_sglang_engine_factory(
runtime: DistributedRuntime,
router_config: RouterConfig,
config: FrontendConfig,
sglang_flags: Optional[Namespace] = None,
):
"""
When using sglang pre and post processor, create the SglangEngineFactory
that creates the engines that run requests.
"""
from .sglang_processor import SglangEngineFactory
tool_call_parser = getattr(sglang_flags, "tool_call_parser", None)
reasoning_parser = getattr(sglang_flags, "reasoning_parser", None)
return SglangEngineFactory(
runtime,
router_config,
config,
debug_perf=config.debug_perf,
tool_call_parser_name=tool_call_parser,
reasoning_parser_name=reasoning_parser,
)
def parse_args() -> tuple[FrontendConfig, Optional[Namespace], Optional[Namespace]]:
"""Parse command-line arguments for the Dynamo frontend. """Parse command-line arguments for the Dynamo frontend.
Returns: Returns:
FrontendConfig: Parsed configuration object. Tuple of (FrontendConfig, vllm_flags, sglang_flags).
""" """
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
...@@ -83,6 +108,7 @@ def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]: ...@@ -83,6 +108,7 @@ def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:
config.validate() config.validate()
vllm_flags = None vllm_flags = None
sglang_flags = None
# parse extra vllm flags using vllm native parser. # parse extra vllm flags using vllm native parser.
if config.chat_processor == "vllm": if config.chat_processor == "vllm":
...@@ -108,11 +134,19 @@ def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]: ...@@ -108,11 +134,19 @@ def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:
vllm_parser = AsyncEngineArgs.add_cli_args(vllm_parser) vllm_parser = AsyncEngineArgs.add_cli_args(vllm_parser)
# the result is returned as Namespace object rather than AsyncEngineArgs object to avoid import error for non-vllm users. # the result is returned as Namespace object rather than AsyncEngineArgs object to avoid import error for non-vllm users.
vllm_flags = vllm_parser.parse_args(unknown) vllm_flags = vllm_parser.parse_args(unknown)
elif config.chat_processor == "sglang":
sglang_parser = argparse.ArgumentParser(add_help=False)
sglang_parser.add_argument("--tool-call-parser", default=None)
sglang_parser.add_argument("--reasoning-parser", default=None)
sglang_flags, remaining = sglang_parser.parse_known_args(unknown)
if remaining:
logger.error(f"Unknown arguments specified: {remaining}")
sys.exit(1)
else: else:
if unknown: if unknown:
logger.error(f"Unknown arguments specified: {unknown}") logger.error(f"Unknown arguments specified: {unknown}")
sys.exit(1) sys.exit(1)
return config, vllm_flags return config, vllm_flags, sglang_flags
async def async_main(): async def async_main():
...@@ -128,7 +162,7 @@ async def async_main(): ...@@ -128,7 +162,7 @@ async def async_main():
# bind that port before the worker, causing port conflicts and/or scraping the # bind that port before the worker, causing port conflicts and/or scraping the
# wrong metrics endpoint. # wrong metrics endpoint.
os.environ.pop("DYN_SYSTEM_PORT", None) os.environ.pop("DYN_SYSTEM_PORT", None)
config, vllm_flags = parse_args() config, vllm_flags, sglang_flags = parse_args()
dump_config(config.dump_config_to, config) dump_config(config.dump_config_to, config)
os.environ["DYN_EVENT_PLANE"] = config.event_plane os.environ["DYN_EVENT_PLANE"] = config.event_plane
logger.info( logger.info(
...@@ -233,6 +267,11 @@ async def async_main(): ...@@ -233,6 +267,11 @@ async def async_main():
runtime, router_config, config, vllm_flags runtime, router_config, config, vllm_flags
).chat_engine_factory ).chat_engine_factory
kwargs["chat_engine_factory"] = chat_engine_factory kwargs["chat_engine_factory"] = chat_engine_factory
elif config.chat_processor == "sglang":
chat_engine_factory = setup_sglang_engine_factory(
runtime, router_config, config, sglang_flags
).chat_engine_factory
kwargs["chat_engine_factory"] = chat_engine_factory
e = EntrypointArgs(EngineType.Dynamic, **kwargs) e = EntrypointArgs(EngineType.Dynamic, **kwargs)
engine = await make_engine(runtime, e) engine = await make_engine(runtime, e)
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from sglang.srt.entrypoints.openai.protocol import Function as SglangFunction
from sglang.srt.entrypoints.openai.protocol import Tool as SglangTool
from sglang.srt.function_call.function_call_parser import FunctionCallParser
from sglang.srt.parser.reasoning_parser import ReasoningParser
from .utils import random_call_id
@dataclass
class SglangPreprocessResult:
"""Result of SGLang preprocessing."""
prompt_token_ids: list[int]
tool_call_parser: FunctionCallParser | None
reasoning_parser: ReasoningParser | None
request: dict[str, Any]
def convert_tools(tools: list[dict[str, Any]] | None) -> list[SglangTool] | None:
"""Convert OpenAI tool dicts to SGLang Tool objects."""
if not tools:
return None
sglang_tools = []
for tool in tools:
func = tool.get("function", {})
sglang_tools.append(
SglangTool(
type=tool.get("type", "function"),
function=SglangFunction(
name=func.get("name", ""),
description=func.get("description"),
parameters=func.get("parameters"),
strict=func.get("strict", False),
),
)
)
return sglang_tools
def _materialize_messages(messages: list[Any]) -> list[dict[str, Any]]:
"""Convert message objects to plain dicts for apply_chat_template."""
normalized = []
for msg in messages:
if hasattr(msg, "model_dump"):
normalized.append(msg.model_dump(exclude_none=False))
elif isinstance(msg, dict):
normalized.append(msg)
else:
normalized.append(dict(msg))
return normalized
def create_parsers(
request: dict[str, Any],
*,
tool_call_parser_name: str | None,
reasoning_parser_name: str | None,
sglang_tools: list[SglangTool] | None = None,
) -> tuple[FunctionCallParser | None, ReasoningParser | None]:
"""Create tool call and reasoning parsers for a request.
Shared by both the single-process preprocessing path and the pool path
(which must recreate non-picklable parsers in the main process).
If ``sglang_tools`` is provided, reuses them; otherwise converts from
the request's ``tools`` field.
"""
if sglang_tools is None:
sglang_tools = convert_tools(request.get("tools"))
tool_choice = request.get("tool_choice", "auto")
tool_call_parser = None
if tool_call_parser_name and sglang_tools and tool_choice != "none":
tool_call_parser = FunctionCallParser(
tools=sglang_tools,
tool_call_parser=tool_call_parser_name,
)
reasoning_parser = None
if reasoning_parser_name:
reasoning_parser = ReasoningParser(
model_type=reasoning_parser_name,
stream_reasoning=True,
)
return tool_call_parser, reasoning_parser
def preprocess_chat_request(
request: dict[str, Any],
*,
tokenizer,
tool_call_parser_name: str | None,
reasoning_parser_name: str | None,
) -> SglangPreprocessResult:
"""Preprocess a chat request using SGLang tokenizer and parser APIs.
Synchronous -- suitable for both main-process and worker-process execution.
"""
messages = _materialize_messages(request.get("messages", []))
# Convert tools to SGLang format (done once, shared with parser creation)
sglang_tools = convert_tools(request.get("tools"))
# Build template kwargs -- single call for rendering + tokenization
template_kwargs: dict[str, Any] = {
"add_generation_prompt": True,
"tokenize": True,
}
if sglang_tools:
template_kwargs["tools"] = [t.model_dump() for t in sglang_tools]
prompt_token_ids = tokenizer.apply_chat_template(messages, **template_kwargs)
if not isinstance(prompt_token_ids, list):
prompt_token_ids = list(prompt_token_ids)
tool_call_parser, reasoning_parser = create_parsers(
request,
tool_call_parser_name=tool_call_parser_name,
reasoning_parser_name=reasoning_parser_name,
sglang_tools=sglang_tools,
)
return SglangPreprocessResult(
prompt_token_ids=prompt_token_ids,
tool_call_parser=tool_call_parser,
reasoning_parser=reasoning_parser,
request=request,
)
def _random_call_id() -> str:
return random_call_id()
class SglangStreamingPostProcessor:
"""Streaming post-processor using SGLang parsers and HF tokenizer detokenization.
Handles:
- Incremental detokenization via sliding-window decode (6-token lookback)
- Reasoning content extraction via SGLang ReasoningParser
- Tool call parsing via SGLang FunctionCallParser (parameter deltas)
"""
# Lookback window size for incremental detokenization. UTF-8 characters
# can span up to 4 bytes, each potentially its own token. A lookback of
# 6 covers the worst case (4-token char) plus margin for BPE merges that
# cross the old/new boundary.
LOOKBACK = 6
def __init__(
self,
*,
tokenizer,
tool_call_parser: FunctionCallParser | None,
reasoning_parser: ReasoningParser | None,
) -> None:
self.tokenizer = tokenizer
self.tool_call_parser = tool_call_parser
self.reasoning_parser = reasoning_parser
self._fast_plain_text = tool_call_parser is None and reasoning_parser is None
self._all_token_ids: list[int] = []
# Tool call accumulation. SGLang's streaming parser returns
# deltas (name in one chunk, argument fragments across subsequent
# chunks). However, when the complete tool-call JSON arrives in a
# single chunk the parser emits the name but never streams
# arguments (a chunking-sensitivity issue in the base detector).
# We accumulate names + arg fragments from streaming deltas and,
# on finish, fall back to parse_non_stream on the detector buffer
# for any tool call whose arguments are still missing.
self._tool_call_ids: dict[int, str] = {} # tool_index -> call_id
self._tool_call_names: dict[int, str] = {} # tool_index -> name
self._tool_call_args: dict[int, list[str]] = {} # tool_index -> arg chunks
def _incremental_decode(self, new_token_ids: list[int]) -> str:
"""Decode new tokens with lookback window for multi-byte char boundaries.
Re-decodes a small window of previous tokens alongside new tokens so that
multi-byte characters spanning token boundaries are correctly resolved.
Only retains the last LOOKBACK tokens to bound memory usage.
"""
prev_count = len(self._all_token_ids)
self._all_token_ids.extend(new_token_ids)
start = max(0, prev_count - self.LOOKBACK)
# Trim to avoid unbounded growth -- only the tail matters for decoding
if len(self._all_token_ids) > self.LOOKBACK * 16:
self._all_token_ids = self._all_token_ids[
-(self.LOOKBACK + len(new_token_ids)) :
]
prev_count = len(self._all_token_ids) - len(new_token_ids)
start = max(0, prev_count - self.LOOKBACK)
# Decode lookback-only prefix (before new tokens)
prefix_tokens = self._all_token_ids[start:prev_count]
prefix_text = (
self.tokenizer.decode(prefix_tokens, skip_special_tokens=True)
if prefix_tokens
else ""
)
# Decode lookback + new tokens together
window_tokens = self._all_token_ids[start:]
window_text = self.tokenizer.decode(window_tokens, skip_special_tokens=True)
return window_text[len(prefix_text) :]
def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | None:
"""Process a single engine response chunk into an OpenAI SSE choice dict.
Args:
engine_response: Dict with ``token_ids`` and optional ``finish_reason``.
Returns:
OpenAI choice dict or ``None`` if nothing to emit yet.
"""
raw_ids = engine_response.get("token_ids")
token_ids = raw_ids if isinstance(raw_ids, list) else list(raw_ids or [])
finish_reason = engine_response.get("finish_reason")
delta_text = self._incremental_decode(token_ids) if token_ids else ""
if self._fast_plain_text:
if delta_text:
return {
"index": 0,
"delta": {"role": "assistant", "content": delta_text},
"finish_reason": finish_reason,
"logprobs": None,
}
elif finish_reason:
return {
"index": 0,
"delta": {},
"finish_reason": finish_reason,
"logprobs": None,
}
return None
# -- Reasoning parsing --
reasoning_text = None
normal_text = delta_text
if self.reasoning_parser and delta_text:
r_text, n_text = self.reasoning_parser.parse_stream_chunk(delta_text)
reasoning_text = r_text or None
normal_text = n_text or ""
# -- Tool call parsing (accumulate deltas) --
content_text = normal_text
if self.tool_call_parser and normal_text:
parsed_text, tool_calls = self.tool_call_parser.parse_stream_chunk(
normal_text
)
content_text = parsed_text
for tc in tool_calls:
idx = tc.tool_index
if idx not in self._tool_call_ids:
self._tool_call_ids[idx] = _random_call_id()
if tc.name:
self._tool_call_names[idx] = tc.name
if tc.parameters:
self._tool_call_args.setdefault(idx, []).append(tc.parameters)
# -- Assemble delta --
delta: dict[str, Any] = {"role": "assistant"}
has_content = False
if content_text:
delta["content"] = content_text
has_content = True
if reasoning_text:
delta["reasoning_content"] = reasoning_text
has_content = True
# Emit complete tool calls on finish. For any tool call whose
# arguments are still empty (chunking-sensitivity issue), fall
# back to parse_non_stream on the detector's buffer.
if finish_reason and self._tool_call_names:
missing_args = any(
idx not in self._tool_call_args for idx in self._tool_call_names
)
if missing_args:
buffer = getattr(self.tool_call_parser.detector, "_buffer", "")
if buffer:
_, final_calls = self.tool_call_parser.parse_non_stream(buffer)
for tc in final_calls:
idx = tc.tool_index
if idx not in self._tool_call_ids:
self._tool_call_ids[idx] = _random_call_id()
if tc.name:
self._tool_call_names[idx] = tc.name
if tc.parameters:
self._tool_call_args[idx] = [tc.parameters]
tool_calls_out: list[dict[str, Any]] = []
for idx in sorted(self._tool_call_names):
tool_calls_out.append(
{
"index": idx,
"id": self._tool_call_ids[idx],
"type": "function",
"function": {
"name": self._tool_call_names[idx],
"arguments": "".join(self._tool_call_args.get(idx, [])),
},
}
)
delta["tool_calls"] = tool_calls_out
has_content = True
if has_content or finish_reason:
return {
"index": 0,
"delta": delta if has_content else {},
"finish_reason": finish_reason,
"logprobs": None,
}
return None
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Conformance tests for the SGLang API surface used by the sglang processor.
These tests lock down the SGLang interfaces we depend on so that SGLang
upgrades that break our integration surface are caught immediately.
"""
import inspect
import pickle
# ---------------------------------------------------------------------------
# Import tests -- verify all required modules and symbols exist
# ---------------------------------------------------------------------------
def test_get_tokenizer_importable():
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
assert callable(get_tokenizer)
def test_function_call_parser_importable():
from sglang.srt.function_call.function_call_parser import FunctionCallParser
assert callable(FunctionCallParser)
def test_tool_call_item_importable():
from sglang.srt.function_call.core_types import ToolCallItem
assert callable(ToolCallItem)
def test_reasoning_parser_importable():
from sglang.srt.parser.reasoning_parser import ReasoningParser
assert callable(ReasoningParser)
def test_sglang_tool_importable():
from sglang.srt.entrypoints.openai.protocol import Function, Tool
assert callable(Tool)
assert callable(Function)
# ---------------------------------------------------------------------------
# get_tokenizer signature
# ---------------------------------------------------------------------------
def test_get_tokenizer_accepts_tokenizer_mode():
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
sig = inspect.signature(get_tokenizer)
params = sig.parameters
assert "tokenizer_name" in params or list(params.keys())[0] != ""
assert "tokenizer_mode" in params
# ---------------------------------------------------------------------------
# FunctionCallParser
# ---------------------------------------------------------------------------
def test_function_call_parser_init():
"""Verify FunctionCallParser constructor accepts tools and tool_call_parser."""
from sglang.srt.entrypoints.openai.protocol import Function, Tool
from sglang.srt.function_call.function_call_parser import FunctionCallParser
tools = [
Tool(
type="function",
function=Function(
name="get_weather",
description="Get weather for a city",
parameters={
"type": "object",
"properties": {"city": {"type": "string"}},
},
),
)
]
parser = FunctionCallParser(tools=tools, tool_call_parser="hermes")
assert parser is not None
def test_function_call_parser_enum_keys():
"""Verify commonly-used parser names are accepted."""
from sglang.srt.entrypoints.openai.protocol import Function, Tool
from sglang.srt.function_call.function_call_parser import FunctionCallParser
tools = [
Tool(
type="function",
function=Function(
name="f",
description="d",
parameters={"type": "object", "properties": {}},
),
)
]
# These parser names must remain available
for name in ("hermes", "llama3", "qwen25"):
parser = FunctionCallParser(tools=tools, tool_call_parser=name)
assert parser is not None
def test_parse_stream_chunk_signature():
"""Verify parse_stream_chunk returns (str, list[ToolCallItem])."""
from sglang.srt.entrypoints.openai.protocol import Function, Tool
from sglang.srt.function_call.function_call_parser import FunctionCallParser
tools = [
Tool(
type="function",
function=Function(
name="f",
description="d",
parameters={"type": "object", "properties": {}},
),
)
]
parser = FunctionCallParser(tools=tools, tool_call_parser="hermes")
result = parser.parse_stream_chunk("Hello world")
assert isinstance(result, tuple)
assert len(result) == 2
normal_text, calls = result
assert isinstance(normal_text, str)
assert isinstance(calls, list)
def test_tool_call_item_fields():
"""Verify ToolCallItem has expected fields."""
from sglang.srt.function_call.core_types import ToolCallItem
item = ToolCallItem(tool_index=0, name="test", parameters='{"x": 1}')
assert item.tool_index == 0
assert item.name == "test"
assert item.parameters == '{"x": 1}'
# ---------------------------------------------------------------------------
# ReasoningParser
# ---------------------------------------------------------------------------
def test_reasoning_parser_init():
"""Verify ReasoningParser constructor accepts model_type."""
from sglang.srt.parser.reasoning_parser import ReasoningParser
parser = ReasoningParser(model_type="deepseek-r1", stream_reasoning=True)
assert parser is not None
def test_reasoning_parser_detector_map():
"""Verify commonly-used detector names are accepted."""
from sglang.srt.parser.reasoning_parser import ReasoningParser
for name in ("deepseek-r1", "qwen3"):
parser = ReasoningParser(model_type=name, stream_reasoning=True)
assert parser is not None
def test_reasoning_parser_parse_stream_chunk():
"""Verify parse_stream_chunk returns (reasoning_text, normal_text)."""
from sglang.srt.parser.reasoning_parser import ReasoningParser
parser = ReasoningParser(model_type="deepseek-r1", stream_reasoning=True)
result = parser.parse_stream_chunk("Hello")
assert isinstance(result, tuple)
assert len(result) == 2
# ---------------------------------------------------------------------------
# StreamingParseResult (function call variant)
# ---------------------------------------------------------------------------
def test_streaming_parse_result_fields():
"""Verify function-call StreamingParseResult has expected fields."""
from sglang.srt.function_call.core_types import StreamingParseResult
r = StreamingParseResult(normal_text="hello", calls=[])
assert r.normal_text == "hello"
assert r.calls == []
# ---------------------------------------------------------------------------
# Tool / Function protocol models
# ---------------------------------------------------------------------------
def test_sglang_tool_model_dump():
"""Verify Tool.model_dump() produces a dict suitable for chat templates."""
from sglang.srt.entrypoints.openai.protocol import Function, Tool
tool = Tool(
type="function",
function=Function(
name="search",
description="Search the web",
parameters={"type": "object", "properties": {"q": {"type": "string"}}},
),
)
d = tool.model_dump()
assert d["type"] == "function"
assert d["function"]["name"] == "search"
assert "properties" in d["function"]["parameters"]
# ---------------------------------------------------------------------------
# Picklability (required for ProcessPoolExecutor worker results)
# ---------------------------------------------------------------------------
def test_preprocess_result_picklability():
"""Verify SglangPreprocessWorkerResult survives pickle round-trip."""
from dynamo.frontend.sglang_processor import SglangPreprocessWorkerResult
result = SglangPreprocessWorkerResult(
prompt_token_ids=[1, 2, 3],
dynamo_preproc={
"model": "test",
"token_ids": [1, 2, 3],
"stop_conditions": {},
"sampling_options": {},
"output_options": {},
"eos_token_ids": [],
"annotations": [],
},
request={"model": "test", "messages": []},
)
restored = pickle.loads(pickle.dumps(result))
assert restored.prompt_token_ids == result.prompt_token_ids
assert restored.dynamo_preproc == result.dynamo_preproc
assert restored.request == result.request
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Tests for tool call parsing in SglangStreamingPostProcessor.
Covers the interaction between SGLang's FunctionCallParser, ReasoningParser,
and our post-processor's accumulate-and-emit-on-finish logic, including the
parse_non_stream fallback for the chunking-sensitivity issue in
BaseFormatDetector.parse_streaming_increment.
"""
import json
import pytest
from sglang.srt.entrypoints.openai.protocol import Function as SglangFunction
from sglang.srt.entrypoints.openai.protocol import Tool as SglangTool
from sglang.srt.function_call.function_call_parser import FunctionCallParser
from sglang.srt.parser.reasoning_parser import ReasoningParser
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
from dynamo.frontend.sglang_prepost import SglangStreamingPostProcessor
MODEL = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def tokenizer():
return get_tokenizer(MODEL)
TOOLS = [
SglangTool(
type="function",
function=SglangFunction(
name="search_gutenberg_books",
description="Search for books in the Project Gutenberg library",
parameters={
"type": "object",
"properties": {
"search_terms": {
"type": "array",
"items": {"type": "string"},
"description": "List of search terms to find books",
}
},
"required": ["search_terms"],
},
),
),
SglangTool(
type="function",
function=SglangFunction(
name="get_weather",
description="Get weather for a city",
parameters={
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
),
),
]
def _run_postprocessor(tokenizer, full_text, batch_size, *, use_reasoning=True):
"""Tokenize text, feed through post-processor in batches, return all choices."""
tcp = FunctionCallParser(tools=TOOLS, tool_call_parser="hermes")
rp = (
ReasoningParser(model_type="qwen3", stream_reasoning=True)
if use_reasoning
else None
)
post = SglangStreamingPostProcessor(
tokenizer=tokenizer,
tool_call_parser=tcp,
reasoning_parser=rp,
)
token_ids = tokenizer.encode(full_text)
results = []
for i in range(0, len(token_ids), batch_size):
batch = token_ids[i : i + batch_size]
is_last = i + batch_size >= len(token_ids)
choice = post.process_output(
{"token_ids": batch, "finish_reason": "stop" if is_last else None}
)
if choice:
results.append(choice)
return results
def _extract_tool_calls(results):
"""Extract tool_calls from the list of choices."""
for r in results:
tc = r.get("delta", {}).get("tool_calls")
if tc:
return tc
return []
# ---------------------------------------------------------------------------
# Single tool call
# ---------------------------------------------------------------------------
class TestSingleToolCall:
"""Single tool call with reasoning, various batch sizes."""
TEXT = (
"<think>\nLet me search for books.\n</think>\n\n"
'<tool_call>\n{"name": "search_gutenberg_books", '
'"arguments": {"search_terms": ["James Joyce"]}}\n</tool_call>'
)
def test_large_batches(self, tokenizer):
"""stream_interval=20 scenario -- complete JSON in one chunk."""
tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 20))
assert len(tc) == 1
assert tc[0]["function"]["name"] == "search_gutenberg_books"
args = json.loads(tc[0]["function"]["arguments"])
assert args == {"search_terms": ["James Joyce"]}
def test_small_batches(self, tokenizer):
"""Token-by-token-ish scenario -- streaming deltas work directly."""
tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 3))
assert len(tc) == 1
assert tc[0]["function"]["name"] == "search_gutenberg_books"
args = json.loads(tc[0]["function"]["arguments"])
assert args == {"search_terms": ["James Joyce"]}
def test_medium_batches(self, tokenizer):
"""Intermediate batch size."""
tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
assert len(tc) == 1
assert tc[0]["function"]["name"] == "search_gutenberg_books"
args = json.loads(tc[0]["function"]["arguments"])
assert args == {"search_terms": ["James Joyce"]}
def test_tool_call_has_id_and_type(self, tokenizer):
"""Each tool call must have id and type fields."""
tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 20))
assert tc[0]["id"].startswith("call_")
assert tc[0]["type"] == "function"
assert tc[0]["index"] == 0
# ---------------------------------------------------------------------------
# No reasoning parser
# ---------------------------------------------------------------------------
class TestNoReasoningParser:
"""Tool calls without reasoning parser active."""
TEXT = (
'<tool_call>\n{"name": "get_weather", '
'"arguments": {"city": "Paris"}}\n</tool_call>'
)
def test_large_batches(self, tokenizer):
tc = _extract_tool_calls(
_run_postprocessor(tokenizer, self.TEXT, 15, use_reasoning=False)
)
assert len(tc) == 1
assert tc[0]["function"]["name"] == "get_weather"
args = json.loads(tc[0]["function"]["arguments"])
assert args == {"city": "Paris"}
def test_small_batches(self, tokenizer):
tc = _extract_tool_calls(
_run_postprocessor(tokenizer, self.TEXT, 3, use_reasoning=False)
)
assert len(tc) == 1
assert tc[0]["function"]["name"] == "get_weather"
args = json.loads(tc[0]["function"]["arguments"])
assert args == {"city": "Paris"}
# ---------------------------------------------------------------------------
# Multiple tool calls
# ---------------------------------------------------------------------------
class TestMultipleToolCalls:
"""Two tool calls in a single response."""
TEXT = (
"<think>\nI'll search and check weather.\n</think>\n\n"
'<tool_call>\n{"name": "search_gutenberg_books", '
'"arguments": {"search_terms": ["Joyce"]}}\n</tool_call>\n'
'<tool_call>\n{"name": "get_weather", '
'"arguments": {"city": "London"}}\n</tool_call>'
)
def test_both_tools_present(self, tokenizer):
tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
assert len(tc) == 2
names = {t["function"]["name"] for t in tc}
assert names == {"search_gutenberg_books", "get_weather"}
def test_arguments_correct(self, tokenizer):
tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
by_name = {t["function"]["name"]: t for t in tc}
assert json.loads(
by_name["search_gutenberg_books"]["function"]["arguments"]
) == {"search_terms": ["Joyce"]}
assert json.loads(by_name["get_weather"]["function"]["arguments"]) == {
"city": "London"
}
def test_distinct_ids(self, tokenizer):
tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
ids = [t["id"] for t in tc]
assert len(set(ids)) == len(ids), "Tool call IDs must be unique"
# ---------------------------------------------------------------------------
# Content alongside tool calls
# ---------------------------------------------------------------------------
class TestContentWithToolCalls:
"""Reasoning content and regular content are preserved alongside tool calls."""
TEXT = (
"<think>\nThinking about it.\n</think>\n\n"
'<tool_call>\n{"name": "get_weather", '
'"arguments": {"city": "NYC"}}\n</tool_call>'
)
def test_reasoning_content_present(self, tokenizer):
results = _run_postprocessor(tokenizer, self.TEXT, 20)
reasoning = ""
for r in results:
rc = r.get("delta", {}).get("reasoning_content", "")
reasoning += rc
assert "Thinking about it" in reasoning
def test_content_is_whitespace_only(self, tokenizer):
"""Content between </think> and <tool_call> should be whitespace only."""
results = _run_postprocessor(tokenizer, self.TEXT, 20)
content = ""
for r in results:
c = r.get("delta", {}).get("content", "")
content += c
assert content.strip() == ""
# ---------------------------------------------------------------------------
# No tool calls (plain text)
# ---------------------------------------------------------------------------
class TestNoToolCalls:
"""When no tool call markup is present, no tool_calls should appear."""
TEXT = "<think>\nJust thinking.\n</think>\n\nHello, world!"
def test_no_tool_calls_emitted(self, tokenizer):
tc = _extract_tool_calls(_run_postprocessor(tokenizer, self.TEXT, 10))
assert tc == []
def test_content_preserved(self, tokenizer):
results = _run_postprocessor(tokenizer, self.TEXT, 10)
content = ""
for r in results:
c = r.get("delta", {}).get("content", "")
content += c
assert "Hello, world!" in content
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Shared utilities for frontend chat processors (vLLM, SGLang)."""
import uuid
from typing import Any
_MASK_64_BITS = (1 << 64) - 1
def random_uuid() -> str:
"""Generate a random 16-character hex UUID."""
return f"{uuid.uuid4().int & _MASK_64_BITS:016x}"
def random_call_id() -> str:
"""Generate a random tool call ID in OpenAI format."""
return f"call_{uuid.uuid4().int & _MASK_64_BITS:016x}"
def worker_warmup() -> bool:
"""Dummy task to ensure a ProcessPoolExecutor worker is fully initialized."""
return True
class PreprocessError(Exception):
"""Raised by preprocess workers for user-facing errors (e.g., n!=1)."""
def __init__(self, error_dict: dict[str, Any]):
self.error_dict = error_dict
super().__init__(str(error_dict))
...@@ -9,9 +9,11 @@ import asyncio ...@@ -9,9 +9,11 @@ import asyncio
import logging import logging
import os import os
import time import time
import uuid
from argparse import Namespace from argparse import Namespace
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import wait as _futures_wait
from dataclasses import dataclass
from typing import Any from typing import Any
from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig from vllm.config import CacheConfig, LoadConfig, ModelConfig, VllmConfig
...@@ -36,12 +38,16 @@ from dynamo.llm import ( ...@@ -36,12 +38,16 @@ from dynamo.llm import (
) )
from dynamo.runtime import Client, DistributedRuntime from dynamo.runtime import Client, DistributedRuntime
from .prepost import StreamingPostProcessor, preprocess_chat_request from .prepost import (
StreamingPostProcessor,
preprocess_chat_request,
preprocess_chat_request_sync,
)
from .utils import PreprocessError, random_uuid, worker_warmup
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_MASK_64_BITS = (1 << 64) - 1
_FINISH_REASON_MAP: dict[str, FinishReason] = { _FINISH_REASON_MAP: dict[str, FinishReason] = {
"eos": FinishReason.STOP, "eos": FinishReason.STOP,
"stop": FinishReason.STOP, "stop": FinishReason.STOP,
...@@ -52,10 +58,6 @@ _FINISH_REASON_MAP: dict[str, FinishReason] = { ...@@ -52,10 +58,6 @@ _FINISH_REASON_MAP: dict[str, FinishReason] = {
} }
def random_uuid() -> str:
return f"{uuid.uuid4().int & _MASK_64_BITS:016x}" # 16 hex chars
def map_finish_reason(raw_reason: str | None) -> FinishReason | None: def map_finish_reason(raw_reason: str | None) -> FinishReason | None:
if raw_reason is None: if raw_reason is None:
return None return None
...@@ -72,6 +74,181 @@ def map_finish_reason(raw_reason: str | None) -> FinishReason | None: ...@@ -72,6 +74,181 @@ def map_finish_reason(raw_reason: str | None) -> FinishReason | None:
return mapped return mapped
# --- Worker process globals (initialized once per process by _init_worker) ---
_w_input_processor: InputProcessor | None = None
_w_tokenizer: Any = None
_w_tool_parser_class: type[ToolParser] | None = None
@dataclass
class PreprocessWorkerResult:
"""Picklable return value from the preprocess worker."""
dynamo_preproc: dict[str, Any]
tokens: list[int]
vllm_preproc: EngineCoreRequest
sampling_params: SamplingParams
request_for_sampling: Any # ChatCompletionRequest (Pydantic model, picklable)
chat_template_kwargs: dict[str, Any]
def _init_worker(
model_path: str,
tokenizer_mode: str,
config_format: str,
load_format: str,
tool_parser_name: str | None,
) -> None:
"""Initialize a worker process with its own VllmConfig and InputProcessor."""
global _w_input_processor, _w_tokenizer, _w_tool_parser_class
global _w_reasoning_parser_class
model_config = ModelConfig(
model=model_path,
tokenizer_mode=tokenizer_mode,
config_format=config_format,
)
vllm_config = VllmConfig(
model_config=model_config,
load_config=LoadConfig(load_format=load_format),
cache_config=CacheConfig(),
)
_w_input_processor = InputProcessor(vllm_config)
_w_tokenizer = _w_input_processor.get_tokenizer()
if tool_parser_name:
_w_tool_parser_class = ToolParserManager.get_tool_parser(tool_parser_name)
else:
_w_tool_parser_class = None
def _preprocess_worker(
request: dict[str, Any],
request_id: str,
model_name: str,
) -> PreprocessWorkerResult:
"""Preprocess a request in a worker process and return a picklable result."""
assert _w_input_processor is not None
pre = preprocess_chat_request_sync(
request,
tokenizer=_w_tokenizer,
renderer=_w_input_processor.renderer,
tool_parser_class=_w_tool_parser_class,
)
request_for_sampling = pre.request_for_sampling
engine_prompt = pre.engine_prompt
tokens = pre.prompt_token_ids
if request_for_sampling.max_completion_tokens is not None:
max_tokens = request_for_sampling.max_completion_tokens
elif request_for_sampling.max_tokens is not None:
max_tokens = request_for_sampling.max_tokens
else:
max_tokens = None
sampling_params = SamplingParams(
output_kind=RequestOutputKind.DELTA,
max_tokens=max_tokens,
)
for k, v in _w_input_processor.generation_config_fields.items():
if hasattr(sampling_params, k):
setattr(sampling_params, k, v)
sampling_fields = (
set(getattr(SamplingParams, "__annotations__", ()))
& set(type(request_for_sampling).model_fields)
) - {"max_tokens", "logprobs", "output_kind"}
for k in sorted(sampling_fields):
v = getattr(request_for_sampling, k, None)
if v is not None:
setattr(sampling_params, k, v)
logprobs = request_for_sampling.logprobs
top_logprobs = request_for_sampling.top_logprobs
if logprobs is True:
sampling_params.logprobs = top_logprobs or 1
elif isinstance(logprobs, int) and not isinstance(logprobs, bool):
sampling_params.logprobs = logprobs
elif top_logprobs not in (None, 0):
sampling_params.logprobs = top_logprobs
prompt_inputs = TokensPrompt(prompt_token_ids=tokens)
if "multi_modal_data" in engine_prompt:
prompt_inputs["multi_modal_data"] = engine_prompt["multi_modal_data"]
if "multi_modal_uuids" in engine_prompt:
prompt_inputs["multi_modal_uuids"] = engine_prompt["multi_modal_uuids"]
if request_for_sampling.cache_salt is not None:
prompt_inputs["cache_salt"] = request_for_sampling.cache_salt
if request_for_sampling.mm_processor_kwargs is not None:
prompt_inputs["mm_processor_kwargs"] = request_for_sampling.mm_processor_kwargs
vllm_preproc: EngineCoreRequest = _w_input_processor.process_inputs(
request_id,
prompt_inputs,
sampling_params,
)
InputProcessor.assign_request_id(vllm_preproc)
sp = vllm_preproc.sampling_params
if sp.n != 1:
raise PreprocessError(
{
"error": {
"message": (
f"Unsupported value: 'n={sp.n}'. "
"This endpoint currently supports only n=1."
),
"type": "invalid_request_error",
"param": "n",
"code": "unsupported_value",
}
}
)
dynamo_preproc = {
"model": model_name,
"token_ids": tokens,
"stop_conditions": {
"max_tokens": sp.max_tokens,
"stop": sp.stop,
"stop_token_ids": sp.stop_token_ids,
"min_tokens": sp.min_tokens,
"ignore_eos": sp.ignore_eos,
},
"sampling_options": {
"n": sp.n,
"presence_penalty": sp.presence_penalty,
"frequency_penalty": sp.frequency_penalty,
"repetition_penalty": sp.repetition_penalty,
"temperature": sp.temperature,
"top_p": sp.top_p,
"top_k": sp.top_k,
"min_p": sp.min_p,
"seed": sp.seed,
},
"output_options": {
"logprobs": sp.logprobs,
"prompt_logprobs": sp.prompt_logprobs,
"skip_special_tokens": sp.skip_special_tokens,
},
"eos_token_ids": (
[vllm_preproc.eos_token_id] if vllm_preproc.eos_token_id is not None else []
),
"annotations": [],
}
return PreprocessWorkerResult(
dynamo_preproc=dynamo_preproc,
tokens=tokens,
vllm_preproc=vllm_preproc,
sampling_params=sampling_params,
request_for_sampling=request_for_sampling,
chat_template_kwargs=pre.chat_template_kwargs,
)
class VllmProcessor: class VllmProcessor:
def __init__( def __init__(
self, self,
...@@ -234,9 +411,11 @@ class VllmProcessor: ...@@ -234,9 +411,11 @@ class VllmProcessor:
"prompt_logprobs": sp.prompt_logprobs, "prompt_logprobs": sp.prompt_logprobs,
"skip_special_tokens": sp.skip_special_tokens, "skip_special_tokens": sp.skip_special_tokens,
}, },
"eos_token_ids": [vllm_preproc.eos_token_id] "eos_token_ids": (
if vllm_preproc.eos_token_id is not None [vllm_preproc.eos_token_id]
else [], if vllm_preproc.eos_token_id is not None
else []
),
"annotations": [], "annotations": [],
} }
...@@ -347,6 +526,77 @@ class VllmProcessor: ...@@ -347,6 +526,77 @@ class VllmProcessor:
[vllm_preproc.request_id], internal=True [vllm_preproc.request_id], internal=True
) )
async def _generator_inner_pool(
self, request: dict[str, Any]
) -> AsyncGenerator[dict[str, Any], None]:
"""Process a request using the worker pool.
Phase 1: Preprocess in a worker process (semaphore held).
Phase 2: Remote inference via router (no worker held).
Phase 3: Post-process tokens in the main process.
"""
request_id = random_uuid()
# --- Phase 1: Preprocess (semaphore held) ---
try:
assert self._worker_semaphore is not None
async with self._worker_semaphore:
assert self.preprocess_pool is not None
future = self.preprocess_pool.submit(
_preprocess_worker, request, request_id, request["model"]
)
preproc_result: PreprocessWorkerResult = await asyncio.wrap_future(
future
)
# Semaphore + worker released here
except PreprocessError as exc:
yield exc.error_dict
return
except Exception as exc:
logger.exception("Worker preprocessing failed for request %s", request_id)
yield {
"error": {
"message": f"Worker error: {exc}",
"type": "internal_error",
}
}
return
# --- Between phases: reconstruct main-process objects ---
dynamo_preproc = preproc_result.dynamo_preproc
tokens = preproc_result.tokens
vllm_preproc = preproc_result.vllm_preproc
sampling_params = preproc_result.sampling_params
request_for_sampling = preproc_result.request_for_sampling
tool_parser = None
if (
self.tool_parser_class
and request_for_sampling.tools
and request_for_sampling.tool_choice != "none"
):
tool_parser = self.tool_parser_class(self.tokenizer)
post = StreamingPostProcessor(
tokenizer=self.tokenizer,
request_for_sampling=request_for_sampling,
sampling_params=sampling_params,
prompt_token_ids=tokens,
tool_parser=tool_parser,
reasoning_parser_class=self.reasoning_parser_class,
chat_template_kwargs=preproc_result.chat_template_kwargs,
)
async for item in self._generate_and_stream(
request_id,
request,
dynamo_preproc,
tokens,
vllm_preproc,
post,
):
yield item
class EngineFactory: class EngineFactory:
def __init__( def __init__(
...@@ -439,7 +689,7 @@ class EngineFactory: ...@@ -439,7 +689,7 @@ class EngineFactory:
else: else:
reasoning_parser_class = None reasoning_parser_class = None
(namespace_name, component_name, endpoint_name) = instance_id.triple() namespace_name, component_name, endpoint_name = instance_id.triple()
generate_endpoint = self.runtime.endpoint( generate_endpoint = self.runtime.endpoint(
f"{namespace_name}.{component_name}.{endpoint_name}" f"{namespace_name}.{component_name}.{endpoint_name}"
) )
...@@ -455,6 +705,45 @@ class EngineFactory: ...@@ -455,6 +705,45 @@ class EngineFactory:
router_mode=self.router_config.router_mode router_mode=self.router_config.router_mode
) )
preprocess_pool = None
preprocess_workers = self.config.preprocess_workers
if preprocess_workers > 0:
logger.info(
"Creating preprocess worker pool with %d workers for model %s",
preprocess_workers,
source_path,
)
preprocess_pool = ProcessPoolExecutor(
max_workers=preprocess_workers,
initializer=_init_worker,
initargs=(
source_path,
tokenizer_mode,
config_format,
load_format,
tool_parser_name,
),
)
# Warm up all workers to ensure initialization completes
futures = [
preprocess_pool.submit(worker_warmup) for _ in range(preprocess_workers)
]
done, not_done = _futures_wait(futures, timeout=120)
if not_done:
for f in not_done:
f.cancel()
preprocess_pool.shutdown(wait=False, cancel_futures=True)
raise RuntimeError(
"Timed out waiting for preprocess worker pool warmup"
)
try:
for f in done:
f.result() # Raises if initializer failed
except Exception:
preprocess_pool.shutdown(wait=False, cancel_futures=True)
raise
logger.info("Preprocess worker pool ready (%d workers)", preprocess_workers)
gen = VllmProcessor( gen = VllmProcessor(
tokenizer, tokenizer,
input_processor, input_processor,
......
...@@ -8,6 +8,7 @@ import os ...@@ -8,6 +8,7 @@ import os
import socket import socket
import sys import sys
import tempfile import tempfile
import warnings
from argparse import Namespace from argparse import Namespace
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Generator, Optional from typing import Any, Dict, Generator, Optional
...@@ -374,6 +375,14 @@ async def parse_args(args: list[str]) -> Config: ...@@ -374,6 +375,14 @@ async def parse_args(args: list[str]) -> Config:
server_args.stream_output = True server_args.stream_output = True
if dynamo_config.use_sglang_tokenizer: if dynamo_config.use_sglang_tokenizer:
warnings.warn(
"--use-sglang-tokenizer is deprecated and will be removed in a future "
"release. Use '--dyn-chat-processor sglang' on the frontend instead, "
"which provides the same SGLang-native pre/post processing with KV "
"router support.",
FutureWarning,
stacklevel=2,
)
logging.info( logging.info(
"Using SGLang's built in tokenizer. Setting skip_tokenizer_init to False" "Using SGLang's built in tokenizer. Setting skip_tokenizer_init to False"
) )
......
...@@ -34,7 +34,10 @@ class DynamoSGLangArgGroup(ArgGroup): ...@@ -34,7 +34,10 @@ class DynamoSGLangArgGroup(ArgGroup):
flag_name="--use-sglang-tokenizer", flag_name="--use-sglang-tokenizer",
env_var="DYN_SGL_USE_TOKENIZER", env_var="DYN_SGL_USE_TOKENIZER",
default=False, default=False,
help="Use SGLang's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend. Cannot be used with --custom-jinja-template.", help="[Deprecated] Use SGLang's tokenizer for pre and post processing. "
"This option will be removed in a future release. Use "
"'--dyn-chat-processor sglang' on the frontend instead, which provides "
"the same SGLang-native pre/post processing with KV router support.",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
......
---
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: SGLang Chat Processor
subtitle: SGLang-native preprocessing and postprocessing for chat completions
---
The SGLang chat processor enables SGLang-native preprocessing and postprocessing in the Dynamo frontend. It uses SGLang's tokenizer, chat templates, tool call parser, and reasoning parser directly -- bypassing the default Rust preprocessor for `v1/chat/completions` requests.
## When to Use
Use `--dyn-chat-processor sglang` when Dynamo's built-in Rust preprocessor does not yet support a tool call parser or reasoning parser you need. The SGLang processor delegates to SGLang's Python implementations, so any parser SGLang supports works immediately.
Common cases:
- A **tool call format** not yet in the Rust `tool_calling` library
- A **reasoning parser** not yet supported natively
- A **chat template** that the Rust preprocessor doesn't handle correctly
If the parser you need is missing from the Rust preprocessor, consider [opening an issue or PR](https://github.com/ai-dynamo/dynamo/issues) to add native support -- native parsers avoid the Python GIL overhead entirely.
## Quick Start
```bash
# Frontend with SGLang processor, tool calling, and reasoning
python -m dynamo.frontend \
--router-mode kv \
--dyn-chat-processor sglang \
--tool-call-parser hermes \
--reasoning-parser qwen3
# Workers (unchanged)
CUDA_VISIBLE_DEVICES=0 python -m dynamo.sglang \
--model-path Qwen/Qwen3-14B-FP8 \
--served-model-name Qwen/Qwen3-14B-FP8 \
--tp 1 --trust-remote-code \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}'
```
## Frontend Arguments
These arguments are passed to the **frontend** (not the worker) when using `--dyn-chat-processor sglang`:
| Argument | Default | Description |
|----------|---------|-------------|
| `--dyn-chat-processor sglang` | (none) | Enable the SGLang chat processor |
| `--tool-call-parser` | `None` | Tool call parser name (any SGLang-supported parser) |
| `--reasoning-parser` | `None` | Reasoning parser name (any SGLang-supported parser) |
### Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| `DYN_SGLANG_STREAM_INTERVAL` | `20` | Number of tokens to accumulate before detokenizing. Higher values improve throughput. The first chunk always emits immediately (interval=1) to minimize time-to-first-token. |
## Tool Calling
The processor supports all SGLang tool call formats. Pass `--tool-call-parser` on the frontend:
```bash
python -m dynamo.frontend \
--dyn-chat-processor sglang \
--tool-call-parser hermes
```
Any parser supported by SGLang can be used. See the [SGLang documentation](https://docs.sglang.ai/) for the full list of available tool call parsers.
### Example: Tool Call Request
```bash
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-14B-FP8",
"messages": [{"role": "user", "content": "What is the weather in Paris?"}],
"tools": [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get weather for a city",
"parameters": {
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"]
}
}
}],
"tool_choice": "auto"
}'
```
Response:
```json
{
"choices": [{
"message": {
"role": "assistant",
"tool_calls": [{
"id": "call_8cd24396f3671048",
"type": "function",
"function": {
"name": "get_weather",
"arguments": "{\"city\": \"Paris\"}"
}
}],
"reasoning_content": "The user wants weather info for Paris..."
},
"finish_reason": "tool_calls"
}]
}
```
## Reasoning Parsing
For models that produce chain-of-thought reasoning (e.g., Qwen3, DeepSeek-R1), pass `--reasoning-parser`:
```bash
python -m dynamo.frontend \
--dyn-chat-processor sglang \
--reasoning-parser qwen3
```
The parser separates think tag content into the `reasoning_content` field and regular content into the `content` field.
## Migration from `--use-sglang-tokenizer`
`--use-sglang-tokenizer` on the **worker** is deprecated. Replace with `--dyn-chat-processor sglang` on the **frontend**:
```diff
# Before (deprecated)
- python -m dynamo.sglang --model-path <model> --use-sglang-tokenizer
- python -m dynamo.frontend
# After
python -m dynamo.sglang --model-path <model>
+ python -m dynamo.frontend --dyn-chat-processor sglang
```
Key differences:
| | `--use-sglang-tokenizer` | `--dyn-chat-processor sglang` |
|---|---|---|
| Location | Worker flag | Frontend flag |
| KV router | Not supported | Supported |
| Tool calling | Not supported | Supported |
| Reasoning | Not supported | Supported |
| Endpoints | `v1/chat/completions` only | `v1/chat/completions` only |
## See Also
- **[Tool Calling](../../agents/tool-calling.md)**: General tool calling guide
- **[Reference Guide](sglang-reference-guide.md)**: Full SGLang backend reference
- **[Agentic Workloads](agents.md)**: Priority scheduling and cache pinning for agents
...@@ -35,7 +35,7 @@ These arguments are added by Dynamo on top of SGLang's native arguments. ...@@ -35,7 +35,7 @@ These arguments are added by Dynamo on top of SGLang's native arguments.
| Argument | Env Var | Default | Description | | Argument | Env Var | Default | Description |
|----------|---------|---------|-------------| |----------|---------|---------|-------------|
| `--endpoint` | `DYN_ENDPOINT` | Auto-generated | Dynamo endpoint in `dyn://namespace.component.endpoint` format | | `--endpoint` | `DYN_ENDPOINT` | Auto-generated | Dynamo endpoint in `dyn://namespace.component.endpoint` format |
| `--use-sglang-tokenizer` | `DYN_SGL_USE_TOKENIZER` | `false` | Use SGLang's tokenizer instead of Dynamo's | | `--use-sglang-tokenizer` | `DYN_SGL_USE_TOKENIZER` | `false` | **[Deprecated]** Use `--dyn-chat-processor sglang` on the frontend instead. See [SGLang Chat Processor](sglang-chat-processor.md). |
| `--dyn-tool-call-parser` | `DYN_TOOL_CALL_PARSER` | `None` | [Tool call](../../agents/tool-calling.md) parser (overrides SGLang's `--tool-call-parser`) | | `--dyn-tool-call-parser` | `DYN_TOOL_CALL_PARSER` | `None` | [Tool call](../../agents/tool-calling.md) parser (overrides SGLang's `--tool-call-parser`) |
| `--dyn-reasoning-parser` | `DYN_REASONING_PARSER` | `None` | Reasoning parser for chain-of-thought models | | `--dyn-reasoning-parser` | `DYN_REASONING_PARSER` | `None` | Reasoning parser for chain-of-thought models |
| `--custom-jinja-template` | `DYN_CUSTOM_JINJA_TEMPLATE` | `None` | Custom chat template path (incompatible with `--use-sglang-tokenizer`) | | `--custom-jinja-template` | `DYN_CUSTOM_JINJA_TEMPLATE` | `None` | Custom chat template path (incompatible with `--use-sglang-tokenizer`) |
...@@ -56,10 +56,10 @@ These arguments are added by Dynamo on top of SGLang's native arguments. ...@@ -56,10 +56,10 @@ These arguments are added by Dynamo on top of SGLang's native arguments.
By default, Dynamo handles tokenization and detokenization through its Rust-based frontend, passing `input_ids` to SGLang. This enables all frontend endpoints (`v1/chat/completions`, `v1/completions`, `v1/embeddings`). By default, Dynamo handles tokenization and detokenization through its Rust-based frontend, passing `input_ids` to SGLang. This enables all frontend endpoints (`v1/chat/completions`, `v1/completions`, `v1/embeddings`).
With `--use-sglang-tokenizer`, SGLang handles tokenization internally and Dynamo passes raw prompts. This restricts the frontend to `v1/chat/completions` only. For SGLang-native preprocessing (tool calling, reasoning parsing, chat templates), use `--dyn-chat-processor sglang` on the frontend. See [SGLang Chat Processor](sglang-chat-processor.md) for architecture and usage.
<Warning> <Warning>
`--custom-jinja-template` and `--use-sglang-tokenizer` are mutually exclusive. Custom templates require Dynamo's preprocessor. `--use-sglang-tokenizer` is deprecated. Use `--dyn-chat-processor sglang` on the frontend instead, which provides the same SGLang-native processing with KV router support and the completions endpoint.
</Warning> </Warning>
## Request Cancellation ## Request Cancellation
......
...@@ -149,6 +149,8 @@ navigation: ...@@ -149,6 +149,8 @@ navigation:
contents: contents:
- page: Reference Guide - page: Reference Guide
path: backends/sglang/sglang-reference-guide.md path: backends/sglang/sglang-reference-guide.md
- page: Chat Processor
path: backends/sglang/sglang-chat-processor.md
- page: Examples - page: Examples
path: backends/sglang/sglang-examples.md path: backends/sglang/sglang-examples.md
- page: Disaggregation - page: Disaggregation
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment