"docs/backends/vscode:/vscode.git/clone" did not exist on "fd8410da6afcc2ad2b40819867f6780cfdcf0628"
Unverified Commit f95ba894 authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

feat: Add --custom-jinja-template argument to pass a custom chat template for SGLang (#3165)


Signed-off-by: default avatarKrishnan Prashanth <kprashanth@nvidia.com>
Signed-off-by: default avatarKrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 65401d75
...@@ -68,6 +68,7 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu ...@@ -68,6 +68,7 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu
| `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` | | `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` |
| `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` | | `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` |
| `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A | | `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A |
| `--custom-jinja-template` | Use custom chat template for that model (takes precedence over default chat template in model repo) | `None` | `--chat-template` |
#### Tokenizer Behavior #### Tokenizer Behavior
......
...@@ -49,11 +49,17 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = { ...@@ -49,11 +49,17 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
"choices": get_reasoning_parser_names(), "choices": get_reasoning_parser_names(),
"help": "Reasoning parser name for the model. If not specified, no reasoning parsing is performed.", "help": "Reasoning parser name for the model. If not specified, no reasoning parsing is performed.",
}, },
"custom-jinja-template": {
"flags": ["--custom-jinja-template"],
"type": str,
"default": None,
"help": "Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository. This template will be applied by Dynamo's preprocessor and cannot be used with --use-sglang-tokenizer.",
},
"use-sglang-tokenizer": { "use-sglang-tokenizer": {
"flags": ["--use-sglang-tokenizer"], "flags": ["--use-sglang-tokenizer"],
"action": "store_true", "action": "store_true",
"default": False, "default": False,
"help": "Use SGLang's tokenizer. This will skip tokenization of the input and output and only v1/chat/completions will be available when using the dynamo frontend", "help": "Use SGLang's tokenizer. This will skip tokenization of the input and output and only v1/chat/completions will be available when using the dynamo frontend. Cannot be used with --custom-jinja-template.",
}, },
} }
...@@ -68,6 +74,7 @@ class DynamoArgs: ...@@ -68,6 +74,7 @@ class DynamoArgs:
# tool and reasoning parser options # tool and reasoning parser options
tool_call_parser: Optional[str] = None tool_call_parser: Optional[str] = None
reasoning_parser: Optional[str] = None reasoning_parser: Optional[str] = None
custom_jinja_template: Optional[str] = None
# preprocessing options # preprocessing options
use_sglang_tokenizer: bool = False use_sglang_tokenizer: bool = False
...@@ -198,6 +205,23 @@ def parse_args(args: list[str]) -> Config: ...@@ -198,6 +205,23 @@ def parse_args(args: list[str]) -> Config:
"reasoning-parser", "reasoning-parser",
) )
if parsed_args.custom_jinja_template and parsed_args.use_sglang_tokenizer:
logging.error(
"Cannot use --custom-jinja-template and --use-sglang-tokenizer together. "
"--custom-jinja-template requires Dynamo's preprocessor to apply the template, "
"while --use-sglang-tokenizer bypasses Dynamo's preprocessor entirely."
"If you want to use the SGLang tokenizer with a custom chat template, "
"please use the --chat-template argument from SGLang."
)
sys.exit(1)
# Replaces any environment variables or home dir (~) to get absolute path
expanded_template_path = None
if parsed_args.custom_jinja_template:
expanded_template_path = os.path.expandvars(
os.path.expanduser(parsed_args.custom_jinja_template)
)
dynamo_args = DynamoArgs( dynamo_args = DynamoArgs(
namespace=parsed_namespace, namespace=parsed_namespace,
component=parsed_component_name, component=parsed_component_name,
...@@ -205,6 +229,7 @@ def parse_args(args: list[str]) -> Config: ...@@ -205,6 +229,7 @@ def parse_args(args: list[str]) -> Config:
migration_limit=parsed_args.migration_limit, migration_limit=parsed_args.migration_limit,
tool_call_parser=tool_call_parser, tool_call_parser=tool_call_parser,
reasoning_parser=reasoning_parser, reasoning_parser=reasoning_parser,
custom_jinja_template=expanded_template_path,
use_sglang_tokenizer=parsed_args.use_sglang_tokenizer, use_sglang_tokenizer=parsed_args.use_sglang_tokenizer,
) )
logging.debug(f"Dynamo args: {dynamo_args}") logging.debug(f"Dynamo args: {dynamo_args}")
......
...@@ -42,6 +42,7 @@ async def register_llm_with_runtime_config( ...@@ -42,6 +42,7 @@ async def register_llm_with_runtime_config(
kv_cache_block_size=server_args.page_size, kv_cache_block_size=server_args.page_size,
migration_limit=dynamo_args.migration_limit, migration_limit=dynamo_args.migration_limit,
runtime_config=runtime_config, runtime_config=runtime_config,
custom_template_path=dynamo_args.custom_jinja_template,
) )
logging.info("Successfully registered LLM with runtime config") logging.info("Successfully registered LLM with runtime config")
return True return True
......
{%- for message in messages %}
{%- if message['role'] == 'system' %}
<|system|>{{ message['content'] }}
{%- elif message['role'] == 'user' %}
CUSTOM_TEMPLATE_ACTIVE| {{ message['content'] }}
{%- elif message['role'] == 'assistant' %}
<|assistant|>{{ message['content'] }}
{%- endif %}
{%- endfor %}
<|assistant|>
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import sys
from pathlib import Path
import uvloop
from transformers import AutoTokenizer
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker
SERVE_TEST_DIR = "/workspace/tests/serve"
class TemplateVerificationHandler:
"""Handler to verify custom template application during preprocessing."""
def __init__(self, model_name="Qwen/Qwen3-0.6B"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.template_marker = "CUSTOM_TEMPLATE_ACTIVE|"
async def generate(self, request, context):
"""Check for template marker and return tokenized response."""
token_ids = request.get("token_ids", [])
decoded = self.tokenizer.decode(token_ids)
# Check if the custom template marker is present
if self.template_marker in decoded:
response_text = "Successfully Applied Chat Template"
else:
response_text = "Failed to Apply Chat Template"
# Return tokenized response for frontend to detokenize
response_tokens = self.tokenizer.encode(response_text, add_special_tokens=False)
yield {"token_ids": response_tokens}
@dynamo_worker(static=False)
async def main(runtime: DistributedRuntime):
"""Main worker function for template verification."""
# Create service
component = runtime.namespace("test").component("backend")
await component.create_service()
endpoint = component.endpoint("generate")
# Use the existing custom template from fixtures
template_path = Path(SERVE_TEST_DIR) / "fixtures" / "custom_template.jinja"
if not template_path.exists():
print(f"Error: Template not found at {template_path}")
sys.exit(1)
# Register model with custom template
model_name = "Qwen/Qwen3-0.6B"
await register_llm(
ModelInput.Tokens,
ModelType.Chat,
endpoint,
model_name,
model_name=model_name,
custom_template_path=str(template_path),
)
# Create handler and serve
handler = TemplateVerificationHandler(model_name)
await endpoint.serve_endpoint(handler.generate)
if __name__ == "__main__":
uvloop.run(main())
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $FRONTEND_PID 2>/dev/null || true
wait $FRONTEND_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
FRONTEND_PID=$!
# run the mock worker + template validation generate()
cd "$SCRIPT_DIR"
exec python template_verifier.py
...@@ -68,6 +68,24 @@ sglang_configs = { ...@@ -68,6 +68,24 @@ sglang_configs = {
) )
], ],
), ),
"template_verification": SGLangConfig(
# Tests custom jinja template preprocessing by verifying the template
# marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
# The backend (launch/template_verifier.*) checks for this marker
# and returns "Successfully Applied Chat Template" if found.
name="template_verification",
directory=SERVE_TEST_DIR,
script_name="template_verifier.sh",
marks=[pytest.mark.gpu_1],
model="Qwen/Qwen3-0.6B",
env={},
models_port=8000,
request_payloads=[
chat_payload_default(
expected_response=["Successfully Applied Chat Template"]
)
],
),
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment