feat: Add --custom-jinja-template argument to pass a custom chat template for SGLang (#3165)

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

feat: Add --custom-jinja-template argument to pass a custom chat template for SGLang (#3165)
Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
f95ba894 · KrishnanPrash · GitHub · 65401d75 · f95ba894 · f95ba894
Unverified Commit f95ba894 authored Sep 30, 2025 by KrishnanPrash Committed by GitHub Sep 30, 2025
7 changed files
--- a/components/backends/sglang/README.md
+++ b/components/backends/sglang/README.md
@@ -68,6 +68,7 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu
 | `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` |
 | `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` |
 | `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A |
+| `--custom-jinja-template` | Use custom chat template for that model (takes precedence over default chat template in model repo) | `None` | `--chat-template` |
 #### Tokenizer Behavior

--- a/components/backends/sglang/src/dynamo/sglang/args.py
+++ b/components/backends/sglang/src/dynamo/sglang/args.py
@@ -49,11 +49,17 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
        "choices": get_reasoning_parser_names(),
        "help": "Reasoning parser name for the model. If not specified, no reasoning parsing is performed.",
    },
+    "custom-jinja-template": {
+        "flags": ["--custom-jinja-template"],
+        "type": str,
+        "default": None,
+        "help": "Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository. This template will be applied by Dynamo's preprocessor and cannot be used with --use-sglang-tokenizer.",
+    },
    "use-sglang-tokenizer": {
        "flags": ["--use-sglang-tokenizer"],
        "action": "store_true",
        "default": False,
-        "help": "Use SGLang's tokenizer. This will skip tokenization of the input and output and only v1/chat/completions will be available when using the dynamo frontend",
+        "help": "Use SGLang's tokenizer. This will skip tokenization of the input and output and only v1/chat/completions will be available when using the dynamo frontend. Cannot be used with --custom-jinja-template.",
    },
 }
@@ -68,6 +74,7 @@ class DynamoArgs:
    # tool and reasoning parser options
    tool_call_parser: Optional[str] = None
    reasoning_parser: Optional[str] = None
+    custom_jinja_template: Optional[str] = None
    # preprocessing options
    use_sglang_tokenizer: bool = False
@@ -198,6 +205,23 @@ def parse_args(args: list[str]) -> Config:
        "reasoning-parser",
    )
+    if parsed_args.custom_jinja_template and parsed_args.use_sglang_tokenizer:
+        logging.error(
+            "Cannot use --custom-jinja-template and --use-sglang-tokenizer together. "
+            "--custom-jinja-template requires Dynamo's preprocessor to apply the template, "
+            "while --use-sglang-tokenizer bypasses Dynamo's preprocessor entirely."
+            "If you want to use the SGLang tokenizer with a custom chat template, "
+            "please use the --chat-template argument from SGLang."
+        )
+        sys.exit(1)
+    # Replaces any environment variables or home dir (~) to get absolute path
+    expanded_template_path = None
+    if parsed_args.custom_jinja_template:
+        expanded_template_path = os.path.expandvars(
+            os.path.expanduser(parsed_args.custom_jinja_template)
+        )
    dynamo_args = DynamoArgs(
        namespace=parsed_namespace,
        component=parsed_component_name,
@@ -205,6 +229,7 @@ def parse_args(args: list[str]) -> Config:
        migration_limit=parsed_args.migration_limit,
        tool_call_parser=tool_call_parser,
        reasoning_parser=reasoning_parser,
+        custom_jinja_template=expanded_template_path,
        use_sglang_tokenizer=parsed_args.use_sglang_tokenizer,
    )
    logging.debug(f"Dynamo args: {dynamo_args}")

--- a/components/backends/sglang/src/dynamo/sglang/register.py
+++ b/components/backends/sglang/src/dynamo/sglang/register.py
@@ -42,6 +42,7 @@ async def register_llm_with_runtime_config(
            kv_cache_block_size=server_args.page_size,
            migration_limit=dynamo_args.migration_limit,
            runtime_config=runtime_config,
+            custom_template_path=dynamo_args.custom_jinja_template,
        )
        logging.info("Successfully registered LLM with runtime config")
        return True

--- a/tests/serve/fixtures/custom_template.jinja
+++ b/tests/serve/fixtures/custom_template.jinja
+{%- for message in messages %}
+{%- if message['role'] == 'system' %}
+<|system|>{{ message['content'] }}
+{%- elif message['role'] == 'user' %}
+CUSTOM_TEMPLATE_ACTIVE| {{ message['content'] }}
+{%- elif message['role'] == 'assistant' %}
+<|assistant|>{{ message['content'] }}
+{%- endif %}
+{%- endfor %}
+<|assistant|>
--- a/tests/serve/launch/template_verifier.py
+++ b/tests/serve/launch/template_verifier.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import sys
+from pathlib import Path
+import uvloop
+from transformers import AutoTokenizer
+from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.runtime import DistributedRuntime, dynamo_worker
+SERVE_TEST_DIR = "/workspace/tests/serve"
+class TemplateVerificationHandler:
+    """Handler to verify custom template application during preprocessing."""
+    def __init__(self, model_name="Qwen/Qwen3-0.6B"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.template_marker = "CUSTOM_TEMPLATE_ACTIVE|"
+    async def generate(self, request, context):
+        """Check for template marker and return tokenized response."""
+        token_ids = request.get("token_ids", [])
+        decoded = self.tokenizer.decode(token_ids)
+        # Check if the custom template marker is present
+        if self.template_marker in decoded:
+            response_text = "Successfully Applied Chat Template"
+        else:
+            response_text = "Failed to Apply Chat Template"
+        # Return tokenized response for frontend to detokenize
+        response_tokens = self.tokenizer.encode(response_text, add_special_tokens=False)
+        yield {"token_ids": response_tokens}
+@dynamo_worker(static=False)
+async def main(runtime: DistributedRuntime):
+    """Main worker function for template verification."""
+    # Create service
+    component = runtime.namespace("test").component("backend")
+    await component.create_service()
+    endpoint = component.endpoint("generate")
+    # Use the existing custom template from fixtures
+    template_path = Path(SERVE_TEST_DIR) / "fixtures" / "custom_template.jinja"
+    if not template_path.exists():
+        print(f"Error: Template not found at {template_path}")
+        sys.exit(1)
+    # Register model with custom template
+    model_name = "Qwen/Qwen3-0.6B"
+    await register_llm(
+        ModelInput.Tokens,
+        ModelType.Chat,
+        endpoint,
+        model_name,
+        model_name=model_name,
+        custom_template_path=str(template_path),
+    )
+    # Create handler and serve
+    handler = TemplateVerificationHandler(model_name)
+    await endpoint.serve_endpoint(handler.generate)
+if __name__ == "__main__":
+    uvloop.run(main())
--- a/tests/serve/launch/template_verifier.sh
+++ b/tests/serve/launch/template_verifier.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_DIR="$(dirname "$SCRIPT_DIR")"
+# Setup cleanup trap
+cleanup() {
+    echo "Cleaning up background processes..."
+    kill $FRONTEND_PID 2>/dev/null || true
+    wait $FRONTEND_PID 2>/dev/null || true
+    echo "Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+# run ingress
+python3 -m dynamo.frontend --http-port=8000 &
+FRONTEND_PID=$!
+# run the mock worker + template validation generate()
+cd "$SCRIPT_DIR"
+exec python template_verifier.py
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -68,6 +68,24 @@ sglang_configs = {
            )
        ],
    ),
+    "template_verification": SGLangConfig(
+        # Tests custom jinja template preprocessing by verifying the template
+        # marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
+        # The backend (launch/template_verifier.*) checks for this marker
+        # and returns "Successfully Applied Chat Template" if found.
+        name="template_verification",
+        directory=SERVE_TEST_DIR,
+        script_name="template_verifier.sh",
+        marks=[pytest.mark.gpu_1],
+        model="Qwen/Qwen3-0.6B",
+        env={},
+        models_port=8000,
+        request_payloads=[
+            chat_payload_default(
+                expected_response=["Successfully Applied Chat Template"]
+            )
+        ],
+    ),
 }