feat: Adding --dyn-endpoint-types flag (#4619)

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>

feat: Adding --dyn-endpoint-types flag (#4619)
Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
03070fd5 · KrishnanPrash · GitHub · 44e8600a · 03070fd5 · 03070fd5
Unverified Commit 03070fd5 authored Dec 02, 2025 by KrishnanPrash Committed by GitHub Dec 02, 2025
14 changed files
--- a/components/src/dynamo/common/utils/__init__.py
+++ b/components/src/dynamo/common/utils/__init__.py
@@ -8,10 +8,11 @@ This module contains shared utility functions used across multiple
 Dynamo backends and components.
 Submodules:
+    - endpoint_types: Endpoint type parsing utilities
    - paths: Workspace directory detection and path utilities
    - prometheus: Prometheus metrics collection and logging utilities
 """
-from dynamo.common.utils import paths, prometheus
+from dynamo.common.utils import endpoint_types, paths, prometheus
-__all__ = ["paths", "prometheus"]
+__all__ = ["endpoint_types", "paths", "prometheus"]
--- a/components/src/dynamo/common/utils/endpoint_types.py
+++ b/components/src/dynamo/common/utils/endpoint_types.py
+#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+"""Utilities for parsing and handling endpoint types."""
+from dynamo.llm import ModelType
+def parse_endpoint_types(endpoint_types_str: str) -> ModelType:
+    """Parse comma-separated endpoint types into ModelType flags.
+    Args:
+        endpoint_types_str: Comma-separated list of endpoint types.
+                          Valid values: 'chat', 'completions'
+                          Examples: 'chat', 'completions', 'chat,completions'
+    Returns:
+        ModelType flags combined with bitwise OR
+    Raises:
+        ValueError: If any invalid endpoint type is provided or string is empty
+    Examples:
+        >>> parse_endpoint_types("chat")
+        ModelType.Chat
+        >>> parse_endpoint_types("completions")
+        ModelType.Completions
+        >>> parse_endpoint_types("chat,completions")
+        ModelType.Chat | ModelType.Completions
+    """
+    if not endpoint_types_str or not endpoint_types_str.strip():
+        raise ValueError("Endpoint types string cannot be empty")
+    types = [t.strip().lower() for t in endpoint_types_str.split(",") if t.strip()]
+    if not types:
+        raise ValueError("No valid endpoint types provided")
+    result = None
+    for t in types:
+        if t == "chat":
+            flag = ModelType.Chat
+        elif t == "completions":
+            flag = ModelType.Completions
+        else:
+            raise ValueError(
+                f"Invalid endpoint type: '{t}'. Valid options: 'chat', 'completions'"
+            )
+        result = flag if result is None else result | flag
+    return result
--- a/components/src/dynamo/sglang/args.py
+++ b/components/src/dynamo/sglang/args.py
@@ -60,6 +60,12 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
        "default": None,
        "help": "Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository. This template will be applied by Dynamo's preprocessor and cannot be used with --use-sglang-tokenizer.",
    },
+    "endpoint-types": {
+        "flags": ["--dyn-endpoint-types"],
+        "type": str,
+        "default": "chat,completions",
+        "help": "Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
+    },
    "use-sglang-tokenizer": {
        "flags": ["--use-sglang-tokenizer"],
        "action": "store_true",
@@ -127,6 +133,9 @@ class DynamoArgs:
    reasoning_parser: Optional[str] = None
    custom_jinja_template: Optional[str] = None
+    # endpoint types to enable
+    dyn_endpoint_types: str = "chat,completions"
    # preprocessing options
    use_sglang_tokenizer: bool = False
@@ -461,6 +470,7 @@ async def parse_args(args: list[str]) -> Config:
        tool_call_parser=tool_call_parser,
        reasoning_parser=reasoning_parser,
        custom_jinja_template=expanded_template_path,
+        dyn_endpoint_types=parsed_args.dyn_endpoint_types,
        use_sglang_tokenizer=parsed_args.use_sglang_tokenizer,
        multimodal_processor=parsed_args.multimodal_processor,
        multimodal_encode_worker=parsed_args.multimodal_encode_worker,

--- a/components/src/dynamo/sglang/main.py
+++ b/components/src/dynamo/sglang/main.py
@@ -11,6 +11,7 @@ import sglang as sgl
 import uvloop
 from dynamo.common.config_dump import dump_config
+from dynamo.common.utils.endpoint_types import parse_endpoint_types
 from dynamo.llm import ModelInput, ModelType
 from dynamo.runtime import DistributedRuntime
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -156,6 +157,18 @@ async def init(runtime: DistributedRuntime, config: Config):
    health_check_payload = SglangHealthCheckPayload(engine).to_dict()
+    logging.info(
+        f"Registering model with endpoint types: {dynamo_args.dyn_endpoint_types}"
+    )
+    if (
+        dynamo_args.custom_jinja_template
+        and "chat" not in dynamo_args.dyn_endpoint_types
+    ):
+        logging.warning(
+            "Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
+            "The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
+        )
    try:
        # Start endpoint immediately and register model concurrently
        # Requests queue until ready_event is set (TODO: Part of new PR)
@@ -171,6 +184,7 @@ async def init(runtime: DistributedRuntime, config: Config):
                generate_endpoint,
                server_args,
                dynamo_args,
+                output_type=parse_endpoint_types(dynamo_args.dyn_endpoint_types),
                readiness_gate=ready_event,
            ),
        )

--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -37,6 +37,7 @@ from transformers import AutoConfig
 import dynamo.nixl_connect as nixl_connect
 from dynamo.common.config_dump import dump_config
+from dynamo.common.utils.endpoint_types import parse_endpoint_types
 from dynamo.common.utils.prometheus import register_engine_metrics_callback
 from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime
@@ -250,7 +251,17 @@ async def init(runtime: DistributedRuntime, config: Config):
    if config.disaggregation_mode == DisaggregationMode.PREFILL:
        model_type = ModelType.Prefill
    else:
-        model_type = ModelType.Chat | ModelType.Completions
+        model_type = parse_endpoint_types(config.dyn_endpoint_types)
+        logging.info(
+            f"Registering model with endpoint types: {config.dyn_endpoint_types}"
+        )
+        # Warn if custom template provided but chat endpoint not enabled
+        if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
+            logging.warning(
+                "Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
+                "The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
+            )
    multimodal_processor = None

--- a/components/src/dynamo/trtllm/utils/trtllm_utils.py
+++ b/components/src/dynamo/trtllm/utils/trtllm_utils.py
@@ -58,6 +58,7 @@ class Config:
        self.tool_call_parser: Optional[str] = None
        self.dump_config_to: Optional[str] = None
        self.custom_jinja_template: Optional[str] = None
+        self.dyn_endpoint_types: str = "chat,completions"
        self.store_kv: str = ""
        self.request_plane: str = ""
@@ -282,6 +283,12 @@ def cmd_line_args():
        default=None,
        help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
    )
+    parser.add_argument(
+        "--dyn-endpoint-types",
+        type=str,
+        default="chat,completions",
+        help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
+    )
    parser.add_argument(
        "--store-kv",
        type=str,
@@ -355,6 +362,7 @@ def cmd_line_args():
    config.reasoning_parser = args.dyn_reasoning_parser
    config.tool_call_parser = args.dyn_tool_call_parser
    config.dump_config_to = args.dump_config_to
+    config.dyn_endpoint_types = args.dyn_endpoint_types
    config.store_kv = args.store_kv
    config.request_plane = args.request_plane

--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -55,6 +55,9 @@ class Config:
    tool_call_parser: Optional[str] = None
    reasoning_parser: Optional[str] = None
+    # endpoint types to enable
+    dyn_endpoint_types: str = "chat,completions"
    # multimodal options
    multimodal_processor: bool = False
    multimodal_encode_worker: bool = False
@@ -135,6 +138,12 @@ def parse_args() -> Config:
        default=None,
        help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
    )
+    parser.add_argument(
+        "--dyn-endpoint-types",
+        type=str,
+        default="chat,completions",
+        help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
+    )
    parser.add_argument(
        "--multimodal-processor",
        action="store_true",
@@ -266,6 +275,7 @@ def parse_args() -> Config:
    config.tool_call_parser = args.dyn_tool_call_parser
    config.reasoning_parser = args.dyn_reasoning_parser
    config.custom_jinja_template = args.custom_jinja_template
+    config.dyn_endpoint_types = args.dyn_endpoint_types
    config.multimodal_processor = args.multimodal_processor
    config.multimodal_encode_worker = args.multimodal_encode_worker
    config.multimodal_worker = args.multimodal_worker

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -15,6 +15,7 @@ from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
 from dynamo.common.config_dump import dump_config
+from dynamo.common.utils.endpoint_types import parse_endpoint_types
 from dynamo.llm import (
    ModelInput,
    ModelRuntimeConfig,
@@ -519,9 +520,22 @@ async def init(runtime: DistributedRuntime, config: Config):
        )
    if not config.engine_args.data_parallel_rank:  # if rank is 0 or None then register
+        # Parse endpoint types from --dyn-endpoint-types flag
+        model_type = parse_endpoint_types(config.dyn_endpoint_types)
+        logger.info(
+            f"Registering model with endpoint types: {config.dyn_endpoint_types}"
+        )
+        # Warn if custom template provided but chat endpoint not enabled
+        if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
+            logger.warning(
+                "Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
+                "The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
+            )
        await register_vllm_model(
            ModelInput.Tokens,
-            ModelType.Chat | ModelType.Completions,
+            model_type,
            generate_endpoint,
            config,
            engine_client,

--- a/examples/backends/sglang/launch/agg.sh
+++ b/examples/backends/sglang/launch/agg.sh
@@ -11,10 +11,18 @@ cleanup() {
 }
 trap cleanup EXIT INT TERM
-# Parse command line arguments
+# Default values
+MODEL="Qwen/Qwen3-0.6B"
 ENABLE_OTEL=false
+# Parse command line arguments
+EXTRA_ARGS=()
 while [[ $# -gt 0 ]]; do
    case $1 in
+        --model-path)
+            MODEL="$2"
+            shift 2
+            ;;
        --enable-otel)
            ENABLE_OTEL=true
            shift
@@ -22,16 +30,17 @@ while [[ $# -gt 0 ]]; do
        -h|--help)
            echo "Usage: $0 [OPTIONS]"
            echo "Options:"
+            echo "  --model-path <name>  Specify model (default: $MODEL)"
            echo "  --enable-otel        Enable OpenTelemetry tracing"
            echo "  -h, --help           Show this help message"
            echo ""
+            echo "Additional SGLang/Dynamo flags can be passed and will be forwarded"
            echo "Note: System metrics are enabled by default on port 8081 (worker)"
            exit 0
            ;;
        *)
-            echo "Unknown option: $1"
+            EXTRA_ARGS+=("$1")
-            echo "Use --help for usage information"
+            shift
-            exit 1
            ;;
    esac
 done
@@ -52,10 +61,11 @@ DYNAMO_PID=$!
 # run worker with metrics enabled
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
 python3 -m dynamo.sglang \
-  --model-path Qwen/Qwen3-0.6B \
+  --model-path "$MODEL" \
-  --served-model-name Qwen/Qwen3-0.6B \
+  --served-model-name "$MODEL" \
  --page-size 16 \
  --tp 1 \
  --trust-remote-code \
  --skip-tokenizer-init \
-  --enable-metrics
+  --enable-metrics \
+  "${EXTRA_ARGS[@]}"
--- a/examples/backends/trtllm/launch/agg.sh
+++ b/examples/backends/trtllm/launch/agg.sh
@@ -27,8 +27,10 @@ python3 -m dynamo.frontend &
 DYNAMO_PID=$!
 # run worker
+# Additional command line args can be passed
 python3 -m dynamo.trtllm \
  --model-path "$MODEL_PATH" \
  --served-model-name "$SERVED_MODEL_NAME" \
  --modality "$MODALITY" \
-  --extra-engine-args "$AGG_ENGINE_ARGS"
+  --extra-engine-args "$AGG_ENGINE_ARGS" \
+  "$@"
--- a/examples/backends/vllm/launch/agg.sh
+++ b/examples/backends/vllm/launch/agg.sh
@@ -4,6 +4,24 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
+# Default model
+MODEL="Qwen/Qwen3-0.6B"
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
@@ -11,4 +29,4 @@ python -m dynamo.frontend &
 # run worker
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
-    python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
+    python -m dynamo.vllm --model "$MODEL" --enforce-eager --connector none "${EXTRA_ARGS[@]}"
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -188,6 +188,22 @@ sglang_configs = {
            ),
        ],
    ),
+    "completions_only": SGLangConfig(
+        name="completions_only",
+        directory=sglang_dir,
+        script_name="agg.sh",
+        marks=[pytest.mark.gpu_1],
+        model="deepseek-ai/deepseek-llm-7b-base",
+        script_args=[
+            "--model-path",
+            "deepseek-ai/deepseek-llm-7b-base",
+            "--dyn-endpoint-types",
+            "completions",
+        ],
+        request_payloads=[
+            completion_payload_default(),
+        ],
+    ),
 }

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -117,6 +117,21 @@ trtllm_configs = {
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
+    "completions_only": TRTLLMConfig(
+        name="completions_only",
+        directory=trtllm_dir,
+        script_name="agg.sh",
+        marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
+        model="deepseek-ai/deepseek-llm-7b-base",
+        script_args=["--dyn-endpoint-types", "completions"],
+        env={
+            "MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
+            "SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
+        },
+        request_payloads=[
+            completion_payload_default(),
+        ],
+    ),
 }

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -326,6 +326,22 @@ vllm_configs = {
    #     delayed_start=45,
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
    # ),
+    "completions_only": VLLMConfig(
+        name="completions_only",
+        directory=vllm_dir,
+        script_name="agg.sh",
+        marks=[pytest.mark.gpu_1],
+        model="deepseek-ai/deepseek-llm-7b-base",
+        script_args=[
+            "--model",
+            "deepseek-ai/deepseek-llm-7b-base",
+            "--dyn-endpoint-types",
+            "completions",
+        ],
+        request_payloads=[
+            completion_payload_default(),
+        ],
+    ),
 }