Unverified Commit 03070fd5 authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

feat: Adding --dyn-endpoint-types flag (#4619)


Signed-off-by: default avatarKrishnan Prashanth <kprashanth@nvidia.com>
Signed-off-by: default avatarKrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
parent 44e8600a
...@@ -8,10 +8,11 @@ This module contains shared utility functions used across multiple ...@@ -8,10 +8,11 @@ This module contains shared utility functions used across multiple
Dynamo backends and components. Dynamo backends and components.
Submodules: Submodules:
- endpoint_types: Endpoint type parsing utilities
- paths: Workspace directory detection and path utilities - paths: Workspace directory detection and path utilities
- prometheus: Prometheus metrics collection and logging utilities - prometheus: Prometheus metrics collection and logging utilities
""" """
from dynamo.common.utils import paths, prometheus from dynamo.common.utils import endpoint_types, paths, prometheus
__all__ = ["paths", "prometheus"] __all__ = ["endpoint_types", "paths", "prometheus"]
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Utilities for parsing and handling endpoint types."""
from dynamo.llm import ModelType
def parse_endpoint_types(endpoint_types_str: str) -> ModelType:
"""Parse comma-separated endpoint types into ModelType flags.
Args:
endpoint_types_str: Comma-separated list of endpoint types.
Valid values: 'chat', 'completions'
Examples: 'chat', 'completions', 'chat,completions'
Returns:
ModelType flags combined with bitwise OR
Raises:
ValueError: If any invalid endpoint type is provided or string is empty
Examples:
>>> parse_endpoint_types("chat")
ModelType.Chat
>>> parse_endpoint_types("completions")
ModelType.Completions
>>> parse_endpoint_types("chat,completions")
ModelType.Chat | ModelType.Completions
"""
if not endpoint_types_str or not endpoint_types_str.strip():
raise ValueError("Endpoint types string cannot be empty")
types = [t.strip().lower() for t in endpoint_types_str.split(",") if t.strip()]
if not types:
raise ValueError("No valid endpoint types provided")
result = None
for t in types:
if t == "chat":
flag = ModelType.Chat
elif t == "completions":
flag = ModelType.Completions
else:
raise ValueError(
f"Invalid endpoint type: '{t}'. Valid options: 'chat', 'completions'"
)
result = flag if result is None else result | flag
return result
...@@ -60,6 +60,12 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = { ...@@ -60,6 +60,12 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
"default": None, "default": None,
"help": "Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository. This template will be applied by Dynamo's preprocessor and cannot be used with --use-sglang-tokenizer.", "help": "Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository. This template will be applied by Dynamo's preprocessor and cannot be used with --use-sglang-tokenizer.",
}, },
"endpoint-types": {
"flags": ["--dyn-endpoint-types"],
"type": str,
"default": "chat,completions",
"help": "Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
},
"use-sglang-tokenizer": { "use-sglang-tokenizer": {
"flags": ["--use-sglang-tokenizer"], "flags": ["--use-sglang-tokenizer"],
"action": "store_true", "action": "store_true",
...@@ -127,6 +133,9 @@ class DynamoArgs: ...@@ -127,6 +133,9 @@ class DynamoArgs:
reasoning_parser: Optional[str] = None reasoning_parser: Optional[str] = None
custom_jinja_template: Optional[str] = None custom_jinja_template: Optional[str] = None
# endpoint types to enable
dyn_endpoint_types: str = "chat,completions"
# preprocessing options # preprocessing options
use_sglang_tokenizer: bool = False use_sglang_tokenizer: bool = False
...@@ -461,6 +470,7 @@ async def parse_args(args: list[str]) -> Config: ...@@ -461,6 +470,7 @@ async def parse_args(args: list[str]) -> Config:
tool_call_parser=tool_call_parser, tool_call_parser=tool_call_parser,
reasoning_parser=reasoning_parser, reasoning_parser=reasoning_parser,
custom_jinja_template=expanded_template_path, custom_jinja_template=expanded_template_path,
dyn_endpoint_types=parsed_args.dyn_endpoint_types,
use_sglang_tokenizer=parsed_args.use_sglang_tokenizer, use_sglang_tokenizer=parsed_args.use_sglang_tokenizer,
multimodal_processor=parsed_args.multimodal_processor, multimodal_processor=parsed_args.multimodal_processor,
multimodal_encode_worker=parsed_args.multimodal_encode_worker, multimodal_encode_worker=parsed_args.multimodal_encode_worker,
......
...@@ -11,6 +11,7 @@ import sglang as sgl ...@@ -11,6 +11,7 @@ import sglang as sgl
import uvloop import uvloop
from dynamo.common.config_dump import dump_config from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.llm import ModelInput, ModelType from dynamo.llm import ModelInput, ModelType
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
...@@ -156,6 +157,18 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -156,6 +157,18 @@ async def init(runtime: DistributedRuntime, config: Config):
health_check_payload = SglangHealthCheckPayload(engine).to_dict() health_check_payload = SglangHealthCheckPayload(engine).to_dict()
logging.info(
f"Registering model with endpoint types: {dynamo_args.dyn_endpoint_types}"
)
if (
dynamo_args.custom_jinja_template
and "chat" not in dynamo_args.dyn_endpoint_types
):
logging.warning(
"Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
"The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
)
try: try:
# Start endpoint immediately and register model concurrently # Start endpoint immediately and register model concurrently
# Requests queue until ready_event is set (TODO: Part of new PR) # Requests queue until ready_event is set (TODO: Part of new PR)
...@@ -171,6 +184,7 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -171,6 +184,7 @@ async def init(runtime: DistributedRuntime, config: Config):
generate_endpoint, generate_endpoint,
server_args, server_args,
dynamo_args, dynamo_args,
output_type=parse_endpoint_types(dynamo_args.dyn_endpoint_types),
readiness_gate=ready_event, readiness_gate=ready_event,
), ),
) )
......
...@@ -37,6 +37,7 @@ from transformers import AutoConfig ...@@ -37,6 +37,7 @@ from transformers import AutoConfig
import dynamo.nixl_connect as nixl_connect import dynamo.nixl_connect as nixl_connect
from dynamo.common.config_dump import dump_config from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.common.utils.prometheus import register_engine_metrics_callback from dynamo.common.utils.prometheus import register_engine_metrics_callback
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
...@@ -250,7 +251,17 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -250,7 +251,17 @@ async def init(runtime: DistributedRuntime, config: Config):
if config.disaggregation_mode == DisaggregationMode.PREFILL: if config.disaggregation_mode == DisaggregationMode.PREFILL:
model_type = ModelType.Prefill model_type = ModelType.Prefill
else: else:
model_type = ModelType.Chat | ModelType.Completions model_type = parse_endpoint_types(config.dyn_endpoint_types)
logging.info(
f"Registering model with endpoint types: {config.dyn_endpoint_types}"
)
# Warn if custom template provided but chat endpoint not enabled
if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
logging.warning(
"Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
"The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
)
multimodal_processor = None multimodal_processor = None
......
...@@ -58,6 +58,7 @@ class Config: ...@@ -58,6 +58,7 @@ class Config:
self.tool_call_parser: Optional[str] = None self.tool_call_parser: Optional[str] = None
self.dump_config_to: Optional[str] = None self.dump_config_to: Optional[str] = None
self.custom_jinja_template: Optional[str] = None self.custom_jinja_template: Optional[str] = None
self.dyn_endpoint_types: str = "chat,completions"
self.store_kv: str = "" self.store_kv: str = ""
self.request_plane: str = "" self.request_plane: str = ""
...@@ -282,6 +283,12 @@ def cmd_line_args(): ...@@ -282,6 +283,12 @@ def cmd_line_args():
default=None, default=None,
help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.", help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
) )
parser.add_argument(
"--dyn-endpoint-types",
type=str,
default="chat,completions",
help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
)
parser.add_argument( parser.add_argument(
"--store-kv", "--store-kv",
type=str, type=str,
...@@ -355,6 +362,7 @@ def cmd_line_args(): ...@@ -355,6 +362,7 @@ def cmd_line_args():
config.reasoning_parser = args.dyn_reasoning_parser config.reasoning_parser = args.dyn_reasoning_parser
config.tool_call_parser = args.dyn_tool_call_parser config.tool_call_parser = args.dyn_tool_call_parser
config.dump_config_to = args.dump_config_to config.dump_config_to = args.dump_config_to
config.dyn_endpoint_types = args.dyn_endpoint_types
config.store_kv = args.store_kv config.store_kv = args.store_kv
config.request_plane = args.request_plane config.request_plane = args.request_plane
......
...@@ -55,6 +55,9 @@ class Config: ...@@ -55,6 +55,9 @@ class Config:
tool_call_parser: Optional[str] = None tool_call_parser: Optional[str] = None
reasoning_parser: Optional[str] = None reasoning_parser: Optional[str] = None
# endpoint types to enable
dyn_endpoint_types: str = "chat,completions"
# multimodal options # multimodal options
multimodal_processor: bool = False multimodal_processor: bool = False
multimodal_encode_worker: bool = False multimodal_encode_worker: bool = False
...@@ -135,6 +138,12 @@ def parse_args() -> Config: ...@@ -135,6 +138,12 @@ def parse_args() -> Config:
default=None, default=None,
help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.", help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
) )
parser.add_argument(
"--dyn-endpoint-types",
type=str,
default="chat,completions",
help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
)
parser.add_argument( parser.add_argument(
"--multimodal-processor", "--multimodal-processor",
action="store_true", action="store_true",
...@@ -266,6 +275,7 @@ def parse_args() -> Config: ...@@ -266,6 +275,7 @@ def parse_args() -> Config:
config.tool_call_parser = args.dyn_tool_call_parser config.tool_call_parser = args.dyn_tool_call_parser
config.reasoning_parser = args.dyn_reasoning_parser config.reasoning_parser = args.dyn_reasoning_parser
config.custom_jinja_template = args.custom_jinja_template config.custom_jinja_template = args.custom_jinja_template
config.dyn_endpoint_types = args.dyn_endpoint_types
config.multimodal_processor = args.multimodal_processor config.multimodal_processor = args.multimodal_processor
config.multimodal_encode_worker = args.multimodal_encode_worker config.multimodal_encode_worker = args.multimodal_encode_worker
config.multimodal_worker = args.multimodal_worker config.multimodal_worker = args.multimodal_worker
......
...@@ -15,6 +15,7 @@ from vllm.v1.engine.async_llm import AsyncLLM ...@@ -15,6 +15,7 @@ from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
from dynamo.common.config_dump import dump_config from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.llm import ( from dynamo.llm import (
ModelInput, ModelInput,
ModelRuntimeConfig, ModelRuntimeConfig,
...@@ -519,9 +520,22 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -519,9 +520,22 @@ async def init(runtime: DistributedRuntime, config: Config):
) )
if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register
# Parse endpoint types from --dyn-endpoint-types flag
model_type = parse_endpoint_types(config.dyn_endpoint_types)
logger.info(
f"Registering model with endpoint types: {config.dyn_endpoint_types}"
)
# Warn if custom template provided but chat endpoint not enabled
if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
logger.warning(
"Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
"The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
)
await register_vllm_model( await register_vllm_model(
ModelInput.Tokens, ModelInput.Tokens,
ModelType.Chat | ModelType.Completions, model_type,
generate_endpoint, generate_endpoint,
config, config,
engine_client, engine_client,
......
...@@ -11,10 +11,18 @@ cleanup() { ...@@ -11,10 +11,18 @@ cleanup() {
} }
trap cleanup EXIT INT TERM trap cleanup EXIT INT TERM
# Parse command line arguments # Default values
MODEL="Qwen/Qwen3-0.6B"
ENABLE_OTEL=false ENABLE_OTEL=false
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
--model-path)
MODEL="$2"
shift 2
;;
--enable-otel) --enable-otel)
ENABLE_OTEL=true ENABLE_OTEL=true
shift shift
...@@ -22,16 +30,17 @@ while [[ $# -gt 0 ]]; do ...@@ -22,16 +30,17 @@ while [[ $# -gt 0 ]]; do
-h|--help) -h|--help)
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo "Options:" echo "Options:"
echo " --model-path <name> Specify model (default: $MODEL)"
echo " --enable-otel Enable OpenTelemetry tracing" echo " --enable-otel Enable OpenTelemetry tracing"
echo " -h, --help Show this help message" echo " -h, --help Show this help message"
echo "" echo ""
echo "Additional SGLang/Dynamo flags can be passed and will be forwarded"
echo "Note: System metrics are enabled by default on port 8081 (worker)" echo "Note: System metrics are enabled by default on port 8081 (worker)"
exit 0 exit 0
;; ;;
*) *)
echo "Unknown option: $1" EXTRA_ARGS+=("$1")
echo "Use --help for usage information" shift
exit 1
;; ;;
esac esac
done done
...@@ -52,10 +61,11 @@ DYNAMO_PID=$! ...@@ -52,10 +61,11 @@ DYNAMO_PID=$!
# run worker with metrics enabled # run worker with metrics enabled
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \ --model-path "$MODEL" \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name "$MODEL" \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
--skip-tokenizer-init \ --skip-tokenizer-init \
--enable-metrics --enable-metrics \
"${EXTRA_ARGS[@]}"
...@@ -27,8 +27,10 @@ python3 -m dynamo.frontend & ...@@ -27,8 +27,10 @@ python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker
# Additional command line args can be passed
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" --extra-engine-args "$AGG_ENGINE_ARGS" \
"$@"
...@@ -4,6 +4,24 @@ ...@@ -4,6 +4,24 @@
set -e set -e
trap 'echo Cleaning up...; kill 0' EXIT trap 'echo Cleaning up...; kill 0' EXIT
# Default model
MODEL="Qwen/Qwen3-0.6B"
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
# run ingress # run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend & python -m dynamo.frontend &
...@@ -11,4 +29,4 @@ python -m dynamo.frontend & ...@@ -11,4 +29,4 @@ python -m dynamo.frontend &
# run worker # run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none python -m dynamo.vllm --model "$MODEL" --enforce-eager --connector none "${EXTRA_ARGS[@]}"
...@@ -188,6 +188,22 @@ sglang_configs = { ...@@ -188,6 +188,22 @@ sglang_configs = {
), ),
], ],
), ),
"completions_only": SGLangConfig(
name="completions_only",
directory=sglang_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=[
"--model-path",
"deepseek-ai/deepseek-llm-7b-base",
"--dyn-endpoint-types",
"completions",
],
request_payloads=[
completion_payload_default(),
],
),
} }
......
...@@ -117,6 +117,21 @@ trtllm_configs = { ...@@ -117,6 +117,21 @@ trtllm_configs = {
delayed_start=60, delayed_start=60,
request_payloads=[multimodal_payload_default()], request_payloads=[multimodal_payload_default()],
), ),
"completions_only": TRTLLMConfig(
name="completions_only",
directory=trtllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=["--dyn-endpoint-types", "completions"],
env={
"MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
"SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
},
request_payloads=[
completion_payload_default(),
],
),
} }
......
...@@ -326,6 +326,22 @@ vllm_configs = { ...@@ -326,6 +326,22 @@ vllm_configs = {
# delayed_start=45, # delayed_start=45,
# script_args=["--model", "llava-hf/llava-1.5-7b-hf"], # script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
# ), # ),
"completions_only": VLLMConfig(
name="completions_only",
directory=vllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=[
"--model",
"deepseek-ai/deepseek-llm-7b-base",
"--dyn-endpoint-types",
"completions",
],
request_payloads=[
completion_payload_default(),
],
),
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment