"lib/bindings/vscode:/vscode.git/clone" did not exist on "dadf0e22479025615a17d5b6f069745a9d6a2131"
Unverified Commit 03070fd5 authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

feat: Adding --dyn-endpoint-types flag (#4619)


Signed-off-by: default avatarKrishnan Prashanth <kprashanth@nvidia.com>
Signed-off-by: default avatarKrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
parent 44e8600a
......@@ -8,10 +8,11 @@ This module contains shared utility functions used across multiple
Dynamo backends and components.
Submodules:
- endpoint_types: Endpoint type parsing utilities
- paths: Workspace directory detection and path utilities
- prometheus: Prometheus metrics collection and logging utilities
"""
from dynamo.common.utils import paths, prometheus
from dynamo.common.utils import endpoint_types, paths, prometheus
__all__ = ["paths", "prometheus"]
__all__ = ["endpoint_types", "paths", "prometheus"]
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Utilities for parsing and handling endpoint types."""
from dynamo.llm import ModelType
def parse_endpoint_types(endpoint_types_str: str) -> ModelType:
"""Parse comma-separated endpoint types into ModelType flags.
Args:
endpoint_types_str: Comma-separated list of endpoint types.
Valid values: 'chat', 'completions'
Examples: 'chat', 'completions', 'chat,completions'
Returns:
ModelType flags combined with bitwise OR
Raises:
ValueError: If any invalid endpoint type is provided or string is empty
Examples:
>>> parse_endpoint_types("chat")
ModelType.Chat
>>> parse_endpoint_types("completions")
ModelType.Completions
>>> parse_endpoint_types("chat,completions")
ModelType.Chat | ModelType.Completions
"""
if not endpoint_types_str or not endpoint_types_str.strip():
raise ValueError("Endpoint types string cannot be empty")
types = [t.strip().lower() for t in endpoint_types_str.split(",") if t.strip()]
if not types:
raise ValueError("No valid endpoint types provided")
result = None
for t in types:
if t == "chat":
flag = ModelType.Chat
elif t == "completions":
flag = ModelType.Completions
else:
raise ValueError(
f"Invalid endpoint type: '{t}'. Valid options: 'chat', 'completions'"
)
result = flag if result is None else result | flag
return result
......@@ -60,6 +60,12 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
"default": None,
"help": "Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository. This template will be applied by Dynamo's preprocessor and cannot be used with --use-sglang-tokenizer.",
},
"endpoint-types": {
"flags": ["--dyn-endpoint-types"],
"type": str,
"default": "chat,completions",
"help": "Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
},
"use-sglang-tokenizer": {
"flags": ["--use-sglang-tokenizer"],
"action": "store_true",
......@@ -127,6 +133,9 @@ class DynamoArgs:
reasoning_parser: Optional[str] = None
custom_jinja_template: Optional[str] = None
# endpoint types to enable
dyn_endpoint_types: str = "chat,completions"
# preprocessing options
use_sglang_tokenizer: bool = False
......@@ -461,6 +470,7 @@ async def parse_args(args: list[str]) -> Config:
tool_call_parser=tool_call_parser,
reasoning_parser=reasoning_parser,
custom_jinja_template=expanded_template_path,
dyn_endpoint_types=parsed_args.dyn_endpoint_types,
use_sglang_tokenizer=parsed_args.use_sglang_tokenizer,
multimodal_processor=parsed_args.multimodal_processor,
multimodal_encode_worker=parsed_args.multimodal_encode_worker,
......
......@@ -11,6 +11,7 @@ import sglang as sgl
import uvloop
from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.llm import ModelInput, ModelType
from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -156,6 +157,18 @@ async def init(runtime: DistributedRuntime, config: Config):
health_check_payload = SglangHealthCheckPayload(engine).to_dict()
logging.info(
f"Registering model with endpoint types: {dynamo_args.dyn_endpoint_types}"
)
if (
dynamo_args.custom_jinja_template
and "chat" not in dynamo_args.dyn_endpoint_types
):
logging.warning(
"Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
"The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
)
try:
# Start endpoint immediately and register model concurrently
# Requests queue until ready_event is set (TODO: Part of new PR)
......@@ -171,6 +184,7 @@ async def init(runtime: DistributedRuntime, config: Config):
generate_endpoint,
server_args,
dynamo_args,
output_type=parse_endpoint_types(dynamo_args.dyn_endpoint_types),
readiness_gate=ready_event,
),
)
......
......@@ -37,6 +37,7 @@ from transformers import AutoConfig
import dynamo.nixl_connect as nixl_connect
from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.common.utils.prometheus import register_engine_metrics_callback
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.runtime import DistributedRuntime
......@@ -250,7 +251,17 @@ async def init(runtime: DistributedRuntime, config: Config):
if config.disaggregation_mode == DisaggregationMode.PREFILL:
model_type = ModelType.Prefill
else:
model_type = ModelType.Chat | ModelType.Completions
model_type = parse_endpoint_types(config.dyn_endpoint_types)
logging.info(
f"Registering model with endpoint types: {config.dyn_endpoint_types}"
)
# Warn if custom template provided but chat endpoint not enabled
if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
logging.warning(
"Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
"The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
)
multimodal_processor = None
......
......@@ -58,6 +58,7 @@ class Config:
self.tool_call_parser: Optional[str] = None
self.dump_config_to: Optional[str] = None
self.custom_jinja_template: Optional[str] = None
self.dyn_endpoint_types: str = "chat,completions"
self.store_kv: str = ""
self.request_plane: str = ""
......@@ -282,6 +283,12 @@ def cmd_line_args():
default=None,
help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
)
parser.add_argument(
"--dyn-endpoint-types",
type=str,
default="chat,completions",
help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
)
parser.add_argument(
"--store-kv",
type=str,
......@@ -355,6 +362,7 @@ def cmd_line_args():
config.reasoning_parser = args.dyn_reasoning_parser
config.tool_call_parser = args.dyn_tool_call_parser
config.dump_config_to = args.dump_config_to
config.dyn_endpoint_types = args.dyn_endpoint_types
config.store_kv = args.store_kv
config.request_plane = args.request_plane
......
......@@ -55,6 +55,9 @@ class Config:
tool_call_parser: Optional[str] = None
reasoning_parser: Optional[str] = None
# endpoint types to enable
dyn_endpoint_types: str = "chat,completions"
# multimodal options
multimodal_processor: bool = False
multimodal_encode_worker: bool = False
......@@ -135,6 +138,12 @@ def parse_args() -> Config:
default=None,
help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
)
parser.add_argument(
"--dyn-endpoint-types",
type=str,
default="chat,completions",
help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Default: 'chat,completions'. Use 'completions' for models without chat templates.",
)
parser.add_argument(
"--multimodal-processor",
action="store_true",
......@@ -266,6 +275,7 @@ def parse_args() -> Config:
config.tool_call_parser = args.dyn_tool_call_parser
config.reasoning_parser = args.dyn_reasoning_parser
config.custom_jinja_template = args.custom_jinja_template
config.dyn_endpoint_types = args.dyn_endpoint_types
config.multimodal_processor = args.multimodal_processor
config.multimodal_encode_worker = args.multimodal_encode_worker
config.multimodal_worker = args.multimodal_worker
......
......@@ -15,6 +15,7 @@ from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.llm import (
ModelInput,
ModelRuntimeConfig,
......@@ -519,9 +520,22 @@ async def init(runtime: DistributedRuntime, config: Config):
)
if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register
# Parse endpoint types from --dyn-endpoint-types flag
model_type = parse_endpoint_types(config.dyn_endpoint_types)
logger.info(
f"Registering model with endpoint types: {config.dyn_endpoint_types}"
)
# Warn if custom template provided but chat endpoint not enabled
if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
logger.warning(
"Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
"The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
)
await register_vllm_model(
ModelInput.Tokens,
ModelType.Chat | ModelType.Completions,
model_type,
generate_endpoint,
config,
engine_client,
......
......@@ -11,10 +11,18 @@ cleanup() {
}
trap cleanup EXIT INT TERM
# Parse command line arguments
# Default values
MODEL="Qwen/Qwen3-0.6B"
ENABLE_OTEL=false
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model-path)
MODEL="$2"
shift 2
;;
--enable-otel)
ENABLE_OTEL=true
shift
......@@ -22,16 +30,17 @@ while [[ $# -gt 0 ]]; do
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model-path <name> Specify model (default: $MODEL)"
echo " --enable-otel Enable OpenTelemetry tracing"
echo " -h, --help Show this help message"
echo ""
echo "Additional SGLang/Dynamo flags can be passed and will be forwarded"
echo "Note: System metrics are enabled by default on port 8081 (worker)"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
EXTRA_ARGS+=("$1")
shift
;;
esac
done
......@@ -52,10 +61,11 @@ DYNAMO_PID=$!
# run worker with metrics enabled
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--model-path "$MODEL" \
--served-model-name "$MODEL" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
--skip-tokenizer-init \
--enable-metrics
--enable-metrics \
"${EXTRA_ARGS[@]}"
......@@ -27,8 +27,10 @@ python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker
# Additional command line args can be passed
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS"
--extra-engine-args "$AGG_ENGINE_ARGS" \
"$@"
......@@ -4,6 +4,24 @@
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Default model
MODEL="Qwen/Qwen3-0.6B"
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
......@@ -11,4 +29,4 @@ python -m dynamo.frontend &
# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
python -m dynamo.vllm --model "$MODEL" --enforce-eager --connector none "${EXTRA_ARGS[@]}"
......@@ -188,6 +188,22 @@ sglang_configs = {
),
],
),
"completions_only": SGLangConfig(
name="completions_only",
directory=sglang_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=[
"--model-path",
"deepseek-ai/deepseek-llm-7b-base",
"--dyn-endpoint-types",
"completions",
],
request_payloads=[
completion_payload_default(),
],
),
}
......
......@@ -117,6 +117,21 @@ trtllm_configs = {
delayed_start=60,
request_payloads=[multimodal_payload_default()],
),
"completions_only": TRTLLMConfig(
name="completions_only",
directory=trtllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=["--dyn-endpoint-types", "completions"],
env={
"MODEL_PATH": "deepseek-ai/deepseek-llm-7b-base",
"SERVED_MODEL_NAME": "deepseek-ai/deepseek-llm-7b-base",
},
request_payloads=[
completion_payload_default(),
],
),
}
......
......@@ -326,6 +326,22 @@ vllm_configs = {
# delayed_start=45,
# script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
# ),
"completions_only": VLLMConfig(
name="completions_only",
directory=vllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=[
"--model",
"deepseek-ai/deepseek-llm-7b-base",
"--dyn-endpoint-types",
"completions",
],
request_payloads=[
completion_payload_default(),
],
),
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment