Unverified Commit a82acfa0 authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

feat: Refactor frontend CLI configuration (#6201)

parent 948d6d85
......@@ -5,12 +5,16 @@
import argparse
import os
from typing import Any, Optional, TypeVar
from typing import Any, Callable, Optional, TypeVar, Union
T = TypeVar("T")
def env_or_default(env_var: str, default: T) -> T:
def env_or_default(
env_var: str,
default: T,
value_type: Optional[Union[type, Callable[..., Any]]] = None,
) -> T:
"""
Get value from environment variable or return default.
......@@ -19,32 +23,35 @@ def env_or_default(env_var: str, default: T) -> T:
Args:
env_var: Environment variable name (e.g., "DYN_NAMESPACE")
default: Default value if env var not set
value_type: If provided, use this type to convert the env value. If None, the type
is taken from type(default). Use value_type when default is None but you still
want the env value coerced (e.g. env_or_default("DYN_FOO", None, value_type=int)).
Returns:
Environment variable value (type-converted) or default
Examples:
>>> env_or_default("DYN_NAMESPACE", "test")
"test" # if DYN_NAMESPACE not set
>>> env_or_default("DYN_MIGRATION_LIMIT", 0)
5 # if DYN_MIGRATION_LIMIT="5"
"""
value = os.environ.get(env_var)
if value is None:
return default
# Type conversion based on default type
if isinstance(default, bool):
# No type info available: default=None and no explicit value_type.
if value_type is None and default is None:
return value # type: ignore[return-value]
# Prefer the explicit type if provided; otherwise derive from default
target_type = value_type if value_type is not None else type(default)
if target_type is bool:
return value.lower() in ("true", "1", "yes", "on") # type: ignore
elif isinstance(default, int):
if target_type is int:
return int(value) # type: ignore
elif isinstance(default, float):
if target_type is float:
return float(value) # type: ignore
elif isinstance(default, list):
# Env vars for list options (e.g. DYN_CONNECTOR) are space-separated; downstream expects a list.
if target_type is list:
return [x.strip() for x in value.split() if x.strip()] # type: ignore
else:
return value # type: ignore
# Fall back to calling the type/callable for custom validators (e.g., pathlib.Path)
return target_type(value) if callable(target_type) else value # type: ignore
def add_argument(
......@@ -55,7 +62,7 @@ def add_argument(
default: Any,
help: str,
obsolete_flag: Optional[str] = None,
arg_type: Optional[type] = str,
arg_type: Optional[Union[type, Callable[..., Any]]] = str,
**kwargs: Any,
) -> None:
"""
......@@ -74,7 +81,12 @@ def add_argument(
arg_type: Type for the argument (default: str)
"""
arg_dest = _get_dest_name(flag_name, kwargs.get("dest"))
default_with_env = env_or_default(env_var, default)
value_type_for_env: Optional[Union[type, Callable[..., Any]]] = None
if arg_type is not None and isinstance(arg_type, type):
value_type_for_env = arg_type
if isinstance(default, list) and (arg_type is None or arg_type is str):
value_type_for_env = None
default_with_env = env_or_default(env_var, default, value_type=value_type_for_env)
names = [flag_name]
......@@ -88,8 +100,9 @@ def add_argument(
"dest": arg_dest,
"default": default_with_env,
"help": env_help,
"type": arg_type,
}
if arg_type is not None:
add_arg_opts["type"] = arg_type
kwargs.update(add_arg_opts)
parser.add_argument(*names, **kwargs)
......@@ -114,15 +127,15 @@ def add_negatable_bool_argument(
default: Default value
help: Help text
"""
arg_dest = _get_dest_name(flag_name, dest)
default_with_env = env_or_default(env_var, default)
parser.add_argument(
flag_name,
dest=arg_dest,
add_argument(
parser,
flag_name=flag_name,
env_var=env_var,
default=default,
help=help,
dest=dest,
arg_type=None,
action=argparse.BooleanOptionalAction,
default=default_with_env,
help=_build_help_message(help, env_var, default),
)
......
......@@ -7,12 +7,14 @@ import argparse
import pytest
from dynamo.common.configuration.utils import (
add_argument,
add_negatable_bool_argument,
env_or_default,
)
pytestmark = [
pytest.mark.unit,
pytest.mark.gpu_0,
pytest.mark.pre_merge,
]
......@@ -88,6 +90,86 @@ class TestEnvOrDefault:
# Bool
assert isinstance(env_or_default("TEST_VAR", True), bool)
def test_none_default_with_no_value_type_returns_raw_env_value(self, monkeypatch):
"""Test env value is returned as string when no type info is available."""
monkeypatch.setenv("TEST_VAR", "env_value")
result = env_or_default("TEST_VAR", None, value_type=None)
assert result == "env_value"
assert isinstance(result, str)
class TestAddArgument:
"""Test add_argument function."""
def test_callable_type_with_none_default_uses_env_and_validates(self, monkeypatch):
"""Test callable arg_type works when default is None and env var is set."""
monkeypatch.setenv("TEST_MODEL_NAME", " model-A ")
parser = argparse.ArgumentParser()
def validate_model_name(value: str) -> str:
if len(value.strip()) == 0:
raise argparse.ArgumentTypeError("model-name must be non-empty")
return value.strip()
add_argument(
parser,
flag_name="--model-name",
env_var="TEST_MODEL_NAME",
default=None,
help="Model name",
arg_type=validate_model_name,
)
args = parser.parse_args([])
assert args.model_name == "model-A"
def test_callable_type_with_none_default_uses_none_when_env_unset(
self, monkeypatch
):
"""Test callable arg_type keeps None default when env var is not set."""
monkeypatch.delenv("TEST_MODEL_NAME", raising=False)
parser = argparse.ArgumentParser()
def validate_model_name(value: str) -> str:
if len(value.strip()) == 0:
raise argparse.ArgumentTypeError("model-name must be non-empty")
return value.strip()
add_argument(
parser,
flag_name="--model-name",
env_var="TEST_MODEL_NAME",
default=None,
help="Model name",
arg_type=validate_model_name,
)
args = parser.parse_args([])
assert args.model_name is None
def test_callable_type_with_invalid_env_value_fails_parse(self, monkeypatch):
"""Test invalid env value still fails validation via argparse type callable."""
monkeypatch.setenv("TEST_MODEL_NAME", " ")
parser = argparse.ArgumentParser()
def validate_model_name(value: str) -> str:
if len(value.strip()) == 0:
raise argparse.ArgumentTypeError("model-name must be non-empty")
return value.strip()
add_argument(
parser,
flag_name="--model-name",
env_var="TEST_MODEL_NAME",
default=None,
help="Model name",
arg_type=validate_model_name,
)
with pytest.raises(SystemExit):
parser.parse_args([])
class TestAddNegatableBool:
"""Test add_negatable_bool function."""
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import os
import pathlib
from typing import Any, Dict, Optional
from dynamo.common.config_dump import register_encoder
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.config_base import ConfigBase
from dynamo.common.configuration.utils import (
add_argument,
add_negatable_bool_argument,
env_or_default,
)
from . import __version__
def validate_model_name(value: str) -> str:
"""Validate that model-name is a non-empty string."""
if not value or not isinstance(value, str) or len(value.strip()) == 0:
raise argparse.ArgumentTypeError(
f"model-name must be a non-empty string, got: {value}"
)
return value.strip()
def validate_model_path(value: str) -> str:
"""Validate that model-path is a valid directory on disk."""
if not os.path.isdir(value):
raise argparse.ArgumentTypeError(
f"model-path must be a valid directory on disk, got: {value}"
)
return value
class FrontendConfig(ConfigBase):
"""Configuration for the Dynamo frontend."""
interactive: bool
kv_cache_block_size: Optional[int]
http_host: str
http_port: int
tls_cert_path: Optional[pathlib.Path]
tls_key_path: Optional[pathlib.Path]
router_mode: str
kv_overlap_score_weight: float
router_temperature: float
use_kv_events: bool
router_ttl: float
router_max_tree_size: int
router_prune_target_ratio: float
namespace: Optional[str] = None
router_replica_sync: bool
router_snapshot_threshold: int
router_reset_states: bool
durable_kv_events: bool
router_track_active_blocks: bool
router_assume_kv_reuse: bool
router_track_output_blocks: bool
router_event_threads: int
enforce_disagg: bool
migration_limit: int
active_decode_blocks_threshold: Optional[float]
active_prefill_tokens_threshold: Optional[int]
active_prefill_tokens_threshold_frac: Optional[float]
model_name: Optional[str]
model_path: Optional[str]
metrics_prefix: Optional[str] = None
kserve_grpc_server: bool
grpc_metrics_port: int
dump_config_to: Optional[str]
store_kv: str
request_plane: str
event_plane: str
chat_processor: str
exp_python_factory: bool
def validate(self) -> None:
if bool(self.tls_cert_path) ^ bool(self.tls_key_path): # ^ is XOR
raise ValueError(
"--tls-cert-path and --tls-key-path must be provided together"
)
if self.migration_limit < 0 or self.migration_limit > 4294967295:
raise ValueError(
"--migration-limit must be between 0 and 4294967295 (0=disabled)"
)
@register_encoder(FrontendConfig)
def _preprocess_for_encode_config(config: FrontendConfig) -> Dict[str, Any]:
"""Convert FrontendConfig object to dictionary for encoding."""
return config.__dict__
class FrontendArgGroup(ArgGroup):
"""Frontend configuration parameters."""
def add_arguments(self, parser) -> None:
parser.add_argument(
"--version", action="version", version=f"Dynamo Frontend {__version__}"
)
g = parser.add_argument_group("Dynamo Frontend Options")
# Interactive needs -i short option; use raw add_argument with BooleanOptionalAction
g.add_argument(
"-i",
"--interactive",
dest="interactive",
action=argparse.BooleanOptionalAction,
default=env_or_default("DYN_INTERACTIVE", False),
help="Interactive text chat.\nenv var: DYN_INTERACTIVE",
)
add_argument(
g,
flag_name="--kv-cache-block-size",
env_var="DYN_KV_CACHE_BLOCK_SIZE",
default=None,
help="KV cache block size (u32).",
arg_type=int,
)
add_argument(
g,
flag_name="--http-host",
env_var="DYN_HTTP_HOST",
default="0.0.0.0",
help="HTTP host for the engine (str).",
)
add_argument(
g,
flag_name="--http-port",
env_var="DYN_HTTP_PORT",
default=8000,
help="HTTP port for the engine (u16).",
arg_type=int,
)
add_argument(
g,
flag_name="--tls-cert-path",
env_var="DYN_TLS_CERT_PATH",
default=None,
help="TLS certificate path, PEM format.",
arg_type=pathlib.Path,
)
add_argument(
g,
flag_name="--tls-key-path",
env_var="DYN_TLS_KEY_PATH",
default=None,
help="TLS certificate key path, PEM format.",
arg_type=pathlib.Path,
)
add_argument(
g,
flag_name="--router-mode",
env_var="DYN_ROUTER_MODE",
default="round-robin",
help="How to route the request.",
choices=["round-robin", "random", "kv"],
)
add_argument(
g,
flag_name="--kv-overlap-score-weight",
env_var="DYN_KV_OVERLAP_SCORE_WEIGHT",
default=1.0,
help=(
"KV Router: Weight for overlap score in worker selection. "
"Higher values prioritize KV cache reuse."
),
arg_type=float,
)
add_argument(
g,
flag_name="--router-temperature",
env_var="DYN_ROUTER_TEMPERATURE",
default=0.0,
help=(
"KV Router: Temperature for worker sampling via softmax. Higher values "
"promote more randomness, and 0 fallbacks to deterministic."
),
arg_type=float,
)
add_negatable_bool_argument(
g,
flag_name="--kv-events",
env_var="DYN_KV_EVENTS",
default=True,
help=(
"KV Router: Enable/disable KV events. Use --kv-events to enable "
"(default, router receives cache state events from workers) or --no-kv-events "
"to disable (router predicts cache state based on routing decisions)."
),
dest="use_kv_events",
)
add_argument(
g,
flag_name="--router-ttl",
env_var="DYN_ROUTER_TTL",
default=120.0,
help=(
"KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
"Only used when --no-kv-events is set."
),
arg_type=float,
)
add_argument(
g,
flag_name="--router-max-tree-size",
env_var="DYN_ROUTER_MAX_TREE_SIZE",
default=2**20,
help=(
"KV Router: Maximum tree size before pruning when KV events are disabled. "
"Only used when --no-kv-events is set."
),
arg_type=int,
)
add_argument(
g,
flag_name="--router-prune-target-ratio",
env_var="DYN_ROUTER_PRUNE_TARGET_RATIO",
default=0.8,
help=(
"KV Router: Target size ratio after pruning when KV events are disabled. "
"Only used when --no-kv-events is set."
),
arg_type=float,
)
add_argument(
g,
flag_name="--namespace",
env_var="DYN_NAMESPACE",
default=None,
help=(
"Dynamo namespace for model discovery scoping. If specified, models will "
"only be discovered from this namespace. If not specified, discovers models "
"from all namespaces (global discovery)."
),
)
add_negatable_bool_argument(
g,
flag_name="--router-replica-sync",
env_var="DYN_ROUTER_REPLICA_SYNC",
default=False,
help=(
"KV Router: Enable replica synchronization across multiple router instances. "
"When true, routers will publish and subscribe to events to maintain "
"consistent state."
),
)
add_argument(
g,
flag_name="--router-snapshot-threshold",
env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
default=1000000,
help=(
"KV Router: Number of messages in stream before triggering a snapshot. "
),
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--router-reset-states",
env_var="DYN_ROUTER_RESET_STATES",
default=False,
help=(
"KV Router: Reset router state on startup, purging stream and object store. "
"By default, states are persisted. WARNING: This can affect existing router "
"replicas."
),
)
add_negatable_bool_argument(
g,
flag_name="--durable-kv-events",
env_var="DYN_DURABLE_KV_EVENTS",
default=False,
help=(
"KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. "
"By default, the router uses the generic event plane (NATS Core or ZMQ) with "
"local_indexer mode. Use this flag when you need durability and multi-replica "
"consistency. Requires NATS with JetStream enabled."
),
)
add_negatable_bool_argument(
g,
flag_name="--track-active-blocks",
env_var="DYN_TRACK_ACTIVE_BLOCKS",
default=True,
dest="router_track_active_blocks",
help=(
"KV Router: Track active blocks (blocks being used for ongoing generation). "
"By default, active blocks are tracked for load balancing. "
),
)
add_negatable_bool_argument(
g,
flag_name="--assume-kv-reuse",
env_var="DYN_ASSUME_KV_REUSE",
default=True,
dest="router_assume_kv_reuse",
help=(
"KV Router: When tracking active blocks, assume KV cache reuse. "
"Use --no-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
),
)
add_negatable_bool_argument(
g,
flag_name="--track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
default=False,
dest="router_track_output_blocks",
help=(
"KV Router: Track output blocks during generation. When enabled, the router adds "
"placeholder blocks as tokens are generated and applies fractional decay based on "
"progress toward expected_output_tokens."
),
)
add_argument(
g,
flag_name="--router-event-threads",
env_var="DYN_ROUTER_EVENT_THREADS",
default=1,
help=(
"KV Router: Number of event processing threads. When > 1, uses a concurrent radix tree with a thread pool for higher throughput."
),
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--enforce-disagg",
env_var="DYN_ENFORCE_DISAGG",
default=False,
help=(
"Enforce disaggregated prefill-decode. When set, unactivated prefill router will "
"return an error instead of falling back to decode-only mode."
),
)
add_argument(
g,
flag_name="--migration-limit",
env_var="DYN_MIGRATION_LIMIT",
default=0,
help=(
"Maximum number of times a request may be migrated to a different engine worker. "
"When > 0, enables request migration on worker disconnect."
),
arg_type=int,
)
add_argument(
g,
flag_name="--active-decode-blocks-threshold",
env_var="DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD",
default=None,
help=(
"Threshold percentage (0.0-1.0) for determining when a worker is considered busy "
"based on KV cache block utilization. If not set, blocks-based busy detection is disabled."
),
arg_type=float,
)
add_argument(
g,
flag_name="--active-prefill-tokens-threshold",
env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD",
default=None,
help=(
"Literal token count threshold for determining when a worker is considered busy "
"based on prefill token utilization. When active prefill tokens exceed this "
"threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled."
),
arg_type=int,
)
add_argument(
g,
flag_name="--active-prefill-tokens-threshold-frac",
env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC",
default=None,
help=(
"Fraction of max_num_batched_tokens for busy detection. Worker is busy when "
"active_prefill_tokens > frac * max_num_batched_tokens. Default 1.5 (disabled). "
"Uses OR logic with --active-prefill-tokens-threshold."
),
arg_type=float,
)
add_argument(
g,
flag_name="--model-name",
env_var="DYN_MODEL_NAME",
default=None,
help="Model name as a string (e.g., 'Llama-3.2-1B-Instruct')",
arg_type=validate_model_name,
)
add_argument(
g,
flag_name="--model-path",
env_var="DYN_MODEL_PATH",
default=None,
help="Path to model directory on disk (e.g., /tmp/model_cache/llama3.2_1B/)",
arg_type=validate_model_path,
)
add_argument(
g,
flag_name="--metrics-prefix",
env_var="DYN_METRICS_PREFIX",
default=None,
help=(
"Prefix for Dynamo frontend metrics. If unset, uses DYN_METRICS_PREFIX env var "
"or 'dynamo_frontend'."
),
)
add_negatable_bool_argument(
g,
flag_name="--kserve-grpc-server",
env_var="DYN_KSERVE_GRPC_SERVER",
default=False,
help="Start KServe gRPC server.",
)
add_argument(
g,
flag_name="--grpc-metrics-port",
env_var="DYN_GRPC_METRICS_PORT",
default=8788,
help=(
"HTTP metrics port for gRPC service (u16). Only used with --kserve-grpc-server. "
"Defaults to 8788."
),
arg_type=int,
)
add_argument(
g,
flag_name="--dump-config-to",
env_var="DYN_DUMP_CONFIG_TO",
default=None,
help="Dump config to the specified file path.",
)
add_argument(
g,
flag_name="--store-kv",
env_var="DYN_STORE_KV",
default="etcd",
help=(
"Which key-value backend to use: etcd, mem, file. Etcd uses the ETCD_* env vars "
"(e.g. ETCD_ENDPOINTS) for connection details. File uses root dir from env var "
"DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv."
),
choices=["etcd", "file", "mem"],
)
add_argument(
g,
flag_name="--request-plane",
env_var="DYN_REQUEST_PLANE",
default="tcp",
help=(
"Determines how requests are distributed from routers to workers. "
"'tcp' is fastest [nats|http|tcp]"
),
choices=["nats", "http", "tcp"],
)
add_argument(
g,
flag_name="--event-plane",
env_var="DYN_EVENT_PLANE",
default="nats",
help="Determines how events are published [nats|zmq]",
choices=["nats", "zmq"],
)
add_argument(
g,
flag_name="--chat-processor",
env_var="DYN_CHAT_PROCESSOR",
default="dynamo",
help=(
"[EXPERIMENTAL] When set to 'vllm', use local vllm for the pre and post "
"processor."
),
choices=["dynamo", "vllm"],
)
add_negatable_bool_argument(
g,
flag_name="--exp-python-factory",
env_var="DYN_EXP_PYTHON_FACTORY",
default=False,
help=(
"[EXPERIMENTAL] Enable Python-based engine factory. When set, engines will be "
"created via a Python callback instead of the default Rust pipeline."
),
)
This diff is collapsed.
......@@ -24,6 +24,7 @@ from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
from vllm.v1.engine.input_processor import InputProcessor
from vllm.v1.engine.output_processor import OutputProcessor, OutputProcessorOutput
from dynamo.frontend.frontend_args import FrontendConfig
from dynamo.llm import (
KvPushRouter,
ModelCardInstanceId,
......@@ -367,10 +368,12 @@ class EngineFactory:
self,
runtime: DistributedRuntime,
router_config: RouterConfig,
config: FrontendConfig,
flags: Namespace,
):
self.runtime = runtime
self.router_config = router_config
self.config = config
self.flags = flags
async def chat_engine_factory(
......@@ -444,7 +447,7 @@ class EngineFactory:
if self.router_config.router_mode == RouterMode.KV:
router = KvPushRouter(
endpoint=generate_endpoint,
block_size=self.flags.kv_cache_block_size or 16,
block_size=self.config.kv_cache_block_size or 16,
kv_router_config=self.router_config.kv_router_config,
)
else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment