"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "6caac57587c432e8c6865e79242c07bc7e0c0353"
Unverified Commit de27efe6 authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

feat: Migrate vllm configuration system (#6075)

parent b94f9dcd
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""ArgGroup implementations for different configuration domains."""
from .runtime_args import DynamoRuntimeArgGroup, DynamoRuntimeConfig
__all__ = ["DynamoRuntimeArgGroup", "DynamoRuntimeConfig"]
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Dynamo runtime configuration ArgGroup."""
from typing import Optional
from dynamo._core import get_reasoning_parser_names, get_tool_parser_names
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.config_base import ConfigBase
from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
class DynamoRuntimeConfig(ConfigBase):
"""Configuration for Dynamo runtime (common across all backends)."""
namespace: str
store_kv: str
request_plane: str
event_plane: str
connector: list[str]
enable_local_indexer: bool
durable_kv_events: bool
dyn_tool_call_parser: Optional[str] = None
dyn_reasoning_parser: Optional[str] = None
custom_jinja_template: Optional[str] = None
endpoint_types: str
dump_config_to: Optional[str] = None
def validate(self) -> None:
# TODO get a better way for spot fixes like this.
self.enable_local_indexer = not self.durable_kv_events
class DynamoRuntimeArgGroup(ArgGroup):
"""Dynamo runtime configuration parameters (common to all backends)."""
def add_arguments(self, parser) -> None:
"""Add Dynamo runtime arguments to parser."""
g = parser.add_argument_group("Dynamo Runtime Options")
add_argument(
g,
flag_name="--namespace",
env_var="DYN_NAMESPACE",
default="dynamo",
help="Dynamo namespace",
)
add_argument(
g,
flag_name="--store-kv",
env_var="DYN_STORE_KV",
default="etcd",
help="Which key-value backend to use: etcd, mem, file. Etcd uses the ETCD_* env vars (e.g. ETCD_ENDPOINTS) for connection details. File uses root dir from env var DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv.",
choices=["etcd", "file", "mem"],
)
add_argument(
g,
flag_name="--request-plane",
env_var="DYN_REQUEST_PLANE",
default="tcp",
help="Determines how requests are distributed from routers to workers. 'tcp' is fastest.",
choices=["tcp", "nats", "http"],
)
add_argument(
g,
flag_name="--event-plane",
env_var="DYN_EVENT_PLANE",
default="nats",
help="Determines how events are published.",
choices=["nats", "zmq"],
)
add_argument(
g,
flag_name="--connector",
env_var="DYN_CONNECTOR",
default=["nixl"],
help="List of connectors to use in order (e.g., --connector nixl lmcache). Options: nixl, lmcache, kvbm, null, none. Order will be preserved in MultiConnector.",
nargs="*",
)
add_negatable_bool_argument(
g,
flag_name="--durable-kv-events",
env_var="DYN_DURABLE_KV_EVENTS",
default=False,
help="Enable durable KV events using NATS JetStream instead of the local indexer. By default, local indexer is enabled for lower latency. Use this flag when you need durability and multi-replica router consistency. Requires NATS with JetStream enabled. Can also be set via DYN_DURABLE_KV_EVENTS=true env var.",
)
# Optional: tool/reasoning parsers (choices from dynamo._core when available)
# To avoid name conflicts with different backends, prefix "dyn-" for dynamo specific args
add_argument(
g,
flag_name="--dyn-tool-call-parser",
env_var="DYN_TOOL_CALL_PARSER",
default=None,
help="Tool call parser name for the model.",
choices=get_tool_parser_names(),
)
add_argument(
g,
flag_name="--dyn-reasoning-parser",
env_var="DYN_REASONING_PARSER",
default=None,
help="Reasoning parser name for the model. If not specified, no reasoning parsing is performed.",
choices=get_reasoning_parser_names(),
)
add_argument(
g,
flag_name="--custom-jinja-template",
env_var="DYN_CUSTOM_JINJA_TEMPLATE",
default=None,
help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
)
add_argument(
g,
flag_name="--endpoint-types",
env_var="DYN_ENDPOINT_TYPES",
default="chat,completions",
obsolete_flag="--dyn-endpoint-types",
help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Use 'completions' for models without chat templates.",
)
add_argument(
g,
flag_name="--dump-config-to",
env_var="DYN_DUMP_CONFIG_TO",
default=None,
help="Dump resolved configuration to the specified file path.",
)
...@@ -40,6 +40,9 @@ def env_or_default(env_var: str, default: T) -> T: ...@@ -40,6 +40,9 @@ def env_or_default(env_var: str, default: T) -> T:
return int(value) # type: ignore return int(value) # type: ignore
elif isinstance(default, float): elif isinstance(default, float):
return float(value) # type: ignore return float(value) # type: ignore
elif isinstance(default, list):
# Env vars for list options (e.g. DYN_CONNECTOR) are space-separated; downstream expects a list.
return [x.strip() for x in value.split() if x.strip()] # type: ignore
else: else:
return value # type: ignore return value # type: ignore
...@@ -75,7 +78,11 @@ def add_argument( ...@@ -75,7 +78,11 @@ def add_argument(
names = [flag_name] names = [flag_name]
env_help = _build_help_message(help, env_var, default_with_env, obsolete_flag) if obsolete_flag:
# Accept obsolete flag as an alias (still show deprecation note in help)
names.append(obsolete_flag)
env_help = _build_help_message(help, env_var, default, obsolete_flag)
add_arg_opts = { add_arg_opts = {
"dest": arg_dest, "dest": arg_dest,
...@@ -126,7 +133,7 @@ def _build_help_message( ...@@ -126,7 +133,7 @@ def _build_help_message(
Build help message with env var and default value. Build help message with env var and default value.
""" """
if obsolete_flag: if obsolete_flag:
return f"{help_text}\nenv var: {env_var} | default: {default}\nobsolete flag: {obsolete_flag}" return f"{help_text}\nenv var: {env_var} | default: {default}\ndeprecating flag: {obsolete_flag}"
return f"{help_text}\nenv var: {env_var} | default: {default}" return f"{help_text}\nenv var: {env_var} | default: {default}"
......
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Dynamo vLLM wrapper configuration ArgGroup."""
from typing import Optional
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.config_base import ConfigBase
from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
from . import __version__
class DynamoVllmArgGroup(ArgGroup):
"""vLLM-specific Dynamo wrapper configuration (not native vLLM engine args)."""
name = "dynamo-vllm"
def add_arguments(self, parser) -> None:
"""Add Dynamo vLLM arguments to parser."""
parser.add_argument(
"--version", action="version", version=f"Dynamo Backend VLLM {__version__}"
)
g = parser.add_argument_group("Dynamo vLLM Options")
add_negatable_bool_argument(
g,
flag_name="--is-prefill-worker",
env_var="DYN_VLLM_IS_PREFILL_WORKER",
default=False,
help="Enable prefill functionality for this worker. Uses the provided namespace to construct dyn://namespace.prefill.generate",
)
add_negatable_bool_argument(
g,
flag_name="--is-decode-worker",
env_var="DYN_VLLM_IS_DECODE_WORKER",
default=False,
help="Mark this as a decode worker which does not publish KV events",
)
add_negatable_bool_argument(
g,
flag_name="--use-vllm-tokenizer",
env_var="DYN_VLLM_USE_TOKENIZER",
default=False,
help="Use vLLM's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend.",
)
add_argument(
g,
flag_name="--sleep-mode-level",
env_var="DYN_VLLM_SLEEP_MODE_LEVEL",
default=1,
help="Sleep mode level (1=offload to CPU, 2=discard weights, 3=discard all).",
choices=[1, 2, 3],
arg_type=int,
)
# Multimodal
add_negatable_bool_argument(
g,
flag_name="--multimodal-processor",
env_var="DYN_VLLM_MULTIMODAL_PROCESSOR",
default=False,
help="Run as multimodal processor component for handling multimodal requests.",
)
add_negatable_bool_argument(
g,
flag_name="--ec-processor",
env_var="DYN_VLLM_EC_PROCESSOR",
default=False,
help="Run as ECConnector processor (routes multimodal requests to encoder then PD workers).",
)
add_negatable_bool_argument(
g,
flag_name="--multimodal-encode-worker",
env_var="DYN_VLLM_MULTIMODAL_ENCODE_WORKER",
default=False,
help="Run as multimodal encode worker component for processing images/videos.",
)
add_negatable_bool_argument(
g,
flag_name="--multimodal-worker",
env_var="DYN_VLLM_MULTIMODAL_WORKER",
default=False,
help="Run as multimodal worker component for LLM inference with multimodal data.",
)
add_negatable_bool_argument(
g,
flag_name="--multimodal-decode-worker",
env_var="DYN_VLLM_MULTIMODAL_DECODE_WORKER",
default=False,
help="Run as multimodal decode worker in disaggregated mode.",
)
add_negatable_bool_argument(
g,
flag_name="--multimodal-encode-prefill-worker",
env_var="DYN_VLLM_MULTIMODAL_ENCODE_PREFILL_WORKER",
default=False,
help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4).",
)
add_negatable_bool_argument(
g,
flag_name="--enable-multimodal",
env_var="DYN_VLLM_ENABLE_MULTIMODAL",
default=False,
help="Enable multimodal processing. If not set, none of the multimodal components can be used.",
)
add_argument(
g,
flag_name="--mm-prompt-template",
env_var="DYN_VLLM_MM_PROMPT_TEMPLATE",
default="USER: <image>\n<prompt> ASSISTANT:",
help=(
"Different multi-modal models expect the prompt to contain different special media prompts. "
"The processor will use this argument to construct the final prompt. "
"User prompt will replace '<prompt>' in the provided template. "
"For example, if the user prompt is 'please describe the image' and the prompt template is "
"'USER: <image> <prompt> ASSISTANT:', the resulting prompt is "
"'USER: <image> please describe the image ASSISTANT:'."
),
)
add_negatable_bool_argument(
g,
flag_name="--frontend-decoding",
env_var="DYN_VLLM_FRONTEND_DECODING",
default=False,
help=(
"Enable frontend decoding of multimodal images. "
"When enabled, images are decoded in the Rust frontend and transferred to the backend via NIXL RDMA. "
"Without this flag, images are decoded in the Python backend (default behavior)."
),
)
# vLLM-native encoder (ECConnector)
add_negatable_bool_argument(
g,
flag_name="--vllm-native-encoder-worker",
env_var="DYN_VLLM_NATIVE_ENCODER_WORKER",
default=False,
help="Run as vLLM-native encoder worker using ECConnector for encoder disaggregation (requires shared storage). The following flags only work when this flag is enabled: --ec-connector-backend, --ec-storage-path, --ec-extra-config, --ec-consumer-mode.",
)
add_argument(
g,
flag_name="--ec-connector-backend",
env_var="DYN_VLLM_EC_CONNECTOR_BACKEND",
default="ECExampleConnector",
help="ECConnector implementation class for encoder disaggregation.",
)
add_argument(
g,
flag_name="--ec-storage-path",
env_var="DYN_VLLM_EC_STORAGE_PATH",
default=None,
help="Storage path for ECConnector (required for ECExampleConnector, optional for other backends).",
)
add_argument(
g,
flag_name="--ec-extra-config",
env_var="DYN_VLLM_EC_EXTRA_CONFIG",
default=None,
help="Additional ECConnector configuration as JSON string.",
)
add_negatable_bool_argument(
g,
flag_name="--ec-consumer-mode",
env_var="DYN_VLLM_EC_CONSUMER_MODE",
default=False,
help="Configure as ECConnector consumer for receiving encoder embeddings (for PD workers).",
)
# vLLM-Omni
add_negatable_bool_argument(
g,
flag_name="--omni",
env_var="DYN_VLLM_OMNI",
default=False,
help="Run as vLLM-Omni worker for multi-stage pipelines (supports text-to-text, text-to-image, etc.).",
)
add_argument(
g,
flag_name="--stage-configs-path",
env_var="DYN_VLLM_STAGE_CONFIGS_PATH",
default=None,
help="Path to vLLM-Omni stage configuration YAML file for --omni mode (optional).",
)
# @dataclass()
class DynamoVllmConfig(ConfigBase):
"""Configuration for Dynamo vLLM wrapper (vLLM-specific only). All fields optional."""
is_prefill_worker: bool
is_decode_worker: bool
use_vllm_tokenizer: bool
sleep_mode_level: int
# Multimodal
multimodal_processor: bool
ec_processor: bool
multimodal_encode_worker: bool
multimodal_worker: bool
multimodal_decode_worker: bool
multimodal_encode_prefill_worker: bool
enable_multimodal: bool
mm_prompt_template: str
frontend_decoding: bool
# vLLM-native encoder (ECConnector)
vllm_native_encoder_worker: bool
ec_connector_backend: str
ec_storage_path: Optional[str] = None
ec_extra_config: Optional[str] = None
ec_consumer_mode: bool
# vLLM-Omni
omni: bool
stage_configs_path: Optional[str] = None
def validate(self) -> None:
"""Validate vLLM wrapper configuration."""
self._validate_prefill_decode_exclusive()
self._validate_multimodal_role_exclusivity()
self._validate_multimodal_requires_flag()
self._validate_ec_connector_storage()
self._validate_omni_stage_config()
def _validate_prefill_decode_exclusive(self) -> None:
"""Ensure at most one of is_prefill_worker and is_decode_worker is set."""
if self.is_prefill_worker and self.is_decode_worker:
raise ValueError(
"Cannot set both --is-prefill-worker and --is-decode-worker"
)
def _count_multimodal_roles(self) -> int:
"""Return the number of multimodal roles set (0 or 1 allowed)."""
return sum(
[
bool(self.multimodal_processor),
bool(self.ec_processor),
bool(self.multimodal_encode_worker),
bool(self.multimodal_worker),
bool(self.multimodal_decode_worker),
bool(self.multimodal_encode_prefill_worker),
bool(self.vllm_native_encoder_worker),
]
)
def _validate_multimodal_role_exclusivity(self) -> None:
"""Ensure only one multimodal role is set at a time."""
if self._count_multimodal_roles() > 1:
raise ValueError(
"Only one multimodal role can be set at a time: "
"multimodal-processor, ec-processor, multimodal-encode-worker, "
"multimodal-worker, multimodal-decode-worker, "
"multimodal-encode-prefill-worker, vllm-native-encoder-worker"
)
def _validate_multimodal_requires_flag(self) -> None:
"""Require --enable-multimodal when any multimodal role is set."""
if self._count_multimodal_roles() == 1 and not self.enable_multimodal:
raise ValueError(
"Use --enable-multimodal when enabling any multimodal component"
)
def _validate_ec_connector_storage(self) -> None:
"""Require ec_storage_path when using ECExampleConnector backend."""
if self.vllm_native_encoder_worker:
if (
self.ec_connector_backend == "ECExampleConnector"
and not self.ec_storage_path
):
raise ValueError(
"--ec-storage-path is required when using ECExampleConnector backend. "
"Specify a shared storage path for encoder cache."
)
def _validate_omni_stage_config(self) -> None:
"""Require stage_configs_path when using --omni."""
if self.stage_configs_path and not self.omni:
raise ValueError(
"--stage-configs-path is only allowed when using --omni. "
"Specify a YAML file containing stage configurations for the multi-stage pipeline."
)
...@@ -12,6 +12,8 @@ import os ...@@ -12,6 +12,8 @@ import os
from collections.abc import Callable from collections.abc import Callable
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
# TODO: move this to configuration system.
# Port range constants # Port range constants
REGISTERED_PORT_MIN = 1024 REGISTERED_PORT_MIN = 1024
REGISTERED_PORT_MAX = 49151 REGISTERED_PORT_MAX = 49151
......
...@@ -55,7 +55,7 @@ from dynamo.vllm.multimodal_handlers import ( ...@@ -55,7 +55,7 @@ from dynamo.vllm.multimodal_handlers import (
) )
from dynamo.vllm.multimodal_utils.encode_utils import create_ec_transfer_config from dynamo.vllm.multimodal_utils.encode_utils import create_ec_transfer_config
from .args import Config, overwrite_args, parse_args from .args import Config, parse_args
from .chrek import get_checkpoint_config from .chrek import get_checkpoint_config
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
from .health_check import ( from .health_check import (
...@@ -99,7 +99,6 @@ async def graceful_shutdown(runtime, shutdown_event): ...@@ -99,7 +99,6 @@ async def graceful_shutdown(runtime, shutdown_event):
async def worker(): async def worker():
config = parse_args() config = parse_args()
overwrite_args(config)
dump_config(config.dump_config_to, config) dump_config(config.dump_config_to, config)
# Name the model. Use either the full path (vllm and sglang do the same), # Name the model. Use either the full path (vllm and sglang do the same),
...@@ -494,8 +493,8 @@ async def register_vllm_model( ...@@ -494,8 +493,8 @@ async def register_vllm_model(
# Add tool/reasoning parsers for decode models # Add tool/reasoning parsers for decode models
if model_type != ModelType.Prefill: if model_type != ModelType.Prefill:
runtime_config.tool_call_parser = config.tool_call_parser runtime_config.tool_call_parser = config.dyn_tool_call_parser
runtime_config.reasoning_parser = config.reasoning_parser runtime_config.reasoning_parser = config.dyn_reasoning_parser
# Get data_parallel_size from vllm_config (defaults to 1) # Get data_parallel_size from vllm_config (defaults to 1)
data_parallel_size = getattr(vllm_config.parallel_config, "data_parallel_size", 1) data_parallel_size = getattr(vllm_config.parallel_config, "data_parallel_size", 1)
...@@ -785,14 +784,14 @@ async def init( ...@@ -785,14 +784,14 @@ async def init(
await _handle_non_leader_node(config.engine_args.data_parallel_rank) await _handle_non_leader_node(config.engine_args.data_parallel_rank)
return return
# Parse endpoint types from --dyn-endpoint-types flag # Parse endpoint types from --endpoint-types flag
model_type = parse_endpoint_types(config.dyn_endpoint_types) model_type = parse_endpoint_types(config.endpoint_types)
logger.info(f"Registering model with endpoint types: {config.dyn_endpoint_types}") logger.info(f"Registering model with endpoint types: {config.endpoint_types}")
model_input = ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens model_input = ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
# Warn if custom template provided but chat endpoint not enabled # Warn if custom template provided but chat endpoint not enabled
if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types: if config.custom_jinja_template and "chat" not in config.endpoint_types:
logger.warning( logger.warning(
"Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. " "Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
"The chat template will be loaded but the /v1/chat/completions endpoint will not be available." "The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment