feat: Migrate vllm configuration system (#6075)

de27efe6 · jh-nv · GitHub · b94f9dcd · de27efe6 · de27efe6
Unverified Commit de27efe6 authored Feb 12, 2026 by jh-nv Committed by GitHub Feb 12, 2026
7 changed files
--- a/components/src/dynamo/common/configuration/groups/__init__.py
+++ b/components/src/dynamo/common/configuration/groups/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""ArgGroup implementations for different configuration domains."""
+from .runtime_args import DynamoRuntimeArgGroup, DynamoRuntimeConfig
+__all__ = ["DynamoRuntimeArgGroup", "DynamoRuntimeConfig"]
--- a/components/src/dynamo/common/configuration/groups/runtime_args.py
+++ b/components/src/dynamo/common/configuration/groups/runtime_args.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Dynamo runtime configuration ArgGroup."""
+from typing import Optional
+from dynamo._core import get_reasoning_parser_names, get_tool_parser_names
+from dynamo.common.configuration.arg_group import ArgGroup
+from dynamo.common.configuration.config_base import ConfigBase
+from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
+class DynamoRuntimeConfig(ConfigBase):
+    """Configuration for Dynamo runtime (common across all backends)."""
+    namespace: str
+    store_kv: str
+    request_plane: str
+    event_plane: str
+    connector: list[str]
+    enable_local_indexer: bool
+    durable_kv_events: bool
+    dyn_tool_call_parser: Optional[str] = None
+    dyn_reasoning_parser: Optional[str] = None
+    custom_jinja_template: Optional[str] = None
+    endpoint_types: str
+    dump_config_to: Optional[str] = None
+    def validate(self) -> None:
+        # TODO  get a better way for spot fixes like this.
+        self.enable_local_indexer = not self.durable_kv_events
+class DynamoRuntimeArgGroup(ArgGroup):
+    """Dynamo runtime configuration parameters (common to all backends)."""
+    def add_arguments(self, parser) -> None:
+        """Add Dynamo runtime arguments to parser."""
+        g = parser.add_argument_group("Dynamo Runtime Options")
+        add_argument(
+            g,
+            flag_name="--namespace",
+            env_var="DYN_NAMESPACE",
+            default="dynamo",
+            help="Dynamo namespace",
+        )
+        add_argument(
+            g,
+            flag_name="--store-kv",
+            env_var="DYN_STORE_KV",
+            default="etcd",
+            help="Which key-value backend to use: etcd, mem, file. Etcd uses the ETCD_* env vars (e.g. ETCD_ENDPOINTS) for connection details. File uses root dir from env var DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv.",
+            choices=["etcd", "file", "mem"],
+        )
+        add_argument(
+            g,
+            flag_name="--request-plane",
+            env_var="DYN_REQUEST_PLANE",
+            default="tcp",
+            help="Determines how requests are distributed from routers to workers. 'tcp' is fastest.",
+            choices=["tcp", "nats", "http"],
+        )
+        add_argument(
+            g,
+            flag_name="--event-plane",
+            env_var="DYN_EVENT_PLANE",
+            default="nats",
+            help="Determines how events are published.",
+            choices=["nats", "zmq"],
+        )
+        add_argument(
+            g,
+            flag_name="--connector",
+            env_var="DYN_CONNECTOR",
+            default=["nixl"],
+            help="List of connectors to use in order (e.g., --connector nixl lmcache). Options: nixl, lmcache, kvbm, null, none. Order will be preserved in MultiConnector.",
+            nargs="*",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--durable-kv-events",
+            env_var="DYN_DURABLE_KV_EVENTS",
+            default=False,
+            help="Enable durable KV events using NATS JetStream instead of the local indexer. By default, local indexer is enabled for lower latency. Use this flag when you need durability and multi-replica router consistency. Requires NATS with JetStream enabled. Can also be set via DYN_DURABLE_KV_EVENTS=true env var.",
+        )
+        # Optional: tool/reasoning parsers (choices from dynamo._core when available)
+        # To avoid name conflicts with different backends, prefix "dyn-" for dynamo specific args
+        add_argument(
+            g,
+            flag_name="--dyn-tool-call-parser",
+            env_var="DYN_TOOL_CALL_PARSER",
+            default=None,
+            help="Tool call parser name for the model.",
+            choices=get_tool_parser_names(),
+        )
+        add_argument(
+            g,
+            flag_name="--dyn-reasoning-parser",
+            env_var="DYN_REASONING_PARSER",
+            default=None,
+            help="Reasoning parser name for the model. If not specified, no reasoning parsing is performed.",
+            choices=get_reasoning_parser_names(),
+        )
+        add_argument(
+            g,
+            flag_name="--custom-jinja-template",
+            env_var="DYN_CUSTOM_JINJA_TEMPLATE",
+            default=None,
+            help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
+        )
+        add_argument(
+            g,
+            flag_name="--endpoint-types",
+            env_var="DYN_ENDPOINT_TYPES",
+            default="chat,completions",
+            obsolete_flag="--dyn-endpoint-types",
+            help="Comma-separated list of endpoint types to enable. Options: 'chat', 'completions'. Use 'completions' for models without chat templates.",
+        )
+        add_argument(
+            g,
+            flag_name="--dump-config-to",
+            env_var="DYN_DUMP_CONFIG_TO",
+            default=None,
+            help="Dump resolved configuration to the specified file path.",
+        )
--- a/components/src/dynamo/common/configuration/utils.py
+++ b/components/src/dynamo/common/configuration/utils.py
@@ -40,6 +40,9 @@ def env_or_default(env_var: str, default: T) -> T:
        return int(value)  # type: ignore
    elif isinstance(default, float):
        return float(value)  # type: ignore
+    elif isinstance(default, list):
+        # Env vars for list options (e.g. DYN_CONNECTOR) are space-separated; downstream expects a list.
+        return [x.strip() for x in value.split() if x.strip()]  # type: ignore
    else:
        return value  # type: ignore
@@ -75,7 +78,11 @@ def add_argument(
    names = [flag_name]
-    env_help = _build_help_message(help, env_var, default_with_env, obsolete_flag)
+    if obsolete_flag:
+        # Accept obsolete flag as an alias (still show deprecation note in help)
+        names.append(obsolete_flag)
+    env_help = _build_help_message(help, env_var, default, obsolete_flag)
    add_arg_opts = {
        "dest": arg_dest,
@@ -126,7 +133,7 @@ def _build_help_message(
    Build help message with env var and default value.
    """
    if obsolete_flag:
-        return f"{help_text}\nenv var: {env_var} | default: {default}\nobsolete flag: {obsolete_flag}"
+        return f"{help_text}\nenv var: {env_var} | default: {default}\ndeprecating flag: {obsolete_flag}"
    return f"{help_text}\nenv var: {env_var} | default: {default}"

--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
--- a/components/src/dynamo/vllm/backend_args.py
+++ b/components/src/dynamo/vllm/backend_args.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Dynamo vLLM wrapper configuration ArgGroup."""
+from typing import Optional
+from dynamo.common.configuration.arg_group import ArgGroup
+from dynamo.common.configuration.config_base import ConfigBase
+from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
+from . import __version__
+class DynamoVllmArgGroup(ArgGroup):
+    """vLLM-specific Dynamo wrapper configuration (not native vLLM engine args)."""
+    name = "dynamo-vllm"
+    def add_arguments(self, parser) -> None:
+        """Add Dynamo vLLM arguments to parser."""
+        parser.add_argument(
+            "--version", action="version", version=f"Dynamo Backend VLLM {__version__}"
+        )
+        g = parser.add_argument_group("Dynamo vLLM Options")
+        add_negatable_bool_argument(
+            g,
+            flag_name="--is-prefill-worker",
+            env_var="DYN_VLLM_IS_PREFILL_WORKER",
+            default=False,
+            help="Enable prefill functionality for this worker. Uses the provided namespace to construct dyn://namespace.prefill.generate",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--is-decode-worker",
+            env_var="DYN_VLLM_IS_DECODE_WORKER",
+            default=False,
+            help="Mark this as a decode worker which does not publish KV events",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--use-vllm-tokenizer",
+            env_var="DYN_VLLM_USE_TOKENIZER",
+            default=False,
+            help="Use vLLM's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend.",
+        )
+        add_argument(
+            g,
+            flag_name="--sleep-mode-level",
+            env_var="DYN_VLLM_SLEEP_MODE_LEVEL",
+            default=1,
+            help="Sleep mode level (1=offload to CPU, 2=discard weights, 3=discard all).",
+            choices=[1, 2, 3],
+            arg_type=int,
+        )
+        # Multimodal
+        add_negatable_bool_argument(
+            g,
+            flag_name="--multimodal-processor",
+            env_var="DYN_VLLM_MULTIMODAL_PROCESSOR",
+            default=False,
+            help="Run as multimodal processor component for handling multimodal requests.",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--ec-processor",
+            env_var="DYN_VLLM_EC_PROCESSOR",
+            default=False,
+            help="Run as ECConnector processor (routes multimodal requests to encoder then PD workers).",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--multimodal-encode-worker",
+            env_var="DYN_VLLM_MULTIMODAL_ENCODE_WORKER",
+            default=False,
+            help="Run as multimodal encode worker component for processing images/videos.",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--multimodal-worker",
+            env_var="DYN_VLLM_MULTIMODAL_WORKER",
+            default=False,
+            help="Run as multimodal worker component for LLM inference with multimodal data.",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--multimodal-decode-worker",
+            env_var="DYN_VLLM_MULTIMODAL_DECODE_WORKER",
+            default=False,
+            help="Run as multimodal decode worker in disaggregated mode.",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--multimodal-encode-prefill-worker",
+            env_var="DYN_VLLM_MULTIMODAL_ENCODE_PREFILL_WORKER",
+            default=False,
+            help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4).",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--enable-multimodal",
+            env_var="DYN_VLLM_ENABLE_MULTIMODAL",
+            default=False,
+            help="Enable multimodal processing. If not set, none of the multimodal components can be used.",
+        )
+        add_argument(
+            g,
+            flag_name="--mm-prompt-template",
+            env_var="DYN_VLLM_MM_PROMPT_TEMPLATE",
+            default="USER: <image>\n<prompt> ASSISTANT:",
+            help=(
+                "Different multi-modal models expect the prompt to contain different special media prompts. "
+                "The processor will use this argument to construct the final prompt. "
+                "User prompt will replace '<prompt>' in the provided template. "
+                "For example, if the user prompt is 'please describe the image' and the prompt template is "
+                "'USER: <image> <prompt> ASSISTANT:', the resulting prompt is "
+                "'USER: <image> please describe the image ASSISTANT:'."
+            ),
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--frontend-decoding",
+            env_var="DYN_VLLM_FRONTEND_DECODING",
+            default=False,
+            help=(
+                "Enable frontend decoding of multimodal images. "
+                "When enabled, images are decoded in the Rust frontend and transferred to the backend via NIXL RDMA. "
+                "Without this flag, images are decoded in the Python backend (default behavior)."
+            ),
+        )
+        # vLLM-native encoder (ECConnector)
+        add_negatable_bool_argument(
+            g,
+            flag_name="--vllm-native-encoder-worker",
+            env_var="DYN_VLLM_NATIVE_ENCODER_WORKER",
+            default=False,
+            help="Run as vLLM-native encoder worker using ECConnector for encoder disaggregation (requires shared storage). The following flags only work when this flag is enabled: --ec-connector-backend, --ec-storage-path, --ec-extra-config, --ec-consumer-mode.",
+        )
+        add_argument(
+            g,
+            flag_name="--ec-connector-backend",
+            env_var="DYN_VLLM_EC_CONNECTOR_BACKEND",
+            default="ECExampleConnector",
+            help="ECConnector implementation class for encoder disaggregation.",
+        )
+        add_argument(
+            g,
+            flag_name="--ec-storage-path",
+            env_var="DYN_VLLM_EC_STORAGE_PATH",
+            default=None,
+            help="Storage path for ECConnector (required for ECExampleConnector, optional for other backends).",
+        )
+        add_argument(
+            g,
+            flag_name="--ec-extra-config",
+            env_var="DYN_VLLM_EC_EXTRA_CONFIG",
+            default=None,
+            help="Additional ECConnector configuration as JSON string.",
+        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--ec-consumer-mode",
+            env_var="DYN_VLLM_EC_CONSUMER_MODE",
+            default=False,
+            help="Configure as ECConnector consumer for receiving encoder embeddings (for PD workers).",
+        )
+        # vLLM-Omni
+        add_negatable_bool_argument(
+            g,
+            flag_name="--omni",
+            env_var="DYN_VLLM_OMNI",
+            default=False,
+            help="Run as vLLM-Omni worker for multi-stage pipelines (supports text-to-text, text-to-image, etc.).",
+        )
+        add_argument(
+            g,
+            flag_name="--stage-configs-path",
+            env_var="DYN_VLLM_STAGE_CONFIGS_PATH",
+            default=None,
+            help="Path to vLLM-Omni stage configuration YAML file for --omni mode (optional).",
+        )
+# @dataclass()
+class DynamoVllmConfig(ConfigBase):
+    """Configuration for Dynamo vLLM wrapper (vLLM-specific only). All fields optional."""
+    is_prefill_worker: bool
+    is_decode_worker: bool
+    use_vllm_tokenizer: bool
+    sleep_mode_level: int
+    # Multimodal
+    multimodal_processor: bool
+    ec_processor: bool
+    multimodal_encode_worker: bool
+    multimodal_worker: bool
+    multimodal_decode_worker: bool
+    multimodal_encode_prefill_worker: bool
+    enable_multimodal: bool
+    mm_prompt_template: str
+    frontend_decoding: bool
+    # vLLM-native encoder (ECConnector)
+    vllm_native_encoder_worker: bool
+    ec_connector_backend: str
+    ec_storage_path: Optional[str] = None
+    ec_extra_config: Optional[str] = None
+    ec_consumer_mode: bool
+    # vLLM-Omni
+    omni: bool
+    stage_configs_path: Optional[str] = None
+    def validate(self) -> None:
+        """Validate vLLM wrapper configuration."""
+        self._validate_prefill_decode_exclusive()
+        self._validate_multimodal_role_exclusivity()
+        self._validate_multimodal_requires_flag()
+        self._validate_ec_connector_storage()
+        self._validate_omni_stage_config()
+    def _validate_prefill_decode_exclusive(self) -> None:
+        """Ensure at most one of is_prefill_worker and is_decode_worker is set."""
+        if self.is_prefill_worker and self.is_decode_worker:
+            raise ValueError(
+                "Cannot set both --is-prefill-worker and --is-decode-worker"
+            )
+    def _count_multimodal_roles(self) -> int:
+        """Return the number of multimodal roles set (0 or 1 allowed)."""
+        return sum(
+            [
+                bool(self.multimodal_processor),
+                bool(self.ec_processor),
+                bool(self.multimodal_encode_worker),
+                bool(self.multimodal_worker),
+                bool(self.multimodal_decode_worker),
+                bool(self.multimodal_encode_prefill_worker),
+                bool(self.vllm_native_encoder_worker),
+            ]
+        )
+    def _validate_multimodal_role_exclusivity(self) -> None:
+        """Ensure only one multimodal role is set at a time."""
+        if self._count_multimodal_roles() > 1:
+            raise ValueError(
+                "Only one multimodal role can be set at a time: "
+                "multimodal-processor, ec-processor, multimodal-encode-worker, "
+                "multimodal-worker, multimodal-decode-worker, "
+                "multimodal-encode-prefill-worker, vllm-native-encoder-worker"
+            )
+    def _validate_multimodal_requires_flag(self) -> None:
+        """Require --enable-multimodal when any multimodal role is set."""
+        if self._count_multimodal_roles() == 1 and not self.enable_multimodal:
+            raise ValueError(
+                "Use --enable-multimodal when enabling any multimodal component"
+            )
+    def _validate_ec_connector_storage(self) -> None:
+        """Require ec_storage_path when using ECExampleConnector backend."""
+        if self.vllm_native_encoder_worker:
+            if (
+                self.ec_connector_backend == "ECExampleConnector"
+                and not self.ec_storage_path
+            ):
+                raise ValueError(
+                    "--ec-storage-path is required when using ECExampleConnector backend. "
+                    "Specify a shared storage path for encoder cache."
+                )
+    def _validate_omni_stage_config(self) -> None:
+        """Require stage_configs_path when using --omni."""
+        if self.stage_configs_path and not self.omni:
+            raise ValueError(
+                "--stage-configs-path is only allowed when using --omni. "
+                "Specify a YAML file containing stage configurations for the multi-stage pipeline."
+            )
--- a/components/src/dynamo/vllm/envs.py
+++ b/components/src/dynamo/vllm/envs.py
@@ -12,6 +12,8 @@ import os
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
+# TODO: move this to configuration system.
 # Port range constants
 REGISTERED_PORT_MIN = 1024
 REGISTERED_PORT_MAX = 49151

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -55,7 +55,7 @@ from dynamo.vllm.multimodal_handlers import (
 )
 from dynamo.vllm.multimodal_utils.encode_utils import create_ec_transfer_config
-from .args import Config, overwrite_args, parse_args
+from .args import Config, parse_args
 from .chrek import get_checkpoint_config
 from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
 from .health_check import (
@@ -99,7 +99,6 @@ async def graceful_shutdown(runtime, shutdown_event):
 async def worker():
    config = parse_args()
-    overwrite_args(config)
    dump_config(config.dump_config_to, config)
    # Name the model. Use either the full path (vllm and sglang do the same),
@@ -494,8 +493,8 @@ async def register_vllm_model(
    # Add tool/reasoning parsers for decode models
    if model_type != ModelType.Prefill:
-        runtime_config.tool_call_parser = config.tool_call_parser
+        runtime_config.tool_call_parser = config.dyn_tool_call_parser
-        runtime_config.reasoning_parser = config.reasoning_parser
+        runtime_config.reasoning_parser = config.dyn_reasoning_parser
    # Get data_parallel_size from vllm_config (defaults to 1)
    data_parallel_size = getattr(vllm_config.parallel_config, "data_parallel_size", 1)
@@ -785,14 +784,14 @@ async def init(
        await _handle_non_leader_node(config.engine_args.data_parallel_rank)
        return
-    # Parse endpoint types from --dyn-endpoint-types flag
+    # Parse endpoint types from --endpoint-types flag
-    model_type = parse_endpoint_types(config.dyn_endpoint_types)
+    model_type = parse_endpoint_types(config.endpoint_types)
-    logger.info(f"Registering model with endpoint types: {config.dyn_endpoint_types}")
+    logger.info(f"Registering model with endpoint types: {config.endpoint_types}")
    model_input = ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
    # Warn if custom template provided but chat endpoint not enabled
-    if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
+    if config.custom_jinja_template and "chat" not in config.endpoint_types:
        logger.warning(
            "Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
            "The chat template will be loaded but the /v1/chat/completions endpoint will not be available."