chore: Remove static mode (#4235)

Signed-off-by: Graham King <grahamk@nvidia.com>

chore: Remove static mode (#4235)
Signed-off-by: Graham King <grahamk@nvidia.com>
e1af3af6 · Graham King · GitHub · d9b674b8 · e1af3af6 · e1af3af6
Unverified Commit e1af3af6 authored Nov 11, 2025 by Graham King Committed by GitHub Nov 11, 2025
20 changed files
--- a/README.md
+++ b/README.md
@@ -92,9 +92,9 @@ Backend engines require Python development headers for JIT compilation. Install
 sudo apt install python3-dev
 ```
-### Install etcd and NATS (required)
+### Install etcd (optional) and NATS (required)
-To coordinate across a data center, Dynamo relies on etcd and NATS. To run Dynamo locally, these need to be available.
+To coordinate across a data center, Dynamo relies on etcd and NATS. These will be used in production. To run Dynamo locally etcd is optional.
 - [etcd](https://etcd.io/) can be run directly as `./etcd`.
 - [nats](https://nats.io/) needs jetstream enabled: `nats-server -js`.
@@ -106,6 +106,9 @@ To quickly setup etcd & NATS, you can also run:
 docker compose -f deploy/docker-compose.yml up -d
 ```
+To run locally without etcd, pass `--store-kv file` to both the frontend and workers. The directory used for key-value data can be configured via the `DYN_FILE_KV` environment variable (example: `export DYN_FILE_KV=/data/kv/dynamo`). Defaults to `$TMPDIR/dynamo_store_kv`.
 ## 2. Select an engine
 We publish Python wheels specialized for each of our supported engines: vllm, sglang, and trtllm. The examples that follow use SGLang; continue reading for other engines.
@@ -142,11 +145,13 @@ Dynamo provides a simple way to spin up a local set of inference components incl
 ```
 # Start an OpenAI compatible HTTP server, a pre-processor (prompt templating and tokenization) and a router.
 # Pass the TLS certificate and key paths to use HTTPS instead of HTTP.
-python -m dynamo.frontend --http-port 8000 [--tls-cert-path cert.pem] [--tls-key-path key.pem]
+# Pass --store-kv to use the filesystem instead of etcd. The workers and frontend must share a disk.
+python -m dynamo.frontend --http-port 8000 [--tls-cert-path cert.pem] [--tls-key-path key.pem] [--store-kv file]
 # Start the SGLang engine, connecting to NATS and etcd to receive requests. You can run several of these,
 # both for the same model and for multiple models. The frontend node will discover them.
-python -m dynamo.sglang --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+# Pass --store-kv to use the filesystem instead of etcd. The workers and frontend must share a disk.
+python -m dynamo.sglang --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B [--store-kv file]
 ```
 #### Send a Request
@@ -336,7 +341,7 @@ uv pip install -e .
 You should now be able to run `python -m dynamo.frontend`.
-Remember that nats and etcd must be running (see earlier).
+Remember that nats and etcd must typically be running (see earlier).
 Set the environment variable `DYN_LOG` to adjust the logging level; for example, `export DYN_LOG=debug`. It has the same syntax as `RUST_LOG`.

--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -20,7 +20,6 @@ import asyncio
 import logging
 import os
 import pathlib
-import re
 import signal
 import uvloop
@@ -49,18 +48,6 @@ CUSTOM_BACKEND_ENDPOINT_ENV_VAR = "CUSTOM_BACKEND_ENDPOINT"
 logger = logging.getLogger(__name__)
-def validate_static_endpoint(value):
-    """Validate that static-endpoint is three words separated by dots."""
-    if not re.match(
-        r"^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$",
-        value,
-    ):
-        raise argparse.ArgumentTypeError(
-            f"static-endpoint must be three words separated by dots, got: {value}"
-        )
-    return value
 def validate_model_name(value):
    """Validate that model-name is a non-empty string."""
    if not value or not isinstance(value, str) or len(value.strip()) == 0:
@@ -181,11 +168,6 @@ def parse_args():
        default=None,
        help="Threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache usage. If not set, busy detection is disabled.",
    )
-    parser.add_argument(
-        "--static-endpoint",
-        type=validate_static_endpoint,
-        help="Static endpoint in format: word.word.word (e.g., dynamo.backend.generate)",
-    )
    parser.add_argument(
        "--model-name",
        type=validate_model_name,
@@ -234,8 +216,6 @@ def parse_args():
    flags = parser.parse_args()
-    if flags.static_endpoint and (not flags.model_name or not flags.model_path):
-        parser.error("--static-endpoint requires both --model-name and --model-path")
    if bool(flags.tls_cert_path) ^ bool(flags.tls_key_path):  # ^ is XOR
        parser.error("--tls-cert-path and --tls-key-path must be provided together")
    if flags.custom_backend_metrics_polling_interval < 0:
@@ -249,7 +229,6 @@ def parse_args():
 async def async_main():
    flags = parse_args()
    dump_config(flags.dump_config_to, flags)
-    is_static = bool(flags.static_endpoint)  # true if the string has a value
    # Warn if DYN_SYSTEM_PORT is set (frontend doesn't use system metrics server)
    if os.environ.get("DYN_SYSTEM_PORT"):
@@ -268,7 +247,7 @@ async def async_main():
            os.environ["DYN_METRICS_PREFIX"] = flags.metrics_prefix
    loop = asyncio.get_running_loop()
-    runtime = DistributedRuntime(loop, flags.store_kv, is_static)
+    runtime = DistributedRuntime(loop, flags.store_kv)
    def signal_handler():
        asyncio.create_task(graceful_shutdown(runtime))
@@ -303,9 +282,6 @@ async def async_main():
        ),
    }
-    if flags.static_endpoint:
-        kwargs["endpoint_id"] = flags.static_endpoint
    if flags.model_name:
        kwargs["model_name"] = flags.model_name
    if flags.model_path:
@@ -325,13 +301,7 @@ async def async_main():
            "custom_backend_metrics_polling_interval"
        ] = flags.custom_backend_metrics_polling_interval
-    if is_static:
+    e = EntrypointArgs(EngineType.Dynamic, **kwargs)
-        # out=dyn://<static_endpoint>
-        engine_type = EngineType.Static
-    else:
-        # out=auto, most common
-        engine_type = EngineType.Dynamic
-    e = EntrypointArgs(engine_type, **kwargs)
    engine = await make_engine(runtime, e)
    try:

--- a/components/src/dynamo/mocker/main.py
+++ b/components/src/dynamo/mocker/main.py
@@ -72,7 +72,7 @@ async def launch_workers(args, extra_engine_args_path):
        logger.info(f"Creating mocker worker {worker_id + 1}/{args.num_workers}")
        # Create a separate DistributedRuntime for this worker (on same event loop)
-        runtime = DistributedRuntime(loop, args.store_kv, False)
+        runtime = DistributedRuntime(loop, args.store_kv)
        runtimes.append(runtime)
        # Create EntrypointArgs for this worker

--- a/components/src/dynamo/planner/planner_sla.py
+++ b/components/src/dynamo/planner/planner_sla.py
@@ -33,7 +33,7 @@ class RequestType(BaseModel):
    text: str
-@dynamo_worker(static=False)
+@dynamo_worker()
 async def init_planner(runtime: DistributedRuntime, args):
    await asyncio.sleep(INIT_PLANNER_START_DELAY)

--- a/components/src/dynamo/router/__main__.py
+++ b/components/src/dynamo/router/__main__.py
@@ -220,7 +220,7 @@ def parse_args():
    return parser.parse_args()
-@dynamo_worker(static=False)
+@dynamo_worker()
 async def worker(runtime: DistributedRuntime):
    """Main worker function for the standalone router service."""

--- a/components/src/dynamo/sglang/main.py
+++ b/components/src/dynamo/sglang/main.py
@@ -38,7 +38,7 @@ async def worker():
    dump_config(config.dynamo_args.dump_config_to, config)
    loop = asyncio.get_running_loop()
-    runtime = DistributedRuntime(loop, config.dynamo_args.store_kv, False)
+    runtime = DistributedRuntime(loop, config.dynamo_args.store_kv)
    def signal_handler():
        asyncio.create_task(graceful_shutdown(runtime))

--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -106,7 +106,7 @@ async def worker():
    config = cmd_line_args()
    loop = asyncio.get_running_loop()
-    runtime = DistributedRuntime(loop, config.store_kv, False)
+    runtime = DistributedRuntime(loop, config.store_kv)
    # Set up signal handler for graceful shutdown
    def signal_handler():

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -75,7 +75,7 @@ async def worker():
    config = parse_args()
    loop = asyncio.get_running_loop()
-    runtime = DistributedRuntime(loop, config.store_kv, False)
+    runtime = DistributedRuntime(loop, config.store_kv)
    await configure_ports(config)
    overwrite_args(config)

--- a/docs/development/backend-guide.md
+++ b/docs/development/backend-guide.md
@@ -21,7 +21,7 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
   # 1. Decorate a function to get the runtime
   #
-   @dynamo_worker(static=False)
+   @dynamo_worker()
   async def worker(runtime: DistributedRuntime):
    # 2. Register ourselves on the network

--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -130,7 +130,7 @@ Example 4: Multiple component in a pipeline.
 In the P/D disaggregated setup you would have `deepseek-distill-llama8b.prefill.generate` (possibly multiple instances of this) and `deepseek-distill-llama8b.decode.generate`.
-For output it is always only `out=auto`. This tells Dynamo to auto-discover the instances, group them by model, and load balance appropriately (depending on `--router-mode` flag). The exception is static workers, see that section.
+For output it is always only `out=auto`. This tells Dynamo to auto-discover the instances, group them by model, and load balance appropriately (depending on `--router-mode` flag).
 ### KV-aware routing
@@ -333,7 +333,7 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
   # 1. Decorate a function to get the runtime
   #
-   @dynamo_worker(static=False)
+   @dynamo_worker()
   async def worker(runtime: DistributedRuntime):
    # 2. Register ourselves on the network

--- a/examples/custom_backend/cancellation/client.py
+++ b/examples/custom_backend/cancellation/client.py
@@ -50,7 +50,7 @@ async def main():
            return
    loop = asyncio.get_running_loop()
-    runtime = DistributedRuntime(loop, "mem", True)
+    runtime = DistributedRuntime(loop, "file")
    # Connect to middle server or direct server based on argument
    if use_middle_server:

--- a/examples/custom_backend/cancellation/middle_server.py
+++ b/examples/custom_backend/cancellation/middle_server.py
@@ -50,7 +50,7 @@ class MiddleServer:
 async def main():
    """Start the middle server"""
    loop = asyncio.get_running_loop()
-    runtime = DistributedRuntime(loop, "mem", True)
+    runtime = DistributedRuntime(loop, "file")
    # Create middle server handler
    handler = MiddleServer(runtime)

--- a/examples/custom_backend/cancellation/server.py
+++ b/examples/custom_backend/cancellation/server.py
@@ -31,7 +31,7 @@ class DemoServer:
 async def main():
    """Start the demo server"""
    loop = asyncio.get_running_loop()
-    runtime = DistributedRuntime(loop, "mem", True)
+    runtime = DistributedRuntime(loop, "file")
    # Create server component
    component = runtime.namespace("demo").component("server")

--- a/examples/custom_backend/nim/mock_nim_backend.py
+++ b/examples/custom_backend/nim/mock_nim_backend.py
@@ -24,7 +24,7 @@ from typing import Any, AsyncGenerator
 import uvloop
-from dynamo.runtime import DistributedRuntime, dynamo_worker
+from dynamo.runtime import DistributedRuntime
 # Global counter for incrementing metrics
 request_count = 0
@@ -106,7 +106,7 @@ async def worker(runtime: DistributedRuntime):
    await stats_endpoint.serve_endpoint(handle_stats_request)  # type: ignore[arg-type]
-def main():
+async def main():
    import argparse
    # Parse args before calling dynamo_worker to determine static mode
@@ -119,12 +119,17 @@ def main():
    # Set static mode based on --use-etcd flag (default is static/no etcd)
    is_static = not args.use_etcd
-    # Create the worker with appropriate static mode
+    loop = asyncio.get_running_loop()
-    worker_func = dynamo_worker(static=is_static)(worker)
+    if is_static:
+        runtime = DistributedRuntime(loop, "file")
+    else:
+        runtime = DistributedRuntime(loop, "etcd")
-    uvloop.install()
+    try:
-    asyncio.run(worker_func())  # type: ignore[arg-type]
+        await worker(runtime)  # type: ignore[arg-type]
+    finally:
+        runtime.shutdown()
 if __name__ == "__main__":
-    main()
+    uvloop.run(main())
--- a/examples/custom_backend/nim/mock_nim_frontend.py
+++ b/examples/custom_backend/nim/mock_nim_frontend.py
@@ -122,8 +122,7 @@ async def async_main():
    loop = asyncio.get_running_loop()
    # Create DistributedRuntime - similar to frontend/main.py line 246
-    is_static = True  # Use static mode (no etcd)
+    runtime = DistributedRuntime(loop, "file")  # type: ignore[call-arg]
-    runtime = DistributedRuntime(loop, "mem", is_static)  # type: ignore[call-arg]
    # Setup signal handlers for graceful shutdown
    def signal_handler():

--- a/examples/multimodal/components/encode_worker.py
+++ b/examples/multimodal/components/encode_worker.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import argparse
 import asyncio
@@ -213,7 +201,7 @@ async def graceful_shutdown(runtime):
    logging.info("DistributedRuntime shutdown complete")
-@dynamo_worker(static=False)
+@dynamo_worker()
 async def worker(runtime: DistributedRuntime):
    # Runtime setup
    # Set up signal handler for graceful shutdown

--- a/examples/multimodal/components/processor.py
+++ b/examples/multimodal/components/processor.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import argparse
 import asyncio
@@ -279,7 +267,7 @@ async def graceful_shutdown(runtime):
    logging.info("DistributedRuntime shutdown complete")
-@dynamo_worker(static=False)
+@dynamo_worker()
 async def worker(runtime: DistributedRuntime):
    # Runtime setup
    # Set up signal handler for graceful shutdown

--- a/examples/multimodal/components/video_encode_worker.py
+++ b/examples/multimodal/components/video_encode_worker.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import argparse
 import asyncio
@@ -260,7 +248,7 @@ async def graceful_shutdown(runtime):
    logging.info("DistributedRuntime shutdown complete")
-@dynamo_worker(static=False)
+@dynamo_worker()
 async def worker(runtime: DistributedRuntime):
    # Runtime setup
    # Set up signal handler for graceful shutdown

--- a/examples/multimodal/components/worker.py
+++ b/examples/multimodal/components/worker.py
@@ -400,7 +400,7 @@ async def graceful_shutdown(runtime):
    logging.info("DistributedRuntime shutdown complete")
-@dynamo_worker(static=False)
+@dynamo_worker()
 async def worker(runtime: DistributedRuntime):
    # Runtime setup
    # Set up signal handler for graceful shutdown

--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -6,7 +6,6 @@ use std::path::PathBuf;
 use clap::ValueEnum;
 use dynamo_llm::entrypoint::RouterConfig;
-use dynamo_llm::entrypoint::input::Input;
 use dynamo_llm::kv_router::KvRouterConfig;
 use dynamo_llm::mocker::protocols::MockEngineArgs;
 use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
@@ -121,12 +120,6 @@ pub struct Flags {
    #[arg(long, value_parser = clap::value_parser!(u32).range(0..1024))]
    pub migration_limit: Option<u32>,
-    /// Make this a static worker.
-    /// Do not connect to or advertise self on etcd.
-    /// in=dyn://x.y.z only
-    #[arg(long, default_value = "false")]
-    pub static_worker: bool,
    /// Which key-value backend to use: etcd, mem, file.
    /// Etcd uses the ETCD_* env vars (e.g. ETCD_ENPOINTS) for connection details.
    /// File uses root dir from env var DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv.
@@ -142,16 +135,7 @@ pub struct Flags {
 impl Flags {
    /// For each Output variant, check if it would be able to run.
    /// This takes validation out of the main engine creation path.
-    pub fn validate(&self, in_opt: &Input, out_opt: &Output) -> anyhow::Result<()> {
+    pub fn validate(&self, out_opt: &Output) -> anyhow::Result<()> {
-        match in_opt {
-            Input::Endpoint(_) => {}
-            _ => {
-                if self.static_worker {
-                    anyhow::bail!("'--static-worker true' only applies to in=dyn://x.y.z");
-                }
-            }
-        }
        match out_opt {
            Output::Auto => {
                if self.context_length.is_some() {
@@ -170,19 +154,6 @@ impl Flags {
                    );
                }
            }
-            Output::Static(_) => {
-                if self.model_name.is_none()
-                    || self
-                        .model_path_pos
-                        .as_ref()
-                        .or(self.model_path_flag.as_ref())
-                        .is_none()
-                {
-                    anyhow::bail!(
-                        "out=dyn://<path> requires --model-name and --model-path, which are the name and path on disk of the model we expect to serve."
-                    );
-                }
-            }
            Output::Echo => {}
            #[cfg(feature = "mistralrs")]
            Output::MistralRs => {}