Unverified Commit e1af3af6 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove static mode (#4235)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent d9b674b8
......@@ -92,9 +92,9 @@ Backend engines require Python development headers for JIT compilation. Install
sudo apt install python3-dev
```
### Install etcd and NATS (required)
### Install etcd (optional) and NATS (required)
To coordinate across a data center, Dynamo relies on etcd and NATS. To run Dynamo locally, these need to be available.
To coordinate across a data center, Dynamo relies on etcd and NATS. These will be used in production. To run Dynamo locally etcd is optional.
- [etcd](https://etcd.io/) can be run directly as `./etcd`.
- [nats](https://nats.io/) needs jetstream enabled: `nats-server -js`.
......@@ -106,6 +106,9 @@ To quickly setup etcd & NATS, you can also run:
docker compose -f deploy/docker-compose.yml up -d
```
To run locally without etcd, pass `--store-kv file` to both the frontend and workers. The directory used for key-value data can be configured via the `DYN_FILE_KV` environment variable (example: `export DYN_FILE_KV=/data/kv/dynamo`). Defaults to `$TMPDIR/dynamo_store_kv`.
## 2. Select an engine
We publish Python wheels specialized for each of our supported engines: vllm, sglang, and trtllm. The examples that follow use SGLang; continue reading for other engines.
......@@ -142,11 +145,13 @@ Dynamo provides a simple way to spin up a local set of inference components incl
```
# Start an OpenAI compatible HTTP server, a pre-processor (prompt templating and tokenization) and a router.
# Pass the TLS certificate and key paths to use HTTPS instead of HTTP.
python -m dynamo.frontend --http-port 8000 [--tls-cert-path cert.pem] [--tls-key-path key.pem]
# Pass --store-kv to use the filesystem instead of etcd. The workers and frontend must share a disk.
python -m dynamo.frontend --http-port 8000 [--tls-cert-path cert.pem] [--tls-key-path key.pem] [--store-kv file]
# Start the SGLang engine, connecting to NATS and etcd to receive requests. You can run several of these,
# both for the same model and for multiple models. The frontend node will discover them.
python -m dynamo.sglang --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# Pass --store-kv to use the filesystem instead of etcd. The workers and frontend must share a disk.
python -m dynamo.sglang --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B [--store-kv file]
```
#### Send a Request
......@@ -336,7 +341,7 @@ uv pip install -e .
You should now be able to run `python -m dynamo.frontend`.
Remember that nats and etcd must be running (see earlier).
Remember that nats and etcd must typically be running (see earlier).
Set the environment variable `DYN_LOG` to adjust the logging level; for example, `export DYN_LOG=debug`. It has the same syntax as `RUST_LOG`.
......
......@@ -20,7 +20,6 @@ import asyncio
import logging
import os
import pathlib
import re
import signal
import uvloop
......@@ -49,18 +48,6 @@ CUSTOM_BACKEND_ENDPOINT_ENV_VAR = "CUSTOM_BACKEND_ENDPOINT"
logger = logging.getLogger(__name__)
def validate_static_endpoint(value):
"""Validate that static-endpoint is three words separated by dots."""
if not re.match(
r"^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$",
value,
):
raise argparse.ArgumentTypeError(
f"static-endpoint must be three words separated by dots, got: {value}"
)
return value
def validate_model_name(value):
"""Validate that model-name is a non-empty string."""
if not value or not isinstance(value, str) or len(value.strip()) == 0:
......@@ -181,11 +168,6 @@ def parse_args():
default=None,
help="Threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache usage. If not set, busy detection is disabled.",
)
parser.add_argument(
"--static-endpoint",
type=validate_static_endpoint,
help="Static endpoint in format: word.word.word (e.g., dynamo.backend.generate)",
)
parser.add_argument(
"--model-name",
type=validate_model_name,
......@@ -234,8 +216,6 @@ def parse_args():
flags = parser.parse_args()
if flags.static_endpoint and (not flags.model_name or not flags.model_path):
parser.error("--static-endpoint requires both --model-name and --model-path")
if bool(flags.tls_cert_path) ^ bool(flags.tls_key_path): # ^ is XOR
parser.error("--tls-cert-path and --tls-key-path must be provided together")
if flags.custom_backend_metrics_polling_interval < 0:
......@@ -249,7 +229,6 @@ def parse_args():
async def async_main():
flags = parse_args()
dump_config(flags.dump_config_to, flags)
is_static = bool(flags.static_endpoint) # true if the string has a value
# Warn if DYN_SYSTEM_PORT is set (frontend doesn't use system metrics server)
if os.environ.get("DYN_SYSTEM_PORT"):
......@@ -268,7 +247,7 @@ async def async_main():
os.environ["DYN_METRICS_PREFIX"] = flags.metrics_prefix
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, flags.store_kv, is_static)
runtime = DistributedRuntime(loop, flags.store_kv)
def signal_handler():
asyncio.create_task(graceful_shutdown(runtime))
......@@ -303,9 +282,6 @@ async def async_main():
),
}
if flags.static_endpoint:
kwargs["endpoint_id"] = flags.static_endpoint
if flags.model_name:
kwargs["model_name"] = flags.model_name
if flags.model_path:
......@@ -325,13 +301,7 @@ async def async_main():
"custom_backend_metrics_polling_interval"
] = flags.custom_backend_metrics_polling_interval
if is_static:
# out=dyn://<static_endpoint>
engine_type = EngineType.Static
else:
# out=auto, most common
engine_type = EngineType.Dynamic
e = EntrypointArgs(engine_type, **kwargs)
e = EntrypointArgs(EngineType.Dynamic, **kwargs)
engine = await make_engine(runtime, e)
try:
......
......@@ -72,7 +72,7 @@ async def launch_workers(args, extra_engine_args_path):
logger.info(f"Creating mocker worker {worker_id + 1}/{args.num_workers}")
# Create a separate DistributedRuntime for this worker (on same event loop)
runtime = DistributedRuntime(loop, args.store_kv, False)
runtime = DistributedRuntime(loop, args.store_kv)
runtimes.append(runtime)
# Create EntrypointArgs for this worker
......
......@@ -33,7 +33,7 @@ class RequestType(BaseModel):
text: str
@dynamo_worker(static=False)
@dynamo_worker()
async def init_planner(runtime: DistributedRuntime, args):
await asyncio.sleep(INIT_PLANNER_START_DELAY)
......
......@@ -220,7 +220,7 @@ def parse_args():
return parser.parse_args()
@dynamo_worker(static=False)
@dynamo_worker()
async def worker(runtime: DistributedRuntime):
"""Main worker function for the standalone router service."""
......
......@@ -38,7 +38,7 @@ async def worker():
dump_config(config.dynamo_args.dump_config_to, config)
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, config.dynamo_args.store_kv, False)
runtime = DistributedRuntime(loop, config.dynamo_args.store_kv)
def signal_handler():
asyncio.create_task(graceful_shutdown(runtime))
......
......@@ -106,7 +106,7 @@ async def worker():
config = cmd_line_args()
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, config.store_kv, False)
runtime = DistributedRuntime(loop, config.store_kv)
# Set up signal handler for graceful shutdown
def signal_handler():
......
......@@ -75,7 +75,7 @@ async def worker():
config = parse_args()
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, config.store_kv, False)
runtime = DistributedRuntime(loop, config.store_kv)
await configure_ports(config)
overwrite_args(config)
......
......@@ -21,7 +21,7 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
# 1. Decorate a function to get the runtime
#
@dynamo_worker(static=False)
@dynamo_worker()
async def worker(runtime: DistributedRuntime):
# 2. Register ourselves on the network
......
......@@ -130,7 +130,7 @@ Example 4: Multiple component in a pipeline.
In the P/D disaggregated setup you would have `deepseek-distill-llama8b.prefill.generate` (possibly multiple instances of this) and `deepseek-distill-llama8b.decode.generate`.
For output it is always only `out=auto`. This tells Dynamo to auto-discover the instances, group them by model, and load balance appropriately (depending on `--router-mode` flag). The exception is static workers, see that section.
For output it is always only `out=auto`. This tells Dynamo to auto-discover the instances, group them by model, and load balance appropriately (depending on `--router-mode` flag).
### KV-aware routing
......@@ -333,7 +333,7 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
# 1. Decorate a function to get the runtime
#
@dynamo_worker(static=False)
@dynamo_worker()
async def worker(runtime: DistributedRuntime):
# 2. Register ourselves on the network
......
......@@ -50,7 +50,7 @@ async def main():
return
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, "mem", True)
runtime = DistributedRuntime(loop, "file")
# Connect to middle server or direct server based on argument
if use_middle_server:
......
......@@ -50,7 +50,7 @@ class MiddleServer:
async def main():
"""Start the middle server"""
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, "mem", True)
runtime = DistributedRuntime(loop, "file")
# Create middle server handler
handler = MiddleServer(runtime)
......
......@@ -31,7 +31,7 @@ class DemoServer:
async def main():
"""Start the demo server"""
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, "mem", True)
runtime = DistributedRuntime(loop, "file")
# Create server component
component = runtime.namespace("demo").component("server")
......
......@@ -24,7 +24,7 @@ from typing import Any, AsyncGenerator
import uvloop
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime import DistributedRuntime
# Global counter for incrementing metrics
request_count = 0
......@@ -106,7 +106,7 @@ async def worker(runtime: DistributedRuntime):
await stats_endpoint.serve_endpoint(handle_stats_request) # type: ignore[arg-type]
def main():
async def main():
import argparse
# Parse args before calling dynamo_worker to determine static mode
......@@ -119,12 +119,17 @@ def main():
# Set static mode based on --use-etcd flag (default is static/no etcd)
is_static = not args.use_etcd
# Create the worker with appropriate static mode
worker_func = dynamo_worker(static=is_static)(worker)
loop = asyncio.get_running_loop()
if is_static:
runtime = DistributedRuntime(loop, "file")
else:
runtime = DistributedRuntime(loop, "etcd")
uvloop.install()
asyncio.run(worker_func()) # type: ignore[arg-type]
try:
await worker(runtime) # type: ignore[arg-type]
finally:
runtime.shutdown()
if __name__ == "__main__":
main()
uvloop.run(main())
......@@ -122,8 +122,7 @@ async def async_main():
loop = asyncio.get_running_loop()
# Create DistributedRuntime - similar to frontend/main.py line 246
is_static = True # Use static mode (no etcd)
runtime = DistributedRuntime(loop, "mem", is_static) # type: ignore[call-arg]
runtime = DistributedRuntime(loop, "file") # type: ignore[call-arg]
# Setup signal handlers for graceful shutdown
def signal_handler():
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
......@@ -213,7 +201,7 @@ async def graceful_shutdown(runtime):
logging.info("DistributedRuntime shutdown complete")
@dynamo_worker(static=False)
@dynamo_worker()
async def worker(runtime: DistributedRuntime):
# Runtime setup
# Set up signal handler for graceful shutdown
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
......@@ -279,7 +267,7 @@ async def graceful_shutdown(runtime):
logging.info("DistributedRuntime shutdown complete")
@dynamo_worker(static=False)
@dynamo_worker()
async def worker(runtime: DistributedRuntime):
# Runtime setup
# Set up signal handler for graceful shutdown
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
......@@ -260,7 +248,7 @@ async def graceful_shutdown(runtime):
logging.info("DistributedRuntime shutdown complete")
@dynamo_worker(static=False)
@dynamo_worker()
async def worker(runtime: DistributedRuntime):
# Runtime setup
# Set up signal handler for graceful shutdown
......
......@@ -400,7 +400,7 @@ async def graceful_shutdown(runtime):
logging.info("DistributedRuntime shutdown complete")
@dynamo_worker(static=False)
@dynamo_worker()
async def worker(runtime: DistributedRuntime):
# Runtime setup
# Set up signal handler for graceful shutdown
......
......@@ -6,7 +6,6 @@ use std::path::PathBuf;
use clap::ValueEnum;
use dynamo_llm::entrypoint::RouterConfig;
use dynamo_llm::entrypoint::input::Input;
use dynamo_llm::kv_router::KvRouterConfig;
use dynamo_llm::mocker::protocols::MockEngineArgs;
use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
......@@ -121,12 +120,6 @@ pub struct Flags {
#[arg(long, value_parser = clap::value_parser!(u32).range(0..1024))]
pub migration_limit: Option<u32>,
/// Make this a static worker.
/// Do not connect to or advertise self on etcd.
/// in=dyn://x.y.z only
#[arg(long, default_value = "false")]
pub static_worker: bool,
/// Which key-value backend to use: etcd, mem, file.
/// Etcd uses the ETCD_* env vars (e.g. ETCD_ENPOINTS) for connection details.
/// File uses root dir from env var DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv.
......@@ -142,16 +135,7 @@ pub struct Flags {
impl Flags {
/// For each Output variant, check if it would be able to run.
/// This takes validation out of the main engine creation path.
pub fn validate(&self, in_opt: &Input, out_opt: &Output) -> anyhow::Result<()> {
match in_opt {
Input::Endpoint(_) => {}
_ => {
if self.static_worker {
anyhow::bail!("'--static-worker true' only applies to in=dyn://x.y.z");
}
}
}
pub fn validate(&self, out_opt: &Output) -> anyhow::Result<()> {
match out_opt {
Output::Auto => {
if self.context_length.is_some() {
......@@ -170,19 +154,6 @@ impl Flags {
);
}
}
Output::Static(_) => {
if self.model_name.is_none()
|| self
.model_path_pos
.as_ref()
.or(self.model_path_flag.as_ref())
.is_none()
{
anyhow::bail!(
"out=dyn://<path> requires --model-name and --model-path, which are the name and path on disk of the model we expect to serve."
);
}
}
Output::Echo => {}
#[cfg(feature = "mistralrs")]
Output::MistralRs => {}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment