chore: Remove embedded Python vllm and sglang engines (#966)

vllm and sglang are now the sub-process engines from #954 Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).

chore: Remove embedded Python vllm and sglang engines (#966)
vllm and sglang are now the sub-process engines from #954 Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).
42969800 · Graham King · GitHub · 5d89a0c8 · 42969800 · 42969800
Unverified Commit 42969800 authored May 07, 2025 by Graham King Committed by GitHub May 07, 2025
20 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1533,68 +1533,6 @@ dependencies = [
 "tracing",
 ]
-[[package]]
-name = "dynamo-engine-sglang"
-version = "0.2.0"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "async_zmq",
- "dynamo-llm",
- "dynamo-runtime",
- "libc",
- "pyo3",
- "regex",
- "serde_json",
- "tokio",
- "tracing",
-]
-[[package]]
-name = "dynamo-engine-vllm0_7"
-version = "0.2.0"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "async_zmq",
- "dynamo-llm",
- "dynamo-runtime",
- "pyo3",
- "regex",
- "serde-pickle",
- "serde_json",
- "thiserror 2.0.12",
- "tokio",
- "tracing",
-]
-[[package]]
-name = "dynamo-engine-vllm0_8"
-version = "0.2.0"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "dynamo-llm",
- "dynamo-runtime",
- "pyo3",
- "pyo3-async-runtimes",
- "pythonize",
- "regex",
- "serde",
- "serde-pickle",
- "serde_json",
- "thiserror 2.0.12",
- "tokio",
- "tokio-stream",
- "tracing",
-]
 [[package]]
 name = "dynamo-llm"
 version = "0.2.0"
@@ -1671,18 +1609,13 @@ dependencies = [
 "dynamo-engine-llamacpp",
 "dynamo-engine-mistralrs",
 "dynamo-engine-python",
- "dynamo-engine-sglang",
- "dynamo-engine-vllm0_7",
- "dynamo-engine-vllm0_8",
 "dynamo-llm",
 "dynamo-runtime",
 "futures",
 "futures-util",
 "humantime",
 "libc",
- "netlink-packet-route",
 "regex",
- "rtnetlink",
 "serde",
 "serde_json",
 "tempfile",
@@ -1718,7 +1651,7 @@ dependencies = [
 "local-ip-address",
 "log",
 "nid",
- "nix 0.29.0",
+ "nix",
 "nuid",
 "once_cell",
 "prometheus",
@@ -3165,12 +3098,6 @@ version = "1.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
-[[package]]
-name = "iter-read"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071ed4cc1afd86650602c7b11aa2e1ce30762a1c27193201cb5cee9c6ebb1294"
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -3936,70 +3863,6 @@ dependencies = [
 "winapi 0.3.9",
 ]
-[[package]]
-name = "netlink-packet-core"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72724faf704479d67b388da142b186f916188505e7e0b26719019c525882eda4"
-dependencies = [
- "anyhow",
- "byteorder",
- "netlink-packet-utils",
-]
-[[package]]
-name = "netlink-packet-route"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74c171cd77b4ee8c7708da746ce392440cb7bcf618d122ec9ecc607b12938bf4"
-dependencies = [
- "anyhow",
- "byteorder",
- "libc",
- "log",
- "netlink-packet-core",
- "netlink-packet-utils",
-]
-[[package]]
-name = "netlink-packet-utils"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ede8a08c71ad5a95cdd0e4e52facd37190977039a4704eb82a283f713747d34"
-dependencies = [
- "anyhow",
- "byteorder",
- "paste",
- "thiserror 1.0.69",
-]
-[[package]]
-name = "netlink-proto"
-version = "0.11.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72452e012c2f8d612410d89eea01e2d9b56205274abb35d53f60200b2ec41d60"
-dependencies = [
- "bytes",
- "futures",
- "log",
- "netlink-packet-core",
- "netlink-sys",
- "thiserror 2.0.12",
-]
-[[package]]
-name = "netlink-sys"
-version = "0.8.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16c903aa70590cb93691bf97a767c8d1d6122d2cc9070433deb3bbf36ce8bd23"
-dependencies = [
- "bytes",
- "futures",
- "libc",
- "log",
- "tokio",
-]
 [[package]]
 name = "nibble_vec"
 version = "0.1.0"
@@ -4020,17 +3883,6 @@ dependencies = [
 "thiserror 1.0.69",
 ]
-[[package]]
-name = "nix"
-version = "0.27.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
-dependencies = [
- "bitflags 2.9.0",
- "cfg-if 1.0.0",
- "libc",
-]
 [[package]]
 name = "nix"
 version = "0.29.0"
@@ -5357,24 +5209,6 @@ dependencies = [
 "syn 2.0.100",
 ]
-[[package]]
-name = "rtnetlink"
-version = "0.14.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b684475344d8df1859ddb2d395dd3dac4f8f3422a1aa0725993cb375fc5caba5"
-dependencies = [
- "futures",
- "log",
- "netlink-packet-core",
- "netlink-packet-route",
- "netlink-packet-utils",
- "netlink-proto",
- "netlink-sys",
- "nix 0.27.1",
- "thiserror 1.0.69",
- "tokio",
-]
 [[package]]
 name = "rustc-demangle"
 version = "0.1.24"
@@ -5697,19 +5531,6 @@ dependencies = [
 "serde",
 ]
-[[package]]
-name = "serde-pickle"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b641fdc8bcf2781ee78b30c599700d64ad4f412976143e4c5d0b9df906bb4843"
-dependencies = [
- "byteorder",
- "iter-read",
- "num-bigint",
- "num-traits",
- "serde",
-]
 [[package]]
 name = "serde_derive"
 version = "1.0.219"

--- a/container/Dockerfile.none
+++ b/container/Dockerfile.none
@@ -46,7 +46,7 @@ ARG CARGO_BUILD_JOBS
 ENV CARGO_TARGET_DIR=/workspace/target
-RUN cargo build --release --locked --features mistralrs,sglang,vllm,python && \
+RUN cargo build --release --locked --features mistralrs,python && \
    cargo doc --no-deps && \
    cp target/release/dynamo-run /usr/local/bin && \
    cp target/release/http /usr/local/bin && \

--- a/container/Dockerfile.tensorrt_llm
+++ b/container/Dockerfile.tensorrt_llm
@@ -173,7 +173,7 @@ COPY launch /workspace/launch
 COPY deploy/sdk /workspace/deploy/sdk
 # Build Rust crate binaries packaged with the wheel
-RUN cargo build --release --locked --features mistralrs,sglang,vllm,python \
+RUN cargo build --release --locked --features mistralrs,python \
    -p dynamo-run \
    -p llmctl \
    # Multiple http named crates are present in dependencies, need to specify the path

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -59,7 +59,9 @@ RUN apt-get update -y && \
    ninja-build \
    pybind11-dev \
    # Rust build dependencies
+	clang \
    libclang-dev \
+	git \
    # Install utilities
    nvtop \
    tmux \
@@ -305,7 +307,7 @@ COPY launch /workspace/launch
 COPY deploy/sdk /workspace/deploy/sdk
 # Build Rust crate binaries packaged with the wheel
-RUN cargo build --release --locked --features mistralrs,sglang,vllm,python \
+RUN cargo build --release --locked --features mistralrs,python \
    -p dynamo-run \
    -p llmctl \
    # Multiple http named crates are present in dependencies, need to specify the path

--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -3,25 +3,39 @@
 * [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm)
    * [Automatically download a model from Hugging Face](#use-model-from-hugging-face)
    * [Run a model from local file](#run-a-model-from-local-file)
-    * [Multi-node](#multi-node)
+    * [Distributed system](#distributed-system)
-* [Compiling from Source](#compiling-from-source)
+* [Full usage details](#full-usage-details)
    * [Setup](#setup)
+    * [mistral.rs](#mistralrs)
+    * [llama.cpp](#llamacpp)
    * [Sglang](#sglang)
-    * [lama.cpp](#llama_cpp)
    * [Vllm](#vllm)
-    * [Python bring-your-own-engine](#python-bring-your-own-engine)
    * [TensorRT-LLM](#tensorrt-llm-engine)
    * [Echo Engines](#echo-engines)
+    * [Write your own engine in Python](#write-your-own-engine-in-python)
 * [Batch mode](#batch-mode)
 * [Defaults](#defaults)
 * [Extra engine arguments](#extra-engine-arguments)
 `dynamo-run` is a CLI tool for exploring the Dynamo components, and an example of how to use them from Rust. It is also available as `dynamo run` if using the Python wheel.
+It supports the following engines: mistralrs, llamacpp, sglang, vllm and tensorrt-llm. `mistralrs` is the default.
+Usage:
+```
+dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]
+```
+Example: `dynamo run Qwen/Qwen2.5-3B-Instruct`.
+Set environment variable `DYN_LOG` to adjust logging level, e.g. `export DYN_LOG=debug`. It has the same syntax as `RUST_LOG`, ask AI for details.
 ## Quickstart with pip and vllm
 If you used `pip` to install `dynamo` you should have the `dynamo-run` binary pre-installed with the `vllm` engine. You must be in a virtual env with vllm installed to use this. To compile from source, see "Full documentation" below.
+The vllm and sglang engines require [etcd](https://etcd.io/) and [nats](https://nats.io/) with jetstream (`nats-server -js`). Mistralrs and llamacpp do not.
 ### Use model from Hugging Face
 This will automatically download Qwen2.5 3B from Hugging Face (6 GiB download) and start it in interactive text mode:
@@ -69,18 +83,26 @@ curl localhost:8080/v1/models
 curl -d '{"model": "Llama-3.2-3B-Instruct-Q4_K_M", "max_completion_tokens": 2049, "messages":[{"role":"user", "content": "What is the capital of South Africa?" }]}' -H 'Content-Type: application/json' http://localhost:8080/v1/chat/completions
 ```
-### Multi-node
+### Distributed System
+You can run the ingress side (HTTP server and pre-processing) on one machine, for example a CPU node, and the worker on a different machine (a GPU node).
-You will need [etcd](https://etcd.io/) and [nats](https://nats.io) installed and accessible from both nodes.
+You will need [etcd](https://etcd.io/) and [nats](https://nats.io) with jetstream installed and accessible from both nodes.
 **Node 1:**
+OpenAI compliant HTTP server, optional pre-processing, worker discovery.
 ```
 dynamo run in=http out=dyn://llama3B_pool
 ```
 **Node 2:**
+Vllm engine. Receives and returns requests over the network.
 ```
-dynamo run in=dyn://llama3B_pool out=vllm ~/llm_models/Llama-3.2-3B-Instruct
+dynamo-run in=dyn://llama3B_pool out=vllm ~/llms/Llama-3.2-3B-Instruct
 ```
 This will use etcd to auto-discover the model and NATS to talk to it. You can run multiple workers on the same endpoint and it will pick one at random each time.
@@ -89,7 +111,7 @@ The `llama3B_pool` name is purely symbolic, pick anything as long as it matches
 Run `dynamo run --help` for more options.
-## Compiling from Source
+## Full usage details
 `dynamo-run` is what `dynamo run` executes. It is an example of what you can build in Rust with the `dynamo-llm` and `dynamo-runtime`. The following guide demonstrates how you can build from source with all the features.
@@ -125,15 +147,6 @@ source $HOME/.cargo/env
 #### Step 3: Build
-Run `cargo build` to install the `dynamo-run` binary in `target/debug`.
-> **Optionally**, you can run `cargo build` from any location with arguments:
-> ```
-> --target-dir /path/to/target_directory` specify target_directory with write privileges
-> --manifest-path /path/to/project/Cargo.toml` if cargo build is run outside of `launch/` directory
-> ```
 - Linux with GPU and CUDA (tested on Ubuntu):
 ```
 cargo build --features cuda
@@ -149,17 +162,50 @@ cargo build --features metal
 cargo build
 ```
+Optionally you can run `cargo build` from any location with arguments:
+```
+--target-dir /path/to/target_directory` # specify target_directory with write privileges
+--manifest-path /path/to/project/Cargo.toml` # if cargo build is run outside of `launch/` directory
+```
 The binary will be called `dynamo-run` in `target/debug`
 ```
 cd target/debug
 ```
-> Note: Build with `--release` for a smaller binary and better performance, but longer build times. The binary will be in `target/release`.
-To build for other engines, see the following sections.
+Build with `--release` for a smaller binary and better performance, but longer build times. The binary will be in `target/release`.
+### mistralrs
+[mistral.rs](https://github.com/EricLBuehler/mistral.rs) is a pure Rust engine that is fast to run, fast to load, supports GGUF as well as safetensors, and runs well on CPU as well as GPU. For those reasons it is the default engine.
+```
+dynamo-run Qwen/Qwen2.5-3B-Instruct
+```
+is equivalent to
+```
+dynamo-run in=text out=mistralrs Qwen/Qwen2.5-3B-Instruct
+```
+### llamacpp
+Currently [llama.cpp](https://github.com/ggml-org/llama.cpp) is not included by default. Build it like this:
+```
+cargo build --features llamacpp[,cuda|metal|vulkan] -p dynamo-run
+```
+```
+dynamo-run out=llamacpp ~/llms/Llama-3.2-3B-Instruct-Q6_K.gguf
+```
 ### sglang
+The [SGLang](https://docs.sglang.ai/index.html) engine requires [etcd](https://etcd.io/) and [nats](https://nats.io/) with jetstream (`nats-server -js`) to be running.
 1. Setup the python virtual env:
 ```
@@ -170,42 +216,49 @@ uv pip install sgl-kernel --force-reinstall --no-deps
 uv pip install "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
 ```
-2. Build
+2. Run
+Any example above using `out=sglang` will work, but our sglang backend is also multi-gpu.
 ```
-cargo build --features sglang
+cd target/debug
+./dynamo-run in=http out=sglang --model-path ~/llms/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8
 ```
-3. Run
+To pass extra arguments to the sglang engine see *Extra engine arguments* below.
-Any example above using `out=sglang` will work, but our sglang backend is also multi-gpu and multi-node.
+**Multi-GPU**
-**Node 1:**
+Pass `--tensor-parallel-size <NUM-GPUS>` to `dynamo-run`. To specify which GPU to start from pass `--base-gpu-id <num>`.
-```
-cd target/debug
-./dynamo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --leader-addr 10.217.98.122:9876
-```
-**Node 2:**
+For example on a shared eight GPU machine where GPUs 0-3 are already in use:
 ```
-cd target/debug
+dynamo-run out=sglang <model> --tensor-parallel-size 4 --base-gpu-id 4
-./dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --leader-addr 10.217.98.122:9876
 ```
-To pass extra arguments to the sglang engine see *Extra engine arguments* below.
+**Multi-node:**
-### llama_cpp
+Dynamo only manages the leader node (node rank 0). The follower nodes are started in the [normal sglang way](https://docs.sglang.ai/references/deepseek.html#running-examples-on-multi-node).
+Leader node:
 ```
-cargo build --features llamacpp,cuda
+dynamo-run out=sglang /data/models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 16 --node-rank 0 --num-nodes 2 --leader-addr 10.217.98.122:5000
-cd target/debug
+```
-dynamo-run out=llamacpp ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
+All follower nodes. Increment `node-rank` each time:
+```
+python3 -m sglang.launch_server --model-path /data/models/DeepSeek-R1-Distill-Llama-70B --tp 16 --dist-init-addr 10.217.98.122:5000 --nnodes 2 --node-rank 1 --trust-remote-code
 ```
-If the build step also builds llama_cpp libraries into the same folder as the binary ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynamo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynamo-run` binary.
+- Parameters `--leader-addr` and `--dist-init-addr` must match and be the IP address of the leader node. All followers must be able to connect. SGLang is using [PyTorch Distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) for networking.
+- Parameters `--tensor-parallel-size` and `--tp` must match and be the total number of GPUs across the cluster.
+- `--node-rank` must be unique consecutive integers starting at 1. The leader, managed by Dynamo, is 0.
 ### vllm
-Using the [vllm](https://github.com/vllm-project/vllm) Python library. We only use the back half of vllm, talking to it over `zmq`. Slow startup, fast inference. Supports both safetensors from HF and GGUF files.
+Using the [vllm](https://github.com/vllm-project/vllm) Python library. Slow startup, fast inference. Supports both safetensors from HF and GGUF files, but is very slow for GGUF - prefer llamacpp.
+The vllm engine requires requires [etcd](https://etcd.io/) and [nats](https://nats.io/) with jetstream (`nats-server -js`) to be running.
 We use [uv](https://docs.astral.sh/uv/) but any virtualenv manager should work.
@@ -230,100 +283,32 @@ Inside that virtualenv:
 **HF repo:**
 ```
-./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct/
+./dynamo-run in=http out=vllm ~/llms/Llama-3.2-3B-Instruct/
-```
-**GGUF:**
 ```
-./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
-```
-Note that vllm GGUF handling is very slow. Prefer llamacpp.
-**Multi-node:**
-vllm uses [ray](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#running-vllm-on-multiple-nodes) for pipeline parallel inference. Dynamo does not change or manage that.
-Head node (the one running `dynamo-run`): `ray start --head --port=6379 --dashboard-host 0.0.0.0`
-Each worker node: `ray start --address='<HEAD_NODE_IP>:6379`
-Remember to pass dynamo-run `--tensor-parallel-size <total-gpus-across-cluster>`, which is often constrained by a model dimension such as being a divisor of the number of attention heads.
 To pass extra arguments to the vllm engine see [Extra engine arguments](#extra_engine_arguments) below.
-### Python bring-your-own-engine
+**Multi-GPU**
-You can provide your own engine in a Python file. The file must provide a generator with this signature:
-```
-async def generate(request):
-```
-Build: `cargo build --features python`
-#### Python does the pre-processing
-If the Python engine wants to receive and returns strings - it will do the prompt templating and tokenization itself - run it like this:
-```
-dynamo-run out=pystr:/home/user/my_python_engine.py
-```
- The `request` parameter is a map, an OpenAI compatible create chat completion request: https://platform.openai.com/docs/api-reference/chat/create
- The function must `yield` a series of maps conforming to create chat completion stream response (example below).
- If using an HTTP front-end add the `--model-name` flag. This is the name we serve the model under.
-The file is loaded once at startup and kept in memory.
-**Example engine:**
+Pass `--tensor-parallel-size <NUM-GPUS>` to `dynamo-run`.
-```
-import asyncio
-async def generate(request):
-    yield {"id":"1","choices":[{"index":0,"delta":{"content":"The","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
-    await asyncio.sleep(0.1)
-    yield {"id":"1","choices":[{"index":0,"delta":{"content":" capital","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
-    await asyncio.sleep(0.1)
-    yield {"id":"1","choices":[{"index":0,"delta":{"content":" of","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
-    await asyncio.sleep(0.1)
-    yield {"id":"1","choices":[{"index":0,"delta":{"content":" France","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
-    await asyncio.sleep(0.1)
-    yield {"id":"1","choices":[{"index":0,"delta":{"content":" is","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
-    await asyncio.sleep(0.1)
-    yield {"id":"1","choices":[{"index":0,"delta":{"content":" Paris","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
-    await asyncio.sleep(0.1)
-    yield {"id":"1","choices":[{"index":0,"delta":{"content":".","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
-    await asyncio.sleep(0.1)
-    yield {"id":"1","choices":[{"index":0,"delta":{"content":"","role":"assistant"},"finish_reason":"stop"}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
-```
-Command line arguments are passed to the python engine like this:
+To specify which GPUs to use set environment variable `CUDA_VISIBLE_DEVICES`.
-```
-dynamo-run out=pystr:my_python_engine.py -- -n 42 --custom-arg Orange --yes
-```
-The python engine receives the arguments in `sys.argv`. The argument list will include some standard ones as well as anything after the `--`.
+**Multi-node:**
-This input:
+vllm uses [ray](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#running-vllm-on-multiple-nodes) for pipeline parallel inference. Dynamo does not change or manage that.
-```
-dynamo-run out=pystr:my_engine.py /opt/models/Llama-3.2-3B-Instruct/ --model-name llama_3.2 --tensor-parallel-size 4 -- -n 1
-```
-is read like this:
+Here is an example on two 8x nodes:
-```
+- Leader node: `ray start --head --port=6379`
-async def generate(request):
+- Each follower node: `ray start --address='<HEAD_NODE_IP>:6379`
-    .. as before ..
+- Leader node: `dynamo-run out=vllm ~/llms/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 16`
-if __name__ == "__main__":
+The `--tensor-parallel-size` parameter is the total number of GPUs in the cluster. This is often constrained by a model dimension such as being a divisor of the number of attention heads.
-    print(f"MAIN: {sys.argv}")
-```
-and produces this output:
+Startup can be slow so you may want to `export DYN_LOG=debug` to see progress.
-```
-MAIN: ['my_engine.py', '--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--model-name', 'llama3.2', '--http-port', '8080', '--tensor-parallel-size', '4', '--base-gpu-id', '0', '--num-nodes', '1', '--node-rank', '0', '-n', '1']
-```
-This allows quick iteration on the engine setup. Note how the `-n` `1` is included. Flags `--leader-addr` and `--model-config` will also be added if provided to `dynamo-run`.
+Shutdown: `ray stop`
 #### TensorRT-LLM engine
@@ -345,47 +330,6 @@ Execute the following to load the TensorRT-LLM model specified in the configurat
 dynamo run out=pystr:/workspace/examples/tensorrt_llm/engines/trtllm_engine.py  -- --engine_args /workspace/examples/tensorrt_llm/configs/llm_api_config.yaml
 ```
-#### Dynamo does the pre-processing
-If the Python engine wants to receive and return tokens - the prompt templating and tokenization is already done - run it like this:
-```
-dynamo-run out=pytok:/home/user/my_python_engine.py --model-path <hf-repo-checkout>
-```
- The request parameter is a map that looks like this:
-```
-{'token_ids': [128000, 128006, 9125, 128007, ... lots more ... ], 'stop_conditions': {'max_tokens': 8192, 'stop': None, 'stop_token_ids_hidden': [128001, 128008, 128009], 'min_tokens': None, 'ignore_eos': None}, 'sampling_options': {'n': None, 'best_of': None, 'presence_penalty': None, 'frequency_penalty': None, 'repetition_penalty': None, 'temperature': None, 'top_p': None, 'top_k': None, 'min_p': None, 'use_beam_search': None, 'length_penalty': None, 'seed': None}, 'eos_token_ids': [128001, 128008, 128009], 'mdc_sum': 'f1cd44546fdcbd664189863b7daece0f139a962b89778469e4cffc9be58ccc88', 'annotations': []}
-```
- The `generate` function must `yield` a series of maps that look like this:
-```
-{"token_ids":[791],"tokens":None,"text":None,"cum_log_probs":None,"log_probs":None,"finish_reason":None}
-```
- Command like flag `--model-path` which must point to a Hugging Face repo checkout containing the `tokenizer.json`. The `--model-name` flag is optional. If not provided we use the HF repo name (directory name) as the model name.
-**Example engine:**
-```
-import asyncio
-async def generate(request):
-    yield {"token_ids":[791]}
-    await asyncio.sleep(0.1)
-    yield {"token_ids":[6864]}
-    await asyncio.sleep(0.1)
-    yield {"token_ids":[315]}
-    await asyncio.sleep(0.1)
-    yield {"token_ids":[9822]}
-    await asyncio.sleep(0.1)
-    yield {"token_ids":[374]}
-    await asyncio.sleep(0.1)
-    yield {"token_ids":[12366]}
-    await asyncio.sleep(0.1)
-    yield {"token_ids":[13]}
-```
-`pytok` supports the same ways of passing command line arguments as `pystr` - `initialize` or `main` with `sys.argv`.
 ### Echo Engines
 Dynamo includes two echo engines for testing and debugging purposes:
@@ -445,9 +389,78 @@ The output looks like this:
 {"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855}
 ```
+### Write your own engine in Python
+Note: This section replaces "bring-your-own-engine".
+The [dynamo](https://pypi.org/project/ai-dynamo/) Python library allows you to build your own engine and attach it to Dynamo.
+The Python file must do three things:
+1. Decorate a function to get the runtime
+2. Register on the network
+3. Attach a request handler
+```
+from dynamo.llm import ModelType, register_llm
+from dynamo.runtime import DistributedRuntime, dynamo_worker
+# 1. Decorate a function to get the runtime
+#
+@dynamo_worker(static=False)
+async def worker(runtime: DistributedRuntime):
+    # 2. Register ourselves on the network
+    #
+    component = runtime.namespace("namespace").component("component")
+    await component.create_service()
+    model_path = "Qwen/Qwen2.5-0.5B-Instruct" # or "/data/models/Qwen2.5-0.5B-Instruct"
+    model_type = ModelType.Backend
+    endpoint = component.endpoint("endpoint")
+    await register_llm(endpoint, model_path, model_type)
+    # Initialize your engine here
+    # engine = ...
+    # 3. Attach request handler
+    #
+    await endpoint.serve_endpoint(RequestHandler(engine).generate, None)
+class RequestHandler:
+    def __init__(self, engine):
+        ...
+    async def generate(self, request):
+        # Call the engine
+        # yield result dict
+        ...
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(worker())
+```
+The `model_path` can be:
+- A HuggingFace repo ID. It will be downloaded and cached locally.
+- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
+- The path to a GGUF file, if your engine supports that.
+The `model_type` can be:
+- ModelType.Backend. Dynamo handles pre-processing. Your `generate` method receives a `request` dict containing a `token_ids` array of int. It must return a dict also containing a `token_ids` array and an optional `finish_reason` string.
+- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). Your engine handles pre-processing.
+- ModelType.Completion. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). Your engine handles pre-processing.
+Here are some example engines:
+- [vllm simple](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_vllm.py)
+- [sglang simple](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_sglang.py)
+- [vllm](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/vllm_inc.py)
+- [sglang](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/sglang_inc.py)
 ### Defaults
-The input defaults to `in=text`. The output will default to `mistralrs` engine. If not available whatever engine you have compiled in (so depending on `--features`).
+The input defaults to `in=text`. The output will default to `out=mistralrs` engine, unless it is disabled with `--no-default-features` in which case vllm is used.
 ### Extra engine arguments
@@ -463,5 +476,5 @@ Put the arguments in a JSON file:
 Pass it like this:
 ```
-dynamo-run out=sglang ~/llm_models/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json
+dynamo-run out=sglang ~/llms/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json
 ```
--- a/launch/dynamo-run/Cargo.toml
+++ b/launch/dynamo-run/Cargo.toml
@@ -26,12 +26,9 @@ description = "Dynamo Run CLI"
 [features]
 # Build with `--no-default-features` to disable these defaults
-# We don't include llamacpp by default until we figure out when it needs external libraries
+default = ["mistralrs"]
-default = ["mistralrs", "vllm", "sglang"]
 mistralrs = ["dep:dynamo-engine-mistralrs"]
 llamacpp = ["dep:dynamo-engine-llamacpp"]
-vllm = ["dep:dynamo-engine-vllm0_7", "dep:dynamo-engine-vllm0_8", "dep:netlink-packet-route", "dep:rtnetlink"]
-sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
 python = ["dep:dynamo-engine-python"]
 cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"]
@@ -44,9 +41,6 @@ dynamo-runtime = { workspace = true }
 dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true }
 dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true }
-dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
-dynamo-engine-vllm0_7 = { path = "../../lib/engines/vllm0_7", optional = true }
-dynamo-engine-vllm0_8 = { path = "../../lib/engines/vllm0_8", optional = true }
 dynamo-engine-python = { path = "../../lib/engines/python", optional = true }
 anyhow = { workspace = true }
@@ -68,15 +62,3 @@ clap = { version = "4.5", features = ["derive", "env"] }
 dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }
 futures-util = { version = "0.3" }
 regex = "1"
-[target.x86_64-unknown-linux-gnu.dependencies]
-netlink-packet-route = { version = "0.19", optional = true }
-rtnetlink = { version = "0.14", optional = true }
-[target.x86_64-unknown-linux-musl.dependencies]
-netlink-packet-route = { version = "0.19", optional = true }
-rtnetlink = { version = "0.14", optional = true }
-[target.aarch64-unknown-linux-gnu.dependencies]
-netlink-packet-route = { version = "0.19", optional = true }
-rtnetlink = { version = "0.14", optional = true }
--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -15,7 +15,6 @@
 use std::collections::HashMap;
 use std::path::PathBuf;
-use std::str::FromStr;
 use clap::ValueEnum;
 use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
@@ -106,21 +105,6 @@ pub struct Flags {
    #[arg(long, default_value = "round-robin")]
    pub router_mode: RouterMode,
-    /// Internal use only.
-    // Start the python vllm engine sub-process.
-    #[arg(long, hide = true, default_value = "false")]
-    pub internal_vllm_process: bool,
-    /// Internal use only.
-    /// Start the sglang Python sub-process.
-    /// The params in the tuple are:
-    /// - the fd of the write end of a pipe where sglang will signal that it's ready.
-    /// - the node rank (0 for first host, 1 for second host, etc)
-    /// - the workers' rank (globally unique)
-    /// - the GPU to use (locally unique)
-    #[arg(long, hide = true, value_parser = parse_sglang_flags)]
-    pub internal_sglang_process: Option<SgLangFlags>,
    /// Additional engine-specific arguments from a JSON file.
    /// Contains a mapping of parameter names to values.
    #[arg(long)]
@@ -200,30 +184,6 @@ impl Flags {
    }
 }
-#[derive(Debug, Clone, Copy)]
-pub struct SgLangFlags {
-    pub pipe_fd: u32,
-    pub tp_rank: u32,
-    pub gpu_id: u32,
-}
-fn parse_sglang_flags(s: &str) -> Result<SgLangFlags, String> {
-    let nums: Vec<u32> = s
-        .split(',')
-        .map(u32::from_str)
-        .collect::<Result<Vec<_>, _>>()
-        .map_err(|e| e.to_string())?;
-    if nums.len() != 3 {
-        return Err("Need exactly 3 numbers".into());
-    }
-    Ok(SgLangFlags {
-        pipe_fd: nums[0],
-        tp_rank: nums[1],
-        gpu_id: nums[2],
-    })
-}
 #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug)]
 pub enum RouterMode {
    #[default]

--- a/launch/dynamo-run/src/input/common.rs
+++ b/launch/dynamo-run/src/input/common.rs
@@ -183,7 +183,6 @@ pub async fn prepare_engine(
                _cache_dir: None,
            })
        }
-        EngineConfig::None => unreachable!(),
    }
 }

--- a/launch/dynamo-run/src/input/endpoint.rs
+++ b/launch/dynamo-run/src/input/endpoint.rs
@@ -91,7 +91,6 @@ pub async fn run(
        EngineConfig::Dynamic(_) => {
            anyhow::bail!("Cannot use endpoint for both in and out");
        }
-        EngineConfig::None => unreachable!(),
    };
    tokio::select! {

--- a/launch/dynamo-run/src/input/http.rs
+++ b/launch/dynamo-run/src/input/http.rs
@@ -97,7 +97,6 @@ pub async fn run(
            .await?;
            manager.add_completions_model(model.service_name(), cmpl_pipeline)?;
        }
-        EngineConfig::None => unreachable!(),
    }
    http_service.run(runtime.primary_token()).await?;
    runtime.shutdown(); // Cancel primary token

--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#[cfg(any(feature = "vllm", feature = "sglang"))]
 use std::{future::Future, pin::Pin};
 use std::{io::Read, sync::Arc, time::Duration};
 use anyhow::Context;
-use dynamo_llm::{
+use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, LocalModel};
-    backend::ExecutionContext, engines::StreamingEngine, kv_router::publisher::KvMetricsPublisher,
-    LocalModel,
-};
 use dynamo_runtime::{protocols::Endpoint, CancellationToken, DistributedRuntime};
 mod flags;
 pub use flags::Flags;
 mod input;
-#[cfg(any(feature = "vllm", feature = "sglang"))]
-mod net;
 mod opt;
 pub use dynamo_llm::request_template::RequestTemplate;
 pub use opt::{Input, Output};
@@ -38,19 +20,12 @@ mod subprocess;
 /// the command line. Hence it's optional, and defaults to this.
 const INVISIBLE_MODEL_NAME: &str = "dynamo-run";
-/// The component name for the KV publisher, if used
-const KV_PUBLISHER_COMPONENT: &str = "kvpublisher";
 const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
 /// How we identify a python string endpoint
 #[cfg(feature = "python")]
 const PYTHON_STR_SCHEME: &str = "pystr:";
-/// How we identify a python token endpoint
-#[cfg(feature = "python")]
-const PYTHON_TOK_SCHEME: &str = "pytok:";
 pub enum EngineConfig {
    /// An remote networked engine we don't know about yet
    Dynamic(Endpoint),
@@ -66,24 +41,13 @@ pub enum EngineConfig {
        engine: ExecutionContext,
        model: Box<LocalModel>,
    },
-    /// vllm multi-node doesn't run an engine on nodes other than 0. 'ray' does all the work.
-    None,
-}
-/// Distributed system values
-struct DynInput {
-    endpoint_id: Endpoint,
-    distributed_runtime: DistributedRuntime,
 }
-#[allow(unused_mut)]
 pub async fn run(
    runtime: dynamo_runtime::Runtime,
-    mut in_opt: Input, // mut because vllm and sglang multi-node can change it
+    in_opt: Input,
    out_opt: Output,
    flags: Flags,
-    #[allow(unused_variables)] zmq_socket_prefix: Option<String>,
 ) -> anyhow::Result<()> {
    let cancel_token = runtime.primary_token();
    let maybe_path = flags
@@ -120,29 +84,6 @@ pub async fn run(
        }
    };
-    let dyn_input = match &in_opt {
-        Input::Endpoint(endpoint_path) => {
-            if maybe_path.as_ref().map(|mp| mp.is_file()).unwrap_or(false)
-                && flags.model_config.is_none()
-            {
-                // TODO We need to convert tokenizer extract from GGUF file into something we can
-                // publish to NATS. Ideally `tokenizer.json` directly, but otherwise an
-                // intermediate format.
-                tracing::error!("Serving GGUF files in a distributed system requires `--model-config <hf-repo-dir>` so that we can find the tokenzier config");
-                return Ok(());
-            }
-            // If we are in a distributed system, we need to know our component upfront
-            let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
-            let endpoint_id: Endpoint = endpoint_path.parse()?;
-            Some(DynInput {
-                endpoint_id,
-                distributed_runtime,
-            })
-        }
-        _ => None,
-    };
    let mut extra: Option<Pin<Box<dyn Future<Output = ()> + Send>>> = None; // vllm and sglang sub-process
    let template = if let Some(path) = flags.request_template.as_ref() {
@@ -183,13 +124,17 @@ pub async fn run(
            engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?,
            model: Box::new(local_model),
        },
        Output::SgLang => {
            if !local_model.path().is_dir() {
                // TODO Does sglang support GGUF? Can we make it work?
                anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
            }
-            let (py_script, mut child) = match subprocess::start(
+            let multi_node_conf = dynamo_llm::engines::MultiNodeConfig {
+                num_nodes: flags.num_nodes,
+                node_rank: flags.node_rank,
+                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
+            };
+            let (py_script, child) = match subprocess::start(
                subprocess::sglang::PY,
                local_model.path(),
                flags.tensor_parallel_size,
@@ -198,6 +143,11 @@ pub async fn run(
                } else {
                    Some(flags.base_gpu_id)
                },
+                if flags.num_nodes <= 1 {
+                    None
+                } else {
+                    Some(multi_node_conf)
+                },
                flags.extra_engine_args.as_deref(),
            )
            .await
@@ -216,151 +166,16 @@ pub async fn run(
            let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
            EngineConfig::Dynamic(endpoint)
        }
-        #[cfg(feature = "sglang")]
-        Output::SgLangLegacy => {
-            if !local_model.path().is_dir() {
-                anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
-            }
-            let Some(sock_prefix) = zmq_socket_prefix else {
-                anyhow::bail!("sglang requires zmq_socket_prefix");
-            };
-            let node_conf = dynamo_llm::engines::MultiNodeConfig {
-                num_nodes: flags.num_nodes,
-                node_rank: flags.node_rank,
-                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
-            };
-            if node_conf.num_nodes > 1 {
-                if let Ok(Some(if_name)) = net::get_primary_interface().await {
-                    tracing::info!("If you see 'gloo' errors from sglang try setting these environment variables:");
-                    tracing::info!("export GLOO_SOCKET_IFNAME={if_name}");
-                    tracing::info!("export NCCL_SOCKET_IFNAME={if_name}");
-                }
-                if node_conf.node_rank != 0 {
-                    // Follower nodes take input from leader node over pytorch distributed, not
-                    // from user.
-                    in_opt = Input::None;
-                }
-            }
-            let (engine, sglang_process) = dynamo_engine_sglang::make_engine(
-                cancel_token.clone(),
-                local_model.path(),
-                &sock_prefix,
-                node_conf,
-                flags.tensor_parallel_size,
-                flags.base_gpu_id,
-                flags.extra_engine_args.clone(),
-            )
-            .await?;
-            extra = Some(Box::pin(async move {
-                let _ = sglang_process.await;
-            }));
-            EngineConfig::StaticCore {
-                engine,
-                model: Box::new(local_model),
-            }
-        }
-        #[cfg(feature = "vllm")]
-        Output::Vllm0_7 => {
-            if flags.base_gpu_id != 0 {
-                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
-            }
-            let Some(sock_prefix) = zmq_socket_prefix else {
-                anyhow::bail!("vllm requires zmq_socket_prefix");
-            };
-            let node_conf = dynamo_llm::engines::MultiNodeConfig {
-                num_nodes: flags.num_nodes,
-                node_rank: flags.node_rank,
-                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
-            };
-            if node_conf.num_nodes > 1 {
-                if let Ok(Some(if_name)) = net::get_primary_interface().await {
-                    tracing::info!("If you see network errors from vllm try setting this environment variable:");
-                    tracing::info!("export NCCL_SOCKET_IFNAME={if_name}");
-                }
-                if node_conf.node_rank != 0 {
-                    // Only node 0 runs vllm, the others communicate over ray
-                    in_opt = Input::None;
-                }
-            }
-            if node_conf.node_rank == 0 {
-                let kv_metrics_publisher = if let Some(dyn_input) = &dyn_input {
-                    let kvp_component = dyn_input
-                        .distributed_runtime
-                        .namespace(dyn_input.endpoint_id.namespace.clone())?
-                        .component(KV_PUBLISHER_COMPONENT)?;
-                    let kvp = Arc::new(KvMetricsPublisher::new()?);
-                    let kvp_inner = kvp.clone();
-                    tokio::spawn(
-                        async move { kvp_inner.create_endpoint(kvp_component, None).await },
-                    );
-                    Some(kvp)
-                } else {
-                    None
-                };
-                // vllm multi-node only the leader runs vllm
-                let (engine, vllm_future) = dynamo_engine_vllm0_7::make_leader_engine(
-                    cancel_token.clone(),
-                    local_model.path(),
-                    &sock_prefix,
-                    node_conf,
-                    flags.tensor_parallel_size,
-                    flags.extra_engine_args.clone(),
-                    kv_metrics_publisher,
-                )
-                .await?;
-                extra = Some(Box::pin(async move {
-                    let _ = vllm_future.await;
-                }));
-                EngineConfig::StaticCore {
-                    engine,
-                    model: Box::new(local_model),
-                }
-            } else {
-                // Nodes rank > 0 only run 'ray'
-                let stop_future =
-                    dynamo_engine_vllm0_7::start_follower(cancel_token.clone(), node_conf).await?;
-                extra = Some(Box::pin(stop_future));
-                EngineConfig::None
-            }
-        }
-        #[cfg(feature = "vllm")]
-        Output::Vllm0_8 => {
-            if flags.base_gpu_id != 0 {
-                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
-            }
-            let node_conf = dynamo_llm::engines::MultiNodeConfig {
-                num_nodes: flags.num_nodes,
-                node_rank: flags.node_rank,
-                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
-            };
-            let engine = dynamo_engine_vllm0_8::make_engine(
-                cancel_token.clone(),
-                local_model.path(),
-                node_conf,
-                flags.tensor_parallel_size,
-                flags.extra_engine_args.clone(),
-            )
-            .await?;
-            EngineConfig::StaticCore {
-                engine,
-                model: Box::new(local_model),
-            }
-        }
-        // No feature flag because it uses a sub-process, it's very cheap to include
        Output::Vllm => {
            if flags.base_gpu_id != 0 {
                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
            }
-            let (py_script, mut child) = match subprocess::start(
+            let (py_script, child) = match subprocess::start(
                subprocess::vllm::PY,
                local_model.path(),
                flags.tensor_parallel_size,
                None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead
+                None, // multi-node config. vllm uses `ray`, see guide
                flags.extra_engine_args.as_deref(),
            )
            .await
@@ -405,18 +220,6 @@ pub async fn run(
                model: Box::new(local_model),
            }
        }
-        #[cfg(feature = "python")]
-        Output::PythonTok(path_str) => {
-            let card = local_model.card();
-            let py_args = flags.as_vec(&path_str, &card.service_name);
-            let p = std::path::PathBuf::from(path_str);
-            let engine =
-                dynamo_engine_python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
-            EngineConfig::StaticCore {
-                engine,
-                model: Box::new(local_model),
-            }
-        }
    };
    match in_opt {
@@ -443,16 +246,8 @@ pub async fn run(
                .await?;
        }
        Input::Endpoint(path) => {
-            let Some(dyn_input) = dyn_input else {
+            let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
-                unreachable!("We set dyn_input earlier");
+            crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
-            };
-            crate::input::endpoint::run(dyn_input.distributed_runtime, path, engine_config).await?;
-        }
-        Input::None => {
-            // Multi-node setup. The engine sub-process has been started and is talking
-            // to it's node_rank 0 controller. We do nothing.
-            // TODO: Acquire an etcd lease, we are running
-            cancel_token.cancelled().await;
        }
    }

--- a/launch/dynamo-run/src/main.rs
+++ b/launch/dynamo-run/src/main.rs
@@ -24,15 +24,13 @@ const HELP: &str = r#"
 dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally.
 Example:
- cargo build --release --features mistralrs,cuda
+- cargo build --features cuda -p dynamo-run
- cd target/release
+- cd target/debug
- ./dynamo-run hf_checkouts/Llama-3.2-3B-Instruct/
+- ./dynamo-run Qwen/Qwen2.5-3B-Instruct
- OR: ./dynamo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf
+- OR: ./dynamo-run /data/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
 "#;
-const ZMQ_SOCKET_PREFIX: &str = "dyn";
+const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]";
-const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>|none] out=ENGINE_LIST [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]";
 fn main() -> anyhow::Result<()> {
    // Set log level based on verbosity flag
@@ -56,72 +54,6 @@ fn main() -> anyhow::Result<()> {
    logging::init();
-    // Call sub-processes before starting the Runtime machinery
-    // For anything except sub-process starting try_parse_from will error.
-    if let Ok(flags) = dynamo_run::Flags::try_parse_from(env::args()) {
-        #[allow(unused_variables)]
-        if let Some(sglang_flags) = flags.internal_sglang_process {
-            let Some(model_path) = flags.model_path_flag.as_ref() else {
-                anyhow::bail!("sglang subprocess requires --model-path");
-            };
-            if !model_path.is_dir() {
-                anyhow::bail!("sglang subprocess requires model path to be a directory containing the safetensors files");
-            }
-            if cfg!(feature = "sglang") {
-                #[cfg(feature = "sglang")]
-                {
-                    let gpu_config = dynamo_engine_sglang::MultiGPUConfig {
-                        tp_size: flags.tensor_parallel_size,
-                        tp_rank: sglang_flags.tp_rank,
-                        gpu_id: sglang_flags.gpu_id,
-                    };
-                    let node_config = dynamo_llm::engines::MultiNodeConfig {
-                        num_nodes: flags.num_nodes,
-                        node_rank: flags.node_rank,
-                        leader_addr: flags.leader_addr.unwrap_or_default(),
-                    };
-                    return dynamo_engine_sglang::run_subprocess(
-                        ZMQ_SOCKET_PREFIX,
-                        model_path,
-                        sglang_flags.pipe_fd as std::os::fd::RawFd,
-                        node_config,
-                        gpu_config,
-                        flags.extra_engine_args,
-                    );
-                }
-            } else {
-                panic!("Rebuild with --features=sglang");
-            }
-        }
-        #[allow(unused_variables)]
-        if flags.internal_vllm_process {
-            let Some(model_path) = flags.model_path_flag else {
-                anyhow::bail!("vllm subprocess requires --model-path flag");
-            };
-            if cfg!(feature = "vllm") {
-                #[cfg(feature = "vllm")]
-                {
-                    let node_config = dynamo_llm::engines::MultiNodeConfig {
-                        num_nodes: flags.num_nodes,
-                        node_rank: flags.node_rank,
-                        leader_addr: flags.leader_addr.unwrap_or_default(),
-                    };
-                    return dynamo_engine_vllm0_7::run_subprocess(
-                        ZMQ_SOCKET_PREFIX,
-                        &model_path,
-                        node_config,
-                        flags.tensor_parallel_size,
-                        flags.extra_engine_args,
-                        flags.router_mode.is_kv_routing(),
-                    );
-                }
-            } else {
-                panic!("Rebuild with --features=vllm");
-            }
-        }
-    }
    // max_worker_threads and max_blocking_threads from env vars or config file.
    let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?;
@@ -195,14 +127,7 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
            .chain(env::args().skip(non_flag_params)),
    )?;
-    dynamo_run::run(
+    dynamo_run::run(runtime, in_opt, out_opt, flags).await
-        runtime,
-        in_opt,
-        out_opt,
-        flags,
-        Some(ZMQ_SOCKET_PREFIX.to_string()),
-    )
-    .await
 }
 /// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.

--- a/launch/dynamo-run/src/net.rs
+++ b/launch/dynamo-run/src/net.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// Mac build uses none of this
-#![allow(dead_code)]
-#[cfg(target_os = "linux")]
-pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
-    unix::get_primary_interface().await
-}
-#[cfg(target_os = "macos")]
-pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
-    Ok(None)
-}
-#[derive(Debug)]
-pub struct LinkDataError {
-    kind: LinkDataErrorKind,
-    interface: Option<String>,
-}
-impl LinkDataError {
-    fn connection(connection_error: std::io::Error) -> Self {
-        let kind = LinkDataErrorKind::Connection(connection_error);
-        let interface = None;
-        Self { kind, interface }
-    }
-    #[cfg(target_os = "linux")]
-    fn communication(communication_error: rtnetlink::Error) -> Self {
-        let kind = LinkDataErrorKind::Communication(communication_error);
-        let interface = None;
-        Self { kind, interface }
-    }
-}
-impl std::fmt::Display for LinkDataError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let err_message = "could not get interface link data";
-        if let Some(interface) = self.interface.as_ref() {
-            write!(f, "{err_message} for {interface}")
-        } else {
-            write!(f, "{err_message}")
-        }
-    }
-}
-impl std::error::Error for LinkDataError {
-    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
-        match self.kind {
-            LinkDataErrorKind::Connection(ref e) => Some(e),
-            #[cfg(target_os = "linux")]
-            LinkDataErrorKind::Communication(ref e) => Some(e),
-        }
-    }
-}
-#[derive(Debug)]
-pub enum LinkDataErrorKind {
-    Connection(std::io::Error),
-    #[cfg(target_os = "linux")]
-    Communication(rtnetlink::Error),
-}
-#[cfg(target_os = "linux")]
-mod unix {
-    use futures_util::TryStreamExt;
-    use netlink_packet_route::address::AddressAttribute;
-    use netlink_packet_route::link::LinkLayerType;
-    use netlink_packet_route::link::State as LinkState;
-    use netlink_packet_route::link::{LinkAttribute, LinkMessage};
-    use netlink_packet_route::AddressFamily;
-    use std::collections::HashMap;
-    use std::collections::HashSet;
-    use std::collections::VecDeque;
-    pub async fn get_primary_interface() -> Result<Option<String>, super::LinkDataError> {
-        let mut candidates: VecDeque<String> = get_ipv4_interface_links()
-            .await?
-            .into_iter()
-            .filter(|(k, v)| {
-                v.is_ethernet() && v.link_is_up() && v.has_carrier() && k.starts_with("e")
-            })
-            .map(|(k, _)| k)
-            .collect();
-        Ok(candidates.pop_front())
-    }
-    #[derive(Clone, Debug)]
-    // Most of the fields are Option<T> because the netlink protocol allows them
-    // to be absent (even though we have no reason to believe they'd ever actually
-    // be missing).
-    struct InterfaceLinkData {
-        link_type: LinkLayerType,
-        state: Option<LinkState>,
-        has_carrier: bool,
-    }
-    impl InterfaceLinkData {
-        pub fn link_is_up(&self) -> bool {
-            self.state
-                .map(|state| matches!(state, LinkState::Up))
-                .unwrap_or(false)
-        }
-        pub fn is_ethernet(&self) -> bool {
-            matches!(self.link_type, LinkLayerType::Ether)
-        }
-        pub fn has_carrier(&self) -> bool {
-            self.has_carrier
-        }
-    }
-    impl From<LinkMessage> for InterfaceLinkData {
-        fn from(link_message: LinkMessage) -> Self {
-            let link_type = link_message.header.link_layer_type;
-            let state = link_message
-                .attributes
-                .iter()
-                .find_map(|attribute| match attribute {
-                    LinkAttribute::OperState(state) => Some(*state),
-                    _ => None,
-                });
-            let has_carrier = link_message
-                .attributes
-                .iter()
-                .find_map(|attribute| match attribute {
-                    LinkAttribute::Carrier(1) => Some(true),
-                    _ => None,
-                })
-                .unwrap_or(false);
-            InterfaceLinkData {
-                link_type,
-                state,
-                has_carrier,
-            }
-        }
-    }
-    // Retrieve the link data (state, MTU, etc.) for all interfaces, and return
-    // them as a HashMap keyed by interface name. This is roughly equivalent to `ip
-    // link show` since we're using the same netlink interface under the hood as
-    // that command.
-    async fn get_ipv4_interface_links(
-    ) -> Result<HashMap<String, InterfaceLinkData>, super::LinkDataError> {
-        let (netlink_connection, rtnetlink_handle, _receiver) =
-            rtnetlink::new_connection().map_err(super::LinkDataError::connection)?;
-        // We have to spawn off the netlink connection because of the architecture
-        // of `netlink_proto::Connection`, which runs in the background and owns
-        // the socket. We communicate with it via channel messages, and it will exit
-        // when both `rtnetlink_handle` and `_receiver` go out of scope.
-        tokio::spawn(netlink_connection);
-        let address_handle = rtnetlink_handle.address().get().execute();
-        let ipv4s: HashSet<String> = address_handle
-            .try_filter_map(|addr_message| async move {
-                if matches!(addr_message.header.family, AddressFamily::Inet) {
-                    Ok(addr_message
-                        .attributes
-                        .into_iter()
-                        .find(|attr| matches!(attr, AddressAttribute::Label(_)))
-                        .and_then(|x| match x {
-                            AddressAttribute::Label(label) => Some(label),
-                            _ => None,
-                        }))
-                } else {
-                    Ok(None)
-                }
-            })
-            .try_collect()
-            .await
-            .map_err(super::LinkDataError::communication)?;
-        let link_handle = rtnetlink_handle.link().get().execute();
-        link_handle
-        .try_filter_map(|link_message| async {
-            let maybe_interface_data = match extract_interface_name(&link_message) {
-                Some(interface_name) => {
-                    if ipv4s.contains(&interface_name) {
-                        Some((interface_name, InterfaceLinkData::from(link_message)))
-                    } else {
-                        None
-                    }
-                }
-                None => {
-                    let idx = link_message.header.index;
-                    eprintln!(
-                        "Network interface with index {idx} doesn't have a name (no IfName attribute)"
-                    );
-                    None
-                }
-            };
-            Ok(maybe_interface_data)
-        })
-        .try_collect()
-        .await
-        .map_err(super::LinkDataError::communication)
-    }
-    fn extract_interface_name(link_message: &LinkMessage) -> Option<String> {
-        link_message
-            .attributes
-            .iter()
-            .find_map(|attribute| match attribute {
-                LinkAttribute::IfName(name) => Some(name.clone()),
-                _ => None,
-            })
-    }
-}
--- a/launch/dynamo-run/src/opt.rs
+++ b/launch/dynamo-run/src/opt.rs
@@ -35,11 +35,6 @@ pub enum Input {
    /// Batch mode. Run all the prompts, write the outputs, exit.
    Batch(PathBuf),
-    /// Start the engine but don't provide any way to talk to it.
-    /// For multi-node sglang, where the engine connects directly
-    /// to the co-ordinator via torch distributed / nccl.
-    None,
 }
 impl TryFrom<&str> for Input {
@@ -50,7 +45,6 @@ impl TryFrom<&str> for Input {
            "http" => Ok(Input::Http),
            "text" => Ok(Input::Text),
            "stdin" => Ok(Input::Stdin),
-            "none" => Ok(Input::None),
            endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => {
                Ok(Input::Endpoint(endpoint_path.to_string()))
            }
@@ -71,7 +65,6 @@ impl fmt::Display for Input {
            Input::Stdin => "stdin",
            Input::Endpoint(path) => path,
            Input::Batch(path) => &path.display().to_string(),
-            Input::None => "none",
        };
        write!(f, "{s}")
    }
@@ -101,39 +94,21 @@ pub enum Output {
    /// Run inference on a model in a GGUF file using mistralrs w/ candle
    MistralRs,
-    #[cfg(feature = "sglang")]
-    /// Deprecated
-    SgLangLegacy,
-    /// Run inference using sglang
-    SgLang,
    #[cfg(feature = "llamacpp")]
    /// Run inference using llama.cpp
    LlamaCpp,
+    /// Run inference using sglang
+    SgLang,
    // Start vllm in a sub-process connecting via nats
    // Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
    Vllm,
-    #[cfg(feature = "vllm")]
-    /// Run inference using vllm 0.8.X+
-    Vllm0_8,
-    #[cfg(feature = "vllm")]
-    /// Run inference using vllm 0.7.X
-    Vllm0_7,
    /// Run inference using a user supplied python file that accepts and returns
    /// strings. It does it's own pre-processing.
    #[cfg(feature = "python")]
    PythonStr(String),
-    /// Run inference using a user supplied python file that accepts and returns
-    /// tokens. We do the pre-processing.
-    #[cfg(feature = "python")]
-    PythonTok(String),
-    //
    // DEVELOPER NOTE
    // If you add an engine add it to `available_engines` below, and to Default if it makes sense
 }
@@ -146,21 +121,12 @@ impl TryFrom<&str> for Output {
            #[cfg(feature = "mistralrs")]
            "mistralrs" => Ok(Output::MistralRs),
-            #[cfg(feature = "sglang")]
-            "sglang_legacy" => Ok(Output::SgLangLegacy),
-            "sglang" => Ok(Output::SgLang),
            #[cfg(feature = "llamacpp")]
            "llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
+            "sglang" => Ok(Output::SgLang),
            "vllm" => Ok(Output::Vllm),
-            #[cfg(feature = "vllm")]
-            "vllm0_8" => Ok(Output::Vllm0_8),
-            #[cfg(feature = "vllm")]
-            "vllm0_7" => Ok(Output::Vllm0_7),
            "echo_full" => Ok(Output::EchoFull),
            "echo_core" => Ok(Output::EchoCore),
@@ -177,14 +143,6 @@ impl TryFrom<&str> for Output {
                Ok(Output::PythonStr(path.to_string()))
            }
-            #[cfg(feature = "python")]
-            python_tok_gen if python_tok_gen.starts_with(crate::PYTHON_TOK_SCHEME) => {
-                let path = python_tok_gen
-                    .strip_prefix(crate::PYTHON_TOK_SCHEME)
-                    .unwrap();
-                Ok(Output::PythonTok(path.to_string()))
-            }
            e => Err(anyhow::anyhow!("Invalid out= option '{e}'")),
        }
    }
@@ -196,21 +154,12 @@ impl fmt::Display for Output {
            #[cfg(feature = "mistralrs")]
            Output::MistralRs => "mistralrs",
-            #[cfg(feature = "sglang")]
-            Output::SgLangLegacy => "sglang_legacy",
-            Output::SgLang => "sglang",
            #[cfg(feature = "llamacpp")]
            Output::LlamaCpp => "llamacpp",
+            Output::SgLang => "sglang",
            Output::Vllm => "vllm",
-            #[cfg(feature = "vllm")]
-            Output::Vllm0_8 => "vllm0_8",
-            #[cfg(feature = "vllm")]
-            Output::Vllm0_7 => "vllm0_7",
            Output::EchoFull => "echo_full",
            Output::EchoCore => "echo_core",
@@ -218,9 +167,6 @@ impl fmt::Display for Output {
            #[cfg(feature = "python")]
            Output::PythonStr(_) => "pystr",
-            #[cfg(feature = "python")]
-            Output::PythonTok(_) => "pytok",
        };
        write!(f, "{s}")
    }
@@ -258,22 +204,11 @@ impl Output {
        }
        out.push(Output::SgLang.to_string());
-        #[cfg(feature = "sglang")]
-        {
-            out.push(Output::SgLangLegacy.to_string());
-        }
        out.push(Output::Vllm.to_string());
-        #[cfg(feature = "vllm")]
-        {
-            out.push(Output::Vllm0_7.to_string());
-            out.push(Output::Vllm0_8.to_string());
-        }
        #[cfg(feature = "python")]
        {
            out.push(Output::PythonStr("file.py".to_string()).to_string());
-            out.push(Output::PythonTok("file.py".to_string()).to_string());
        }
        out

--- a/launch/dynamo-run/src/subprocess.rs
+++ b/launch/dynamo-run/src/subprocess.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 use std::borrow::Cow;
 use std::io::Write;
@@ -23,6 +11,8 @@ use anyhow::Context;
 use regex::Regex;
 use tokio::io::AsyncBufReadExt;
+use dynamo_llm::engines::MultiNodeConfig;
 pub mod sglang;
 pub mod vllm;
@@ -39,6 +29,8 @@ pub async fn start(
    // sglang which GPU to start from, on a multi-GPU system
    // vllm uses CUDA_VISIBLE_DEVICES
    base_gpu_id: Option<u32>,
+    // sglang multi-node config. vllm uses `ray` externally
+    multi_node_config: Option<MultiNodeConfig>,
    // Path to a JSON file containing extra arguments to the backend engine
    extra_engine_args: Option<&Path>,
 ) -> anyhow::Result<(tempfile::TempPath, tokio::process::Child)> {
@@ -61,6 +53,15 @@ pub async fn start(
        args.push("--base-gpu-id".to_string());
        args.push(base_gpu_id.to_string());
    }
+    // sglang only
+    if let Some(multi_node_config) = multi_node_config {
+        args.push("--nnodes".to_string());
+        args.push(multi_node_config.num_nodes.to_string());
+        args.push("--node-rank".to_string());
+        args.push(multi_node_config.node_rank.to_string());
+        args.push("--dist-init-addr".to_string());
+        args.push(multi_node_config.leader_addr);
+    }
    if let Some(extra_engine_args) = extra_engine_args {
        args.push("--extra-engine-args".to_string());
        args.push(extra_engine_args.to_string_lossy().to_string());

--- a/launch/dynamo-run/src/subprocess/sglang_inc.py
+++ b/launch/dynamo-run/src/subprocess/sglang_inc.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
 #
 # A very basic example of sglang worker handling pre-processed requests.
@@ -52,6 +39,9 @@ class Config:
    model: str
    base_gpu_id: int
    tensor_parallel_size: int
+    nnodes: int
+    node_rank: int
+    dist_init_addr: str
    extra_engine_args: str
@@ -111,6 +101,13 @@ async def init(runtime: DistributedRuntime, config: Config):
        "tp_size": config.tensor_parallel_size,
        "base_gpu_id": config.base_gpu_id,
    }
+    if config.dist_init_addr != "":
+        arg_map["trust_remote_code"] = True
+        arg_map["nnodes"] = config.nnodes
+        arg_map["dist_init_addr"] = config.dist_init_addr
+        # In practice this is always 0 because Dynamo only manages the leader
+        arg_map["node_rank"] = config.node_rank
    if config.extra_engine_args != "":
        json_map = {}
        # extra_engine_args is a filename
@@ -157,6 +154,21 @@ def cmd_line_args():
    parser.add_argument(
        "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
    )
+    parser.add_argument(
+        "--nnodes", type=int, default=1, help="The number of machines SGLang will use"
+    )
+    parser.add_argument(
+        "--node-rank",
+        type=int,
+        default=0,
+        help="Unique number for each node. 0 for the leader.",
+    )
+    parser.add_argument(
+        "--dist-init-addr",
+        type=str,
+        default="",
+        help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
+    )
    parser.add_argument(
        "--extra-engine-args",
        type=str,
@@ -183,6 +195,9 @@ def cmd_line_args():
    config.endpoint = parsed_endpoint_name
    config.base_gpu_id = args.base_gpu_id
    config.tensor_parallel_size = args.tensor_parallel_size
+    config.nnodes = args.nnodes
+    config.node_rank = args.node_rank
+    config.dist_init_addr = args.dist_init_addr
    config.extra_engine_args = args.extra_engine_args
    return config

--- a/launch/dynamo-run/src/subprocess/vllm_inc.py
+++ b/launch/dynamo-run/src/subprocess/vllm_inc.py
@@ -66,19 +66,20 @@ class RequestHandler:
    Request handler for the generate endpoint
    """
-    def __init__(self, engine):
+    def __init__(self, engine, default_sampling_params):
        self.engine_client = engine
+        self.default_sampling_params = default_sampling_params
    async def generate(self, request):
        request_id = "1"  # hello_world example only
        logging.debug(f"Received request: {request}")
        prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
-        sampling_params = SamplingParams(
-            temperature=request["sampling_options"]["temperature"],
+        sampling_params = SamplingParams(**self.default_sampling_params)
-            # vllm defaults this to 16
+        sampling_params.temperature = request["sampling_options"]["temperature"]
-            max_tokens=request["stop_conditions"]["max_tokens"],
+        sampling_params.max_tokens = request["stop_conditions"]["max_tokens"]
-        )
        num_output_tokens_so_far = 0
        gen = self.engine_client.generate(prompt, sampling_params, request_id)
        async for res in gen:
@@ -142,13 +143,18 @@ async def init(runtime: DistributedRuntime, config: Config):
        arg_map = {**arg_map, **json_map}  # json_map gets precedence
    engine_args = AsyncEngineArgs(**arg_map)
+    model_config = engine_args.create_model_config()
+    # Load default sampling params from `generation_config.json`
+    default_sampling_params = model_config.get_diff_sampling_param()
    engine_context = build_async_engine_client_from_engine_args(engine_args)
    engine_client = await engine_context.__aenter__()
    # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
    # after the lease is revoked
-    await endpoint.serve_endpoint(RequestHandler(engine_client).generate, None)
+    await endpoint.serve_endpoint(
+        RequestHandler(engine_client, default_sampling_params).generate, None
+    )
 def cmd_line_args():

--- a/lib/engines/python/src/lib.rs
+++ b/lib/engines/python/src/lib.rs
@@ -36,7 +36,6 @@ use tokio::sync::mpsc;
 use tokio::sync::oneshot::Sender;
 use tokio_stream::{wrappers::ReceiverStream, StreamExt};
-use dynamo_llm::backend::ExecutionContext;
 use dynamo_llm::engines::{EngineDispatcher, StreamingEngine};
 /// Python snippet to import a file as a module
@@ -89,26 +88,6 @@ pub async fn make_string_engine(
    Ok(engine)
 }
-/// An engine that takes and returns tokens.
-pub async fn make_token_engine(
-    cancel_token: CancellationToken,
-    py_file: &Path,
-    py_args: Vec<String>,
-) -> pipeline_error::Result<ExecutionContext> {
-    pyo3::prepare_freethreaded_python();
-    if let Ok(venv) = env::var("VIRTUAL_ENV") {
-        Python::with_gil(|py| {
-            if let Err(e) = fix_venv(venv, py) {
-                tracing::warn!("failed to fix venv: {}", e);
-            }
-        });
-    }
-    let engine = new_engine(cancel_token, py_file, py_args).await?;
-    let engine: ExecutionContext = Arc::new(engine);
-    Ok(engine)
-}
 #[derive(Clone)]
 pub struct PythonServerStreamingEngine {
    _cancel_token: CancellationToken,
@@ -128,17 +107,6 @@ async fn new_engine(
    let user_module =
        python_file_to_module(py_file, py_args).with_context(|| py_file.display().to_string())?;
    let generator = Python::with_gil(|py| {
-        /* Leave commented, `initialize` may be needed to match Triton
-        if let Ok(initialize) = user_module.getattr(py, "initialize") {
-            initialize
-                .call1(py, (py_args,))
-                .inspect_err(|err| {
-                    println!();
-                    err.display(py);
-                })
-                .with_context(|| "Failed calling python engine's initialize(args)")?;
-        };
-        */
        user_module
            .getattr(py, "generate")
            .with_context(|| "generate")

--- a/lib/engines/sglang/Cargo.toml
+++ b/lib/engines/sglang/Cargo.toml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-[package]
-name = "dynamo-engine-sglang"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-authors.workspace = true
-license.workspace = true
-homepage.workspace = true
-repository.workspace = true
-keywords.workspace = true
-[dependencies]
-dynamo-runtime = { workspace = true }
-dynamo-llm = { workspace = true }
-anyhow = { workspace = true }
-async-stream = { workspace = true }
-async-trait = { workspace = true }
-async_zmq = { workspace = true }
-serde_json = { workspace = true }
-tokio = { workspace = true }
-tracing = { workspace = true }
-async-openai = "0.27.2"
-libc = "0.2"
-pyo3 = { version = "0.23.3", default-features = false, features = [
-  "macros",
-  "experimental-async",
-  "experimental-inspect",
-  "py-clone",
-] }
-regex = "1"
--- a/lib/engines/sglang/src/engine.rs
+++ b/lib/engines/sglang/src/engine.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-use std::path::{Path, PathBuf};
-use async_stream::stream;
-use async_trait::async_trait;
-use dynamo_llm::engines::MultiNodeConfig;
-use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
-use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
-use dynamo_runtime::protocols::annotated::Annotated;
-use dynamo_runtime::runtime::CancellationToken;
-pub struct SgLangEngine {
-    cancel_token: CancellationToken,
-    worker: super::worker::SgLangWorker,
-}
-impl SgLangEngine {
-    pub async fn new(
-        cancel_token: CancellationToken,
-        sock_code: &str,
-        model_path: &Path,
-        node_conf: MultiNodeConfig,
-        tensor_parallel_size: u32,
-        base_gpu_id: u32,
-        extra_engine_args: Option<PathBuf>,
-    ) -> anyhow::Result<Self> {
-        let w = super::worker::start(
-            cancel_token.clone(),
-            sock_code,
-            model_path,
-            node_conf,
-            tensor_parallel_size,
-            base_gpu_id,
-            extra_engine_args,
-        )
-        .await?;
-        let engine = SgLangEngine {
-            cancel_token,
-            worker: w,
-        };
-        Ok(engine)
-    }
-    pub fn take_sglang_worker_handle(&mut self) -> tokio::task::JoinHandle<()> {
-        self.worker.take_sglang_handle()
-    }
-}
-#[async_trait]
-impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
-    for SgLangEngine
-{
-    async fn generate(
-        &self,
-        request: SingleIn<BackendInput>,
-    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
-        let (request, context) = request.into_parts();
-        let ctx = context.context();
-        let request_id = ctx.id().to_string();
-        let (resp_tx, mut resp_rx) = tokio::sync::mpsc::channel(128);
-        let work_req = super::worker::WorkRequest {
-            request_id: context.id().to_string(),
-            request,
-            response_channel: resp_tx,
-        };
-        self.worker.enqueue_request(work_req).await?;
-        let cancel_token = self.cancel_token.clone();
-        let output = stream! {
-            loop {
-                tokio::select! {
-                    _ = cancel_token.cancelled() => {
-                        break;
-                    }
-                    maybe_resp_rx = resp_rx.recv() => {
-                        match maybe_resp_rx {
-                            Some(out) => {
-                                yield out;
-                            },
-                            None => {
-                                tracing::trace!(request_id, "generate: response channel closed");
-                                break;
-                            }
-                        }
-                    }
-                }
-            }
-        };
-        Ok(ResponseStream::new(Box::pin(output), ctx))
-    }
-}