chore: Remove embedded Python vllm and sglang engines (#966)

vllm and sglang are now the sub-process engines from #954 Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).

chore: Remove embedded Python vllm and sglang engines (#966)
vllm and sglang are now the sub-process engines from #954 Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).
42969800 · Graham King · GitHub · 5d89a0c8 · 42969800 · 42969800
Unverified Commit 42969800 authored May 07, 2025 by Graham King Committed by GitHub May 07, 2025
20 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1533,68 +1533,6 @@ dependencies = [
 "tracing",
 ]
-[[package]]
-name = "dynamo-engine-sglang"
-version = "0.2.0"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "async_zmq",
- "dynamo-llm",
- "dynamo-runtime",
- "libc",
- "pyo3",
- "regex",
- "serde_json",
- "tokio",
- "tracing",
-]
-[[package]]
-name = "dynamo-engine-vllm0_7"
-version = "0.2.0"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "async_zmq",
- "dynamo-llm",
- "dynamo-runtime",
- "pyo3",
- "regex",
- "serde-pickle",
- "serde_json",
- "thiserror 2.0.12",
- "tokio",
- "tracing",
-]
-[[package]]
-name = "dynamo-engine-vllm0_8"
-version = "0.2.0"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "dynamo-llm",
- "dynamo-runtime",
- "pyo3",
- "pyo3-async-runtimes",
- "pythonize",
- "regex",
- "serde",
- "serde-pickle",
- "serde_json",
- "thiserror 2.0.12",
- "tokio",
- "tokio-stream",
- "tracing",
-]
 [[package]]
 name = "dynamo-llm"
 version = "0.2.0"
@@ -1671,18 +1609,13 @@ dependencies = [
 "dynamo-engine-llamacpp",
 "dynamo-engine-mistralrs",
 "dynamo-engine-python",
- "dynamo-engine-sglang",
- "dynamo-engine-vllm0_7",
- "dynamo-engine-vllm0_8",
 "dynamo-llm",
 "dynamo-runtime",
 "futures",
 "futures-util",
 "humantime",
 "libc",
- "netlink-packet-route",
 "regex",
- "rtnetlink",
 "serde",
 "serde_json",
 "tempfile",
@@ -1718,7 +1651,7 @@ dependencies = [
 "local-ip-address",
 "log",
 "nid",
- "nix 0.29.0",
+ "nix",
 "nuid",
 "once_cell",
 "prometheus",
@@ -3165,12 +3098,6 @@ version = "1.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
-[[package]]
-name = "iter-read"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071ed4cc1afd86650602c7b11aa2e1ce30762a1c27193201cb5cee9c6ebb1294"
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -3936,70 +3863,6 @@ dependencies = [
 "winapi 0.3.9",
 ]
-[[package]]
-name = "netlink-packet-core"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72724faf704479d67b388da142b186f916188505e7e0b26719019c525882eda4"
-dependencies = [
- "anyhow",
- "byteorder",
- "netlink-packet-utils",
-]
-[[package]]
-name = "netlink-packet-route"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74c171cd77b4ee8c7708da746ce392440cb7bcf618d122ec9ecc607b12938bf4"
-dependencies = [
- "anyhow",
- "byteorder",
- "libc",
- "log",
- "netlink-packet-core",
- "netlink-packet-utils",
-]
-[[package]]
-name = "netlink-packet-utils"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ede8a08c71ad5a95cdd0e4e52facd37190977039a4704eb82a283f713747d34"
-dependencies = [
- "anyhow",
- "byteorder",
- "paste",
- "thiserror 1.0.69",
-]
-[[package]]
-name = "netlink-proto"
-version = "0.11.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72452e012c2f8d612410d89eea01e2d9b56205274abb35d53f60200b2ec41d60"
-dependencies = [
- "bytes",
- "futures",
- "log",
- "netlink-packet-core",
- "netlink-sys",
- "thiserror 2.0.12",
-]
-[[package]]
-name = "netlink-sys"
-version = "0.8.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16c903aa70590cb93691bf97a767c8d1d6122d2cc9070433deb3bbf36ce8bd23"
-dependencies = [
- "bytes",
- "futures",
- "libc",
- "log",
- "tokio",
-]
 [[package]]
 name = "nibble_vec"
 version = "0.1.0"
@@ -4020,17 +3883,6 @@ dependencies = [
 "thiserror 1.0.69",
 ]
-[[package]]
-name = "nix"
-version = "0.27.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
-dependencies = [
- "bitflags 2.9.0",
- "cfg-if 1.0.0",
- "libc",
-]
 [[package]]
 name = "nix"
 version = "0.29.0"
@@ -5357,24 +5209,6 @@ dependencies = [
 "syn 2.0.100",
 ]
-[[package]]
-name = "rtnetlink"
-version = "0.14.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b684475344d8df1859ddb2d395dd3dac4f8f3422a1aa0725993cb375fc5caba5"
-dependencies = [
- "futures",
- "log",
- "netlink-packet-core",
- "netlink-packet-route",
- "netlink-packet-utils",
- "netlink-proto",
- "netlink-sys",
- "nix 0.27.1",
- "thiserror 1.0.69",
- "tokio",
-]
 [[package]]
 name = "rustc-demangle"
 version = "0.1.24"
@@ -5697,19 +5531,6 @@ dependencies = [
 "serde",
 ]
-[[package]]
-name = "serde-pickle"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b641fdc8bcf2781ee78b30c599700d64ad4f412976143e4c5d0b9df906bb4843"
-dependencies = [
- "byteorder",
- "iter-read",
- "num-bigint",
- "num-traits",
- "serde",
-]
 [[package]]
 name = "serde_derive"
 version = "1.0.219"

--- a/container/Dockerfile.none
+++ b/container/Dockerfile.none
@@ -46,7 +46,7 @@ ARG CARGO_BUILD_JOBS
 ENV CARGO_TARGET_DIR=/workspace/target
-RUN cargo build --release --locked --features mistralrs,sglang,vllm,python && \
+RUN cargo build --release --locked --features mistralrs,python && \
    cargo doc --no-deps && \
    cp target/release/dynamo-run /usr/local/bin && \
    cp target/release/http /usr/local/bin && \

--- a/container/Dockerfile.tensorrt_llm
+++ b/container/Dockerfile.tensorrt_llm
@@ -173,7 +173,7 @@ COPY launch /workspace/launch
 COPY deploy/sdk /workspace/deploy/sdk
 # Build Rust crate binaries packaged with the wheel
-RUN cargo build --release --locked --features mistralrs,sglang,vllm,python \
+RUN cargo build --release --locked --features mistralrs,python \
    -p dynamo-run \
    -p llmctl \
    # Multiple http named crates are present in dependencies, need to specify the path

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -59,7 +59,9 @@ RUN apt-get update -y && \
    ninja-build \
    pybind11-dev \
    # Rust build dependencies
+	clang \
    libclang-dev \
+	git \
    # Install utilities
    nvtop \
    tmux \
@@ -305,7 +307,7 @@ COPY launch /workspace/launch
 COPY deploy/sdk /workspace/deploy/sdk
 # Build Rust crate binaries packaged with the wheel
-RUN cargo build --release --locked --features mistralrs,sglang,vllm,python \
+RUN cargo build --release --locked --features mistralrs,python \
    -p dynamo-run \
    -p llmctl \
    # Multiple http named crates are present in dependencies, need to specify the path

--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
--- a/launch/dynamo-run/Cargo.toml
+++ b/launch/dynamo-run/Cargo.toml
@@ -26,12 +26,9 @@ description = "Dynamo Run CLI"
 [features]
 # Build with `--no-default-features` to disable these defaults
-# We don't include llamacpp by default until we figure out when it needs external libraries
+default = ["mistralrs"]
-default = ["mistralrs", "vllm", "sglang"]
 mistralrs = ["dep:dynamo-engine-mistralrs"]
 llamacpp = ["dep:dynamo-engine-llamacpp"]
-vllm = ["dep:dynamo-engine-vllm0_7", "dep:dynamo-engine-vllm0_8", "dep:netlink-packet-route", "dep:rtnetlink"]
-sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
 python = ["dep:dynamo-engine-python"]
 cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"]
@@ -44,9 +41,6 @@ dynamo-runtime = { workspace = true }
 dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true }
 dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true }
-dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
-dynamo-engine-vllm0_7 = { path = "../../lib/engines/vllm0_7", optional = true }
-dynamo-engine-vllm0_8 = { path = "../../lib/engines/vllm0_8", optional = true }
 dynamo-engine-python = { path = "../../lib/engines/python", optional = true }
 anyhow = { workspace = true }
@@ -68,15 +62,3 @@ clap = { version = "4.5", features = ["derive", "env"] }
 dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }
 futures-util = { version = "0.3" }
 regex = "1"
-[target.x86_64-unknown-linux-gnu.dependencies]
-netlink-packet-route = { version = "0.19", optional = true }
-rtnetlink = { version = "0.14", optional = true }
-[target.x86_64-unknown-linux-musl.dependencies]
-netlink-packet-route = { version = "0.19", optional = true }
-rtnetlink = { version = "0.14", optional = true }
-[target.aarch64-unknown-linux-gnu.dependencies]
-netlink-packet-route = { version = "0.19", optional = true }
-rtnetlink = { version = "0.14", optional = true }
--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -15,7 +15,6 @@
 use std::collections::HashMap;
 use std::path::PathBuf;
-use std::str::FromStr;
 use clap::ValueEnum;
 use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
@@ -106,21 +105,6 @@ pub struct Flags {
    #[arg(long, default_value = "round-robin")]
    pub router_mode: RouterMode,
-    /// Internal use only.
-    // Start the python vllm engine sub-process.
-    #[arg(long, hide = true, default_value = "false")]
-    pub internal_vllm_process: bool,
-    /// Internal use only.
-    /// Start the sglang Python sub-process.
-    /// The params in the tuple are:
-    /// - the fd of the write end of a pipe where sglang will signal that it's ready.
-    /// - the node rank (0 for first host, 1 for second host, etc)
-    /// - the workers' rank (globally unique)
-    /// - the GPU to use (locally unique)
-    #[arg(long, hide = true, value_parser = parse_sglang_flags)]
-    pub internal_sglang_process: Option<SgLangFlags>,
    /// Additional engine-specific arguments from a JSON file.
    /// Contains a mapping of parameter names to values.
    #[arg(long)]
@@ -200,30 +184,6 @@ impl Flags {
    }
 }
-#[derive(Debug, Clone, Copy)]
-pub struct SgLangFlags {
-    pub pipe_fd: u32,
-    pub tp_rank: u32,
-    pub gpu_id: u32,
-}
-fn parse_sglang_flags(s: &str) -> Result<SgLangFlags, String> {
-    let nums: Vec<u32> = s
-        .split(',')
-        .map(u32::from_str)
-        .collect::<Result<Vec<_>, _>>()
-        .map_err(|e| e.to_string())?;
-    if nums.len() != 3 {
-        return Err("Need exactly 3 numbers".into());
-    }
-    Ok(SgLangFlags {
-        pipe_fd: nums[0],
-        tp_rank: nums[1],
-        gpu_id: nums[2],
-    })
-}
 #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug)]
 pub enum RouterMode {
    #[default]

--- a/launch/dynamo-run/src/input/common.rs
+++ b/launch/dynamo-run/src/input/common.rs
@@ -183,7 +183,6 @@ pub async fn prepare_engine(
                _cache_dir: None,
            })
        }
-        EngineConfig::None => unreachable!(),
    }
 }

--- a/launch/dynamo-run/src/input/endpoint.rs
+++ b/launch/dynamo-run/src/input/endpoint.rs
@@ -91,7 +91,6 @@ pub async fn run(
        EngineConfig::Dynamic(_) => {
            anyhow::bail!("Cannot use endpoint for both in and out");
        }
-        EngineConfig::None => unreachable!(),
    };
    tokio::select! {

--- a/launch/dynamo-run/src/input/http.rs
+++ b/launch/dynamo-run/src/input/http.rs
@@ -97,7 +97,6 @@ pub async fn run(
            .await?;
            manager.add_completions_model(model.service_name(), cmpl_pipeline)?;
        }
-        EngineConfig::None => unreachable!(),
    }
    http_service.run(runtime.primary_token()).await?;
    runtime.shutdown(); // Cancel primary token

--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#[cfg(any(feature = "vllm", feature = "sglang"))]
 use std::{future::Future, pin::Pin};
 use std::{io::Read, sync::Arc, time::Duration};
 use anyhow::Context;
-use dynamo_llm::{
+use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, LocalModel};
-    backend::ExecutionContext, engines::StreamingEngine, kv_router::publisher::KvMetricsPublisher,
-    LocalModel,
-};
 use dynamo_runtime::{protocols::Endpoint, CancellationToken, DistributedRuntime};
 mod flags;
 pub use flags::Flags;
 mod input;
-#[cfg(any(feature = "vllm", feature = "sglang"))]
-mod net;
 mod opt;
 pub use dynamo_llm::request_template::RequestTemplate;
 pub use opt::{Input, Output};
@@ -38,19 +20,12 @@ mod subprocess;
 /// the command line. Hence it's optional, and defaults to this.
 const INVISIBLE_MODEL_NAME: &str = "dynamo-run";
-/// The component name for the KV publisher, if used
-const KV_PUBLISHER_COMPONENT: &str = "kvpublisher";
 const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
 /// How we identify a python string endpoint
 #[cfg(feature = "python")]
 const PYTHON_STR_SCHEME: &str = "pystr:";
-/// How we identify a python token endpoint
-#[cfg(feature = "python")]
-const PYTHON_TOK_SCHEME: &str = "pytok:";
 pub enum EngineConfig {
    /// An remote networked engine we don't know about yet
    Dynamic(Endpoint),
@@ -66,24 +41,13 @@ pub enum EngineConfig {
        engine: ExecutionContext,
        model: Box<LocalModel>,
    },
-    /// vllm multi-node doesn't run an engine on nodes other than 0. 'ray' does all the work.
-    None,
 }
-/// Distributed system values
-struct DynInput {
-    endpoint_id: Endpoint,
-    distributed_runtime: DistributedRuntime,
-}
-#[allow(unused_mut)]
 pub async fn run(
    runtime: dynamo_runtime::Runtime,
-    mut in_opt: Input, // mut because vllm and sglang multi-node can change it
+    in_opt: Input,
    out_opt: Output,
    flags: Flags,
-    #[allow(unused_variables)] zmq_socket_prefix: Option<String>,
 ) -> anyhow::Result<()> {
    let cancel_token = runtime.primary_token();
    let maybe_path = flags
@@ -120,29 +84,6 @@ pub async fn run(
        }
    };
-    let dyn_input = match &in_opt {
-        Input::Endpoint(endpoint_path) => {
-            if maybe_path.as_ref().map(|mp| mp.is_file()).unwrap_or(false)
-                && flags.model_config.is_none()
-            {
-                // TODO We need to convert tokenizer extract from GGUF file into something we can
-                // publish to NATS. Ideally `tokenizer.json` directly, but otherwise an
-                // intermediate format.
-                tracing::error!("Serving GGUF files in a distributed system requires `--model-config <hf-repo-dir>` so that we can find the tokenzier config");
-                return Ok(());
-            }
-            // If we are in a distributed system, we need to know our component upfront
-            let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
-            let endpoint_id: Endpoint = endpoint_path.parse()?;
-            Some(DynInput {
-                endpoint_id,
-                distributed_runtime,
-            })
-        }
-        _ => None,
-    };
    let mut extra: Option<Pin<Box<dyn Future<Output = ()> + Send>>> = None; // vllm and sglang sub-process
    let template = if let Some(path) = flags.request_template.as_ref() {
@@ -183,13 +124,17 @@ pub async fn run(
            engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?,
            model: Box::new(local_model),
        },
        Output::SgLang => {
            if !local_model.path().is_dir() {
                // TODO Does sglang support GGUF? Can we make it work?
                anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
            }
-            let (py_script, mut child) = match subprocess::start(
+            let multi_node_conf = dynamo_llm::engines::MultiNodeConfig {
+                num_nodes: flags.num_nodes,
+                node_rank: flags.node_rank,
+                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
+            };
+            let (py_script, child) = match subprocess::start(
                subprocess::sglang::PY,
                local_model.path(),
                flags.tensor_parallel_size,
@@ -198,6 +143,11 @@ pub async fn run(
                } else {
                    Some(flags.base_gpu_id)
                },
+                if flags.num_nodes <= 1 {
+                    None
+                } else {
+                    Some(multi_node_conf)
+                },
                flags.extra_engine_args.as_deref(),
            )
            .await
@@ -216,151 +166,16 @@ pub async fn run(
            let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
            EngineConfig::Dynamic(endpoint)
        }
-        #[cfg(feature = "sglang")]
-        Output::SgLangLegacy => {
-            if !local_model.path().is_dir() {
-                anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
-            }
-            let Some(sock_prefix) = zmq_socket_prefix else {
-                anyhow::bail!("sglang requires zmq_socket_prefix");
-            };
-            let node_conf = dynamo_llm::engines::MultiNodeConfig {
-                num_nodes: flags.num_nodes,
-                node_rank: flags.node_rank,
-                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
-            };
-            if node_conf.num_nodes > 1 {
-                if let Ok(Some(if_name)) = net::get_primary_interface().await {
-                    tracing::info!("If you see 'gloo' errors from sglang try setting these environment variables:");
-                    tracing::info!("export GLOO_SOCKET_IFNAME={if_name}");
-                    tracing::info!("export NCCL_SOCKET_IFNAME={if_name}");
-                }
-                if node_conf.node_rank != 0 {
-                    // Follower nodes take input from leader node over pytorch distributed, not
-                    // from user.
-                    in_opt = Input::None;
-                }
-            }
-            let (engine, sglang_process) = dynamo_engine_sglang::make_engine(
-                cancel_token.clone(),
-                local_model.path(),
-                &sock_prefix,
-                node_conf,
-                flags.tensor_parallel_size,
-                flags.base_gpu_id,
-                flags.extra_engine_args.clone(),
-            )
-            .await?;
-            extra = Some(Box::pin(async move {
-                let _ = sglang_process.await;
-            }));
-            EngineConfig::StaticCore {
-                engine,
-                model: Box::new(local_model),
-            }
-        }
-        #[cfg(feature = "vllm")]
-        Output::Vllm0_7 => {
-            if flags.base_gpu_id != 0 {
-                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
-            }
-            let Some(sock_prefix) = zmq_socket_prefix else {
-                anyhow::bail!("vllm requires zmq_socket_prefix");
-            };
-            let node_conf = dynamo_llm::engines::MultiNodeConfig {
-                num_nodes: flags.num_nodes,
-                node_rank: flags.node_rank,
-                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
-            };
-            if node_conf.num_nodes > 1 {
-                if let Ok(Some(if_name)) = net::get_primary_interface().await {
-                    tracing::info!("If you see network errors from vllm try setting this environment variable:");
-                    tracing::info!("export NCCL_SOCKET_IFNAME={if_name}");
-                }
-                if node_conf.node_rank != 0 {
-                    // Only node 0 runs vllm, the others communicate over ray
-                    in_opt = Input::None;
-                }
-            }
-            if node_conf.node_rank == 0 {
-                let kv_metrics_publisher = if let Some(dyn_input) = &dyn_input {
-                    let kvp_component = dyn_input
-                        .distributed_runtime
-                        .namespace(dyn_input.endpoint_id.namespace.clone())?
-                        .component(KV_PUBLISHER_COMPONENT)?;
-                    let kvp = Arc::new(KvMetricsPublisher::new()?);
-                    let kvp_inner = kvp.clone();
-                    tokio::spawn(
-                        async move { kvp_inner.create_endpoint(kvp_component, None).await },
-                    );
-                    Some(kvp)
-                } else {
-                    None
-                };
-                // vllm multi-node only the leader runs vllm
-                let (engine, vllm_future) = dynamo_engine_vllm0_7::make_leader_engine(
-                    cancel_token.clone(),
-                    local_model.path(),
-                    &sock_prefix,
-                    node_conf,
-                    flags.tensor_parallel_size,
-                    flags.extra_engine_args.clone(),
-                    kv_metrics_publisher,
-                )
-                .await?;
-                extra = Some(Box::pin(async move {
-                    let _ = vllm_future.await;
-                }));
-                EngineConfig::StaticCore {
-                    engine,
-                    model: Box::new(local_model),
-                }
-            } else {
-                // Nodes rank > 0 only run 'ray'
-                let stop_future =
-                    dynamo_engine_vllm0_7::start_follower(cancel_token.clone(), node_conf).await?;
-                extra = Some(Box::pin(stop_future));
-                EngineConfig::None
-            }
-        }
-        #[cfg(feature = "vllm")]
-        Output::Vllm0_8 => {
-            if flags.base_gpu_id != 0 {
-                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
-            }
-            let node_conf = dynamo_llm::engines::MultiNodeConfig {
-                num_nodes: flags.num_nodes,
-                node_rank: flags.node_rank,
-                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
-            };
-            let engine = dynamo_engine_vllm0_8::make_engine(
-                cancel_token.clone(),
-                local_model.path(),
-                node_conf,
-                flags.tensor_parallel_size,
-                flags.extra_engine_args.clone(),
-            )
-            .await?;
-            EngineConfig::StaticCore {
-                engine,
-                model: Box::new(local_model),
-            }
-        }
-        // No feature flag because it uses a sub-process, it's very cheap to include
        Output::Vllm => {
            if flags.base_gpu_id != 0 {
                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
            }
-            let (py_script, mut child) = match subprocess::start(
+            let (py_script, child) = match subprocess::start(
                subprocess::vllm::PY,
                local_model.path(),
                flags.tensor_parallel_size,
                None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead
+                None, // multi-node config. vllm uses `ray`, see guide
                flags.extra_engine_args.as_deref(),
            )
            .await
@@ -405,18 +220,6 @@ pub async fn run(
                model: Box::new(local_model),
            }
        }
-        #[cfg(feature = "python")]
-        Output::PythonTok(path_str) => {
-            let card = local_model.card();
-            let py_args = flags.as_vec(&path_str, &card.service_name);
-            let p = std::path::PathBuf::from(path_str);
-            let engine =
-                dynamo_engine_python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
-            EngineConfig::StaticCore {
-                engine,
-                model: Box::new(local_model),
-            }
-        }
    };
    match in_opt {
@@ -443,16 +246,8 @@ pub async fn run(
                .await?;
        }
        Input::Endpoint(path) => {
-            let Some(dyn_input) = dyn_input else {
+            let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
-                unreachable!("We set dyn_input earlier");
+            crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
-            };
-            crate::input::endpoint::run(dyn_input.distributed_runtime, path, engine_config).await?;
-        }
-        Input::None => {
-            // Multi-node setup. The engine sub-process has been started and is talking
-            // to it's node_rank 0 controller. We do nothing.
-            // TODO: Acquire an etcd lease, we are running
-            cancel_token.cancelled().await;
        }
    }

--- a/launch/dynamo-run/src/main.rs
+++ b/launch/dynamo-run/src/main.rs
@@ -24,15 +24,13 @@ const HELP: &str = r#"
 dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally.
 Example:
- cargo build --release --features mistralrs,cuda
+- cargo build --features cuda -p dynamo-run
- cd target/release
+- cd target/debug
- ./dynamo-run hf_checkouts/Llama-3.2-3B-Instruct/
+- ./dynamo-run Qwen/Qwen2.5-3B-Instruct
- OR: ./dynamo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf
+- OR: ./dynamo-run /data/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
 "#;
-const ZMQ_SOCKET_PREFIX: &str = "dyn";
+const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]";
-const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>|none] out=ENGINE_LIST [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]";
 fn main() -> anyhow::Result<()> {
    // Set log level based on verbosity flag
@@ -56,72 +54,6 @@ fn main() -> anyhow::Result<()> {
    logging::init();
-    // Call sub-processes before starting the Runtime machinery
-    // For anything except sub-process starting try_parse_from will error.
-    if let Ok(flags) = dynamo_run::Flags::try_parse_from(env::args()) {
-        #[allow(unused_variables)]
-        if let Some(sglang_flags) = flags.internal_sglang_process {
-            let Some(model_path) = flags.model_path_flag.as_ref() else {
-                anyhow::bail!("sglang subprocess requires --model-path");
-            };
-            if !model_path.is_dir() {
-                anyhow::bail!("sglang subprocess requires model path to be a directory containing the safetensors files");
-            }
-            if cfg!(feature = "sglang") {
-                #[cfg(feature = "sglang")]
-                {
-                    let gpu_config = dynamo_engine_sglang::MultiGPUConfig {
-                        tp_size: flags.tensor_parallel_size,
-                        tp_rank: sglang_flags.tp_rank,
-                        gpu_id: sglang_flags.gpu_id,
-                    };
-                    let node_config = dynamo_llm::engines::MultiNodeConfig {
-                        num_nodes: flags.num_nodes,
-                        node_rank: flags.node_rank,
-                        leader_addr: flags.leader_addr.unwrap_or_default(),
-                    };
-                    return dynamo_engine_sglang::run_subprocess(
-                        ZMQ_SOCKET_PREFIX,
-                        model_path,
-                        sglang_flags.pipe_fd as std::os::fd::RawFd,
-                        node_config,
-                        gpu_config,
-                        flags.extra_engine_args,
-                    );
-                }
-            } else {
-                panic!("Rebuild with --features=sglang");
-            }
-        }
-        #[allow(unused_variables)]
-        if flags.internal_vllm_process {
-            let Some(model_path) = flags.model_path_flag else {
-                anyhow::bail!("vllm subprocess requires --model-path flag");
-            };
-            if cfg!(feature = "vllm") {
-                #[cfg(feature = "vllm")]
-                {
-                    let node_config = dynamo_llm::engines::MultiNodeConfig {
-                        num_nodes: flags.num_nodes,
-                        node_rank: flags.node_rank,
-                        leader_addr: flags.leader_addr.unwrap_or_default(),
-                    };
-                    return dynamo_engine_vllm0_7::run_subprocess(
-                        ZMQ_SOCKET_PREFIX,
-                        &model_path,
-                        node_config,
-                        flags.tensor_parallel_size,
-                        flags.extra_engine_args,
-                        flags.router_mode.is_kv_routing(),
-                    );
-                }
-            } else {
-                panic!("Rebuild with --features=vllm");
-            }
-        }
-    }
    // max_worker_threads and max_blocking_threads from env vars or config file.
    let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?;
@@ -195,14 +127,7 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
            .chain(env::args().skip(non_flag_params)),
    )?;
-    dynamo_run::run(
+    dynamo_run::run(runtime, in_opt, out_opt, flags).await
-        runtime,
-        in_opt,
-        out_opt,
-        flags,
-        Some(ZMQ_SOCKET_PREFIX.to_string()),
-    )
-    .await
 }
 /// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.

--- a/launch/dynamo-run/src/net.rs
+++ b/launch/dynamo-run/src/net.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// Mac build uses none of this
-#![allow(dead_code)]
-#[cfg(target_os = "linux")]
-pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
-    unix::get_primary_interface().await
-}
-#[cfg(target_os = "macos")]
-pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
-    Ok(None)
-}
-#[derive(Debug)]
-pub struct LinkDataError {
-    kind: LinkDataErrorKind,
-    interface: Option<String>,
-}
-impl LinkDataError {
-    fn connection(connection_error: std::io::Error) -> Self {
-        let kind = LinkDataErrorKind::Connection(connection_error);
-        let interface = None;
-        Self { kind, interface }
-    }
-    #[cfg(target_os = "linux")]
-    fn communication(communication_error: rtnetlink::Error) -> Self {
-        let kind = LinkDataErrorKind::Communication(communication_error);
-        let interface = None;
-        Self { kind, interface }
-    }
-}
-impl std::fmt::Display for LinkDataError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let err_message = "could not get interface link data";
-        if let Some(interface) = self.interface.as_ref() {
-            write!(f, "{err_message} for {interface}")
-        } else {
-            write!(f, "{err_message}")
-        }
-    }
-}
-impl std::error::Error for LinkDataError {
-    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
-        match self.kind {
-            LinkDataErrorKind::Connection(ref e) => Some(e),
-            #[cfg(target_os = "linux")]
-            LinkDataErrorKind::Communication(ref e) => Some(e),
-        }
-    }
-}
-#[derive(Debug)]
-pub enum LinkDataErrorKind {
-    Connection(std::io::Error),
-    #[cfg(target_os = "linux")]
-    Communication(rtnetlink::Error),
-}
-#[cfg(target_os = "linux")]
-mod unix {
-    use futures_util::TryStreamExt;
-    use netlink_packet_route::address::AddressAttribute;
-    use netlink_packet_route::link::LinkLayerType;
-    use netlink_packet_route::link::State as LinkState;
-    use netlink_packet_route::link::{LinkAttribute, LinkMessage};
-    use netlink_packet_route::AddressFamily;
-    use std::collections::HashMap;
-    use std::collections::HashSet;
-    use std::collections::VecDeque;
-    pub async fn get_primary_interface() -> Result<Option<String>, super::LinkDataError> {
-        let mut candidates: VecDeque<String> = get_ipv4_interface_links()
-            .await?
-            .into_iter()
-            .filter(|(k, v)| {
-                v.is_ethernet() && v.link_is_up() && v.has_carrier() && k.starts_with("e")
-            })
-            .map(|(k, _)| k)
-            .collect();
-        Ok(candidates.pop_front())
-    }
-    #[derive(Clone, Debug)]
-    // Most of the fields are Option<T> because the netlink protocol allows them
-    // to be absent (even though we have no reason to believe they'd ever actually
-    // be missing).
-    struct InterfaceLinkData {
-        link_type: LinkLayerType,
-        state: Option<LinkState>,
-        has_carrier: bool,
-    }
-    impl InterfaceLinkData {
-        pub fn link_is_up(&self) -> bool {
-            self.state
-                .map(|state| matches!(state, LinkState::Up))
-                .unwrap_or(false)
-        }
-        pub fn is_ethernet(&self) -> bool {
-            matches!(self.link_type, LinkLayerType::Ether)
-        }
-        pub fn has_carrier(&self) -> bool {
-            self.has_carrier
-        }
-    }
-    impl From<LinkMessage> for InterfaceLinkData {
-        fn from(link_message: LinkMessage) -> Self {
-            let link_type = link_message.header.link_layer_type;
-            let state = link_message
-                .attributes
-                .iter()
-                .find_map(|attribute| match attribute {
-                    LinkAttribute::OperState(state) => Some(*state),
-                    _ => None,
-                });
-            let has_carrier = link_message
-                .attributes
-                .iter()
-                .find_map(|attribute| match attribute {
-                    LinkAttribute::Carrier(1) => Some(true),
-                    _ => None,
-                })
-                .unwrap_or(false);
-            InterfaceLinkData {
-                link_type,
-                state,
-                has_carrier,
-            }
-        }
-    }
-    // Retrieve the link data (state, MTU, etc.) for all interfaces, and return
-    // them as a HashMap keyed by interface name. This is roughly equivalent to `ip
-    // link show` since we're using the same netlink interface under the hood as
-    // that command.
-    async fn get_ipv4_interface_links(
-    ) -> Result<HashMap<String, InterfaceLinkData>, super::LinkDataError> {
-        let (netlink_connection, rtnetlink_handle, _receiver) =
-            rtnetlink::new_connection().map_err(super::LinkDataError::connection)?;
-        // We have to spawn off the netlink connection because of the architecture
-        // of `netlink_proto::Connection`, which runs in the background and owns
-        // the socket. We communicate with it via channel messages, and it will exit
-        // when both `rtnetlink_handle` and `_receiver` go out of scope.
-        tokio::spawn(netlink_connection);
-        let address_handle = rtnetlink_handle.address().get().execute();
-        let ipv4s: HashSet<String> = address_handle
-            .try_filter_map(|addr_message| async move {
-                if matches!(addr_message.header.family, AddressFamily::Inet) {
-                    Ok(addr_message
-                        .attributes
-                        .into_iter()
-                        .find(|attr| matches!(attr, AddressAttribute::Label(_)))
-                        .and_then(|x| match x {
-                            AddressAttribute::Label(label) => Some(label),
-                            _ => None,
-                        }))
-                } else {
-                    Ok(None)
-                }
-            })
-            .try_collect()
-            .await
-            .map_err(super::LinkDataError::communication)?;
-        let link_handle = rtnetlink_handle.link().get().execute();
-        link_handle
-        .try_filter_map(|link_message| async {
-            let maybe_interface_data = match extract_interface_name(&link_message) {
-                Some(interface_name) => {
-                    if ipv4s.contains(&interface_name) {
-                        Some((interface_name, InterfaceLinkData::from(link_message)))
-                    } else {
-                        None
-                    }
-                }
-                None => {
-                    let idx = link_message.header.index;
-                    eprintln!(
-                        "Network interface with index {idx} doesn't have a name (no IfName attribute)"
-                    );
-                    None
-                }
-            };
-            Ok(maybe_interface_data)
-        })
-        .try_collect()
-        .await
-        .map_err(super::LinkDataError::communication)
-    }
-    fn extract_interface_name(link_message: &LinkMessage) -> Option<String> {
-        link_message
-            .attributes
-            .iter()
-            .find_map(|attribute| match attribute {
-                LinkAttribute::IfName(name) => Some(name.clone()),
-                _ => None,
-            })
-    }
-}
--- a/launch/dynamo-run/src/opt.rs
+++ b/launch/dynamo-run/src/opt.rs
@@ -35,11 +35,6 @@ pub enum Input {
    /// Batch mode. Run all the prompts, write the outputs, exit.
    Batch(PathBuf),
-    /// Start the engine but don't provide any way to talk to it.
-    /// For multi-node sglang, where the engine connects directly
-    /// to the co-ordinator via torch distributed / nccl.
-    None,
 }
 impl TryFrom<&str> for Input {
@@ -50,7 +45,6 @@ impl TryFrom<&str> for Input {
            "http" => Ok(Input::Http),
            "text" => Ok(Input::Text),
            "stdin" => Ok(Input::Stdin),
-            "none" => Ok(Input::None),
            endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => {
                Ok(Input::Endpoint(endpoint_path.to_string()))
            }
@@ -71,7 +65,6 @@ impl fmt::Display for Input {
            Input::Stdin => "stdin",
            Input::Endpoint(path) => path,
            Input::Batch(path) => &path.display().to_string(),
-            Input::None => "none",
        };
        write!(f, "{s}")
    }
@@ -101,39 +94,21 @@ pub enum Output {
    /// Run inference on a model in a GGUF file using mistralrs w/ candle
    MistralRs,
-    #[cfg(feature = "sglang")]
-    /// Deprecated
-    SgLangLegacy,
-    /// Run inference using sglang
-    SgLang,
    #[cfg(feature = "llamacpp")]
    /// Run inference using llama.cpp
    LlamaCpp,
+    /// Run inference using sglang
+    SgLang,
    // Start vllm in a sub-process connecting via nats
    // Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
    Vllm,
-    #[cfg(feature = "vllm")]
-    /// Run inference using vllm 0.8.X+
-    Vllm0_8,
-    #[cfg(feature = "vllm")]
-    /// Run inference using vllm 0.7.X
-    Vllm0_7,
    /// Run inference using a user supplied python file that accepts and returns
    /// strings. It does it's own pre-processing.
    #[cfg(feature = "python")]
    PythonStr(String),
-    /// Run inference using a user supplied python file that accepts and returns
-    /// tokens. We do the pre-processing.
-    #[cfg(feature = "python")]
-    PythonTok(String),
-    //
    // DEVELOPER NOTE
    // If you add an engine add it to `available_engines` below, and to Default if it makes sense
 }
@@ -146,21 +121,12 @@ impl TryFrom<&str> for Output {
            #[cfg(feature = "mistralrs")]
            "mistralrs" => Ok(Output::MistralRs),
-            #[cfg(feature = "sglang")]
-            "sglang_legacy" => Ok(Output::SgLangLegacy),
-            "sglang" => Ok(Output::SgLang),
            #[cfg(feature = "llamacpp")]
            "llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
+            "sglang" => Ok(Output::SgLang),
            "vllm" => Ok(Output::Vllm),
-            #[cfg(feature = "vllm")]
-            "vllm0_8" => Ok(Output::Vllm0_8),
-            #[cfg(feature = "vllm")]
-            "vllm0_7" => Ok(Output::Vllm0_7),
            "echo_full" => Ok(Output::EchoFull),
            "echo_core" => Ok(Output::EchoCore),
@@ -177,14 +143,6 @@ impl TryFrom<&str> for Output {
                Ok(Output::PythonStr(path.to_string()))
            }
-            #[cfg(feature = "python")]
-            python_tok_gen if python_tok_gen.starts_with(crate::PYTHON_TOK_SCHEME) => {
-                let path = python_tok_gen
-                    .strip_prefix(crate::PYTHON_TOK_SCHEME)
-                    .unwrap();
-                Ok(Output::PythonTok(path.to_string()))
-            }
            e => Err(anyhow::anyhow!("Invalid out= option '{e}'")),
        }
    }
@@ -196,21 +154,12 @@ impl fmt::Display for Output {
            #[cfg(feature = "mistralrs")]
            Output::MistralRs => "mistralrs",
-            #[cfg(feature = "sglang")]
-            Output::SgLangLegacy => "sglang_legacy",
-            Output::SgLang => "sglang",
            #[cfg(feature = "llamacpp")]
            Output::LlamaCpp => "llamacpp",
+            Output::SgLang => "sglang",
            Output::Vllm => "vllm",
-            #[cfg(feature = "vllm")]
-            Output::Vllm0_8 => "vllm0_8",
-            #[cfg(feature = "vllm")]
-            Output::Vllm0_7 => "vllm0_7",
            Output::EchoFull => "echo_full",
            Output::EchoCore => "echo_core",
@@ -218,9 +167,6 @@ impl fmt::Display for Output {
            #[cfg(feature = "python")]
            Output::PythonStr(_) => "pystr",
-            #[cfg(feature = "python")]
-            Output::PythonTok(_) => "pytok",
        };
        write!(f, "{s}")
    }
@@ -258,22 +204,11 @@ impl Output {
        }
        out.push(Output::SgLang.to_string());
-        #[cfg(feature = "sglang")]
-        {
-            out.push(Output::SgLangLegacy.to_string());
-        }
        out.push(Output::Vllm.to_string());
-        #[cfg(feature = "vllm")]
-        {
-            out.push(Output::Vllm0_7.to_string());
-            out.push(Output::Vllm0_8.to_string());
-        }
        #[cfg(feature = "python")]
        {
            out.push(Output::PythonStr("file.py".to_string()).to_string());
-            out.push(Output::PythonTok("file.py".to_string()).to_string());
        }
        out

--- a/launch/dynamo-run/src/subprocess.rs
+++ b/launch/dynamo-run/src/subprocess.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 use std::borrow::Cow;
 use std::io::Write;
@@ -23,6 +11,8 @@ use anyhow::Context;
 use regex::Regex;
 use tokio::io::AsyncBufReadExt;
+use dynamo_llm::engines::MultiNodeConfig;
 pub mod sglang;
 pub mod vllm;
@@ -39,6 +29,8 @@ pub async fn start(
    // sglang which GPU to start from, on a multi-GPU system
    // vllm uses CUDA_VISIBLE_DEVICES
    base_gpu_id: Option<u32>,
+    // sglang multi-node config. vllm uses `ray` externally
+    multi_node_config: Option<MultiNodeConfig>,
    // Path to a JSON file containing extra arguments to the backend engine
    extra_engine_args: Option<&Path>,
 ) -> anyhow::Result<(tempfile::TempPath, tokio::process::Child)> {
@@ -61,6 +53,15 @@ pub async fn start(
        args.push("--base-gpu-id".to_string());
        args.push(base_gpu_id.to_string());
    }
+    // sglang only
+    if let Some(multi_node_config) = multi_node_config {
+        args.push("--nnodes".to_string());
+        args.push(multi_node_config.num_nodes.to_string());
+        args.push("--node-rank".to_string());
+        args.push(multi_node_config.node_rank.to_string());
+        args.push("--dist-init-addr".to_string());
+        args.push(multi_node_config.leader_addr);
+    }
    if let Some(extra_engine_args) = extra_engine_args {
        args.push("--extra-engine-args".to_string());
        args.push(extra_engine_args.to_string_lossy().to_string());

--- a/launch/dynamo-run/src/subprocess/sglang_inc.py
+++ b/launch/dynamo-run/src/subprocess/sglang_inc.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
 #
 # A very basic example of sglang worker handling pre-processed requests.
@@ -52,6 +39,9 @@ class Config:
    model: str
    base_gpu_id: int
    tensor_parallel_size: int
+    nnodes: int
+    node_rank: int
+    dist_init_addr: str
    extra_engine_args: str
@@ -111,6 +101,13 @@ async def init(runtime: DistributedRuntime, config: Config):
        "tp_size": config.tensor_parallel_size,
        "base_gpu_id": config.base_gpu_id,
    }
+    if config.dist_init_addr != "":
+        arg_map["trust_remote_code"] = True
+        arg_map["nnodes"] = config.nnodes
+        arg_map["dist_init_addr"] = config.dist_init_addr
+        # In practice this is always 0 because Dynamo only manages the leader
+        arg_map["node_rank"] = config.node_rank
    if config.extra_engine_args != "":
        json_map = {}
        # extra_engine_args is a filename
@@ -157,6 +154,21 @@ def cmd_line_args():
    parser.add_argument(
        "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
    )
+    parser.add_argument(
+        "--nnodes", type=int, default=1, help="The number of machines SGLang will use"
+    )
+    parser.add_argument(
+        "--node-rank",
+        type=int,
+        default=0,
+        help="Unique number for each node. 0 for the leader.",
+    )
+    parser.add_argument(
+        "--dist-init-addr",
+        type=str,
+        default="",
+        help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
+    )
    parser.add_argument(
        "--extra-engine-args",
        type=str,
@@ -183,6 +195,9 @@ def cmd_line_args():
    config.endpoint = parsed_endpoint_name
    config.base_gpu_id = args.base_gpu_id
    config.tensor_parallel_size = args.tensor_parallel_size
+    config.nnodes = args.nnodes
+    config.node_rank = args.node_rank
+    config.dist_init_addr = args.dist_init_addr
    config.extra_engine_args = args.extra_engine_args
    return config

--- a/launch/dynamo-run/src/subprocess/vllm_inc.py
+++ b/launch/dynamo-run/src/subprocess/vllm_inc.py
@@ -66,19 +66,20 @@ class RequestHandler:
    Request handler for the generate endpoint
    """
-    def __init__(self, engine):
+    def __init__(self, engine, default_sampling_params):
        self.engine_client = engine
+        self.default_sampling_params = default_sampling_params
    async def generate(self, request):
        request_id = "1"  # hello_world example only
        logging.debug(f"Received request: {request}")
        prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
-        sampling_params = SamplingParams(
-            temperature=request["sampling_options"]["temperature"],
+        sampling_params = SamplingParams(**self.default_sampling_params)
-            # vllm defaults this to 16
+        sampling_params.temperature = request["sampling_options"]["temperature"]
-            max_tokens=request["stop_conditions"]["max_tokens"],
+        sampling_params.max_tokens = request["stop_conditions"]["max_tokens"]
-        )
        num_output_tokens_so_far = 0
        gen = self.engine_client.generate(prompt, sampling_params, request_id)
        async for res in gen:
@@ -142,13 +143,18 @@ async def init(runtime: DistributedRuntime, config: Config):
        arg_map = {**arg_map, **json_map}  # json_map gets precedence
    engine_args = AsyncEngineArgs(**arg_map)
+    model_config = engine_args.create_model_config()
+    # Load default sampling params from `generation_config.json`
+    default_sampling_params = model_config.get_diff_sampling_param()
    engine_context = build_async_engine_client_from_engine_args(engine_args)
    engine_client = await engine_context.__aenter__()
    # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
    # after the lease is revoked
-    await endpoint.serve_endpoint(RequestHandler(engine_client).generate, None)
+    await endpoint.serve_endpoint(
+        RequestHandler(engine_client, default_sampling_params).generate, None
+    )
 def cmd_line_args():

--- a/lib/engines/python/src/lib.rs
+++ b/lib/engines/python/src/lib.rs
@@ -36,7 +36,6 @@ use tokio::sync::mpsc;
 use tokio::sync::oneshot::Sender;
 use tokio_stream::{wrappers::ReceiverStream, StreamExt};
-use dynamo_llm::backend::ExecutionContext;
 use dynamo_llm::engines::{EngineDispatcher, StreamingEngine};
 /// Python snippet to import a file as a module
@@ -89,26 +88,6 @@ pub async fn make_string_engine(
    Ok(engine)
 }
-/// An engine that takes and returns tokens.
-pub async fn make_token_engine(
-    cancel_token: CancellationToken,
-    py_file: &Path,
-    py_args: Vec<String>,
-) -> pipeline_error::Result<ExecutionContext> {
-    pyo3::prepare_freethreaded_python();
-    if let Ok(venv) = env::var("VIRTUAL_ENV") {
-        Python::with_gil(|py| {
-            if let Err(e) = fix_venv(venv, py) {
-                tracing::warn!("failed to fix venv: {}", e);
-            }
-        });
-    }
-    let engine = new_engine(cancel_token, py_file, py_args).await?;
-    let engine: ExecutionContext = Arc::new(engine);
-    Ok(engine)
-}
 #[derive(Clone)]
 pub struct PythonServerStreamingEngine {
    _cancel_token: CancellationToken,
@@ -128,17 +107,6 @@ async fn new_engine(
    let user_module =
        python_file_to_module(py_file, py_args).with_context(|| py_file.display().to_string())?;
    let generator = Python::with_gil(|py| {
-        /* Leave commented, `initialize` may be needed to match Triton
-        if let Ok(initialize) = user_module.getattr(py, "initialize") {
-            initialize
-                .call1(py, (py_args,))
-                .inspect_err(|err| {
-                    println!();
-                    err.display(py);
-                })
-                .with_context(|| "Failed calling python engine's initialize(args)")?;
-        };
-        */
        user_module
            .getattr(py, "generate")
            .with_context(|| "generate")

--- a/lib/engines/sglang/Cargo.toml
+++ b/lib/engines/sglang/Cargo.toml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-[package]
-name = "dynamo-engine-sglang"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-authors.workspace = true
-license.workspace = true
-homepage.workspace = true
-repository.workspace = true
-keywords.workspace = true
-[dependencies]
-dynamo-runtime = { workspace = true }
-dynamo-llm = { workspace = true }
-anyhow = { workspace = true }
-async-stream = { workspace = true }
-async-trait = { workspace = true }
-async_zmq = { workspace = true }
-serde_json = { workspace = true }
-tokio = { workspace = true }
-tracing = { workspace = true }
-async-openai = "0.27.2"
-libc = "0.2"
-pyo3 = { version = "0.23.3", default-features = false, features = [
-  "macros",
-  "experimental-async",
-  "experimental-inspect",
-  "py-clone",
-] }
-regex = "1"
--- a/lib/engines/sglang/src/engine.rs
+++ b/lib/engines/sglang/src/engine.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-use std::path::{Path, PathBuf};
-use async_stream::stream;
-use async_trait::async_trait;
-use dynamo_llm::engines::MultiNodeConfig;
-use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
-use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
-use dynamo_runtime::protocols::annotated::Annotated;
-use dynamo_runtime::runtime::CancellationToken;
-pub struct SgLangEngine {
-    cancel_token: CancellationToken,
-    worker: super::worker::SgLangWorker,
-}
-impl SgLangEngine {
-    pub async fn new(
-        cancel_token: CancellationToken,
-        sock_code: &str,
-        model_path: &Path,
-        node_conf: MultiNodeConfig,
-        tensor_parallel_size: u32,
-        base_gpu_id: u32,
-        extra_engine_args: Option<PathBuf>,
-    ) -> anyhow::Result<Self> {
-        let w = super::worker::start(
-            cancel_token.clone(),
-            sock_code,
-            model_path,
-            node_conf,
-            tensor_parallel_size,
-            base_gpu_id,
-            extra_engine_args,
-        )
-        .await?;
-        let engine = SgLangEngine {
-            cancel_token,
-            worker: w,
-        };
-        Ok(engine)
-    }
-    pub fn take_sglang_worker_handle(&mut self) -> tokio::task::JoinHandle<()> {
-        self.worker.take_sglang_handle()
-    }
-}
-#[async_trait]
-impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
-    for SgLangEngine
-{
-    async fn generate(
-        &self,
-        request: SingleIn<BackendInput>,
-    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
-        let (request, context) = request.into_parts();
-        let ctx = context.context();
-        let request_id = ctx.id().to_string();
-        let (resp_tx, mut resp_rx) = tokio::sync::mpsc::channel(128);
-        let work_req = super::worker::WorkRequest {
-            request_id: context.id().to_string(),
-            request,
-            response_channel: resp_tx,
-        };
-        self.worker.enqueue_request(work_req).await?;
-        let cancel_token = self.cancel_token.clone();
-        let output = stream! {
-            loop {
-                tokio::select! {
-                    _ = cancel_token.cancelled() => {
-                        break;
-                    }
-                    maybe_resp_rx = resp_rx.recv() => {
-                        match maybe_resp_rx {
-                            Some(out) => {
-                                yield out;
-                            },
-                            None => {
-                                tracing::trace!(request_id, "generate: response channel closed");
-                                break;
-                            }
-                        }
-                    }
-                }
-            }
-        };
-        Ok(ResponseStream::new(Box::pin(output), ctx))
-    }
-}