chore(dynamo-run): Remove out=sglang|vllm|trtllm (#1920)

19a77ae7 · Graham King · GitHub · 3c500ae7 · 19a77ae7 · 19a77ae7
Unverified Commit 19a77ae7 authored Jul 23, 2025 by Graham King Committed by GitHub Jul 23, 2025
18 changed files
--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -64,46 +64,6 @@ pub struct Flags {
    #[arg(long)]
    pub model_config: Option<PathBuf>,
-    /// sglang, vllm
-    ///
-    /// How many GPUs to use at once, total across all nodes.
-    /// This must divide by num_nodes, and each node must use the same number of GPUs.
-    #[arg(long, default_value = "1", value_parser = clap::value_parser!(u32).range(1..256))]
-    pub tensor_parallel_size: u32,
-    /// sglang only
-    /// vllm uses CUDA_VISIBLE_DEVICES env var
-    ///
-    /// Use GPUs from this ID upwards.
-    /// If your machine has four GPUs but the first two (0 and 1) are in use,
-    /// pass --base-gpu-id 2 to use the third GPU (and up, if tensor_parallel_size > 1)
-    #[arg(long, default_value = "0", value_parser = clap::value_parser!(u32).range(0..256))]
-    pub base_gpu_id: u32,
-    /// vllm and sglang only
-    ///
-    /// How many nodes/hosts to use
-    #[arg(long, default_value = "1", value_parser = clap::value_parser!(u32).range(1..256))]
-    pub num_nodes: u32,
-    /// vllm and sglang only
-    ///
-    /// This nodes' unique ID, running from 0 to num_nodes.
-    #[arg(long, default_value = "0", value_parser = clap::value_parser!(u32).range(0..255))]
-    pub node_rank: u32,
-    /// For multi-node / pipeline parallel this is the <host>:<port> of the first node.
-    ///
-    /// - vllm: The address/port of the Ray head node.
-    ///
-    /// - sglang: The Torch Distributed init method address, in format <host>:<port>.
-    ///   It becomes "tcp://<host>:<port>" when given to torch.distributed.init_process_group.
-    ///   This expects to use the nccl backend (transparently to us here).
-    ///   All nodes must use the same address here, which is node_rank == 0's address.
-    ///
-    #[arg(long)]
-    pub leader_addr: Option<String>,
    /// If using `out=dyn` with multiple instances, this says how to route the requests.
    ///
    /// Mostly interesting for KV-aware routing.
@@ -199,22 +159,6 @@ impl Flags {
            }
            #[cfg(feature = "mistralrs")]
            Output::MistralRs => {}
-            Output::SgLang => {
-                if !local_model.path().is_dir() {
-                    // TODO GGUF support for sglang: https://github.com/ai-dynamo/dynamo/issues/572
-                    anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
-                }
-            }
-            Output::Vllm => {
-                if self.base_gpu_id != 0 {
-                    anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
-                }
-            }
-            Output::Trtllm => {
-                if self.base_gpu_id != 0 {
-                    anyhow::bail!("TRTLLM does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
-                }
-            }
            #[cfg(feature = "llamacpp")]
            Output::LlamaCpp => {
                if !local_model.path().is_file() {

--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-use std::time::Duration;
-use std::{future::Future, pin::Pin};
 use anyhow::Context as _;
 use dynamo_llm::entrypoint::input::Input;
 use dynamo_llm::entrypoint::EngineConfig;
@@ -17,9 +14,6 @@ pub use flags::Flags;
 mod opt;
 pub use dynamo_llm::request_template::RequestTemplate;
 pub use opt::Output;
-mod subprocess;
-const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
 pub async fn run(
    runtime: Runtime,
@@ -48,6 +42,7 @@ pub async fn run(
        .request_template(flags.request_template.clone())
        .migration_limit(flags.migration_limit);
+    // TODO: old, address this later:
    // If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
    // If not, then the endpoint isn't exposed so we let LocalModel invent one.
    let mut rt = Either::Left(runtime.clone());
@@ -71,7 +66,7 @@ pub async fn run(
    flags.validate(&local_model, &out_opt)?;
    // Make an engine from the local_model, flags and output.
-    let (engine_config, extra) = engine_for(
+    let engine_config = engine_for(
        runtime.primary_token(),
        out_opt,
        flags.clone(),
@@ -85,17 +80,9 @@ pub async fn run(
    //
    dynamo_llm::entrypoint::input::run_input(rt, in_opt, engine_config).await?;
-    // Allow engines to ask main thread to wait on an extra future.
-    // We use this to stop the vllm and sglang sub-process
-    if let Some(extra) = extra {
-        extra.await;
-    }
    Ok(())
 }
-type ExtraFuture = Pin<Box<dyn Future<Output = ()> + Send>>;
 /// Create the engine matching `out_opt`
 /// Note validation happens in Flags::validate. In here assume everything is going to work.
 async fn engine_for(
@@ -104,71 +91,27 @@ async fn engine_for(
    flags: Flags,
    local_model: LocalModel,
    rt: Either<Runtime, DistributedRuntime>,
-) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
+) -> anyhow::Result<EngineConfig> {
    match out_opt {
-        Output::Dynamic => Ok((EngineConfig::Dynamic(Box::new(local_model)), None)),
+        Output::Dynamic => Ok(EngineConfig::Dynamic(Box::new(local_model))),
-        Output::EchoFull => Ok((
+        Output::EchoFull => Ok(EngineConfig::StaticFull {
-            EngineConfig::StaticFull {
+            model: Box::new(local_model),
-                model: Box::new(local_model),
+            engine: dynamo_llm::engines::make_engine_full(),
-                engine: dynamo_llm::engines::make_engine_full(),
+        }),
-            },
+        Output::EchoCore => Ok(EngineConfig::StaticCore {
-            None,
+            engine: dynamo_llm::engines::make_engine_core(),
-        )),
+            model: Box::new(local_model),
-        Output::EchoCore => Ok((
+        }),
-            EngineConfig::StaticCore {
-                engine: dynamo_llm::engines::make_engine_core(),
-                model: Box::new(local_model),
-            },
-            None,
-        )),
        #[cfg(feature = "mistralrs")]
-        Output::MistralRs => Ok((
+        Output::MistralRs => Ok(EngineConfig::StaticFull {
-            EngineConfig::StaticFull {
+            engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
-                engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
+            model: Box::new(local_model),
-                model: Box::new(local_model),
+        }),
-            },
-            None,
-        )),
        #[cfg(feature = "llamacpp")]
-        Output::LlamaCpp => Ok((
+        Output::LlamaCpp => Ok(EngineConfig::StaticCore {
-            EngineConfig::StaticCore {
+            engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?,
-                engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?,
+            model: Box::new(local_model),
-                model: Box::new(local_model),
+        }),
-            },
-            None,
-        )),
-        // For multi-node config. vllm uses `ray`, see guide
-        Output::Vllm => shell(subprocess::vllm::PY, cancel_token, local_model, flags, None).await,
-        // For multi-node config. trtlllm uses `mpi`, see guide
-        Output::Trtllm => {
-            shell(
-                subprocess::trtllm::PY,
-                cancel_token,
-                local_model,
-                flags,
-                None,
-            )
-            .await
-        }
-        Output::SgLang => {
-            let multi_node_config = if flags.num_nodes > 1 {
-                Some(dynamo_llm::engines::MultiNodeConfig {
-                    num_nodes: flags.num_nodes,
-                    node_rank: flags.node_rank,
-                    leader_addr: flags.leader_addr.clone().unwrap_or_default(),
-                })
-            } else {
-                None
-            };
-            shell(
-                subprocess::sglang::PY,
-                cancel_token,
-                local_model,
-                flags,
-                multi_node_config,
-            )
-            .await
-        }
        Output::Mocker => {
            let Either::Right(drt) = rt else {
                panic!("Mocker requires a distributed runtime to run.");
@@ -180,76 +123,12 @@ async fn engine_for(
            let engine =
                dynamo_llm::mocker::engine::make_mocker_engine(drt, endpoint, args).await?;
-            Ok((
+            Ok(EngineConfig::StaticCore {
-                EngineConfig::StaticCore {
+                engine,
-                    engine,
+                model: Box::new(local_model),
-                    model: Box::new(local_model),
+            })
-                },
-                None,
-            ))
-        }
-    }
-}
-async fn shell(
-    py_script: &'static str,
-    cancel_token: CancellationToken,
-    local_model: LocalModel,
-    flags: Flags,
-    multi_node_config: Option<dynamo_llm::engines::MultiNodeConfig>,
-) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
-    let (py_script, child) =
-        match subprocess::start(py_script, &local_model, flags.clone(), multi_node_config).await {
-            Ok(x) => x,
-            Err(err) => {
-                anyhow::bail!("Failed starting engine sub-process: {err}");
-            }
-        };
-    // Sub-process cleanup
-    let extra: ExtraFuture = Box::pin(async move {
-        stopper(cancel_token, child, py_script).await;
-    });
-    Ok((EngineConfig::Dynamic(Box::new(local_model)), Some(extra)))
-}
-/// Wait for cancel_token to be cancelled, then stop the child as gracefully as possible.
-/// Keeps the TempPath alive until the child is stopped.
-async fn stopper(
-    cancel_token: CancellationToken,
-    mut child: tokio::process::Child,
-    py_script: tempfile::TempPath,
-) {
-    cancel_token.cancelled().await;
-    // Ask subprocess to stop gracefully
-    if let Some(pid) = child.id() {
-        unsafe { libc::kill(pid as i32, libc::SIGTERM) };
-    }
-    tokio::select! {
-        exit = child.wait() => {
-            tracing::trace!("engine sub-process graceful exit");
-            match exit {
-                Ok(exit_status) if exit_status.success() => {}
-                Ok(exit_status) => {
-                    // This is nearly always 15 (SIGTERM)
-                    tracing::trace!("engine sub-process non-0 exit: {exit_status}");
-                }
-                Err(err) => {
-                    tracing::warn!("engine sub-process error getting exit status: {err}");
-                }
-            }
-        }
-        _ = tokio::time::sleep(CHILD_STOP_TIMEOUT) => {
-            // It didn't stop in time, kill it
-            child.kill().await.expect("Failed killing engine subprocess");
-            let _ = child.wait().await;
        }
    }
-    // This temporary file contains the python script running the engine. It deletes on drop.
-    // Keep it alive until the engine has stopped.
-    drop(py_script);
 }
 /// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.

--- a/launch/dynamo-run/src/main.rs
+++ b/launch/dynamo-run/src/main.rs
@@ -90,6 +90,11 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
                in_opt = Some(val.try_into()?);
            }
            "out" => {
+                if val == "sglang" || val == "trtllm" || val == "vllm" {
+                    tracing::error!("To run the {val} engine please use the Python interface, see root README or look in directory `components/backends/`.");
+                    std::process::exit(1);
+                }
                out_opt = Some(val.try_into()?);
            }
            _ => {

--- a/launch/dynamo-run/src/opt.rs
+++ b/launch/dynamo-run/src/opt.rs
@@ -22,16 +22,6 @@ pub enum Output {
    /// Run inference using llama.cpp
    LlamaCpp,
-    /// Run inference using sglang
-    SgLang,
-    /// Run inference using trtllm
-    Trtllm,
-    // Start vllm in a sub-process connecting via nats
-    // Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
-    Vllm,
    Mocker,
 }
@@ -46,11 +36,7 @@ impl TryFrom<&str> for Output {
            #[cfg(feature = "llamacpp")]
            "llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
-            "sglang" => Ok(Output::SgLang),
-            "trtllm" => Ok(Output::Trtllm),
-            "vllm" => Ok(Output::Vllm),
            "mocker" => Ok(Output::Mocker),
            "echo_full" => Ok(Output::EchoFull),
            "echo_core" => Ok(Output::EchoCore),
@@ -79,11 +65,7 @@ impl fmt::Display for Output {
            #[cfg(feature = "llamacpp")]
            Output::LlamaCpp => "llamacpp",
-            Output::SgLang => "sglang",
-            Output::Trtllm => "trtllm",
-            Output::Vllm => "vllm",
            Output::Mocker => "mocker",
            Output::EchoFull => "echo_full",
            Output::EchoCore => "echo_core",
@@ -96,7 +78,11 @@ impl fmt::Display for Output {
 impl Output {
    #[allow(unused_mut)]
    pub fn available_engines() -> Vec<String> {
-        let mut out = vec!["echo_core".to_string(), "echo_full".to_string()];
+        let mut out = vec![
+            "echo_core".to_string(),
+            "echo_full".to_string(),
+            Output::Mocker.to_string(),
+        ];
        #[cfg(feature = "mistralrs")]
        {
            out.push(Output::MistralRs.to_string());
@@ -107,11 +93,6 @@ impl Output {
            out.push(Output::LlamaCpp.to_string());
        }
-        out.push(Output::SgLang.to_string());
-        out.push(Output::Trtllm.to_string());
-        out.push(Output::Vllm.to_string());
-        out.push(Output::Mocker.to_string());
        out
    }
 }
--- a/launch/dynamo-run/src/subprocess.rs
+++ b/launch/dynamo-run/src/subprocess.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-use std::borrow::Cow;
-use std::io::Write;
-use std::process::Stdio;
-use std::sync::LazyLock;
-use anyhow::Context;
-use regex::Regex;
-use tokio::io::AsyncBufReadExt;
-use crate::flags::RouterMode;
-use dynamo_llm::engines::MultiNodeConfig;
-use dynamo_llm::local_model::LocalModel;
-pub mod sglang;
-pub mod trtllm;
-pub mod vllm;
-pub async fn start(
-    // The Python code to run
-    py_script: &'static str,
-    // Model info
-    local_model: &LocalModel,
-    // Command line flags for user overrides
-    flags: super::Flags,
-    // sglang multi-node config. vllm uses `ray` externally
-    multi_node_config: Option<MultiNodeConfig>,
-) -> anyhow::Result<(tempfile::TempPath, tokio::process::Child)> {
-    let mut tmp = tempfile::NamedTempFile::new()?;
-    // Writes on Linux don't block
-    tmp.write_all(py_script.as_bytes())?;
-    let script_path = tmp.into_temp_path();
-    let card = local_model.card();
-    let mut args = vec![
-        script_path.to_string_lossy().to_string(),
-        "--endpoint".to_string(),
-        local_model.endpoint_id().as_url(),
-        "--model-path".to_string(),
-        local_model.path().to_string_lossy().to_string(),
-        "--model-name".to_string(),
-        local_model.display_name().to_string(),
-        "--tensor-parallel-size".to_string(),
-        flags.tensor_parallel_size.to_string(),
-        "--kv-block-size".to_string(),
-        card.kv_cache_block_size.to_string(),
-        "--context-length".to_string(),
-        card.context_length.to_string(),
-        "--migration-limit".to_string(),
-        card.migration_limit.to_string(),
-    ];
-    // TRTLLM only
-    // The worker node will only publish events and metrics if the router mode is KV
-    if flags.router_mode == RouterMode::KV {
-        args.push("--publish-events-and-metrics".to_string());
-    }
-    // sglang only
-    // vllm uses CUDA_VISIBLE_DEVICES
-    if flags.base_gpu_id != 0 {
-        args.push("--base-gpu-id".to_string());
-        args.push(flags.base_gpu_id.to_string());
-    }
-    // sglang only
-    if let Some(multi_node_config) = multi_node_config {
-        args.push("--nnodes".to_string());
-        args.push(multi_node_config.num_nodes.to_string());
-        args.push("--node-rank".to_string());
-        args.push(multi_node_config.node_rank.to_string());
-        args.push("--dist-init-addr".to_string());
-        args.push(multi_node_config.leader_addr);
-    }
-    if let Some(extra_engine_args) = flags.extra_engine_args {
-        args.push("--extra-engine-args".to_string());
-        args.push(extra_engine_args.to_string_lossy().to_string());
-    }
-    let mut cmd = tokio::process::Command::new("python3");
-    cmd.kill_on_drop(false)
-        .args(args)
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped());
-    let mut child = cmd
-        .spawn()
-        .with_context(|| format!("Failed running: '{}'", pretty_cmd(&cmd)))?;
-    // Safety: We set stdout/stderr a few lines above
-    let stdout = tokio::io::BufReader::new(child.stdout.take().unwrap());
-    let stderr = tokio::io::BufReader::new(child.stderr.take().unwrap());
-    tokio::spawn(async move {
-        let mut lines = stdout.lines();
-        while let Ok(Some(line)) = lines.next_line().await {
-            tracing::info!("{}", strip_log_prefix(&line));
-        }
-    });
-    tokio::spawn(async move {
-        let mut lines = stderr.lines();
-        while let Ok(Some(line)) = lines.next_line().await {
-            // FIXME: always logging INFO/DEBUG will hide real errors, but
-            // some libraries log non-errors to stderr, which confuses users
-            // when we log those as ERROR. Using WARN as a middle ground for
-            // now, but we can probably be smarter here.
-            tracing::warn!("{}", strip_log_prefix(&line));
-        }
-    });
-    // We must keep temp path alive, it deletes on drop
-    Ok((script_path, child))
-}
-pub fn pretty_cmd(c: &tokio::process::Command) -> String {
-    format!(
-        "{} {}",
-        c.as_std().get_program().to_string_lossy(),
-        c.as_std()
-            .get_args()
-            .map(|x| x.to_string_lossy())
-            .collect::<Vec<std::borrow::Cow<'_, str>>>()
-            .join(" ")
-    )
-}
-// Thanks Gemini
-static LOG_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| {
-    Regex::new(
-        r"^(?:(?:[A-Z]+ \d{2}-\d{2} \d{2}:\d{2}:\d{2})|(?:\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\])) (.*)$"
-    ).unwrap()
-    // ^                                  Start of the line.
-    // (?:                                Non-capturing group for the two prefix alternatives.
-    //      (?:                           Non-capturing group for the first prefix type.
-    //          [A-Z]+                    One or more uppercase letters (log level).
-    //            (single space)          A literal space.
-    //          \d{2}-\d{2}               Date like MM-DD.
-    //            (single space)          A literal space.
-    //          \d{2}:\d{2}:\d{2}         Time like HH:MM:SS.
-    //      )                             End of first prefix type group.
-    //      |                             OR
-    //      (?:                           Non-capturing group for the second prefix type.
-    //          \[                        A literal opening square bracket.
-    //          \d{4}-\d{2}-\d{2}         Date like YYYY-MM-DD.
-    //            (single space)          A literal space.
-    //          \d{2}:\d{2}:\d{2}         Time like HH:MM:SS.
-    //          \]                        A literal closing square bracket.
-    //      )                             End of second prefix type group.
-    // )                                  End of the alternatives group.
-    //   (single space)                   A literal space. This is the space BEFORE the message.
-    // (.*)                               Capture group 1: The rest of the line (the message).
-    // $                                  End of the line.
-});
-/// Strips the log level, date, and time from the start of a log line.
-///
-/// # Examples
-/// let line = "INFO 05-06 09:38:50 [async_llm.py:252] Added request 1";
-/// assert_eq!(strip_log_prefix(line), "[async_llm.py:252] Added request 1");
-///
-/// let line_no_prefix = "This is a normal line.";
-/// assert_eq!(strip_log_prefix(line_no_prefix), "This is a normal line.");
-fn strip_log_prefix(line: &str) -> Cow<'_, str> {
-    if let Some(captures) = LOG_PREFIX_RE.captures(line) {
-        // `captures.get(0)` would be the entire matched prefix + message.
-        // `captures.get(1)` is the first capture group, which is `(.*)`, the message itself.
-        if let Some(message_match) = captures.get(1) {
-            return Cow::Borrowed(message_match.as_str());
-        }
-    }
-    // If the regex doesn't match, or somehow the capture group is not found (shouldn't happen with (.*))
-    // return the original line.
-    Cow::Borrowed(line)
-}
-#[cfg(test)]
-mod tests {
-    use super::strip_log_prefix;
-    #[test]
-    fn test_strip_log_prefix() {
-        let line = "INFO 05-06 09:38:50 [async_llm.py:252] Added request 1";
-        let expected = "[async_llm.py:252] Added request 1";
-        assert_eq!(strip_log_prefix(line), expected);
-        let line = "Just a regular line.";
-        assert_eq!(strip_log_prefix(line), line);
-        let line = "INFO this is not a full prefix";
-        assert_eq!(strip_log_prefix(line), line);
-        let line = "[2025-05-06 11:58:51] Capture cuda graph bs [1, 2, 4, 8]";
-        assert_eq!(strip_log_prefix(line), "Capture cuda graph bs [1, 2, 4, 8]");
-    }
-}
--- a/launch/dynamo-run/src/subprocess/sglang.rs
+++ b/launch/dynamo-run/src/subprocess/sglang.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/// Source code of the SGLang sub-process
-pub const PY: &str = include_str!("sglang_inc.py");
--- a/launch/dynamo-run/src/subprocess/sglang_inc.py
+++ b/launch/dynamo-run/src/subprocess/sglang_inc.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# `dynamo-run out=sglang` runs this script
-# Can also be used standalone: `python3 sglang_inc.py` - lots of optional cmd line params
-import argparse
-import asyncio
-import json
-import logging
-import sys
-from typing import Optional
-import sglang
-import uvloop
-from sglang.srt.entrypoints.engine import EmbeddingReqInput
-from sglang.srt.server_args import ServerArgs
-from dynamo.llm import ModelType, register_llm
-from dynamo.runtime import DistributedRuntime, dynamo_worker
-from dynamo.runtime.logging import configure_dynamo_logging
-# Only used if you run it manually from the command line
-DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
-DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
-configure_dynamo_logging()
-class Config:
-    """Command line parameters or defaults"""
-    namespace: str
-    component: str
-    endpoint: str
-    model_path: str
-    model_name: Optional[str]
-    base_gpu_id: int
-    tensor_parallel_size: int
-    kv_block_size: int
-    context_length: int
-    nnodes: int
-    node_rank: int
-    dist_init_addr: str
-    migration_limit: int
-    extra_engine_args: str
-class RequestHandler:
-    """
-    Request handler for the generate endpoint
-    """
-    def __init__(self, engine):
-        self.engine_client = engine
-    async def generate(self, request):
-        sampling_params = {}
-        if request["sampling_options"]["temperature"] is not None:
-            sampling_params["temperature"] = request["sampling_options"]["temperature"]
-        sampling_params = {
-            # sglang defaults this to 128
-            "max_new_tokens": request["stop_conditions"]["max_tokens"],
-        }
-        # Check if this is a batch request
-        is_batch = "batch_token_ids" in request and request["batch_token_ids"]
-        if is_batch:
-            # Track tokens separately for each batch item
-            num_output_tokens_so_far = {}
-            logging.debug("received batch token ids")
-            gen = await self.engine_client.async_generate(
-                input_ids=request["batch_token_ids"],
-                sampling_params=sampling_params,
-                stream=True,
-            )
-        else:
-            num_output_tokens_so_far = 0
-            logging.debug("received token ids")
-            gen = await self.engine_client.async_generate(
-                input_ids=request["token_ids"],
-                sampling_params=sampling_params,
-                stream=True,
-            )
-        async for res in gen:
-            # res is a dict
-            logging.debug(f"res: {res}")
-            finish_reason = res["meta_info"]["finish_reason"]
-            if is_batch:
-                # Handle batch response - get index from SGLang response
-                index = res.get("index", 0)
-                if index not in num_output_tokens_so_far:
-                    num_output_tokens_so_far[index] = 0
-                if finish_reason:
-                    logging.warning(f"finish_reason: {finish_reason}")
-                    # Final response for this batch item
-                    out = {
-                        "token_ids": [],
-                        "finish_reason": finish_reason["type"],
-                        "index": index,
-                    }
-                else:
-                    # Streaming response for this batch item
-                    next_total_toks = len(res["output_ids"])
-                    new_tokens = res["output_ids"][num_output_tokens_so_far[index] :]
-                    out = {
-                        "token_ids": new_tokens,
-                        "index": index,
-                    }
-                    num_output_tokens_so_far[index] = next_total_toks
-            else:
-                if finish_reason:
-                    out = {
-                        "token_ids": [],
-                        "finish_reason": finish_reason["type"],
-                    }
-                else:
-                    next_total_toks = len(res["output_ids"])
-                    new_tokens = res["output_ids"][num_output_tokens_so_far:]
-                    out = {
-                        "token_ids": new_tokens,
-                    }
-                    num_output_tokens_so_far = next_total_toks
-            yield out
-    async def encode(self, request):
-        obj = EmbeddingReqInput(input_ids=request["token_ids"])
-        generator = self.engine_client.tokenizer_manager.generate_request(obj, None)
-        engine_results = await anext(generator)
-        tokens = 0
-        embeddings = []
-        for result in engine_results:
-            embeddings.append(result["embedding"])
-            tokens += result["meta_info"]["prompt_tokens"]
-        out = {
-            "embeddings": embeddings,
-            "prompt_tokens": tokens,
-            "total_tokens": tokens,
-        }
-        yield out
-@dynamo_worker(static=False)
-async def worker(runtime: DistributedRuntime):
-    await init(runtime, cmd_line_args())
-async def init(runtime: DistributedRuntime, config: Config):
-    """
-    Instantiate and serve
-    """
-    arg_map = {
-        "model_path": config.model_path,
-        "skip_tokenizer_init": True,
-        "tp_size": config.tensor_parallel_size,
-        "base_gpu_id": config.base_gpu_id,
-    }
-    if config.kv_block_size:
-        arg_map["page_size"] = config.kv_block_size
-    if config.context_length:
-        arg_map["context_length"] = config.context_length
-    if config.dist_init_addr != "":
-        arg_map["trust_remote_code"] = True
-        arg_map["nnodes"] = config.nnodes
-        arg_map["dist_init_addr"] = config.dist_init_addr
-        # In practice this is always 0 because Dynamo only manages the leader
-        arg_map["node_rank"] = config.node_rank
-    if config.extra_engine_args != "":
-        json_map = {}
-        # extra_engine_args is a filename
-        try:
-            with open(config.extra_engine_args) as f:
-                json_map = json.load(f)
-        except FileNotFoundError:
-            logging.error(f"File {config.extra_engine_args} not found.")
-        except json.JSONDecodeError as e:
-            logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
-        logging.debug(f"Adding extra engine arguments: {json_map}")
-        arg_map = {**arg_map, **json_map}  # json_map gets precedence
-    # TODO fetch default SamplingParams from generation_config.json
-    engine_args = ServerArgs(**arg_map)
-    engine_client = sglang.Engine(server_args=engine_args)
-    component = runtime.namespace(config.namespace).component(config.component)
-    await component.create_service()
-    endpoint = component.endpoint(config.endpoint)
-    model_type = (
-        ModelType.Backend if not engine_args.is_embedding else ModelType.Embedding
-    )
-    await register_llm(
-        model_type,
-        endpoint,
-        config.model_path,
-        config.model_name,
-        migration_limit=config.migration_limit,
-    )
-    # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
-    # after the lease is revoked
-    handler = RequestHandler(engine_client)
-    if engine_args.is_embedding:
-        await endpoint.serve_endpoint(handler.encode)
-    else:
-        await endpoint.serve_endpoint(handler.generate)
-def cmd_line_args():
-    parser = argparse.ArgumentParser(
-        description="SGLang server integrated with Dynamo LLM."
-    )
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        default=DEFAULT_ENDPOINT,
-        help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
-    )
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default=DEFAULT_MODEL,
-        help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
-    )
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default="",
-        help="Name to serve the model under. Defaults to deriving it from model path.",
-    )
-    parser.add_argument(
-        "--base-gpu-id",
-        type=int,
-        default=0,
-        help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
-    )
-    parser.add_argument(
-        "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
-    )
-    parser.add_argument(
-        "--kv-block-size", type=int, default=16, help="Size of a KV cache block."
-    )
-    parser.add_argument(
-        "--context-length",
-        type=int,
-        default=None,
-        help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
-    )
-    parser.add_argument(
-        "--nnodes", type=int, default=1, help="The number of machines SGLang will use"
-    )
-    parser.add_argument(
-        "--node-rank",
-        type=int,
-        default=0,
-        help="Unique number for each node. 0 for the leader.",
-    )
-    parser.add_argument(
-        "--dist-init-addr",
-        type=str,
-        default="",
-        help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
-    )
-    parser.add_argument(
-        "--migration-limit",
-        type=int,
-        default=0,
-        help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
-    )
-    parser.add_argument(
-        "--extra-engine-args",
-        type=str,
-        default="",
-        help="Path to a JSON file containing additional keyword arguments to pass to the SGLang Engine.",
-    )
-    args = parser.parse_args()
-    config = Config()
-    config.model_path = args.model_path
-    if args.model_name:
-        config.model_name = args.model_name
-    else:
-        # This becomes an `Option` on the Rust side
-        config.model_name = None
-    endpoint_str = args.endpoint.replace("dyn://", "", 1)
-    endpoint_parts = endpoint_str.split(".")
-    if len(endpoint_parts) != 3:
-        logging.error(
-            f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
-        )
-        sys.exit(1)
-    parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
-    config.namespace = parsed_namespace
-    config.component = parsed_component_name
-    config.endpoint = parsed_endpoint_name
-    config.base_gpu_id = args.base_gpu_id
-    config.tensor_parallel_size = args.tensor_parallel_size
-    config.kv_block_size = args.kv_block_size
-    config.context_length = args.context_length
-    config.nnodes = args.nnodes
-    config.node_rank = args.node_rank
-    config.dist_init_addr = args.dist_init_addr
-    config.migration_limit = args.migration_limit
-    config.extra_engine_args = args.extra_engine_args
-    return config
-if __name__ == "__main__":
-    uvloop.install()
-    asyncio.run(worker())
--- a/launch/dynamo-run/src/subprocess/trtllm.rs
+++ b/launch/dynamo-run/src/subprocess/trtllm.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/// Source code of the TRTLLM sub-process
-pub const PY: &str = include_str!("trtllm_inc.py");
--- a/launch/dynamo-run/src/subprocess/trtllm_config/sample.yaml
+++ b/launch/dynamo-run/src/subprocess/trtllm_config/sample.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This is a sample config for TensorRT-LLM engine.
-# The config provides smaller free_gpu_memory_fraction to ensure that the engine
-# does not use all the GPU memory and both prefill and decode workers can fit in
-# the GPU memory when running in disaggregated mode.
-# You might have to tweak this config based on your model size and GPU memory.
-backend: pytorch
-disable_overlap_scheduler: true
-kv_cache_config:
-  free_gpu_memory_fraction: 0.40
--- a/launch/dynamo-run/src/subprocess/trtllm_inc.py
+++ b/launch/dynamo-run/src/subprocess/trtllm_inc.py
--- a/launch/dynamo-run/src/subprocess/vllm.rs
+++ b/launch/dynamo-run/src/subprocess/vllm.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/// Source code of the VLLM sub-process
-pub const PY: &str = include_str!("vllm_inc.py");
--- a/launch/dynamo-run/src/subprocess/vllm_inc.py
+++ b/launch/dynamo-run/src/subprocess/vllm_inc.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# `dynamo-run out=vllm` runs this script
-# Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
-# Setup checklist:
-# - We are in a virtualenv with vllm installed - and patched if using kv routing.
-# - `libdynamo_llm_capi.so` is in system lib path or it's containing folder is in LD_LIBRARY_PATH
-#   It builds in target/debug/ by default.
-import argparse
-import asyncio
-import json
-import logging
-import os
-import sys
-import uuid
-from typing import Optional
-import uvloop
-from vllm import SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TokensPrompt
-from dynamo.llm import (
-    ForwardPassMetrics,
-    KvStats,
-    ModelType,
-    WorkerMetricsPublisher,
-    WorkerStats,
-    register_llm,
-)
-from dynamo.runtime import DistributedRuntime, dynamo_worker
-from dynamo.runtime.logging import configure_dynamo_logging
-from dynamo.sdk.lib.utils import get_capi_library_path
-# Only used if you run it manually from the command line
-DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
-DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
-configure_dynamo_logging()
-class Config:
-    """Command line parameters or defaults"""
-    namespace: str
-    component: str
-    endpoint: str
-    model_path: str
-    model_name: Optional[str]
-    tensor_parallel_size: int
-    kv_block_size: int
-    context_length: int
-    migration_limit: int
-    extra_engine_args: str
-class RequestHandler:
-    """
-    Request handler for the generate endpoint
-    """
-    def __init__(self, component, engine, default_sampling_params):
-        self.component = component
-        self.engine_client = engine
-        self.default_sampling_params = default_sampling_params
-        self.metrics_publisher = WorkerMetricsPublisher()
-    def setup_kv_metrics(self):
-        if not hasattr(self.engine_client, "set_metrics_publisher"):
-            logging.debug("VLLM version does not support KV metrics")
-            return
-        self.engine_client.set_metrics_publisher(self.metrics_publisher)
-        # Initially send dummy metrics to kick start,
-        # vLLM will not update stat until forward pass is triggered
-        # Create the structured metrics objects
-        worker_stats = WorkerStats(
-            request_active_slots=0,
-            request_total_slots=1024,
-            num_requests_waiting=0,
-            data_parallel_rank=None,
-        )
-        kv_stats = KvStats(
-            kv_active_blocks=0,
-            kv_total_blocks=1024,
-            gpu_cache_usage_perc=0.0,
-            gpu_prefix_cache_hit_rate=0.0,
-        )
-        metrics = ForwardPassMetrics(
-            worker_stats=worker_stats, kv_stats=kv_stats, spec_decode_stats=None
-        )
-        # Publish the metrics as a single object
-        self.metrics_publisher.publish(metrics)
-        task = asyncio.create_task(self.create_metrics_publisher_endpoint())
-        task.add_done_callback(
-            lambda _: logging.debug("metrics publisher endpoint created")
-        )
-    async def create_metrics_publisher_endpoint(self):
-        logging.debug("Creating metrics publisher endpoint")
-        await self.metrics_publisher.create_endpoint(self.component)
-    async def generate(self, request):
-        # logging.debug(f"Received request: {request}")
-        request_id = str(uuid.uuid4().hex)
-        prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
-        sampling_params = SamplingParams(**self.default_sampling_params)
-        for key, value in request["sampling_options"].items():
-            if not value:
-                continue
-            if hasattr(sampling_params, key):
-                setattr(sampling_params, key, value)
-        max_tokens = request["stop_conditions"]["max_tokens"]
-        if max_tokens:
-            sampling_params.max_tokens = max_tokens
-        num_output_tokens_so_far = 0
-        gen = self.engine_client.generate(prompt, sampling_params, request_id)
-        async for res in gen:
-            # res is vllm's RequestOutput
-            # This is the expected way for a request to end.
-            # The new token ID will be eos, don't forward it.
-            if res.finished:
-                yield {"finish_reason": "stop", "token_ids": []}
-                break
-            if not res.outputs:
-                yield {"finish_reason": "error", "token_ids": []}
-                break
-            output = res.outputs[0]
-            next_total_toks = len(output.token_ids)
-            out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
-            if output.finish_reason:
-                out["finish_reason"] = output.finish_reason
-            if output.stop_reason:
-                out["stop_reason"] = output.stop_reason
-            yield out
-            num_output_tokens_so_far = next_total_toks
-@dynamo_worker(static=False)
-async def worker(runtime: DistributedRuntime):
-    await init(runtime, cmd_line_args())
-def _check_and_set_env_value(key, expected, allow_override=False):
-    if not allow_override and key in os.environ and os.environ[key] != expected:
-        raise ValueError(
-            f"{key} is set and doesn't equal expected {expected}. Please unset variable before launch."
-        )
-    os.environ.setdefault(key, expected)
-async def init(runtime: DistributedRuntime, config: Config):
-    """
-    Instantiate and serve
-    """
-    arg_map = {
-        "model": config.model_path,
-        "task": "generate",
-        "tensor_parallel_size": config.tensor_parallel_size,
-        "skip_tokenizer_init": True,
-        "disable_log_requests": True,
-        "enable_prefix_caching": True,
-        # KV routing relies on logging KV metrics
-        "disable_log_stats": False,
-    }
-    assert config.kv_block_size > 0, "Must use non-negative integer for KV Block Size"
-    arg_map["block_size"] = config.kv_block_size
-    if config.context_length:
-        # Usually we want it to default to the max (from tokenizer_config.json)
-        arg_map["max_model_len"] = config.context_length
-    if config.extra_engine_args != "":
-        json_map = {}
-        # extra_engine_args is a filename
-        try:
-            with open(config.extra_engine_args) as f:
-                json_map = json.load(f)
-        except FileNotFoundError:
-            logging.error(f"File {config.extra_engine_args} not found.")
-        except json.JSONDecodeError as e:
-            logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
-        logging.debug(f"Adding extra engine arguments: {json_map}")
-        arg_map = {**arg_map, **json_map}  # json_map gets precedence
-    # Patch won't start KVCacheEventManager unless these four are set
-    component = runtime.namespace(config.namespace).component(config.component)
-    await component.create_service()
-    endpoint = component.endpoint(config.endpoint)
-    _check_and_set_env_value("VLLM_WORKER_ID", str(endpoint.lease_id()))
-    _check_and_set_env_value(
-        "VLLM_KV_CAPI_PATH", get_capi_library_path(), allow_override=True
-    )
-    _check_and_set_env_value("VLLM_KV_NAMESPACE", config.namespace)
-    _check_and_set_env_value("VLLM_KV_COMPONENT", config.component)
-    _check_and_set_env_value(
-        "VLLM_NO_USAGE_STATS", "1", allow_override=True
-    )  # Avoid internal HTTP requests
-    engine_args = AsyncEngineArgs(**arg_map)
-    model_config = engine_args.create_model_config()
-    # Load default sampling params from `generation_config.json`
-    default_sampling_params = model_config.get_diff_sampling_param()
-    engine_context = build_async_engine_client_from_engine_args(engine_args)
-    engine_client = await engine_context.__aenter__()
-    await register_llm(
-        ModelType.Backend,
-        endpoint,
-        config.model_path,
-        config.model_name,
-        context_length=arg_map.get(
-            "max_model_len", None
-        ),  # if None, takes length from tokenizer
-        kv_cache_block_size=arg_map["block_size"],
-        migration_limit=config.migration_limit,
-    )
-    handler = RequestHandler(component, engine_client, default_sampling_params)
-    handler.setup_kv_metrics()
-    # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
-    # after the lease is revoked
-    await endpoint.serve_endpoint(handler.generate)
-def cmd_line_args():
-    parser = argparse.ArgumentParser(
-        description="vLLM server integrated with Dynamo LLM."
-    )
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        default=DEFAULT_ENDPOINT,
-        help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
-    )
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default=DEFAULT_MODEL,
-        help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
-    )
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default="",
-        help="Name to serve the model under. Defaults to deriving it from model path.",
-    )
-    parser.add_argument(
-        "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
-    )
-    parser.add_argument(
-        "--kv-block-size", type=int, default=16, help="Size of a KV cache block."
-    )
-    parser.add_argument(
-        "--context-length",
-        type=int,
-        default=None,
-        help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
-    )
-    parser.add_argument(
-        "--migration-limit",
-        type=int,
-        default=0,
-        help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
-    )
-    parser.add_argument(
-        "--extra-engine-args",
-        type=str,
-        default="",
-        help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
-    )
-    args = parser.parse_args()
-    config = Config()
-    config.model_path = args.model_path
-    if args.model_name:
-        config.model_name = args.model_name
-    else:
-        # This becomes an `Option` on the Rust side
-        config.model_name = None
-    endpoint_str = args.endpoint.replace("dyn://", "", 1)
-    endpoint_parts = endpoint_str.split(".")
-    if len(endpoint_parts) != 3:
-        logging.error(
-            f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
-        )
-        sys.exit(1)
-    parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
-    config.namespace = parsed_namespace
-    config.component = parsed_component_name
-    config.endpoint = parsed_endpoint_name
-    config.tensor_parallel_size = args.tensor_parallel_size
-    config.kv_block_size = args.kv_block_size
-    config.context_length = args.context_length
-    config.migration_limit = args.migration_limit
-    config.extra_engine_args = args.extra_engine_args
-    return config
-if __name__ == "__main__":
-    uvloop.install()
-    asyncio.run(worker())
--- a/launch/dynamo-run/src/subprocess/vllm_v1_inc.py
+++ b/launch/dynamo-run/src/subprocess/vllm_v1_inc.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# `dynamo-run out=vllm` runs this script
-# Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
-# Setup checklist:
-# - We are in a virtualenv with vllm installed. V1 is compatible with v0.9.0
-# Steps:
-# git clone https://github.com/vllm-project/vllm.git
-# cd vllm && git checkout v0.9.0
-# uv pip uninstall ai-dynamo-vllm
-# VLLM_USE_PRECOMPILED=1 uv pip install --editable .
-import argparse
-import asyncio
-import json
-import logging
-import os
-import sys
-import uuid
-from typing import Optional
-import uvloop
-from vllm.config import VllmConfig
-from vllm.distributed.kv_events import KVEventsConfig
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.inputs import TokensPrompt
-from vllm.sampling_params import SamplingParams
-from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine.async_llm import AsyncLLM
-from vllm.v1.metrics.loggers import StatLoggerBase
-from vllm.v1.metrics.stats import IterationStats, SchedulerStats
-from dynamo.llm import (
-    ForwardPassMetrics,
-    KvStats,
-    ModelType,
-    SpecDecodeStats,
-    WorkerMetricsPublisher,
-    WorkerStats,
-    ZmqKvEventPublisher,
-    ZmqKvEventPublisherConfig,
-    register_llm,
-)
-from dynamo.runtime import Component, DistributedRuntime, dynamo_worker
-from dynamo.runtime.logging import configure_dynamo_logging
-# Only used if you run it manually from the command line
-DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
-DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
-configure_dynamo_logging()
-logger = logging.getLogger(__name__)
-class Config:
-    """Command line parameters or defaults"""
-    namespace: str
-    component: str
-    endpoint: str
-    model_path: str
-    model_name: Optional[str]
-    tensor_parallel_size: int
-    kv_block_size: int
-    context_length: int
-    migration_limit: int
-    extra_engine_args: str
-class DynamoStatLoggerPublisher(StatLoggerBase):
-    """Stat logger publisher. Wrapper for the WorkerMetricsPublisher to match the StatLoggerBase interface."""
-    def __init__(self, component: Component, dp_rank: int) -> None:
-        self.inner = WorkerMetricsPublisher()
-        self.inner.create_endpoint(component)
-        self.dp_rank = dp_rank
-    def record(
-        self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]
-    ):
-        # request_total_slots and kv_total_blocks are properties of model + gpu
-        # we should only publish them once, not every metric update
-        # they should be part of some runtime metadata tied to MDC or put in etcd ?
-        hit_rate = 0
-        if scheduler_stats.prefix_cache_stats.queries > 0:
-            hit_rate = (
-                scheduler_stats.prefix_cache_stats.hits
-                / scheduler_stats.prefix_cache_stats.queries
-            )
-        worker_stats = WorkerStats(
-            request_active_slots=scheduler_stats.num_running_reqs,
-            request_total_slots=0,  # TODO - remove from metrics
-            num_requests_waiting=scheduler_stats.num_waiting_reqs,
-            data_parallel_rank=None,
-        )
-        kv_stats = KvStats(
-            kv_active_blocks=0,  # TODO - need to calculate this
-            kv_total_blocks=0,  # TODO - remove from metrics
-            gpu_cache_usage_perc=scheduler_stats.gpu_cache_usage,  # used in current cost function
-            gpu_prefix_cache_hit_rate=hit_rate,
-        )
-        spec_dec_stats = scheduler_stats.spec_decoding_stats
-        if spec_dec_stats:
-            spec_dec_stats = SpecDecodeStats(
-                num_spec_tokens=spec_dec_stats.num_spec_tokens,
-                num_drafts=spec_dec_stats.num_drafts,
-                num_draft_tokens=spec_dec_stats.num_draft_tokens,
-                num_accepted_tokens=spec_dec_stats.num_accepted_tokens,
-                num_accepted_tokens_per_pos=spec_dec_stats.num_accepted_tokens_per_pos,
-            )
-        metrics = ForwardPassMetrics(
-            worker_stats=worker_stats,
-            kv_stats=kv_stats,
-            spec_decode_stats=spec_dec_stats,
-        )
-        self.inner.publish(metrics)
-    def log_engine_initialized(self) -> None:
-        pass
-class StatLoggerFactory:
-    """Factory for creating stat logger publishers. Required by vLLM."""
-    def __init__(self, component: Component) -> None:
-        self.component = component
-    def create_stat_logger(self, dp_rank: int) -> StatLoggerBase:
-        return DynamoStatLoggerPublisher(self.component, dp_rank)
-    def __call__(self, vllm_config: VllmConfig, dp_rank: int) -> StatLoggerBase:
-        return self.create_stat_logger(dp_rank=dp_rank)
-class RequestHandler:
-    """
-    Request handler for the generate and clear_kv_blocks endpoints.
-    """
-    def __init__(self, component, engine, default_sampling_params):
-        self.component = component
-        self.engine_client = engine
-        self.default_sampling_params = default_sampling_params
-    async def clear_kv_blocks(self, request=None):
-        try:
-            await self.engine_client.reset_prefix_cache()
-            yield {"status": "success", "message": "KV cache cleared"}
-        except Exception as e:
-            yield {"status": "error", "message": str(e)}
-    async def generate(self, request):
-        request_id = str(uuid.uuid4().hex)
-        prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
-        sampling_params = SamplingParams(**self.default_sampling_params)
-        for key, value in request["sampling_options"].items():
-            if not value:
-                continue
-            if hasattr(sampling_params, key):
-                setattr(sampling_params, key, value)
-        max_tokens = request["stop_conditions"]["max_tokens"]
-        if max_tokens:
-            sampling_params.max_tokens = max_tokens
-        num_output_tokens_so_far = 0
-        gen = self.engine_client.generate(prompt, sampling_params, request_id)
-        async for res in gen:
-            # res is vllm's RequestOutput
-            # This is the expected way for a request to end.
-            # The new token ID will be eos, don't forward it.
-            if res.finished:
-                yield {"finish_reason": "stop", "token_ids": []}
-                break
-            if not res.outputs:
-                yield {"finish_reason": "error", "token_ids": []}
-                break
-            output = res.outputs[0]
-            next_total_toks = len(output.token_ids)
-            out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
-            if output.finish_reason:
-                out["finish_reason"] = output.finish_reason
-            if output.stop_reason:
-                out["stop_reason"] = output.stop_reason
-            yield out
-            num_output_tokens_so_far = next_total_toks
-@dynamo_worker(static=False)
-async def worker(runtime: DistributedRuntime):
-    await init(runtime, cmd_line_args())
-async def init(runtime: DistributedRuntime, config: Config):
-    """
-    Instantiate and serve
-    """
-    component = runtime.namespace(config.namespace).component(config.component)
-    await component.create_service()
-    generate_endpoint = component.endpoint(config.endpoint)
-    clear_endpoint = component.endpoint("clear_kv_blocks")
-    await register_llm(
-        ModelType.Backend,
-        generate_endpoint,
-        config.model_path,
-        config.model_name,
-        kv_cache_block_size=config.kv_block_size,
-        migration_limit=config.migration_limit,
-    )
-    arg_map = {
-        "model": config.model_path,
-        "task": "generate",
-        "tensor_parallel_size": config.tensor_parallel_size,
-        "skip_tokenizer_init": True,
-        "disable_log_requests": True,
-        "enable_prefix_caching": True,
-        # KV routing relies on logging KV metrics
-        "disable_log_stats": False,
-        "kv_events_config": KVEventsConfig(
-            enable_kv_cache_events=True, publisher="zmq"
-        ),
-    }
-    if config.context_length:
-        # Usually we want it to default to the max (from tokenizer_config.json)
-        arg_map["max_model_len"] = config.context_length
-    if config.kv_block_size > 0:
-        arg_map["block_size"] = config.kv_block_size
-    if config.extra_engine_args != "":
-        json_map = {}
-        # extra_engine_args is a filename
-        try:
-            with open(config.extra_engine_args) as f:
-                json_map = json.load(f)
-        except FileNotFoundError:
-            logging.error(f"File {config.extra_engine_args} not found.")
-        except json.JSONDecodeError as e:
-            logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
-        logging.debug(f"Adding extra engine arguments: {json_map}")
-        arg_map = {**arg_map, **json_map}  # json_map gets precedence
-    logger.info(f"VLLM config: {arg_map}")
-    os.environ["VLLM_NO_USAGE_STATS"] = "1"  # Avoid internal HTTP requests
-    os.environ[
-        "VLLM_WORKER_MULTIPROC_METHOD"
-    ] = "spawn"  # Ensure our publisher makes it to the new process
-    engine_args = AsyncEngineArgs(**arg_map)
-    model_config = engine_args.create_model_config()
-    # Load default sampling params from `generation_config.json`
-    default_sampling_params = model_config.get_diff_sampling_param()
-    # Taken from build_async_engine_client_from_engine_args()
-    usage_context = UsageContext.OPENAI_API_SERVER
-    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
-    # Explicitly pass our custom stat logger for metrics
-    engine_client = AsyncLLM.from_vllm_config(
-        vllm_config=vllm_config,
-        usage_context=usage_context,
-        stat_loggers=[StatLoggerFactory(component)],
-        disable_log_requests=engine_args.disable_log_requests,
-        disable_log_stats=engine_args.disable_log_stats,
-    )
-    logger.info("VllmWorker has been initialized")
-    zmq_config = ZmqKvEventPublisherConfig(
-        worker_id=generate_endpoint.lease_id(), kv_block_size=engine_args.block_size
-    )
-    _ = ZmqKvEventPublisher(component=component, config=zmq_config)
-    handler = RequestHandler(component, engine_client, default_sampling_params)
-    try:
-        await asyncio.gather(
-            generate_endpoint.serve_endpoint(handler.generate),
-            clear_endpoint.serve_endpoint(handler.clear_kv_blocks),
-        )
-    except Exception as e:
-        logger.error(f"Failed to serve endpoints: {e}")
-        raise
-def cmd_line_args():
-    parser = argparse.ArgumentParser(
-        description="vLLM server integrated with Dynamo LLM."
-    )
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        default=DEFAULT_ENDPOINT,
-        help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
-    )
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default=DEFAULT_MODEL,
-        help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
-    )
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default="",
-        help="Name to serve the model under. Defaults to deriving it from model path.",
-    )
-    parser.add_argument(
-        "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
-    )
-    parser.add_argument(
-        "--kv-block-size", type=int, default=16, help="Size of a KV cache block."
-    )
-    parser.add_argument(
-        "--context-length",
-        type=int,
-        default=None,
-        help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
-    )
-    parser.add_argument(
-        "--migration-limit",
-        type=int,
-        default=0,
-        help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
-    )
-    parser.add_argument(
-        "--extra-engine-args",
-        type=str,
-        default="",
-        help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
-    )
-    args = parser.parse_args()
-    config = Config()
-    config.model_path = args.model_path
-    if args.model_name:
-        config.model_name = args.model_name
-    else:
-        # This becomes an `Option` on the Rust side
-        config.model_name = None
-    endpoint_str = args.endpoint.replace("dyn://", "", 1)
-    endpoint_parts = endpoint_str.split(".")
-    if len(endpoint_parts) != 3:
-        logging.error(
-            f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
-        )
-        sys.exit(1)
-    parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
-    config.namespace = parsed_namespace
-    config.component = parsed_component_name
-    config.endpoint = parsed_endpoint_name
-    config.tensor_parallel_size = args.tensor_parallel_size
-    config.kv_block_size = args.kv_block_size
-    config.context_length = args.context_length
-    config.migration_limit = args.migration_limit
-    config.extra_engine_args = args.extra_engine_args
-    return config
-if __name__ == "__main__":
-    uvloop.install()
-    asyncio.run(worker())
--- a/lib/bindings/python/examples/cli/cli.py
+++ b/lib/bindings/python/examples/cli/cli.py
@@ -22,7 +22,6 @@
 import argparse
 import asyncio
-import signal
 import sys
 from pathlib import Path
@@ -31,9 +30,6 @@ import uvloop
 from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
 from dynamo.runtime import DistributedRuntime
-subprocess_ref = None  # Global process reference for cleanup
-subprocess_task = None  # Global async task reference for cleanup
 def parse_args():
    in_mode = "text"
@@ -90,13 +86,6 @@ def parse_args():
    # http_port: Option<u16>
    parser.add_argument("--http-port", type=int, help="HTTP port for the engine (u16).")
-    # TODO: Not yet used here
-    parser.add_argument(
-        "--tensor-parallel-size",
-        type=int,
-        help="Tensor parallel size for the model (e.g., 4).",
-    )
    # Add the positional model argument.
    # It's made optional (nargs='?') because its requirement depends on 'out_mode',
    # which is handled in post-parsing validation.
@@ -131,39 +120,8 @@ def parse_args():
    return parsed_args
-async def cleanup_subprocess_async():
-    """Clean up the sglang/vllm/trtllm subprocess if it exists."""
-    global subprocess_ref
-    if subprocess_ref and subprocess_ref.returncode is None:
-        subprocess_ref.terminate()
-        try:
-            await asyncio.wait_for(subprocess_ref.wait(), timeout=2)
-        except asyncio.TimeoutError:
-            subprocess_ref.kill()
-            await subprocess_ref.wait()
-        # Only cleanup once
-        subprocess_ref = None
-def signal_handler():
-    """Handle signals in async context by cleaning up subprocess and exiting."""
-    asyncio.create_task(cleanup_subprocess_async())
-    sys.exit(0)
 async def run():
-    global subprocess_ref
-    global subprocess_task
-    # Register signal handlers
    loop = asyncio.get_running_loop()
-    loop.add_signal_handler(signal.SIGINT, signal_handler)  # Ctrl-C
-    loop.add_signal_handler(signal.SIGTERM, signal_handler)  # kill
-    # If we find cases where subprocess does not stop we may need this. Seem OK so far.
-    # atexit.register(cleanup_subprocess)
    runtime = DistributedRuntime(loop, False)
    args = parse_args()
@@ -174,57 +132,6 @@ async def run():
    }
    out_mode = args["out_mode"]
-    # Handle subprocess execution for sglang and vllm
-    if out_mode in ["sglang", "vllm", "trtllm"]:
-        # Determine which script to run
-        script_name = f"{out_mode}_inc.py"
-        script_path = Path(__file__).parent / script_name
-        if not script_path.exists():
-            print(f"Error: Script '{script_path}' not found")
-            sys.exit(1)
-        # Build command with all relevant arguments
-        cmd = [sys.executable, str(script_path)]
-        # Add arguments if they exist
-        if args["model_path"]:
-            cmd.extend(["--model-path", args["model_path"]])
-        flags = args["flags"]
-        if flags.model_name:
-            cmd.extend(["--model-name", flags.model_name])
-        if flags.context_length:
-            cmd.extend(["--context-length", str(flags.context_length)])
-        if flags.kv_cache_block_size:
-            cmd.extend(["--kv-cache-block-size", str(flags.kv_cache_block_size)])
-        # Start subprocess in background and stream output
-        print(f"Starting {out_mode} subprocess: {' '.join(cmd)}")
-        async def stream_subprocess_output():
-            global subprocess_ref
-            subprocess_ref = await asyncio.create_subprocess_exec(
-                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
-            )
-            try:
-                if subprocess_ref.stdout is not None:
-                    async for line in subprocess_ref.stdout:
-                        print(f"Engine: {line.decode().rstrip()}")
-                    await subprocess_ref.wait()
-            except asyncio.CancelledError:
-                # Task was cancelled, terminate the subprocess
-                await cleanup_subprocess_async()
-                raise
-        task = asyncio.create_task(stream_subprocess_output())
-        # Store the task reference for potential cleanup
-        subprocess_task = task
-        # Set out_mode to "dyn" because we talk to the subprocess over NATS
-        out_mode = "dyn"
    engine_type = engine_type_map.get(out_mode)
    if engine_type is None:
        print(f"Unsupported output type: {out_mode}")
@@ -249,19 +156,7 @@ async def run():
    e = EntrypointArgs(engine_type, **entrypoint_kwargs)
    engine = await make_engine(runtime, e)
-    try:
+    await run_input(runtime, args["in_mode"], engine)
-        await run_input(runtime, args["in_mode"], engine)
-    finally:
-        # Clean up subprocess when main execution finishes
-        await cleanup_subprocess_async()
-        # Cancel the subprocess task if it exists
-        if subprocess_task:
-            subprocess_task.cancel()
-            try:
-                await subprocess_task
-            except asyncio.CancelledError:
-                pass
 if __name__ == "__main__":

--- a/lib/bindings/python/examples/cli/sglang_inc.py
+++ b/lib/bindings/python/examples/cli/sglang_inc.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# `dynamo-run out=sglang` runs this script
-# Can also be used standalone: `python3 sglang_inc.py` - lots of optional cmd line params
-import argparse
-import asyncio
-import json
-import logging
-import sys
-from typing import Optional
-import sglang
-import uvloop
-from sglang.srt.entrypoints.engine import EmbeddingReqInput
-from sglang.srt.server_args import ServerArgs
-from dynamo.llm import ModelType, register_llm
-from dynamo.runtime import DistributedRuntime, dynamo_worker
-from dynamo.runtime.logging import configure_dynamo_logging
-# Only used if you run it manually from the command line
-DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
-DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
-configure_dynamo_logging()
-class Config:
-    """Command line parameters or defaults"""
-    namespace: str
-    component: str
-    endpoint: str
-    model_path: str
-    model_name: Optional[str]
-    base_gpu_id: int
-    tensor_parallel_size: int
-    kv_block_size: int
-    context_length: int
-    nnodes: int
-    node_rank: int
-    dist_init_addr: str
-    extra_engine_args: str
-class RequestHandler:
-    """
-    Request handler for the generate endpoint
-    """
-    def __init__(self, engine):
-        self.engine_client = engine
-    async def generate(self, request):
-        sampling_params = {}
-        if request["sampling_options"]["temperature"] is not None:
-            sampling_params["temperature"] = request["sampling_options"]["temperature"]
-        # sglang defaults this to 128
-        sampling_params["max_new_tokens"] = request["stop_conditions"]["max_tokens"]
-        # Check if this is a batch request
-        is_batch = "batch_token_ids" in request and request["batch_token_ids"]
-        if is_batch:
-            # Track tokens separately for each batch item
-            num_output_tokens_so_far = {}
-            gen = await self.engine_client.async_generate(
-                input_ids=request["batch_token_ids"],
-                sampling_params=sampling_params,
-                stream=True,
-            )
-        else:
-            num_output_tokens_so_far = 0
-            gen = await self.engine_client.async_generate(
-                input_ids=request["token_ids"],
-                sampling_params=sampling_params,
-                stream=True,
-            )
-        async for res in gen:
-            # res is a dict
-            finish_reason = res["meta_info"]["finish_reason"]
-            if is_batch:
-                # Handle batch response - get index from SGLang response
-                index = res.get("index", 0)
-                if index not in num_output_tokens_so_far:
-                    num_output_tokens_so_far[index] = 0
-                if finish_reason:
-                    logging.warning(f"finish_reason: {finish_reason}")
-                    # Final response for this batch item
-                    out = {
-                        "token_ids": [],
-                        "finish_reason": finish_reason["type"],
-                        "index": index,
-                    }
-                else:
-                    # Streaming response for this batch item
-                    next_total_toks = len(res["output_ids"])
-                    new_tokens = res["output_ids"][num_output_tokens_so_far[index] :]
-                    out = {
-                        "token_ids": new_tokens,
-                        "index": index,
-                    }
-                    num_output_tokens_so_far[index] = next_total_toks
-            else:
-                if finish_reason:
-                    out = {
-                        "token_ids": [],
-                        "finish_reason": finish_reason["type"],
-                    }
-                else:
-                    next_total_toks = len(res["output_ids"])
-                    new_tokens = res["output_ids"][num_output_tokens_so_far:]
-                    out = {
-                        "token_ids": new_tokens,
-                    }
-                    num_output_tokens_so_far = next_total_toks
-            yield out
-    async def encode(self, request):
-        obj = EmbeddingReqInput(input_ids=request["token_ids"])
-        generator = self.engine_client.tokenizer_manager.generate_request(obj, None)
-        engine_results = await anext(generator)
-        tokens = 0
-        embeddings = []
-        for result in engine_results:
-            embeddings.append(result["embedding"])
-            tokens += result["meta_info"]["prompt_tokens"]
-        out = {
-            "embeddings": embeddings,
-            "prompt_tokens": tokens,
-            "total_tokens": tokens,
-        }
-        yield out
-@dynamo_worker(static=False)
-async def worker(runtime: DistributedRuntime):
-    await init(runtime, cmd_line_args())
-async def init(runtime: DistributedRuntime, config: Config):
-    """
-    Instantiate and serve
-    """
-    arg_map = {
-        "model_path": config.model_path,
-        "skip_tokenizer_init": True,
-        "tp_size": config.tensor_parallel_size,
-        "base_gpu_id": config.base_gpu_id,
-    }
-    if config.kv_block_size:
-        arg_map["page_size"] = config.kv_block_size
-    if config.context_length:
-        arg_map["context_length"] = config.context_length
-    if config.dist_init_addr != "":
-        arg_map["trust_remote_code"] = True
-        arg_map["nnodes"] = config.nnodes
-        arg_map["dist_init_addr"] = config.dist_init_addr
-        # In practice this is always 0 because Dynamo only manages the leader
-        arg_map["node_rank"] = config.node_rank
-    if config.extra_engine_args != "":
-        json_map = {}
-        # extra_engine_args is a filename
-        try:
-            with open(config.extra_engine_args) as f:
-                json_map = json.load(f)
-        except FileNotFoundError:
-            logging.error(f"File {config.extra_engine_args} not found.")
-        except json.JSONDecodeError as e:
-            logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
-        logging.debug(f"Adding extra engine arguments: {json_map}")
-        arg_map = {**arg_map, **json_map}  # json_map gets precedence
-    # TODO fetch default SamplingParams from generation_config.json
-    engine_args = ServerArgs(**arg_map)
-    engine_client = sglang.Engine(server_args=engine_args)
-    component = runtime.namespace(config.namespace).component(config.component)
-    await component.create_service()
-    endpoint = component.endpoint(config.endpoint)
-    model_type = (
-        ModelType.Backend if not engine_args.is_embedding else ModelType.Embedding
-    )
-    await register_llm(model_type, endpoint, config.model_path, config.model_name)
-    # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
-    # after the lease is revoked
-    handler = RequestHandler(engine_client)
-    if engine_args.is_embedding:
-        await endpoint.serve_endpoint(handler.encode)
-    else:
-        await endpoint.serve_endpoint(handler.generate)
-def cmd_line_args():
-    parser = argparse.ArgumentParser(
-        description="SGLang server integrated with Dynamo LLM."
-    )
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        default=DEFAULT_ENDPOINT,
-        help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
-    )
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default=DEFAULT_MODEL,
-        help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
-    )
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default="",
-        help="Name to serve the model under. Defaults to deriving it from model path.",
-    )
-    parser.add_argument(
-        "--base-gpu-id",
-        type=int,
-        default=0,
-        help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
-    )
-    parser.add_argument(
-        "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
-    )
-    parser.add_argument(
-        "--kv-block-size", type=int, default=16, help="Size of a KV cache block."
-    )
-    parser.add_argument(
-        "--context-length",
-        type=int,
-        default=None,
-        help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
-    )
-    parser.add_argument(
-        "--nnodes", type=int, default=1, help="The number of machines SGLang will use"
-    )
-    parser.add_argument(
-        "--node-rank",
-        type=int,
-        default=0,
-        help="Unique number for each node. 0 for the leader.",
-    )
-    parser.add_argument(
-        "--dist-init-addr",
-        type=str,
-        default="",
-        help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
-    )
-    parser.add_argument(
-        "--extra-engine-args",
-        type=str,
-        default="",
-        help="Path to a JSON file containing additional keyword arguments to pass to the SGLang Engine.",
-    )
-    args = parser.parse_args()
-    config = Config()
-    config.model_path = args.model_path
-    if args.model_name:
-        config.model_name = args.model_name
-    else:
-        # This becomes an `Option` on the Rust side
-        config.model_name = None
-    endpoint_str = args.endpoint.replace("dyn://", "", 1)
-    endpoint_parts = endpoint_str.split(".")
-    if len(endpoint_parts) != 3:
-        logging.error(
-            f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
-        )
-        sys.exit(1)
-    parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
-    config.namespace = parsed_namespace
-    config.component = parsed_component_name
-    config.endpoint = parsed_endpoint_name
-    config.base_gpu_id = args.base_gpu_id
-    config.tensor_parallel_size = args.tensor_parallel_size
-    config.kv_block_size = args.kv_block_size
-    config.context_length = args.context_length
-    config.nnodes = args.nnodes
-    config.node_rank = args.node_rank
-    config.dist_init_addr = args.dist_init_addr
-    config.extra_engine_args = args.extra_engine_args
-    return config
-if __name__ == "__main__":
-    uvloop.install()
-    asyncio.run(worker())
--- a/lib/bindings/python/examples/cli/trtllm_inc.py
+++ b/lib/bindings/python/examples/cli/trtllm_inc.py
--- a/lib/bindings/python/examples/cli/vllm_inc.py
+++ b/lib/bindings/python/examples/cli/vllm_inc.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-# `dynamo-run out=vllm` runs this script
-# Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
-# Setup checklist:
-# - We are in a virtualenv with vllm installed - and patched if using kv routing.
-# - `libdynamo_llm_capi.so` is in system lib path or it's containing folder is in LD_LIBRARY_PATH
-#   It builds in target/debug/ by default.
-import argparse
-import asyncio
-import json
-import logging
-import os
-import sys
-import uuid
-from typing import Optional
-import uvloop
-from vllm import SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TokensPrompt
-from dynamo.llm import (
-    ForwardPassMetrics,
-    KvStats,
-    ModelType,
-    WorkerMetricsPublisher,
-    WorkerStats,
-    register_llm,
-)
-from dynamo.runtime import DistributedRuntime, dynamo_worker
-from dynamo.runtime.logging import configure_dynamo_logging
-# Only used if you run it manually from the command line
-DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
-DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
-configure_dynamo_logging()
-class Config:
-    """Command line parameters or defaults"""
-    namespace: str
-    component: str
-    endpoint: str
-    model_path: str
-    model_name: Optional[str]
-    tensor_parallel_size: int
-    kv_block_size: int
-    context_length: int
-    extra_engine_args: str
-class RequestHandler:
-    """
-    Request handler for the generate endpoint
-    """
-    def __init__(self, component, engine, default_sampling_params):
-        self.component = component
-        self.engine_client = engine
-        self.default_sampling_params = default_sampling_params
-        self.metrics_publisher = WorkerMetricsPublisher()
-    def setup_kv_metrics(self):
-        if not hasattr(self.engine_client, "set_metrics_publisher"):
-            logging.debug("VLLM version does not support KV metrics")
-            return
-        self.engine_client.set_metrics_publisher(self.metrics_publisher)
-        # Initially send dummy metrics to kick start,
-        # vLLM will not update stat until forward pass is triggered
-        # Create the structured metrics objects
-        worker_stats = WorkerStats(
-            request_active_slots=0,
-            request_total_slots=1024,
-            num_requests_waiting=0,
-            data_parallel_rank=None,
-        )
-        kv_stats = KvStats(
-            kv_active_blocks=0,
-            kv_total_blocks=1024,
-            gpu_cache_usage_perc=0.0,
-            gpu_prefix_cache_hit_rate=0.0,
-        )
-        metrics = ForwardPassMetrics(
-            worker_stats=worker_stats, kv_stats=kv_stats, spec_decode_stats=None
-        )
-        # Publish the metrics as a single object
-        self.metrics_publisher.publish(metrics)
-        task = asyncio.create_task(self.create_metrics_publisher_endpoint())
-        task.add_done_callback(
-            lambda _: logging.debug("metrics publisher endpoint created")
-        )
-    async def create_metrics_publisher_endpoint(self):
-        logging.debug("Creating metrics publisher endpoint")
-        await self.metrics_publisher.create_endpoint(self.component)
-    async def generate(self, request):
-        # logging.debug(f"Received request: {request}")
-        request_id = str(uuid.uuid4().hex)
-        prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
-        sampling_params = SamplingParams(**self.default_sampling_params)
-        for key, value in request["sampling_options"].items():
-            if not value:
-                continue
-            if hasattr(sampling_params, key):
-                setattr(sampling_params, key, value)
-        max_tokens = request["stop_conditions"]["max_tokens"]
-        if max_tokens:
-            sampling_params.max_tokens = max_tokens
-        num_output_tokens_so_far = 0
-        gen = self.engine_client.generate(prompt, sampling_params, request_id)
-        async for res in gen:
-            # res is vllm's RequestOutput
-            # This is the expected way for a request to end.
-            # The new token ID will be eos, don't forward it.
-            if res.finished:
-                yield {"finish_reason": "stop", "token_ids": []}
-                break
-            if not res.outputs:
-                yield {"finish_reason": "error", "token_ids": []}
-                break
-            output = res.outputs[0]
-            next_total_toks = len(output.token_ids)
-            out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
-            if output.finish_reason:
-                out["finish_reason"] = output.finish_reason
-            if output.stop_reason:
-                out["stop_reason"] = output.stop_reason
-            yield out
-            num_output_tokens_so_far = next_total_toks
-@dynamo_worker(static=False)
-async def worker(runtime: DistributedRuntime):
-    await init(runtime, cmd_line_args())
-def _check_and_set_env_value(key, expected, allow_override=False):
-    if not allow_override and key in os.environ and os.environ[key] != expected:
-        raise ValueError(
-            f"{key} is set and doesn't equal expected {expected}. Please unset variable before launch."
-        )
-    os.environ.setdefault(key, expected)
-async def init(runtime: DistributedRuntime, config: Config):
-    """
-    Instantiate and serve
-    """
-    arg_map = {
-        "model": config.model_path,
-        "task": "generate",
-        "tensor_parallel_size": config.tensor_parallel_size,
-        "skip_tokenizer_init": True,
-        "disable_log_requests": True,
-        "enable_prefix_caching": True,
-        # KV routing relies on logging KV metrics
-        "disable_log_stats": False,
-    }
-    assert config.kv_block_size > 0, "Must use non-negative integer for KV Block Size"
-    arg_map["block_size"] = config.kv_block_size
-    if config.context_length:
-        # Usually we want it to default to the max (from tokenizer_config.json)
-        arg_map["max_model_len"] = config.context_length
-    if config.extra_engine_args != "":
-        json_map = {}
-        # extra_engine_args is a filename
-        try:
-            with open(config.extra_engine_args) as f:
-                json_map = json.load(f)
-        except FileNotFoundError:
-            logging.error(f"File {config.extra_engine_args} not found.")
-        except json.JSONDecodeError as e:
-            logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
-        logging.debug(f"Adding extra engine arguments: {json_map}")
-        arg_map = {**arg_map, **json_map}  # json_map gets precedence
-    # Patch won't start KVCacheEventManager unless these four are set
-    component = runtime.namespace(config.namespace).component(config.component)
-    await component.create_service()
-    endpoint = component.endpoint(config.endpoint)
-    _check_and_set_env_value("VLLM_WORKER_ID", str(endpoint.lease_id()))
-    _check_and_set_env_value(
-        "VLLM_KV_CAPI_PATH", "libdynamo_llm_capi.so", allow_override=True
-    )
-    _check_and_set_env_value("VLLM_KV_NAMESPACE", config.namespace)
-    _check_and_set_env_value("VLLM_KV_COMPONENT", config.component)
-    _check_and_set_env_value(
-        "VLLM_NO_USAGE_STATS", "1", allow_override=True
-    )  # Avoid internal HTTP requests
-    engine_args = AsyncEngineArgs(**arg_map)
-    model_config = engine_args.create_model_config()
-    # Load default sampling params from `generation_config.json`
-    default_sampling_params = model_config.get_diff_sampling_param()
-    engine_context = build_async_engine_client_from_engine_args(engine_args)
-    engine_client = await engine_context.__aenter__()
-    await register_llm(
-        ModelType.Backend,
-        endpoint,
-        config.model_path,
-        config.model_name,
-        context_length=arg_map.get(
-            "max_model_len", None
-        ),  # if None, takes length from tokenizer
-        kv_cache_block_size=arg_map["block_size"],
-    )
-    handler = RequestHandler(component, engine_client, default_sampling_params)
-    handler.setup_kv_metrics()
-    # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
-    # after the lease is revoked
-    await endpoint.serve_endpoint(handler.generate)
-def cmd_line_args():
-    parser = argparse.ArgumentParser(
-        description="vLLM server integrated with Dynamo LLM."
-    )
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        default=DEFAULT_ENDPOINT,
-        help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
-    )
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default=DEFAULT_MODEL,
-        help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
-    )
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default="",
-        help="Name to serve the model under. Defaults to deriving it from model path.",
-    )
-    parser.add_argument(
-        "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
-    )
-    parser.add_argument(
-        "--kv-block-size", type=int, default=16, help="Size of a KV cache block."
-    )
-    parser.add_argument(
-        "--context-length",
-        type=int,
-        default=None,
-        help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
-    )
-    parser.add_argument(
-        "--extra-engine-args",
-        type=str,
-        default="",
-        help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
-    )
-    args = parser.parse_args()
-    config = Config()
-    config.model_path = args.model_path
-    if args.model_name:
-        config.model_name = args.model_name
-    else:
-        # This becomes an `Option` on the Rust side
-        config.model_name = None
-    endpoint_str = args.endpoint.replace("dyn://", "", 1)
-    endpoint_parts = endpoint_str.split(".")
-    if len(endpoint_parts) != 3:
-        logging.error(
-            f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
-        )
-        sys.exit(1)
-    parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
-    config.namespace = parsed_namespace
-    config.component = parsed_component_name
-    config.endpoint = parsed_endpoint_name
-    config.tensor_parallel_size = args.tensor_parallel_size
-    config.kv_block_size = args.kv_block_size
-    config.context_length = args.context_length
-    config.extra_engine_args = args.extra_engine_args
-    return config
-if __name__ == "__main__":
-    uvloop.install()
-    asyncio.run(worker())
--- a/lib/llm/src/entrypoint/input/endpoint.rs
+++ b/lib/llm/src/entrypoint/input/endpoint.rs
@@ -80,10 +80,7 @@ pub async fn run(
            (Box::pin(fut), Some(model.card().clone()))
        }
        EngineConfig::Dynamic(_) => {
-            // We can only get here for in=dyn out=vllm|sglang`, because vllm and sglang are a
+            unreachable!("An endpoint input will never have a Dynamic engine");
-            // subprocess that we talk to like a remote endpoint.
-            // That means the vllm/sglang subprocess is doing all the work, we are idle.
-            (never_ready(), None)
        }
    };
@@ -107,7 +104,3 @@ pub async fn run(
    Ok(())
 }
-fn never_ready() -> Pin<Box<dyn Future<Output = anyhow::Result<()>> + Send + 'static>> {
-    Box::pin(std::future::pending::<anyhow::Result<()>>())
-}