chore: Remove embedded Python vllm and sglang engines (#966)

vllm and sglang are now the sub-process engines from #954 Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).

chore: Remove embedded Python vllm and sglang engines (#966)
vllm and sglang are now the sub-process engines from #954 Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).
42969800 · Graham King · GitHub · 5d89a0c8 · 5d89a0c8 · 5d89a0c8
Unverified Commit 42969800 authored May 07, 2025 by Graham King Committed by GitHub May 07, 2025
14 changed files
--- a/lib/engines/sglang/src/lib.rs
+++ b/lib/engines/sglang/src/lib.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-use std::path::{Path, PathBuf};
-use std::sync::Arc;
-use dynamo_llm::backend::ExecutionContext;
-use dynamo_runtime::pipeline::error as pipeline_error;
-use dynamo_runtime::CancellationToken;
-use pyo3::prelude::*;
-mod worker;
-mod engine;
-use engine::SgLangEngine;
-mod subprocess;
-pub use subprocess::run_subprocess;
-pub async fn make_engine(
-    cancel_token: CancellationToken,
-    // Full path to the model directory
-    model_path: &Path,
-    // Unique string to name zmq sockets
-    sock_code: &str,
-    // Multi node settings
-    node_conf: dynamo_llm::engines::MultiNodeConfig,
-    // How many GPUs to use
-    tensor_parallel_size: u32,
-    // The base GPU ID to start allocating GPUs from
-    base_gpu_id: u32,
-    // Extra arguments to pass directly as sglang ServerArgs
-    extra_engine_args: Option<PathBuf>,
-) -> pipeline_error::Result<(ExecutionContext, tokio::task::JoinHandle<()>)> {
-    let mut engine = SgLangEngine::new(
-        cancel_token,
-        sock_code,
-        model_path,
-        node_conf,
-        tensor_parallel_size,
-        base_gpu_id,
-        extra_engine_args,
-    )
-    .await?;
-    let sglang_process = engine.take_sglang_worker_handle();
-    let engine: ExecutionContext = Arc::new(engine);
-    Ok((engine, sglang_process))
-}
-#[derive(Debug, Clone, Copy)]
-pub struct MultiGPUConfig {
-    /// How many GPUs we are using / how many processes
-    pub tp_size: u32,
-    /// Tensor Parallel Rank. Must be unique across all nodes and GPUs.
-    pub tp_rank: u32,
-    /// GPU ID. Which GPU to run on. In single-node setup this is the same as tp_rank.
-    pub gpu_id: u32,
-}
-impl Default for MultiGPUConfig {
-    fn default() -> Self {
-        MultiGPUConfig {
-            tp_size: 1,
-            tp_rank: 0,
-            gpu_id: 0,
-        }
-    }
-}
-#[cfg(target_os = "macos")]
-fn fix_venv(venv: String, py: Python<'_>) -> anyhow::Result<()> {
-    let version_info = py.version_info();
-    let sys: PyObject = py.import("sys")?.into();
-    let sys_path = sys.getattr(py, "path")?;
-    let venv_path = format!(
-        "{venv}/lib/python{}.{}/site-packages",
-        version_info.major, version_info.minor
-    );
-    // TODO: This should go _before_ the site-packages
-    sys_path.call_method1(py, "append", (venv_path,))?;
-    Ok(())
-}
-#[cfg(not(target_os = "macos"))]
-fn fix_venv(_venv: String, _py: Python<'_>) -> anyhow::Result<()> {
-    Ok(())
-}
--- a/lib/engines/sglang/src/sglang_inc.py
+++ b/lib/engines/sglang/src/sglang_inc.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# This file is included as a string in subprocess.rs. Most work should be done in the Rust caller.
-#
-import json
-import logging
-import tempfile
-from multiprocessing.connection import Connection
-from sglang.srt.entrypoints.engine import _set_envs_and_config
-from sglang.srt.managers.scheduler import run_scheduler_process
-from sglang.srt.server_args import PortArgs, ServerArgs
-logging.basicConfig(
-    level="DEBUG",
-    force=True,
-    datefmt="%Y-%m-%d %H:%M:%S",
-    format="[%(asctime)s] %(message)s",
-)
-# These can all be overridden by --extra-engine-args json file
-arg_map = {
-    "model_path": f"{model_path}",
-    "enable_metrics": False,
-    "log_level": "debug",
-    "log_requests": True,
-    "tp_size": int(tp_size_str),
-    # Multi-node
-    "dist_init_addr": dist_init_addr if dist_init_addr != "" else None,
-    "nnodes": int(nnodes_str),
-    "node_rank": int(node_rank_str),
-}
-json_map = {}
-if extra_engine_args != "":
-    # extra_engine_args is a filename
-    try:
-        with open(extra_engine_args) as f:
-            json_map = json.load(f)
-    except FileNotFoundError:
-        logging.debug(f"File {extra_engine_args} not found.")
-    except json.JSONDecodeError as e:
-        logging.debug(f"Invalid JSON in {extra_engine_args}: {e}")
-    logging.debug(f"Adding extra engine arguments: {json_map}")
-    arg_map = {**arg_map, **json_map}  # json_map gets precedence
-server_args = ServerArgs(**arg_map)
-_set_envs_and_config(server_args)
-logging.debug(server_args)
-ipc_path = f"ipc:///tmp/{socket_id}"
-# These must match worker.rs zmq_sockets, which is the other side
-port_args = PortArgs(
-    # we don't use this one so use anything
-    tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
-    # Us -> sglang
-    scheduler_input_ipc_name=f"{ipc_path}_input_socket",
-    # sglang -> us
-    detokenizer_ipc_name=f"{ipc_path}_output_socket",
-    # The port for nccl initialization (torch.dist), which we don't use
-    nccl_port=9876,
-)
-# Rank must be globally unique across nodes
-tp_rank = int(tp_rank_str)
-# See nvidia-smi for GPU IDs, they run 0,1,2,etc.
-# In a single-node setup this is the same as rank
-gpu_id = int(gpu_id_str)
-pipe_fd_int = int(pipe_fd)
-writer = Connection(handle=pipe_fd_int, readable=False, writable=True)
-run_scheduler_process(server_args, port_args, gpu_id, tp_rank, None, writer)
--- a/lib/engines/sglang/src/subprocess.rs
+++ b/lib/engines/sglang/src/subprocess.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-use pyo3::{types::IntoPyDict, Python};
-use std::{
-    env,
-    ffi::CString,
-    os::fd::RawFd,
-    path::{Path, PathBuf},
-};
-use dynamo_llm::engines::MultiNodeConfig;
-const PY_START_ENGINE: &str = include_str!("sglang_inc.py");
-/// Start the Python sglang engine that listens on zmq socket
-/// This is called by running `nio --internal-sglang-process
-/// This does not return until the subprocess exits.
-pub fn run_subprocess(
-    // The prefix to put on the zmq socket names
-    socket_id: &str,
-    // Directory containing an HF repo with safetensors files, tokenizer, etc
-    model_path: &Path,
-    // The write half of a pipe, where sglang will signal when it's ready
-    notify_pipe_fd: RawFd,
-    // Multi node. Usually Default::default
-    node_config: MultiNodeConfig,
-    // Multi GPU. Usually Default::default
-    gpu_config: super::MultiGPUConfig,
-    // Allow passing any arguments to sglang
-    extra_engine_args: Option<PathBuf>,
-) -> anyhow::Result<()> {
-    pyo3::prepare_freethreaded_python(); // or enable feature "auto-initialize"
-    if let Ok(venv) = env::var("VIRTUAL_ENV") {
-        let _ = Python::with_gil(|py| crate::fix_venv(venv, py));
-    }
-    let dir = model_path.display().to_string();
-    let extra_engine_args_str = &extra_engine_args
-        .map(|p| p.display().to_string())
-        .unwrap_or_default();
-    Python::with_gil(|py| {
-        let locals = [
-            ("socket_id", socket_id),
-            ("model_path", dir.as_str()),
-            ("pipe_fd", &notify_pipe_fd.to_string()),
-            // to_string because slice must all be the same type
-            ("tp_size_str", &gpu_config.tp_size.to_string()),
-            ("tp_rank_str", &gpu_config.tp_rank.to_string()),
-            ("gpu_id_str", &gpu_config.gpu_id.to_string()),
-            ("nnodes_str", &node_config.num_nodes.to_string()),
-            ("node_rank_str", &node_config.node_rank.to_string()),
-            ("dist_init_addr", &node_config.leader_addr),
-            ("extra_engine_args", extra_engine_args_str),
-        ]
-        .into_py_dict(py)
-        .unwrap();
-        if let Err(err) = py.run(CString::new(PY_START_ENGINE)?.as_ref(), None, Some(&locals)) {
-            anyhow::bail!("sglang engine run error: {err}");
-        }
-        tracing::info!("sglang subprocess exit");
-        Ok(())
-    })
-}
--- a/lib/engines/sglang/src/worker.rs
+++ b/lib/engines/sglang/src/worker.rs
--- a/lib/engines/vllm0_7/Cargo.toml
+++ b/lib/engines/vllm0_7/Cargo.toml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-[package]
-name = "dynamo-engine-vllm0_7"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-authors.workspace = true
-license.workspace = true
-homepage.workspace = true
-repository.workspace = true
-keywords.workspace = true
-[dependencies]
-dynamo-runtime = { workspace = true }
-dynamo-llm = { workspace = true }
-anyhow = { workspace = true }
-async-stream = { workspace = true }
-async-trait = { workspace = true }
-async_zmq = { workspace = true }
-serde_json = { workspace = true }
-thiserror = { workspace = true }
-tokio = { workspace = true }
-tracing = { workspace = true }
-async-openai = "0.27.2"
-pyo3 = { version = "0.23.3", default-features = false, features = [
-  "macros",
-  "experimental-async",
-  "experimental-inspect",
-  "py-clone",
-] }
-regex = "1"
-serde-pickle = "1.2.0"
--- a/lib/engines/vllm0_7/src/engine.rs
+++ b/lib/engines/vllm0_7/src/engine.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-use std::path::{Path, PathBuf};
-use std::sync::Arc;
-use async_stream::stream;
-use async_trait::async_trait;
-use dynamo_llm::engines::MultiNodeConfig;
-use dynamo_llm::kv_router::publisher::KvMetricsPublisher;
-use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
-use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
-use dynamo_runtime::protocols::annotated::Annotated;
-use dynamo_runtime::runtime::CancellationToken;
-use crate::worker;
-pub struct VllmEngine {
-    cancel_token: CancellationToken,
-    worker: worker::VllmWorker,
-}
-impl VllmEngine {
-    pub async fn new(
-        cancel_token: CancellationToken,
-        sock_code: &str,
-        model_path: &Path,
-        node_conf: MultiNodeConfig,
-        tensor_parallel_size: u32,
-        extra_engine_args: Option<PathBuf>,
-        kv_metrics_publisher: Option<Arc<KvMetricsPublisher>>,
-    ) -> anyhow::Result<Self> {
-        let w = worker::start(
-            cancel_token.clone(),
-            sock_code,
-            model_path,
-            node_conf,
-            tensor_parallel_size,
-            extra_engine_args,
-            kv_metrics_publisher,
-        )
-        .await?;
-        let engine = VllmEngine {
-            cancel_token,
-            worker: w,
-        };
-        Ok(engine)
-    }
-    pub fn take_vllm_worker_handle(&mut self) -> tokio::task::JoinHandle<()> {
-        self.worker.take_vllm_handle()
-    }
-}
-#[async_trait]
-impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
-    for VllmEngine
-{
-    async fn generate(
-        &self,
-        request: SingleIn<BackendInput>,
-    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
-        let (request, context) = request.into_parts();
-        let ctx = context.context();
-        let request_id = ctx.id().to_string();
-        let (resp_tx, mut resp_rx) = tokio::sync::mpsc::channel(128);
-        let work_req = worker::WorkRequest {
-            request_id: context.id().to_string(),
-            request,
-            response_channel: resp_tx,
-        };
-        self.worker.enqueue_request(work_req).await?;
-        let cancel_token = self.cancel_token.clone();
-        let output = stream! {
-            loop {
-                let maybe_resp = tokio::select!{
-                    _ = cancel_token.cancelled() => {
-                        break;
-                    }
-                    maybe_resp = resp_rx.recv() => {
-                        maybe_resp
-                    }
-                };
-                match maybe_resp {
-                    Some(out) => {
-                        yield out;
-                    },
-                    None => {
-                        tracing::trace!(request_id, "generate: response channel closed");
-                        break;
-                    }
-                }
-            }
-        };
-        Ok(ResponseStream::new(Box::pin(output), ctx))
-    }
-}
--- a/lib/engines/vllm0_7/src/lib.rs
+++ b/lib/engines/vllm0_7/src/lib.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-use std::future::Future;
-use std::path::{Path, PathBuf};
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-use pyo3::prelude::*;
-use dynamo_runtime::pipeline::error as pipeline_error;
-use dynamo_runtime::CancellationToken;
-use dynamo_llm::backend::ExecutionContext;
-use dynamo_llm::engines::MultiNodeConfig;
-use dynamo_llm::kv_router::publisher::KvMetricsPublisher;
-mod engine;
-use engine::VllmEngine;
-mod ray;
-use ray::Ray;
-mod subprocess;
-pub use subprocess::run_subprocess;
-mod worker;
-pub async fn make_leader_engine(
-    cancel_token: CancellationToken,
-    // Full path to the model, either a GGUF file or an HF repo dir
-    model_path: &Path,
-    // Unique string to name zmq sockets
-    sock_code: &str,
-    // Multi node settings
-    node_conf: MultiNodeConfig,
-    // How many GPUs to use
-    tensor_parallel_size: u32,
-    // Path to extra engine args file
-    extra_engine_args: Option<PathBuf>,
-    // When using our vllm fork, this is how we publish it's KV metrics for the KV router
-    kv_metrics_publisher: Option<Arc<KvMetricsPublisher>>,
-) -> pipeline_error::Result<(ExecutionContext, impl Future<Output = ()>)> {
-    let ray_obj = if node_conf.num_nodes > 1 {
-        let r = ray::start_leader(node_conf.leader_addr.parse()?)?;
-        tracing::info!("Leader waiting for {} total nodes", node_conf.num_nodes);
-        r.wait_for(cancel_token.clone(), node_conf.num_nodes)
-            .await?;
-        tracing::info!("All nodes registered");
-        Some(r)
-    } else {
-        None
-    };
-    let mut engine = VllmEngine::new(
-        cancel_token,
-        sock_code,
-        model_path,
-        node_conf,
-        tensor_parallel_size,
-        extra_engine_args,
-        kv_metrics_publisher,
-    )
-    .await?;
-    let vllm_process = engine.take_vllm_worker_handle();
-    let vllm_future = async move {
-        if let Err(err) = vllm_process.await {
-            tracing::error!("Failed stopping vllm process: {err:#}");
-        }
-        if let Some(r) = ray_obj {
-            if let Err(err) = r.stop().await {
-                tracing::error!("Failed stopping ray: {err:#}");
-            }
-        }
-    };
-    let engine: ExecutionContext = Arc::new(engine);
-    Ok((engine, vllm_future))
-}
-pub async fn start_follower(
-    cancel_token: CancellationToken,
-    node_conf: MultiNodeConfig,
-) -> pipeline_error::Result<StopFuture> {
-    let r = ray::start_follower(node_conf.leader_addr.parse()?)?;
-    tracing::info!("Follower waiting for {} total nodes", node_conf.num_nodes);
-    r.wait_for(cancel_token, node_conf.num_nodes).await?;
-    tracing::info!("All nodes registered");
-    Ok(StopFuture {
-        state: Some(StopFutureState::New(r)),
-    })
-}
-pub struct StopFuture {
-    state: Option<StopFutureState>,
-}
-enum StopFutureState {
-    New(Ray),
-    Running(Pin<Box<dyn Future<Output = ()> + Send>>),
-}
-impl Future for StopFuture {
-    type Output = ();
-    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        let state = match self.state.take() {
-            None => return Poll::Ready(()),
-            Some(state) => state,
-        };
-        match state {
-            StopFutureState::New(obj) => {
-                // Convert object to a stop future
-                let future = Box::pin(async move {
-                    if let Err(err) = obj.stop().await {
-                        tracing::error!("Failed calling 'ray stop': {err:#}");
-                    }
-                });
-                self.state = Some(StopFutureState::Running(future));
-                // Recurse to poll the new future immediately
-                self.poll(cx)
-            }
-            StopFutureState::Running(mut future) => {
-                // Poll the stop future
-                match future.as_mut().poll(cx) {
-                    Poll::Ready(()) => {
-                        // Done, leave state as None
-                        Poll::Ready(())
-                    }
-                    Poll::Pending => {
-                        // Not ready yet, preserve the future
-                        self.state = Some(StopFutureState::Running(future));
-                        Poll::Pending
-                    }
-                }
-            }
-        }
-    }
-}
-#[cfg(target_os = "macos")]
-fn fix_venv(venv: String, py: Python<'_>) -> anyhow::Result<()> {
-    let version_info = py.version_info();
-    let sys: PyObject = py.import("sys")?.into();
-    let sys_path = sys.getattr(py, "path")?;
-    let venv_path = format!(
-        "{venv}/lib/python{}.{}/site-packages",
-        version_info.major, version_info.minor
-    );
-    // TODO: This should go _before_ the site-packages
-    sys_path.call_method1(py, "append", (venv_path,))?;
-    Ok(())
-}
-#[cfg(not(target_os = "macos"))]
-fn fix_venv(_venv: String, _py: Python<'_>) -> anyhow::Result<()> {
-    Ok(())
-}
--- a/lib/engines/vllm0_7/src/ray.rs
+++ b/lib/engines/vllm0_7/src/ray.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-use regex::Regex;
-use std::io::{BufRead, BufReader};
-use std::net::SocketAddrV4;
-use std::process::{Command, Stdio};
-use std::time::Duration;
-use thiserror::Error;
-use tokio::io::AsyncBufReadExt;
-use tokio::select;
-use tokio::time;
-use dynamo_runtime::CancellationToken;
-/// Default is 16 seconds, we make it a bit shorter
-const RAY_STOP_TIMEOUT_SECS: u32 = 10;
-/// How long to wait for all the nodes to start.
-/// This is either done manually or through some orchestration system, so either way it
-/// can take some time.
-const RAY_WAIT_SECS: u32 = 60 * 5;
-#[derive(Debug, Error)]
-pub enum RayError {
-    #[error("Failed to execute Ray command: {0}")]
-    CommandExecution(#[from] std::io::Error),
-    #[error("Ray command failed with exit code: {0}")]
-    CommandFailed(i32),
-    #[error("Failed to parse Ray status output")]
-    StatusParseError,
-    #[error("Timeout waiting for nodes to become active")]
-    WaitTimeout,
-    #[error("Operation cancelled")]
-    Cancelled,
-}
-#[derive(Debug, PartialEq)]
-pub struct RayStatus {
-    pub active_nodes: Vec<String>,
-    pub pending_nodes_count: usize,
-    pub recent_failures_count: usize,
-}
-pub struct Ray {
-    #[allow(dead_code)]
-    leader_address: SocketAddrV4,
-}
-pub fn start_leader(leader_address: SocketAddrV4) -> Result<Ray, RayError> {
-    let ip = leader_address.ip().to_string();
-    let port = leader_address.port().to_string();
-    let mut cmd = Command::new("ray");
-    cmd.args([
-        "start",
-        "--head",
-        "--disable-usage-stats",
-        "--log-style=record",
-        &format!("--node-ip-address={}", ip),
-        &format!("--port={}", port),
-    ]);
-    cmd.stdout(Stdio::piped());
-    cmd.stderr(Stdio::piped());
-    let mut child = cmd.spawn()?;
-    // Process stdout
-    if let Some(stdout) = child.stdout.take() {
-        let reader = BufReader::new(stdout);
-        for line in reader.lines().map_while(Result::ok) {
-            tracing::info!("RAY: {line}");
-        }
-    }
-    // Process stderr
-    if let Some(stderr) = child.stderr.take() {
-        let reader = BufReader::new(stderr);
-        for line in reader.lines().map_while(Result::ok) {
-            tracing::info!("RAY: {line}");
-        }
-    }
-    let status = child.wait()?;
-    if !status.success() {
-        return Err(RayError::CommandFailed(status.code().unwrap_or(-1)));
-    }
-    Ok(Ray { leader_address })
-}
-pub fn start_follower(leader_address: SocketAddrV4) -> Result<Ray, RayError> {
-    let address = leader_address.to_string();
-    let mut cmd = Command::new("ray");
-    cmd.args(["start", &format!("--address={address}")]);
-    cmd.stdout(Stdio::piped());
-    cmd.stderr(Stdio::piped());
-    let mut child = cmd.spawn()?;
-    // Process stdout
-    if let Some(stdout) = child.stdout.take() {
-        let reader = BufReader::new(stdout);
-        for line in reader.lines().map_while(Result::ok) {
-            tracing::info!("RAY: {line}");
-        }
-    }
-    // Process stderr
-    if let Some(stderr) = child.stderr.take() {
-        let reader = BufReader::new(stderr);
-        for line in reader.lines().map_while(Result::ok) {
-            tracing::info!("RAY: {line}");
-        }
-    }
-    let status = child.wait()?;
-    if !status.success() {
-        return Err(RayError::CommandFailed(status.code().unwrap_or(-1)));
-    }
-    Ok(Ray { leader_address })
-}
-impl Ray {
-    pub fn status(&self) -> Result<RayStatus, RayError> {
-        let output = Command::new("ray").arg("status").output()?;
-        if !output.status.success() {
-            return Err(RayError::CommandFailed(output.status.code().unwrap_or(-1)));
-        }
-        let output_str = String::from_utf8_lossy(&output.stdout);
-        parse_ray_status(&output_str).ok_or(RayError::StatusParseError)
-    }
-    pub async fn wait_for(
-        &self,
-        cancel_token: CancellationToken,
-        num_nodes: u32,
-    ) -> Result<(), RayError> {
-        let timeout = time::sleep(Duration::from_secs(RAY_WAIT_SECS as u64));
-        select! {
-            _ = cancel_token.cancelled() => {
-                Err(RayError::Cancelled)
-            }
-            _ = timeout => {
-                Err(RayError::WaitTimeout)
-            }
-            result = self.wait_for_nodes(num_nodes) => {
-                result
-            }
-        }
-    }
-    async fn wait_for_nodes(&self, num_nodes: u32) -> Result<(), RayError> {
-        loop {
-            let status = self.status()?;
-            if status.active_nodes.len() as u32 == num_nodes {
-                return Ok(());
-            }
-            time::sleep(Duration::from_millis(100)).await;
-        }
-    }
-    pub async fn stop(&self) -> Result<(), RayError> {
-        let mut cmd = tokio::process::Command::new("ray");
-        cmd.args([
-            "stop",
-            &format!("--grace-period={RAY_STOP_TIMEOUT_SECS}"),
-            "--log-style=record",
-        ]);
-        cmd.stdout(Stdio::piped());
-        cmd.stderr(Stdio::piped());
-        let mut child = cmd.spawn()?;
-        // Process stdout
-        if let Some(stdout) = child.stdout.take() {
-            let reader = tokio::io::BufReader::new(stdout);
-            let mut lines = reader.lines();
-            while let Ok(Some(line)) = lines.next_line().await {
-                tracing::info!("RAY: {line}");
-            }
-        }
-        // Process stderr
-        if let Some(stderr) = child.stderr.take() {
-            let reader = tokio::io::BufReader::new(stderr);
-            let mut lines = reader.lines();
-            while let Ok(Some(line)) = lines.next_line().await {
-                tracing::info!("RAY: {line}");
-            }
-        }
-        let status = child.wait().await?;
-        if !status.success() {
-            return Err(RayError::CommandFailed(status.code().unwrap_or(-1)));
-        }
-        Ok(())
-    }
-}
-/// Parse the output of "ray status" command into a RayStatus struct
-fn parse_ray_status(output: &str) -> Option<RayStatus> {
-    let mut active_nodes = Vec::new();
-    let mut pending_nodes_count = 0;
-    let mut recent_failures_count = 0;
-    // Flags to track which section we're in
-    let mut in_active_section = false;
-    let mut in_pending_section = false;
-    let mut in_failures_section = false;
-    // Regex to match node IDs
-    let node_regex = Regex::new(r"(\d+)\s+(node_[a-f0-9]+)").unwrap();
-    let num_regex = Regex::new(r"(\d+)").unwrap();
-    for line in output.lines() {
-        let trimmed = line.trim();
-        if trimmed == "Active:" {
-            in_active_section = true;
-            in_pending_section = false;
-            in_failures_section = false;
-            continue;
-        } else if trimmed == "Pending:" {
-            in_active_section = false;
-            in_pending_section = true;
-            in_failures_section = false;
-            continue;
-        } else if trimmed == "Recent failures:" {
-            in_active_section = false;
-            in_pending_section = false;
-            in_failures_section = true;
-            continue;
-        } else if trimmed.starts_with("Resources") {
-            // We've reached the end of the node status section
-            break;
-        }
-        if in_active_section {
-            if let Some(captures) = node_regex.captures(trimmed) {
-                if let Some(node_id) = captures.get(2) {
-                    active_nodes.push(node_id.as_str().to_string());
-                }
-            }
-        } else if in_pending_section && trimmed != "(no pending nodes)" {
-            // Count pending nodes
-            if let Some(captures) = num_regex.captures(trimmed) {
-                if let Some(count) = captures.get(1) {
-                    if let Ok(count) = count.as_str().parse::<usize>() {
-                        pending_nodes_count += count;
-                    }
-                }
-            }
-        } else if in_failures_section && trimmed != "(no failures)" {
-            // Count failures
-            if let Some(captures) = num_regex.captures(trimmed) {
-                if let Some(count) = captures.get(1) {
-                    if let Ok(count) = count.as_str().parse::<usize>() {
-                        recent_failures_count += count;
-                    }
-                }
-            }
-        }
-    }
-    Some(RayStatus {
-        active_nodes,
-        pending_nodes_count,
-        recent_failures_count,
-    })
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_parse_ray_status() {
-        let sample_output = r#"======== Autoscaler status: 2025-03-04 13:13:59.104771 ========
-Node status
---------------------------------------------------------------
-Active:
- 1 node_b09a7440bd0987680f97c35206b2475251907d0c928fdd0f52b1b38f
- 1 node_035ea3b640e13f3603d3debd97de8c569ed8c8b10e19ce00ea4fd070
-Pending:
- (no pending nodes)
-Recent failures:
- (no failures)
-Resources
---------------------------------------------------------------
-Usage:
- 0.0/256.0 CPU
- 0.0/16.0 GPU
- 0B/1.58TiB memory
- 0B/372.53GiB object_store_memory
-Demands:
- (no resource demands)
-"#;
-        let expected = RayStatus {
-            active_nodes: vec![
-                "node_b09a7440bd0987680f97c35206b2475251907d0c928fdd0f52b1b38f".to_string(),
-                "node_035ea3b640e13f3603d3debd97de8c569ed8c8b10e19ce00ea4fd070".to_string(),
-            ],
-            pending_nodes_count: 0,
-            recent_failures_count: 0,
-        };
-        let result = parse_ray_status(sample_output);
-        assert!(result.is_some());
-        assert_eq!(result.unwrap(), expected);
-    }
-    /// Test with pending nodes and failures
-    #[test]
-    fn test_parse_ray_status_with_failing() {
-        let sample_output_with_pending = r#"======== Autoscaler status: 2025-03-04 13:13:59.104771 ========
-Node status
---------------------------------------------------------------
-Active:
- 1 node_b09a7440bd0987680f97c35206b2475251907d0c928fdd0f52b1b38f
-Pending:
- 2 node_pending_1
- 3 node_pending_2
-Recent failures:
- 1 node_failure_1
- 4 node_failure_2
-Resources
---------------------------------------------------------------
-Usage:
- 0.0/256.0 CPU
-"#;
-        let expected_with_pending = RayStatus {
-            active_nodes: vec![
-                "node_b09a7440bd0987680f97c35206b2475251907d0c928fdd0f52b1b38f".to_string(),
-            ],
-            pending_nodes_count: 5,   // 2 + 3
-            recent_failures_count: 5, // 1 + 4
-        };
-        let result = parse_ray_status(sample_output_with_pending);
-        assert!(result.is_some());
-        assert_eq!(result.unwrap(), expected_with_pending);
-    }
-    /// Test with empty output
-    #[test]
-    fn test_parse_ray_status_empty() {
-        let empty_output = "";
-        let result = parse_ray_status(empty_output);
-        assert!(result.is_some());
-        assert_eq!(result.unwrap().active_nodes.len(), 0);
-    }
-}
--- a/lib/engines/vllm0_7/src/subprocess.rs
+++ b/lib/engines/vllm0_7/src/subprocess.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-use pyo3::{types::IntoPyDict, Python};
-use std::env;
-use std::ffi::CString;
-use std::path::{Path, PathBuf};
-use dynamo_llm::engines::MultiNodeConfig;
-const PY_START_ENGINE: &str = include_str!("vllm_inc.py");
-/// Start the Python vllm engine that listens on zmq socket
-/// This is called by running `<bin> --internal-vllm-process
-/// This does not return until vllm exits.
-pub fn run_subprocess(
-    socket_id: &str,
-    model_path: &Path,
-    node_config: MultiNodeConfig,
-    tp_size: u32,
-    extra_engine_args: Option<PathBuf>,
-    with_kv_routing: bool,
-) -> anyhow::Result<()> {
-    if with_kv_routing {
-        set_kv_routing_vars()?;
-    }
-    pyo3::prepare_freethreaded_python(); // or enable feature "auto-initialize"
-    if let Ok(venv) = env::var("VIRTUAL_ENV") {
-        let _ = Python::with_gil(|py| crate::fix_venv(venv, py));
-    }
-    let model_path_str = model_path.display().to_string();
-    let extra_engine_args_str = &extra_engine_args
-        .map(|p| p.display().to_string())
-        .unwrap_or_default();
-    Python::with_gil(|py| {
-        let locals = [
-            ("socket_id", socket_id),
-            ("model_path", model_path_str.as_str()),
-            ("tp_size_str", &tp_size.to_string()),
-            ("nnodes_str", &node_config.num_nodes.to_string()),
-            ("extra_engine_args", extra_engine_args_str),
-            ("enable_prefix_caching", &with_kv_routing.to_string()),
-        ]
-        .into_py_dict(py)
-        .unwrap();
-        if let Err(err) = py.run(CString::new(PY_START_ENGINE)?.as_ref(), None, Some(&locals)) {
-            anyhow::bail!("vllm engine run error: {err}");
-        }
-        tracing::info!("vllm subprocess exit");
-        Ok(())
-    })
-}
-// These environment variables trigger our vllm patch to emit KV routing events
-fn set_kv_routing_vars() -> anyhow::Result<()> {
-    let exe = env::current_exe()?;
-    let exe_dir = exe
-        .parent()
-        .ok_or(anyhow::anyhow!("Current binary has no directory"))?;
-    let mut lib = PathBuf::from(exe_dir);
-    lib.set_file_name("libdynamo_llm_capi.so");
-    let vars = [
-        // Path to the C API Library
-        ("VLLM_KV_CAPI_PATH", lib.display().to_string()),
-        // Identifiers to publish KV related information
-        ("VLLM_KV_NAMESPACE", "dynamo".to_string()),
-        ("VLLM_KV_COMPONENT", "vllm".to_string()),
-        // Worker ID used for identifying workers in distributed settings
-        ("VLLM_WORKER_ID", "0".to_string()),
-    ];
-    for (kvar, default_v) in vars {
-        if env::var(kvar).is_err() {
-            env::set_var(kvar, default_v);
-        }
-    }
-    Ok(())
-}
--- a/lib/engines/vllm0_7/src/vllm_inc.py
+++ b/lib/engines/vllm0_7/src/vllm_inc.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# This file is included as a string in subprocess.rs. Most work should be done in the Rust caller.
-#
-import json
-import logging
-import multiprocessing
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.multiprocessing.engine import run_mp_engine
-from vllm.usage.usage_lib import UsageContext
-arg_map = {
-    "model": f"{model_path}",
-    "served_model_name": None,
-    "task": "generate",
-    "skip_tokenizer_init": True,
-    "seed": 0,
-    "max_model_len": 8192,
-    "max_seq_len_to_capture": 8192,
-    "tensor_parallel_size": int(tp_size_str),
-    "pipeline_parallel_size": int(nnodes_str),
-    "enable_prefix_caching": enable_prefix_caching.lower() == "true",
-}
-json_map = {}
-if extra_engine_args != "":
-    # extra_engine_args is a filename
-    try:
-        with open(extra_engine_args) as f:
-            json_map = json.load(f)
-    except FileNotFoundError:
-        logging.debug(f"File {extra_engine_args} not found.")
-    except json.JSONDecodeError as e:
-        logging.debug(f"Invalid JSON in {extra_engine_args}: {e}")
-    logging.debug(f"Adding extra engine arguments: {json_map}")
-    arg_map = {**arg_map, **json_map}  # json_map gets precedence
-engine_args = AsyncEngineArgs(**arg_map)
-ipc_path = f"ipc:///tmp/{socket_id}"
-engine_alive = multiprocessing.Value("b", True, lock=False)
-run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, engine_alive)
--- a/lib/engines/vllm0_7/src/worker.rs
+++ b/lib/engines/vllm0_7/src/worker.rs
--- a/lib/engines/vllm0_8/Cargo.toml
+++ b/lib/engines/vllm0_8/Cargo.toml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-[package]
-name = "dynamo-engine-vllm0_8"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-authors.workspace = true
-license.workspace = true
-homepage.workspace = true
-repository.workspace = true
-keywords.workspace = true
-[dependencies]
-dynamo-runtime = { workspace = true }
-dynamo-llm = { workspace = true }
-anyhow = { workspace = true }
-async-stream = { workspace = true }
-async-trait = { workspace = true }
-serde = { workspace = true }
-serde_json = { workspace = true }
-thiserror = { workspace = true }
-tokio = { workspace = true }
-tokio-stream = { workspace = true }
-tracing = { workspace = true }
-async-openai = "0.27.2"
-pyo3 = { version = "0.23.3", default-features = false, features = [
-  "macros",
-  "experimental-async",
-  "experimental-inspect",
-  "py-clone",
-] }
-pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
-  "attributes",
-  "testing",
-  "tokio-runtime",
-  "unstable-streams",
-] }
-pythonize = { version = "0.23" }
-regex = "1"
-serde-pickle = "1.2.0"
--- a/lib/engines/vllm0_8/src/lib.rs
+++ b/lib/engines/vllm0_8/src/lib.rs
--- a/lib/engines/vllm0_8/src/vllm_inc.py
+++ b/lib/engines/vllm0_8/src/vllm_inc.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# This file is included as a string in lib.rs. Most work should be done in the Rust caller.
-#
-import json
-import logging
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TokensPrompt
-# TODO this should match DYN_LOG level
-logging.basicConfig(level=logging.INFO)
-async def main(request_queue, ready_event, extra_engine_args, **kwargs):
-    arg_map = kwargs
-    if extra_engine_args != "":
-        json_map = {}
-        # extra_engine_args is a filename
-        try:
-            with open(extra_engine_args) as f:
-                json_map = json.load(f)
-        except FileNotFoundError:
-            logging.error(f"File {extra_engine_args} not found.")
-        except json.JSONDecodeError as e:
-            logging.error(f"Invalid JSON in {extra_engine_args}: {e}")
-        logging.debug(f"Adding extra engine arguments: {json_map}")
-        arg_map = {**arg_map, **json_map}  # json_map gets precedence
-    engine_args = AsyncEngineArgs(**arg_map)
-    # Main loop
-    try:
-        async with build_async_engine_client_from_engine_args(
-            engine_args
-        ) as engine_client:
-            ready_event.set()
-            while True:
-                req = await request_queue.get()
-                if req is None:  # Stop sentinel
-                    break
-                (request_id, request, sampling_params, response_queue) = req
-                prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
-                gen = engine_client.generate(prompt, sampling_params, request_id)
-                async for res in gen:
-                    await response_queue.put(res)
-                await response_queue.put(None)
-                request_queue.task_done()
-    except Exception as e:
-        logging.error(f"vllm init failed: {e}")
-    finally:
-        logging.debug("vllm worker stopped")
-async def run_response(response_queue):
-    try:
-        while True:
-            item = await response_queue.get()
-            yield item
-            response_queue.task_done()
-            if item is None:
-                return
-    except Exception as e:
-        logging.error(f"failed reading response from vllm: {e}")