Unverified Commit 42969800 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove embedded Python vllm and sglang engines (#966)

vllm and sglang are now the sub-process engines from #954

Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).
parent 5d89a0c8
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::{Path, PathBuf};
use std::sync::Arc;
use dynamo_llm::backend::ExecutionContext;
use dynamo_runtime::pipeline::error as pipeline_error;
use dynamo_runtime::CancellationToken;
use pyo3::prelude::*;
mod worker;
mod engine;
use engine::SgLangEngine;
mod subprocess;
pub use subprocess::run_subprocess;
pub async fn make_engine(
cancel_token: CancellationToken,
// Full path to the model directory
model_path: &Path,
// Unique string to name zmq sockets
sock_code: &str,
// Multi node settings
node_conf: dynamo_llm::engines::MultiNodeConfig,
// How many GPUs to use
tensor_parallel_size: u32,
// The base GPU ID to start allocating GPUs from
base_gpu_id: u32,
// Extra arguments to pass directly as sglang ServerArgs
extra_engine_args: Option<PathBuf>,
) -> pipeline_error::Result<(ExecutionContext, tokio::task::JoinHandle<()>)> {
let mut engine = SgLangEngine::new(
cancel_token,
sock_code,
model_path,
node_conf,
tensor_parallel_size,
base_gpu_id,
extra_engine_args,
)
.await?;
let sglang_process = engine.take_sglang_worker_handle();
let engine: ExecutionContext = Arc::new(engine);
Ok((engine, sglang_process))
}
#[derive(Debug, Clone, Copy)]
pub struct MultiGPUConfig {
/// How many GPUs we are using / how many processes
pub tp_size: u32,
/// Tensor Parallel Rank. Must be unique across all nodes and GPUs.
pub tp_rank: u32,
/// GPU ID. Which GPU to run on. In single-node setup this is the same as tp_rank.
pub gpu_id: u32,
}
impl Default for MultiGPUConfig {
fn default() -> Self {
MultiGPUConfig {
tp_size: 1,
tp_rank: 0,
gpu_id: 0,
}
}
}
#[cfg(target_os = "macos")]
fn fix_venv(venv: String, py: Python<'_>) -> anyhow::Result<()> {
let version_info = py.version_info();
let sys: PyObject = py.import("sys")?.into();
let sys_path = sys.getattr(py, "path")?;
let venv_path = format!(
"{venv}/lib/python{}.{}/site-packages",
version_info.major, version_info.minor
);
// TODO: This should go _before_ the site-packages
sys_path.call_method1(py, "append", (venv_path,))?;
Ok(())
}
#[cfg(not(target_os = "macos"))]
fn fix_venv(_venv: String, _py: Python<'_>) -> anyhow::Result<()> {
Ok(())
}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This file is included as a string in subprocess.rs. Most work should be done in the Rust caller.
#
import json
import logging
import tempfile
from multiprocessing.connection import Connection
from sglang.srt.entrypoints.engine import _set_envs_and_config
from sglang.srt.managers.scheduler import run_scheduler_process
from sglang.srt.server_args import PortArgs, ServerArgs
logging.basicConfig(
level="DEBUG",
force=True,
datefmt="%Y-%m-%d %H:%M:%S",
format="[%(asctime)s] %(message)s",
)
# These can all be overridden by --extra-engine-args json file
arg_map = {
"model_path": f"{model_path}",
"enable_metrics": False,
"log_level": "debug",
"log_requests": True,
"tp_size": int(tp_size_str),
# Multi-node
"dist_init_addr": dist_init_addr if dist_init_addr != "" else None,
"nnodes": int(nnodes_str),
"node_rank": int(node_rank_str),
}
json_map = {}
if extra_engine_args != "":
# extra_engine_args is a filename
try:
with open(extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.debug(f"File {extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.debug(f"Invalid JSON in {extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
server_args = ServerArgs(**arg_map)
_set_envs_and_config(server_args)
logging.debug(server_args)
ipc_path = f"ipc:///tmp/{socket_id}"
# These must match worker.rs zmq_sockets, which is the other side
port_args = PortArgs(
# we don't use this one so use anything
tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
# Us -> sglang
scheduler_input_ipc_name=f"{ipc_path}_input_socket",
# sglang -> us
detokenizer_ipc_name=f"{ipc_path}_output_socket",
# The port for nccl initialization (torch.dist), which we don't use
nccl_port=9876,
)
# Rank must be globally unique across nodes
tp_rank = int(tp_rank_str)
# See nvidia-smi for GPU IDs, they run 0,1,2,etc.
# In a single-node setup this is the same as rank
gpu_id = int(gpu_id_str)
pipe_fd_int = int(pipe_fd)
writer = Connection(handle=pipe_fd_int, readable=False, writable=True)
run_scheduler_process(server_args, port_args, gpu_id, tp_rank, None, writer)
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use pyo3::{types::IntoPyDict, Python};
use std::{
env,
ffi::CString,
os::fd::RawFd,
path::{Path, PathBuf},
};
use dynamo_llm::engines::MultiNodeConfig;
const PY_START_ENGINE: &str = include_str!("sglang_inc.py");
/// Start the Python sglang engine that listens on zmq socket
/// This is called by running `nio --internal-sglang-process
/// This does not return until the subprocess exits.
pub fn run_subprocess(
// The prefix to put on the zmq socket names
socket_id: &str,
// Directory containing an HF repo with safetensors files, tokenizer, etc
model_path: &Path,
// The write half of a pipe, where sglang will signal when it's ready
notify_pipe_fd: RawFd,
// Multi node. Usually Default::default
node_config: MultiNodeConfig,
// Multi GPU. Usually Default::default
gpu_config: super::MultiGPUConfig,
// Allow passing any arguments to sglang
extra_engine_args: Option<PathBuf>,
) -> anyhow::Result<()> {
pyo3::prepare_freethreaded_python(); // or enable feature "auto-initialize"
if let Ok(venv) = env::var("VIRTUAL_ENV") {
let _ = Python::with_gil(|py| crate::fix_venv(venv, py));
}
let dir = model_path.display().to_string();
let extra_engine_args_str = &extra_engine_args
.map(|p| p.display().to_string())
.unwrap_or_default();
Python::with_gil(|py| {
let locals = [
("socket_id", socket_id),
("model_path", dir.as_str()),
("pipe_fd", &notify_pipe_fd.to_string()),
// to_string because slice must all be the same type
("tp_size_str", &gpu_config.tp_size.to_string()),
("tp_rank_str", &gpu_config.tp_rank.to_string()),
("gpu_id_str", &gpu_config.gpu_id.to_string()),
("nnodes_str", &node_config.num_nodes.to_string()),
("node_rank_str", &node_config.node_rank.to_string()),
("dist_init_addr", &node_config.leader_addr),
("extra_engine_args", extra_engine_args_str),
]
.into_py_dict(py)
.unwrap();
if let Err(err) = py.run(CString::new(PY_START_ENGINE)?.as_ref(), None, Some(&locals)) {
anyhow::bail!("sglang engine run error: {err}");
}
tracing::info!("sglang subprocess exit");
Ok(())
})
}
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-vllm0_7"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
async_zmq = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
async-openai = "0.27.2"
pyo3 = { version = "0.23.3", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"py-clone",
] }
regex = "1"
serde-pickle = "1.2.0"
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::{Path, PathBuf};
use std::sync::Arc;
use async_stream::stream;
use async_trait::async_trait;
use dynamo_llm::engines::MultiNodeConfig;
use dynamo_llm::kv_router::publisher::KvMetricsPublisher;
use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use dynamo_runtime::runtime::CancellationToken;
use crate::worker;
pub struct VllmEngine {
cancel_token: CancellationToken,
worker: worker::VllmWorker,
}
impl VllmEngine {
pub async fn new(
cancel_token: CancellationToken,
sock_code: &str,
model_path: &Path,
node_conf: MultiNodeConfig,
tensor_parallel_size: u32,
extra_engine_args: Option<PathBuf>,
kv_metrics_publisher: Option<Arc<KvMetricsPublisher>>,
) -> anyhow::Result<Self> {
let w = worker::start(
cancel_token.clone(),
sock_code,
model_path,
node_conf,
tensor_parallel_size,
extra_engine_args,
kv_metrics_publisher,
)
.await?;
let engine = VllmEngine {
cancel_token,
worker: w,
};
Ok(engine)
}
pub fn take_vllm_worker_handle(&mut self) -> tokio::task::JoinHandle<()> {
self.worker.take_vllm_handle()
}
}
#[async_trait]
impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
for VllmEngine
{
async fn generate(
&self,
request: SingleIn<BackendInput>,
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
let (request, context) = request.into_parts();
let ctx = context.context();
let request_id = ctx.id().to_string();
let (resp_tx, mut resp_rx) = tokio::sync::mpsc::channel(128);
let work_req = worker::WorkRequest {
request_id: context.id().to_string(),
request,
response_channel: resp_tx,
};
self.worker.enqueue_request(work_req).await?;
let cancel_token = self.cancel_token.clone();
let output = stream! {
loop {
let maybe_resp = tokio::select!{
_ = cancel_token.cancelled() => {
break;
}
maybe_resp = resp_rx.recv() => {
maybe_resp
}
};
match maybe_resp {
Some(out) => {
yield out;
},
None => {
tracing::trace!(request_id, "generate: response channel closed");
break;
}
}
}
};
Ok(ResponseStream::new(Box::pin(output), ctx))
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::future::Future;
use std::path::{Path, PathBuf};
use std::pin::Pin;
use std::sync::Arc;
use std::task::{Context, Poll};
use pyo3::prelude::*;
use dynamo_runtime::pipeline::error as pipeline_error;
use dynamo_runtime::CancellationToken;
use dynamo_llm::backend::ExecutionContext;
use dynamo_llm::engines::MultiNodeConfig;
use dynamo_llm::kv_router::publisher::KvMetricsPublisher;
mod engine;
use engine::VllmEngine;
mod ray;
use ray::Ray;
mod subprocess;
pub use subprocess::run_subprocess;
mod worker;
pub async fn make_leader_engine(
cancel_token: CancellationToken,
// Full path to the model, either a GGUF file or an HF repo dir
model_path: &Path,
// Unique string to name zmq sockets
sock_code: &str,
// Multi node settings
node_conf: MultiNodeConfig,
// How many GPUs to use
tensor_parallel_size: u32,
// Path to extra engine args file
extra_engine_args: Option<PathBuf>,
// When using our vllm fork, this is how we publish it's KV metrics for the KV router
kv_metrics_publisher: Option<Arc<KvMetricsPublisher>>,
) -> pipeline_error::Result<(ExecutionContext, impl Future<Output = ()>)> {
let ray_obj = if node_conf.num_nodes > 1 {
let r = ray::start_leader(node_conf.leader_addr.parse()?)?;
tracing::info!("Leader waiting for {} total nodes", node_conf.num_nodes);
r.wait_for(cancel_token.clone(), node_conf.num_nodes)
.await?;
tracing::info!("All nodes registered");
Some(r)
} else {
None
};
let mut engine = VllmEngine::new(
cancel_token,
sock_code,
model_path,
node_conf,
tensor_parallel_size,
extra_engine_args,
kv_metrics_publisher,
)
.await?;
let vllm_process = engine.take_vllm_worker_handle();
let vllm_future = async move {
if let Err(err) = vllm_process.await {
tracing::error!("Failed stopping vllm process: {err:#}");
}
if let Some(r) = ray_obj {
if let Err(err) = r.stop().await {
tracing::error!("Failed stopping ray: {err:#}");
}
}
};
let engine: ExecutionContext = Arc::new(engine);
Ok((engine, vllm_future))
}
pub async fn start_follower(
cancel_token: CancellationToken,
node_conf: MultiNodeConfig,
) -> pipeline_error::Result<StopFuture> {
let r = ray::start_follower(node_conf.leader_addr.parse()?)?;
tracing::info!("Follower waiting for {} total nodes", node_conf.num_nodes);
r.wait_for(cancel_token, node_conf.num_nodes).await?;
tracing::info!("All nodes registered");
Ok(StopFuture {
state: Some(StopFutureState::New(r)),
})
}
pub struct StopFuture {
state: Option<StopFutureState>,
}
enum StopFutureState {
New(Ray),
Running(Pin<Box<dyn Future<Output = ()> + Send>>),
}
impl Future for StopFuture {
type Output = ();
fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
let state = match self.state.take() {
None => return Poll::Ready(()),
Some(state) => state,
};
match state {
StopFutureState::New(obj) => {
// Convert object to a stop future
let future = Box::pin(async move {
if let Err(err) = obj.stop().await {
tracing::error!("Failed calling 'ray stop': {err:#}");
}
});
self.state = Some(StopFutureState::Running(future));
// Recurse to poll the new future immediately
self.poll(cx)
}
StopFutureState::Running(mut future) => {
// Poll the stop future
match future.as_mut().poll(cx) {
Poll::Ready(()) => {
// Done, leave state as None
Poll::Ready(())
}
Poll::Pending => {
// Not ready yet, preserve the future
self.state = Some(StopFutureState::Running(future));
Poll::Pending
}
}
}
}
}
}
#[cfg(target_os = "macos")]
fn fix_venv(venv: String, py: Python<'_>) -> anyhow::Result<()> {
let version_info = py.version_info();
let sys: PyObject = py.import("sys")?.into();
let sys_path = sys.getattr(py, "path")?;
let venv_path = format!(
"{venv}/lib/python{}.{}/site-packages",
version_info.major, version_info.minor
);
// TODO: This should go _before_ the site-packages
sys_path.call_method1(py, "append", (venv_path,))?;
Ok(())
}
#[cfg(not(target_os = "macos"))]
fn fix_venv(_venv: String, _py: Python<'_>) -> anyhow::Result<()> {
Ok(())
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use regex::Regex;
use std::io::{BufRead, BufReader};
use std::net::SocketAddrV4;
use std::process::{Command, Stdio};
use std::time::Duration;
use thiserror::Error;
use tokio::io::AsyncBufReadExt;
use tokio::select;
use tokio::time;
use dynamo_runtime::CancellationToken;
/// Default is 16 seconds, we make it a bit shorter
const RAY_STOP_TIMEOUT_SECS: u32 = 10;
/// How long to wait for all the nodes to start.
/// This is either done manually or through some orchestration system, so either way it
/// can take some time.
const RAY_WAIT_SECS: u32 = 60 * 5;
#[derive(Debug, Error)]
pub enum RayError {
#[error("Failed to execute Ray command: {0}")]
CommandExecution(#[from] std::io::Error),
#[error("Ray command failed with exit code: {0}")]
CommandFailed(i32),
#[error("Failed to parse Ray status output")]
StatusParseError,
#[error("Timeout waiting for nodes to become active")]
WaitTimeout,
#[error("Operation cancelled")]
Cancelled,
}
#[derive(Debug, PartialEq)]
pub struct RayStatus {
pub active_nodes: Vec<String>,
pub pending_nodes_count: usize,
pub recent_failures_count: usize,
}
pub struct Ray {
#[allow(dead_code)]
leader_address: SocketAddrV4,
}
pub fn start_leader(leader_address: SocketAddrV4) -> Result<Ray, RayError> {
let ip = leader_address.ip().to_string();
let port = leader_address.port().to_string();
let mut cmd = Command::new("ray");
cmd.args([
"start",
"--head",
"--disable-usage-stats",
"--log-style=record",
&format!("--node-ip-address={}", ip),
&format!("--port={}", port),
]);
cmd.stdout(Stdio::piped());
cmd.stderr(Stdio::piped());
let mut child = cmd.spawn()?;
// Process stdout
if let Some(stdout) = child.stdout.take() {
let reader = BufReader::new(stdout);
for line in reader.lines().map_while(Result::ok) {
tracing::info!("RAY: {line}");
}
}
// Process stderr
if let Some(stderr) = child.stderr.take() {
let reader = BufReader::new(stderr);
for line in reader.lines().map_while(Result::ok) {
tracing::info!("RAY: {line}");
}
}
let status = child.wait()?;
if !status.success() {
return Err(RayError::CommandFailed(status.code().unwrap_or(-1)));
}
Ok(Ray { leader_address })
}
pub fn start_follower(leader_address: SocketAddrV4) -> Result<Ray, RayError> {
let address = leader_address.to_string();
let mut cmd = Command::new("ray");
cmd.args(["start", &format!("--address={address}")]);
cmd.stdout(Stdio::piped());
cmd.stderr(Stdio::piped());
let mut child = cmd.spawn()?;
// Process stdout
if let Some(stdout) = child.stdout.take() {
let reader = BufReader::new(stdout);
for line in reader.lines().map_while(Result::ok) {
tracing::info!("RAY: {line}");
}
}
// Process stderr
if let Some(stderr) = child.stderr.take() {
let reader = BufReader::new(stderr);
for line in reader.lines().map_while(Result::ok) {
tracing::info!("RAY: {line}");
}
}
let status = child.wait()?;
if !status.success() {
return Err(RayError::CommandFailed(status.code().unwrap_or(-1)));
}
Ok(Ray { leader_address })
}
impl Ray {
pub fn status(&self) -> Result<RayStatus, RayError> {
let output = Command::new("ray").arg("status").output()?;
if !output.status.success() {
return Err(RayError::CommandFailed(output.status.code().unwrap_or(-1)));
}
let output_str = String::from_utf8_lossy(&output.stdout);
parse_ray_status(&output_str).ok_or(RayError::StatusParseError)
}
pub async fn wait_for(
&self,
cancel_token: CancellationToken,
num_nodes: u32,
) -> Result<(), RayError> {
let timeout = time::sleep(Duration::from_secs(RAY_WAIT_SECS as u64));
select! {
_ = cancel_token.cancelled() => {
Err(RayError::Cancelled)
}
_ = timeout => {
Err(RayError::WaitTimeout)
}
result = self.wait_for_nodes(num_nodes) => {
result
}
}
}
async fn wait_for_nodes(&self, num_nodes: u32) -> Result<(), RayError> {
loop {
let status = self.status()?;
if status.active_nodes.len() as u32 == num_nodes {
return Ok(());
}
time::sleep(Duration::from_millis(100)).await;
}
}
pub async fn stop(&self) -> Result<(), RayError> {
let mut cmd = tokio::process::Command::new("ray");
cmd.args([
"stop",
&format!("--grace-period={RAY_STOP_TIMEOUT_SECS}"),
"--log-style=record",
]);
cmd.stdout(Stdio::piped());
cmd.stderr(Stdio::piped());
let mut child = cmd.spawn()?;
// Process stdout
if let Some(stdout) = child.stdout.take() {
let reader = tokio::io::BufReader::new(stdout);
let mut lines = reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
tracing::info!("RAY: {line}");
}
}
// Process stderr
if let Some(stderr) = child.stderr.take() {
let reader = tokio::io::BufReader::new(stderr);
let mut lines = reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
tracing::info!("RAY: {line}");
}
}
let status = child.wait().await?;
if !status.success() {
return Err(RayError::CommandFailed(status.code().unwrap_or(-1)));
}
Ok(())
}
}
/// Parse the output of "ray status" command into a RayStatus struct
fn parse_ray_status(output: &str) -> Option<RayStatus> {
let mut active_nodes = Vec::new();
let mut pending_nodes_count = 0;
let mut recent_failures_count = 0;
// Flags to track which section we're in
let mut in_active_section = false;
let mut in_pending_section = false;
let mut in_failures_section = false;
// Regex to match node IDs
let node_regex = Regex::new(r"(\d+)\s+(node_[a-f0-9]+)").unwrap();
let num_regex = Regex::new(r"(\d+)").unwrap();
for line in output.lines() {
let trimmed = line.trim();
if trimmed == "Active:" {
in_active_section = true;
in_pending_section = false;
in_failures_section = false;
continue;
} else if trimmed == "Pending:" {
in_active_section = false;
in_pending_section = true;
in_failures_section = false;
continue;
} else if trimmed == "Recent failures:" {
in_active_section = false;
in_pending_section = false;
in_failures_section = true;
continue;
} else if trimmed.starts_with("Resources") {
// We've reached the end of the node status section
break;
}
if in_active_section {
if let Some(captures) = node_regex.captures(trimmed) {
if let Some(node_id) = captures.get(2) {
active_nodes.push(node_id.as_str().to_string());
}
}
} else if in_pending_section && trimmed != "(no pending nodes)" {
// Count pending nodes
if let Some(captures) = num_regex.captures(trimmed) {
if let Some(count) = captures.get(1) {
if let Ok(count) = count.as_str().parse::<usize>() {
pending_nodes_count += count;
}
}
}
} else if in_failures_section && trimmed != "(no failures)" {
// Count failures
if let Some(captures) = num_regex.captures(trimmed) {
if let Some(count) = captures.get(1) {
if let Ok(count) = count.as_str().parse::<usize>() {
recent_failures_count += count;
}
}
}
}
}
Some(RayStatus {
active_nodes,
pending_nodes_count,
recent_failures_count,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_ray_status() {
let sample_output = r#"======== Autoscaler status: 2025-03-04 13:13:59.104771 ========
Node status
---------------------------------------------------------------
Active:
1 node_b09a7440bd0987680f97c35206b2475251907d0c928fdd0f52b1b38f
1 node_035ea3b640e13f3603d3debd97de8c569ed8c8b10e19ce00ea4fd070
Pending:
(no pending nodes)
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
0.0/256.0 CPU
0.0/16.0 GPU
0B/1.58TiB memory
0B/372.53GiB object_store_memory
Demands:
(no resource demands)
"#;
let expected = RayStatus {
active_nodes: vec![
"node_b09a7440bd0987680f97c35206b2475251907d0c928fdd0f52b1b38f".to_string(),
"node_035ea3b640e13f3603d3debd97de8c569ed8c8b10e19ce00ea4fd070".to_string(),
],
pending_nodes_count: 0,
recent_failures_count: 0,
};
let result = parse_ray_status(sample_output);
assert!(result.is_some());
assert_eq!(result.unwrap(), expected);
}
/// Test with pending nodes and failures
#[test]
fn test_parse_ray_status_with_failing() {
let sample_output_with_pending = r#"======== Autoscaler status: 2025-03-04 13:13:59.104771 ========
Node status
---------------------------------------------------------------
Active:
1 node_b09a7440bd0987680f97c35206b2475251907d0c928fdd0f52b1b38f
Pending:
2 node_pending_1
3 node_pending_2
Recent failures:
1 node_failure_1
4 node_failure_2
Resources
---------------------------------------------------------------
Usage:
0.0/256.0 CPU
"#;
let expected_with_pending = RayStatus {
active_nodes: vec![
"node_b09a7440bd0987680f97c35206b2475251907d0c928fdd0f52b1b38f".to_string(),
],
pending_nodes_count: 5, // 2 + 3
recent_failures_count: 5, // 1 + 4
};
let result = parse_ray_status(sample_output_with_pending);
assert!(result.is_some());
assert_eq!(result.unwrap(), expected_with_pending);
}
/// Test with empty output
#[test]
fn test_parse_ray_status_empty() {
let empty_output = "";
let result = parse_ray_status(empty_output);
assert!(result.is_some());
assert_eq!(result.unwrap().active_nodes.len(), 0);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use pyo3::{types::IntoPyDict, Python};
use std::env;
use std::ffi::CString;
use std::path::{Path, PathBuf};
use dynamo_llm::engines::MultiNodeConfig;
const PY_START_ENGINE: &str = include_str!("vllm_inc.py");
/// Start the Python vllm engine that listens on zmq socket
/// This is called by running `<bin> --internal-vllm-process
/// This does not return until vllm exits.
pub fn run_subprocess(
socket_id: &str,
model_path: &Path,
node_config: MultiNodeConfig,
tp_size: u32,
extra_engine_args: Option<PathBuf>,
with_kv_routing: bool,
) -> anyhow::Result<()> {
if with_kv_routing {
set_kv_routing_vars()?;
}
pyo3::prepare_freethreaded_python(); // or enable feature "auto-initialize"
if let Ok(venv) = env::var("VIRTUAL_ENV") {
let _ = Python::with_gil(|py| crate::fix_venv(venv, py));
}
let model_path_str = model_path.display().to_string();
let extra_engine_args_str = &extra_engine_args
.map(|p| p.display().to_string())
.unwrap_or_default();
Python::with_gil(|py| {
let locals = [
("socket_id", socket_id),
("model_path", model_path_str.as_str()),
("tp_size_str", &tp_size.to_string()),
("nnodes_str", &node_config.num_nodes.to_string()),
("extra_engine_args", extra_engine_args_str),
("enable_prefix_caching", &with_kv_routing.to_string()),
]
.into_py_dict(py)
.unwrap();
if let Err(err) = py.run(CString::new(PY_START_ENGINE)?.as_ref(), None, Some(&locals)) {
anyhow::bail!("vllm engine run error: {err}");
}
tracing::info!("vllm subprocess exit");
Ok(())
})
}
// These environment variables trigger our vllm patch to emit KV routing events
fn set_kv_routing_vars() -> anyhow::Result<()> {
let exe = env::current_exe()?;
let exe_dir = exe
.parent()
.ok_or(anyhow::anyhow!("Current binary has no directory"))?;
let mut lib = PathBuf::from(exe_dir);
lib.set_file_name("libdynamo_llm_capi.so");
let vars = [
// Path to the C API Library
("VLLM_KV_CAPI_PATH", lib.display().to_string()),
// Identifiers to publish KV related information
("VLLM_KV_NAMESPACE", "dynamo".to_string()),
("VLLM_KV_COMPONENT", "vllm".to_string()),
// Worker ID used for identifying workers in distributed settings
("VLLM_WORKER_ID", "0".to_string()),
];
for (kvar, default_v) in vars {
if env::var(kvar).is_err() {
env::set_var(kvar, default_v);
}
}
Ok(())
}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This file is included as a string in subprocess.rs. Most work should be done in the Rust caller.
#
import json
import logging
import multiprocessing
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.multiprocessing.engine import run_mp_engine
from vllm.usage.usage_lib import UsageContext
arg_map = {
"model": f"{model_path}",
"served_model_name": None,
"task": "generate",
"skip_tokenizer_init": True,
"seed": 0,
"max_model_len": 8192,
"max_seq_len_to_capture": 8192,
"tensor_parallel_size": int(tp_size_str),
"pipeline_parallel_size": int(nnodes_str),
"enable_prefix_caching": enable_prefix_caching.lower() == "true",
}
json_map = {}
if extra_engine_args != "":
# extra_engine_args is a filename
try:
with open(extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.debug(f"File {extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.debug(f"Invalid JSON in {extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
engine_args = AsyncEngineArgs(**arg_map)
ipc_path = f"ipc:///tmp/{socket_id}"
engine_alive = multiprocessing.Value("b", True, lock=False)
run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, engine_alive)
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-vllm0_8"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
tracing = { workspace = true }
async-openai = "0.27.2"
pyo3 = { version = "0.23.3", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"py-clone",
] }
pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
"attributes",
"testing",
"tokio-runtime",
"unstable-streams",
] }
pythonize = { version = "0.23" }
regex = "1"
serde-pickle = "1.2.0"
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This file is included as a string in lib.rs. Most work should be done in the Rust caller.
#
import json
import logging
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.inputs import TokensPrompt
# TODO this should match DYN_LOG level
logging.basicConfig(level=logging.INFO)
async def main(request_queue, ready_event, extra_engine_args, **kwargs):
arg_map = kwargs
if extra_engine_args != "":
json_map = {}
# extra_engine_args is a filename
try:
with open(extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.error(f"File {extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.error(f"Invalid JSON in {extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
engine_args = AsyncEngineArgs(**arg_map)
# Main loop
try:
async with build_async_engine_client_from_engine_args(
engine_args
) as engine_client:
ready_event.set()
while True:
req = await request_queue.get()
if req is None: # Stop sentinel
break
(request_id, request, sampling_params, response_queue) = req
prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
gen = engine_client.generate(prompt, sampling_params, request_id)
async for res in gen:
await response_queue.put(res)
await response_queue.put(None)
request_queue.task_done()
except Exception as e:
logging.error(f"vllm init failed: {e}")
finally:
logging.debug("vllm worker stopped")
async def run_response(response_queue):
try:
while True:
item = await response_queue.get()
yield item
response_queue.task_done()
if item is None:
return
except Exception as e:
logging.error(f"failed reading response from vllm: {e}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment