Unverified Commit 42969800 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove embedded Python vllm and sglang engines (#966)

vllm and sglang are now the sub-process engines from #954

Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).
parent 5d89a0c8
...@@ -1533,68 +1533,6 @@ dependencies = [ ...@@ -1533,68 +1533,6 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "dynamo-engine-sglang"
version = "0.2.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"async_zmq",
"dynamo-llm",
"dynamo-runtime",
"libc",
"pyo3",
"regex",
"serde_json",
"tokio",
"tracing",
]
[[package]]
name = "dynamo-engine-vllm0_7"
version = "0.2.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"async_zmq",
"dynamo-llm",
"dynamo-runtime",
"pyo3",
"regex",
"serde-pickle",
"serde_json",
"thiserror 2.0.12",
"tokio",
"tracing",
]
[[package]]
name = "dynamo-engine-vllm0_8"
version = "0.2.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"dynamo-llm",
"dynamo-runtime",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"regex",
"serde",
"serde-pickle",
"serde_json",
"thiserror 2.0.12",
"tokio",
"tokio-stream",
"tracing",
]
[[package]] [[package]]
name = "dynamo-llm" name = "dynamo-llm"
version = "0.2.0" version = "0.2.0"
...@@ -1671,18 +1609,13 @@ dependencies = [ ...@@ -1671,18 +1609,13 @@ dependencies = [
"dynamo-engine-llamacpp", "dynamo-engine-llamacpp",
"dynamo-engine-mistralrs", "dynamo-engine-mistralrs",
"dynamo-engine-python", "dynamo-engine-python",
"dynamo-engine-sglang",
"dynamo-engine-vllm0_7",
"dynamo-engine-vllm0_8",
"dynamo-llm", "dynamo-llm",
"dynamo-runtime", "dynamo-runtime",
"futures", "futures",
"futures-util", "futures-util",
"humantime", "humantime",
"libc", "libc",
"netlink-packet-route",
"regex", "regex",
"rtnetlink",
"serde", "serde",
"serde_json", "serde_json",
"tempfile", "tempfile",
...@@ -1718,7 +1651,7 @@ dependencies = [ ...@@ -1718,7 +1651,7 @@ dependencies = [
"local-ip-address", "local-ip-address",
"log", "log",
"nid", "nid",
"nix 0.29.0", "nix",
"nuid", "nuid",
"once_cell", "once_cell",
"prometheus", "prometheus",
...@@ -3165,12 +3098,6 @@ version = "1.70.1" ...@@ -3165,12 +3098,6 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "iter-read"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071ed4cc1afd86650602c7b11aa2e1ce30762a1c27193201cb5cee9c6ebb1294"
[[package]] [[package]]
name = "itertools" name = "itertools"
version = "0.10.5" version = "0.10.5"
...@@ -3936,70 +3863,6 @@ dependencies = [ ...@@ -3936,70 +3863,6 @@ dependencies = [
"winapi 0.3.9", "winapi 0.3.9",
] ]
[[package]]
name = "netlink-packet-core"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72724faf704479d67b388da142b186f916188505e7e0b26719019c525882eda4"
dependencies = [
"anyhow",
"byteorder",
"netlink-packet-utils",
]
[[package]]
name = "netlink-packet-route"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74c171cd77b4ee8c7708da746ce392440cb7bcf618d122ec9ecc607b12938bf4"
dependencies = [
"anyhow",
"byteorder",
"libc",
"log",
"netlink-packet-core",
"netlink-packet-utils",
]
[[package]]
name = "netlink-packet-utils"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ede8a08c71ad5a95cdd0e4e52facd37190977039a4704eb82a283f713747d34"
dependencies = [
"anyhow",
"byteorder",
"paste",
"thiserror 1.0.69",
]
[[package]]
name = "netlink-proto"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72452e012c2f8d612410d89eea01e2d9b56205274abb35d53f60200b2ec41d60"
dependencies = [
"bytes",
"futures",
"log",
"netlink-packet-core",
"netlink-sys",
"thiserror 2.0.12",
]
[[package]]
name = "netlink-sys"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16c903aa70590cb93691bf97a767c8d1d6122d2cc9070433deb3bbf36ce8bd23"
dependencies = [
"bytes",
"futures",
"libc",
"log",
"tokio",
]
[[package]] [[package]]
name = "nibble_vec" name = "nibble_vec"
version = "0.1.0" version = "0.1.0"
...@@ -4020,17 +3883,6 @@ dependencies = [ ...@@ -4020,17 +3883,6 @@ dependencies = [
"thiserror 1.0.69", "thiserror 1.0.69",
] ]
[[package]]
name = "nix"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
dependencies = [
"bitflags 2.9.0",
"cfg-if 1.0.0",
"libc",
]
[[package]] [[package]]
name = "nix" name = "nix"
version = "0.29.0" version = "0.29.0"
...@@ -5357,24 +5209,6 @@ dependencies = [ ...@@ -5357,24 +5209,6 @@ dependencies = [
"syn 2.0.100", "syn 2.0.100",
] ]
[[package]]
name = "rtnetlink"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b684475344d8df1859ddb2d395dd3dac4f8f3422a1aa0725993cb375fc5caba5"
dependencies = [
"futures",
"log",
"netlink-packet-core",
"netlink-packet-route",
"netlink-packet-utils",
"netlink-proto",
"netlink-sys",
"nix 0.27.1",
"thiserror 1.0.69",
"tokio",
]
[[package]] [[package]]
name = "rustc-demangle" name = "rustc-demangle"
version = "0.1.24" version = "0.1.24"
...@@ -5697,19 +5531,6 @@ dependencies = [ ...@@ -5697,19 +5531,6 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "serde-pickle"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b641fdc8bcf2781ee78b30c599700d64ad4f412976143e4c5d0b9df906bb4843"
dependencies = [
"byteorder",
"iter-read",
"num-bigint",
"num-traits",
"serde",
]
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.219" version = "1.0.219"
......
...@@ -46,7 +46,7 @@ ARG CARGO_BUILD_JOBS ...@@ -46,7 +46,7 @@ ARG CARGO_BUILD_JOBS
ENV CARGO_TARGET_DIR=/workspace/target ENV CARGO_TARGET_DIR=/workspace/target
RUN cargo build --release --locked --features mistralrs,sglang,vllm,python && \ RUN cargo build --release --locked --features mistralrs,python && \
cargo doc --no-deps && \ cargo doc --no-deps && \
cp target/release/dynamo-run /usr/local/bin && \ cp target/release/dynamo-run /usr/local/bin && \
cp target/release/http /usr/local/bin && \ cp target/release/http /usr/local/bin && \
......
...@@ -173,7 +173,7 @@ COPY launch /workspace/launch ...@@ -173,7 +173,7 @@ COPY launch /workspace/launch
COPY deploy/sdk /workspace/deploy/sdk COPY deploy/sdk /workspace/deploy/sdk
# Build Rust crate binaries packaged with the wheel # Build Rust crate binaries packaged with the wheel
RUN cargo build --release --locked --features mistralrs,sglang,vllm,python \ RUN cargo build --release --locked --features mistralrs,python \
-p dynamo-run \ -p dynamo-run \
-p llmctl \ -p llmctl \
# Multiple http named crates are present in dependencies, need to specify the path # Multiple http named crates are present in dependencies, need to specify the path
......
...@@ -59,7 +59,9 @@ RUN apt-get update -y && \ ...@@ -59,7 +59,9 @@ RUN apt-get update -y && \
ninja-build \ ninja-build \
pybind11-dev \ pybind11-dev \
# Rust build dependencies # Rust build dependencies
clang \
libclang-dev \ libclang-dev \
git \
# Install utilities # Install utilities
nvtop \ nvtop \
tmux \ tmux \
...@@ -305,7 +307,7 @@ COPY launch /workspace/launch ...@@ -305,7 +307,7 @@ COPY launch /workspace/launch
COPY deploy/sdk /workspace/deploy/sdk COPY deploy/sdk /workspace/deploy/sdk
# Build Rust crate binaries packaged with the wheel # Build Rust crate binaries packaged with the wheel
RUN cargo build --release --locked --features mistralrs,sglang,vllm,python \ RUN cargo build --release --locked --features mistralrs,python \
-p dynamo-run \ -p dynamo-run \
-p llmctl \ -p llmctl \
# Multiple http named crates are present in dependencies, need to specify the path # Multiple http named crates are present in dependencies, need to specify the path
......
This diff is collapsed.
...@@ -26,12 +26,9 @@ description = "Dynamo Run CLI" ...@@ -26,12 +26,9 @@ description = "Dynamo Run CLI"
[features] [features]
# Build with `--no-default-features` to disable these defaults # Build with `--no-default-features` to disable these defaults
# We don't include llamacpp by default until we figure out when it needs external libraries default = ["mistralrs"]
default = ["mistralrs", "vllm", "sglang"]
mistralrs = ["dep:dynamo-engine-mistralrs"] mistralrs = ["dep:dynamo-engine-mistralrs"]
llamacpp = ["dep:dynamo-engine-llamacpp"] llamacpp = ["dep:dynamo-engine-llamacpp"]
vllm = ["dep:dynamo-engine-vllm0_7", "dep:dynamo-engine-vllm0_8", "dep:netlink-packet-route", "dep:rtnetlink"]
sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
python = ["dep:dynamo-engine-python"] python = ["dep:dynamo-engine-python"]
cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"] cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"]
...@@ -44,9 +41,6 @@ dynamo-runtime = { workspace = true } ...@@ -44,9 +41,6 @@ dynamo-runtime = { workspace = true }
dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true } dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true }
dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true } dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true }
dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
dynamo-engine-vllm0_7 = { path = "../../lib/engines/vllm0_7", optional = true }
dynamo-engine-vllm0_8 = { path = "../../lib/engines/vllm0_8", optional = true }
dynamo-engine-python = { path = "../../lib/engines/python", optional = true } dynamo-engine-python = { path = "../../lib/engines/python", optional = true }
anyhow = { workspace = true } anyhow = { workspace = true }
...@@ -68,15 +62,3 @@ clap = { version = "4.5", features = ["derive", "env"] } ...@@ -68,15 +62,3 @@ clap = { version = "4.5", features = ["derive", "env"] }
dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] } dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }
futures-util = { version = "0.3" } futures-util = { version = "0.3" }
regex = "1" regex = "1"
[target.x86_64-unknown-linux-gnu.dependencies]
netlink-packet-route = { version = "0.19", optional = true }
rtnetlink = { version = "0.14", optional = true }
[target.x86_64-unknown-linux-musl.dependencies]
netlink-packet-route = { version = "0.19", optional = true }
rtnetlink = { version = "0.14", optional = true }
[target.aarch64-unknown-linux-gnu.dependencies]
netlink-packet-route = { version = "0.19", optional = true }
rtnetlink = { version = "0.14", optional = true }
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr;
use clap::ValueEnum; use clap::ValueEnum;
use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode; use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
...@@ -106,21 +105,6 @@ pub struct Flags { ...@@ -106,21 +105,6 @@ pub struct Flags {
#[arg(long, default_value = "round-robin")] #[arg(long, default_value = "round-robin")]
pub router_mode: RouterMode, pub router_mode: RouterMode,
/// Internal use only.
// Start the python vllm engine sub-process.
#[arg(long, hide = true, default_value = "false")]
pub internal_vllm_process: bool,
/// Internal use only.
/// Start the sglang Python sub-process.
/// The params in the tuple are:
/// - the fd of the write end of a pipe where sglang will signal that it's ready.
/// - the node rank (0 for first host, 1 for second host, etc)
/// - the workers' rank (globally unique)
/// - the GPU to use (locally unique)
#[arg(long, hide = true, value_parser = parse_sglang_flags)]
pub internal_sglang_process: Option<SgLangFlags>,
/// Additional engine-specific arguments from a JSON file. /// Additional engine-specific arguments from a JSON file.
/// Contains a mapping of parameter names to values. /// Contains a mapping of parameter names to values.
#[arg(long)] #[arg(long)]
...@@ -200,30 +184,6 @@ impl Flags { ...@@ -200,30 +184,6 @@ impl Flags {
} }
} }
#[derive(Debug, Clone, Copy)]
pub struct SgLangFlags {
pub pipe_fd: u32,
pub tp_rank: u32,
pub gpu_id: u32,
}
fn parse_sglang_flags(s: &str) -> Result<SgLangFlags, String> {
let nums: Vec<u32> = s
.split(',')
.map(u32::from_str)
.collect::<Result<Vec<_>, _>>()
.map_err(|e| e.to_string())?;
if nums.len() != 3 {
return Err("Need exactly 3 numbers".into());
}
Ok(SgLangFlags {
pipe_fd: nums[0],
tp_rank: nums[1],
gpu_id: nums[2],
})
}
#[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug)] #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug)]
pub enum RouterMode { pub enum RouterMode {
#[default] #[default]
......
...@@ -183,7 +183,6 @@ pub async fn prepare_engine( ...@@ -183,7 +183,6 @@ pub async fn prepare_engine(
_cache_dir: None, _cache_dir: None,
}) })
} }
EngineConfig::None => unreachable!(),
} }
} }
......
...@@ -91,7 +91,6 @@ pub async fn run( ...@@ -91,7 +91,6 @@ pub async fn run(
EngineConfig::Dynamic(_) => { EngineConfig::Dynamic(_) => {
anyhow::bail!("Cannot use endpoint for both in and out"); anyhow::bail!("Cannot use endpoint for both in and out");
} }
EngineConfig::None => unreachable!(),
}; };
tokio::select! { tokio::select! {
......
...@@ -97,7 +97,6 @@ pub async fn run( ...@@ -97,7 +97,6 @@ pub async fn run(
.await?; .await?;
manager.add_completions_model(model.service_name(), cmpl_pipeline)?; manager.add_completions_model(model.service_name(), cmpl_pipeline)?;
} }
EngineConfig::None => unreachable!(),
} }
http_service.run(runtime.primary_token()).await?; http_service.run(runtime.primary_token()).await?;
runtime.shutdown(); // Cancel primary token runtime.shutdown(); // Cancel primary token
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#[cfg(any(feature = "vllm", feature = "sglang"))]
use std::{future::Future, pin::Pin}; use std::{future::Future, pin::Pin};
use std::{io::Read, sync::Arc, time::Duration}; use std::{io::Read, sync::Arc, time::Duration};
use anyhow::Context; use anyhow::Context;
use dynamo_llm::{ use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, LocalModel};
backend::ExecutionContext, engines::StreamingEngine, kv_router::publisher::KvMetricsPublisher,
LocalModel,
};
use dynamo_runtime::{protocols::Endpoint, CancellationToken, DistributedRuntime}; use dynamo_runtime::{protocols::Endpoint, CancellationToken, DistributedRuntime};
mod flags; mod flags;
pub use flags::Flags; pub use flags::Flags;
mod input; mod input;
#[cfg(any(feature = "vllm", feature = "sglang"))]
mod net;
mod opt; mod opt;
pub use dynamo_llm::request_template::RequestTemplate; pub use dynamo_llm::request_template::RequestTemplate;
pub use opt::{Input, Output}; pub use opt::{Input, Output};
...@@ -38,19 +20,12 @@ mod subprocess; ...@@ -38,19 +20,12 @@ mod subprocess;
/// the command line. Hence it's optional, and defaults to this. /// the command line. Hence it's optional, and defaults to this.
const INVISIBLE_MODEL_NAME: &str = "dynamo-run"; const INVISIBLE_MODEL_NAME: &str = "dynamo-run";
/// The component name for the KV publisher, if used
const KV_PUBLISHER_COMPONENT: &str = "kvpublisher";
const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2); const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
/// How we identify a python string endpoint /// How we identify a python string endpoint
#[cfg(feature = "python")] #[cfg(feature = "python")]
const PYTHON_STR_SCHEME: &str = "pystr:"; const PYTHON_STR_SCHEME: &str = "pystr:";
/// How we identify a python token endpoint
#[cfg(feature = "python")]
const PYTHON_TOK_SCHEME: &str = "pytok:";
pub enum EngineConfig { pub enum EngineConfig {
/// An remote networked engine we don't know about yet /// An remote networked engine we don't know about yet
Dynamic(Endpoint), Dynamic(Endpoint),
...@@ -66,24 +41,13 @@ pub enum EngineConfig { ...@@ -66,24 +41,13 @@ pub enum EngineConfig {
engine: ExecutionContext, engine: ExecutionContext,
model: Box<LocalModel>, model: Box<LocalModel>,
}, },
/// vllm multi-node doesn't run an engine on nodes other than 0. 'ray' does all the work.
None,
} }
/// Distributed system values
struct DynInput {
endpoint_id: Endpoint,
distributed_runtime: DistributedRuntime,
}
#[allow(unused_mut)]
pub async fn run( pub async fn run(
runtime: dynamo_runtime::Runtime, runtime: dynamo_runtime::Runtime,
mut in_opt: Input, // mut because vllm and sglang multi-node can change it in_opt: Input,
out_opt: Output, out_opt: Output,
flags: Flags, flags: Flags,
#[allow(unused_variables)] zmq_socket_prefix: Option<String>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let cancel_token = runtime.primary_token(); let cancel_token = runtime.primary_token();
let maybe_path = flags let maybe_path = flags
...@@ -120,29 +84,6 @@ pub async fn run( ...@@ -120,29 +84,6 @@ pub async fn run(
} }
}; };
let dyn_input = match &in_opt {
Input::Endpoint(endpoint_path) => {
if maybe_path.as_ref().map(|mp| mp.is_file()).unwrap_or(false)
&& flags.model_config.is_none()
{
// TODO We need to convert tokenizer extract from GGUF file into something we can
// publish to NATS. Ideally `tokenizer.json` directly, but otherwise an
// intermediate format.
tracing::error!("Serving GGUF files in a distributed system requires `--model-config <hf-repo-dir>` so that we can find the tokenzier config");
return Ok(());
}
// If we are in a distributed system, we need to know our component upfront
let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
let endpoint_id: Endpoint = endpoint_path.parse()?;
Some(DynInput {
endpoint_id,
distributed_runtime,
})
}
_ => None,
};
let mut extra: Option<Pin<Box<dyn Future<Output = ()> + Send>>> = None; // vllm and sglang sub-process let mut extra: Option<Pin<Box<dyn Future<Output = ()> + Send>>> = None; // vllm and sglang sub-process
let template = if let Some(path) = flags.request_template.as_ref() { let template = if let Some(path) = flags.request_template.as_ref() {
...@@ -183,13 +124,17 @@ pub async fn run( ...@@ -183,13 +124,17 @@ pub async fn run(
engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?, engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?,
model: Box::new(local_model), model: Box::new(local_model),
}, },
Output::SgLang => { Output::SgLang => {
if !local_model.path().is_dir() { if !local_model.path().is_dir() {
// TODO Does sglang support GGUF? Can we make it work? // TODO Does sglang support GGUF? Can we make it work?
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout"); anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
} }
let (py_script, mut child) = match subprocess::start( let multi_node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
let (py_script, child) = match subprocess::start(
subprocess::sglang::PY, subprocess::sglang::PY,
local_model.path(), local_model.path(),
flags.tensor_parallel_size, flags.tensor_parallel_size,
...@@ -198,6 +143,11 @@ pub async fn run( ...@@ -198,6 +143,11 @@ pub async fn run(
} else { } else {
Some(flags.base_gpu_id) Some(flags.base_gpu_id)
}, },
if flags.num_nodes <= 1 {
None
} else {
Some(multi_node_conf)
},
flags.extra_engine_args.as_deref(), flags.extra_engine_args.as_deref(),
) )
.await .await
...@@ -216,151 +166,16 @@ pub async fn run( ...@@ -216,151 +166,16 @@ pub async fn run(
let endpoint: Endpoint = subprocess::ENDPOINT.parse()?; let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
EngineConfig::Dynamic(endpoint) EngineConfig::Dynamic(endpoint)
} }
#[cfg(feature = "sglang")]
Output::SgLangLegacy => {
if !local_model.path().is_dir() {
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
}
let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("sglang requires zmq_socket_prefix");
};
let node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
if node_conf.num_nodes > 1 {
if let Ok(Some(if_name)) = net::get_primary_interface().await {
tracing::info!("If you see 'gloo' errors from sglang try setting these environment variables:");
tracing::info!("export GLOO_SOCKET_IFNAME={if_name}");
tracing::info!("export NCCL_SOCKET_IFNAME={if_name}");
}
if node_conf.node_rank != 0 {
// Follower nodes take input from leader node over pytorch distributed, not
// from user.
in_opt = Input::None;
}
}
let (engine, sglang_process) = dynamo_engine_sglang::make_engine(
cancel_token.clone(),
local_model.path(),
&sock_prefix,
node_conf,
flags.tensor_parallel_size,
flags.base_gpu_id,
flags.extra_engine_args.clone(),
)
.await?;
extra = Some(Box::pin(async move {
let _ = sglang_process.await;
}));
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
}
}
#[cfg(feature = "vllm")]
Output::Vllm0_7 => {
if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("vllm requires zmq_socket_prefix");
};
let node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
if node_conf.num_nodes > 1 {
if let Ok(Some(if_name)) = net::get_primary_interface().await {
tracing::info!("If you see network errors from vllm try setting this environment variable:");
tracing::info!("export NCCL_SOCKET_IFNAME={if_name}");
}
if node_conf.node_rank != 0 {
// Only node 0 runs vllm, the others communicate over ray
in_opt = Input::None;
}
}
if node_conf.node_rank == 0 {
let kv_metrics_publisher = if let Some(dyn_input) = &dyn_input {
let kvp_component = dyn_input
.distributed_runtime
.namespace(dyn_input.endpoint_id.namespace.clone())?
.component(KV_PUBLISHER_COMPONENT)?;
let kvp = Arc::new(KvMetricsPublisher::new()?);
let kvp_inner = kvp.clone();
tokio::spawn(
async move { kvp_inner.create_endpoint(kvp_component, None).await },
);
Some(kvp)
} else {
None
};
// vllm multi-node only the leader runs vllm
let (engine, vllm_future) = dynamo_engine_vllm0_7::make_leader_engine(
cancel_token.clone(),
local_model.path(),
&sock_prefix,
node_conf,
flags.tensor_parallel_size,
flags.extra_engine_args.clone(),
kv_metrics_publisher,
)
.await?;
extra = Some(Box::pin(async move {
let _ = vllm_future.await;
}));
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
}
} else {
// Nodes rank > 0 only run 'ray'
let stop_future =
dynamo_engine_vllm0_7::start_follower(cancel_token.clone(), node_conf).await?;
extra = Some(Box::pin(stop_future));
EngineConfig::None
}
}
#[cfg(feature = "vllm")]
Output::Vllm0_8 => {
if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
let node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
let engine = dynamo_engine_vllm0_8::make_engine(
cancel_token.clone(),
local_model.path(),
node_conf,
flags.tensor_parallel_size,
flags.extra_engine_args.clone(),
)
.await?;
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
}
}
// No feature flag because it uses a sub-process, it's very cheap to include
Output::Vllm => { Output::Vllm => {
if flags.base_gpu_id != 0 { if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead."); anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
} }
let (py_script, mut child) = match subprocess::start( let (py_script, child) = match subprocess::start(
subprocess::vllm::PY, subprocess::vllm::PY,
local_model.path(), local_model.path(),
flags.tensor_parallel_size, flags.tensor_parallel_size,
None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead
None, // multi-node config. vllm uses `ray`, see guide
flags.extra_engine_args.as_deref(), flags.extra_engine_args.as_deref(),
) )
.await .await
...@@ -405,18 +220,6 @@ pub async fn run( ...@@ -405,18 +220,6 @@ pub async fn run(
model: Box::new(local_model), model: Box::new(local_model),
} }
} }
#[cfg(feature = "python")]
Output::PythonTok(path_str) => {
let card = local_model.card();
let py_args = flags.as_vec(&path_str, &card.service_name);
let p = std::path::PathBuf::from(path_str);
let engine =
dynamo_engine_python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
}
}
}; };
match in_opt { match in_opt {
...@@ -443,16 +246,8 @@ pub async fn run( ...@@ -443,16 +246,8 @@ pub async fn run(
.await?; .await?;
} }
Input::Endpoint(path) => { Input::Endpoint(path) => {
let Some(dyn_input) = dyn_input else { let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
unreachable!("We set dyn_input earlier"); crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
};
crate::input::endpoint::run(dyn_input.distributed_runtime, path, engine_config).await?;
}
Input::None => {
// Multi-node setup. The engine sub-process has been started and is talking
// to it's node_rank 0 controller. We do nothing.
// TODO: Acquire an etcd lease, we are running
cancel_token.cancelled().await;
} }
} }
......
...@@ -24,15 +24,13 @@ const HELP: &str = r#" ...@@ -24,15 +24,13 @@ const HELP: &str = r#"
dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally. dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally.
Example: Example:
- cargo build --release --features mistralrs,cuda - cargo build --features cuda -p dynamo-run
- cd target/release - cd target/debug
- ./dynamo-run hf_checkouts/Llama-3.2-3B-Instruct/ - ./dynamo-run Qwen/Qwen2.5-3B-Instruct
- OR: ./dynamo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf - OR: ./dynamo-run /data/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
"#; "#;
const ZMQ_SOCKET_PREFIX: &str = "dyn"; const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]";
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>|none] out=ENGINE_LIST [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]";
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
// Set log level based on verbosity flag // Set log level based on verbosity flag
...@@ -56,72 +54,6 @@ fn main() -> anyhow::Result<()> { ...@@ -56,72 +54,6 @@ fn main() -> anyhow::Result<()> {
logging::init(); logging::init();
// Call sub-processes before starting the Runtime machinery
// For anything except sub-process starting try_parse_from will error.
if let Ok(flags) = dynamo_run::Flags::try_parse_from(env::args()) {
#[allow(unused_variables)]
if let Some(sglang_flags) = flags.internal_sglang_process {
let Some(model_path) = flags.model_path_flag.as_ref() else {
anyhow::bail!("sglang subprocess requires --model-path");
};
if !model_path.is_dir() {
anyhow::bail!("sglang subprocess requires model path to be a directory containing the safetensors files");
}
if cfg!(feature = "sglang") {
#[cfg(feature = "sglang")]
{
let gpu_config = dynamo_engine_sglang::MultiGPUConfig {
tp_size: flags.tensor_parallel_size,
tp_rank: sglang_flags.tp_rank,
gpu_id: sglang_flags.gpu_id,
};
let node_config = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
};
return dynamo_engine_sglang::run_subprocess(
ZMQ_SOCKET_PREFIX,
model_path,
sglang_flags.pipe_fd as std::os::fd::RawFd,
node_config,
gpu_config,
flags.extra_engine_args,
);
}
} else {
panic!("Rebuild with --features=sglang");
}
}
#[allow(unused_variables)]
if flags.internal_vllm_process {
let Some(model_path) = flags.model_path_flag else {
anyhow::bail!("vllm subprocess requires --model-path flag");
};
if cfg!(feature = "vllm") {
#[cfg(feature = "vllm")]
{
let node_config = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
};
return dynamo_engine_vllm0_7::run_subprocess(
ZMQ_SOCKET_PREFIX,
&model_path,
node_config,
flags.tensor_parallel_size,
flags.extra_engine_args,
flags.router_mode.is_kv_routing(),
);
}
} else {
panic!("Rebuild with --features=vllm");
}
}
}
// max_worker_threads and max_blocking_threads from env vars or config file. // max_worker_threads and max_blocking_threads from env vars or config file.
let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?; let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?;
...@@ -195,14 +127,7 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> { ...@@ -195,14 +127,7 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
.chain(env::args().skip(non_flag_params)), .chain(env::args().skip(non_flag_params)),
)?; )?;
dynamo_run::run( dynamo_run::run(runtime, in_opt, out_opt, flags).await
runtime,
in_opt,
out_opt,
flags,
Some(ZMQ_SOCKET_PREFIX.to_string()),
)
.await
} }
/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it. /// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Mac build uses none of this
#![allow(dead_code)]
#[cfg(target_os = "linux")]
pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
unix::get_primary_interface().await
}
#[cfg(target_os = "macos")]
pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
Ok(None)
}
#[derive(Debug)]
pub struct LinkDataError {
kind: LinkDataErrorKind,
interface: Option<String>,
}
impl LinkDataError {
fn connection(connection_error: std::io::Error) -> Self {
let kind = LinkDataErrorKind::Connection(connection_error);
let interface = None;
Self { kind, interface }
}
#[cfg(target_os = "linux")]
fn communication(communication_error: rtnetlink::Error) -> Self {
let kind = LinkDataErrorKind::Communication(communication_error);
let interface = None;
Self { kind, interface }
}
}
impl std::fmt::Display for LinkDataError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let err_message = "could not get interface link data";
if let Some(interface) = self.interface.as_ref() {
write!(f, "{err_message} for {interface}")
} else {
write!(f, "{err_message}")
}
}
}
impl std::error::Error for LinkDataError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self.kind {
LinkDataErrorKind::Connection(ref e) => Some(e),
#[cfg(target_os = "linux")]
LinkDataErrorKind::Communication(ref e) => Some(e),
}
}
}
#[derive(Debug)]
pub enum LinkDataErrorKind {
Connection(std::io::Error),
#[cfg(target_os = "linux")]
Communication(rtnetlink::Error),
}
#[cfg(target_os = "linux")]
mod unix {
use futures_util::TryStreamExt;
use netlink_packet_route::address::AddressAttribute;
use netlink_packet_route::link::LinkLayerType;
use netlink_packet_route::link::State as LinkState;
use netlink_packet_route::link::{LinkAttribute, LinkMessage};
use netlink_packet_route::AddressFamily;
use std::collections::HashMap;
use std::collections::HashSet;
use std::collections::VecDeque;
pub async fn get_primary_interface() -> Result<Option<String>, super::LinkDataError> {
let mut candidates: VecDeque<String> = get_ipv4_interface_links()
.await?
.into_iter()
.filter(|(k, v)| {
v.is_ethernet() && v.link_is_up() && v.has_carrier() && k.starts_with("e")
})
.map(|(k, _)| k)
.collect();
Ok(candidates.pop_front())
}
#[derive(Clone, Debug)]
// Most of the fields are Option<T> because the netlink protocol allows them
// to be absent (even though we have no reason to believe they'd ever actually
// be missing).
struct InterfaceLinkData {
link_type: LinkLayerType,
state: Option<LinkState>,
has_carrier: bool,
}
impl InterfaceLinkData {
pub fn link_is_up(&self) -> bool {
self.state
.map(|state| matches!(state, LinkState::Up))
.unwrap_or(false)
}
pub fn is_ethernet(&self) -> bool {
matches!(self.link_type, LinkLayerType::Ether)
}
pub fn has_carrier(&self) -> bool {
self.has_carrier
}
}
impl From<LinkMessage> for InterfaceLinkData {
fn from(link_message: LinkMessage) -> Self {
let link_type = link_message.header.link_layer_type;
let state = link_message
.attributes
.iter()
.find_map(|attribute| match attribute {
LinkAttribute::OperState(state) => Some(*state),
_ => None,
});
let has_carrier = link_message
.attributes
.iter()
.find_map(|attribute| match attribute {
LinkAttribute::Carrier(1) => Some(true),
_ => None,
})
.unwrap_or(false);
InterfaceLinkData {
link_type,
state,
has_carrier,
}
}
}
// Retrieve the link data (state, MTU, etc.) for all interfaces, and return
// them as a HashMap keyed by interface name. This is roughly equivalent to `ip
// link show` since we're using the same netlink interface under the hood as
// that command.
async fn get_ipv4_interface_links(
) -> Result<HashMap<String, InterfaceLinkData>, super::LinkDataError> {
let (netlink_connection, rtnetlink_handle, _receiver) =
rtnetlink::new_connection().map_err(super::LinkDataError::connection)?;
// We have to spawn off the netlink connection because of the architecture
// of `netlink_proto::Connection`, which runs in the background and owns
// the socket. We communicate with it via channel messages, and it will exit
// when both `rtnetlink_handle` and `_receiver` go out of scope.
tokio::spawn(netlink_connection);
let address_handle = rtnetlink_handle.address().get().execute();
let ipv4s: HashSet<String> = address_handle
.try_filter_map(|addr_message| async move {
if matches!(addr_message.header.family, AddressFamily::Inet) {
Ok(addr_message
.attributes
.into_iter()
.find(|attr| matches!(attr, AddressAttribute::Label(_)))
.and_then(|x| match x {
AddressAttribute::Label(label) => Some(label),
_ => None,
}))
} else {
Ok(None)
}
})
.try_collect()
.await
.map_err(super::LinkDataError::communication)?;
let link_handle = rtnetlink_handle.link().get().execute();
link_handle
.try_filter_map(|link_message| async {
let maybe_interface_data = match extract_interface_name(&link_message) {
Some(interface_name) => {
if ipv4s.contains(&interface_name) {
Some((interface_name, InterfaceLinkData::from(link_message)))
} else {
None
}
}
None => {
let idx = link_message.header.index;
eprintln!(
"Network interface with index {idx} doesn't have a name (no IfName attribute)"
);
None
}
};
Ok(maybe_interface_data)
})
.try_collect()
.await
.map_err(super::LinkDataError::communication)
}
fn extract_interface_name(link_message: &LinkMessage) -> Option<String> {
link_message
.attributes
.iter()
.find_map(|attribute| match attribute {
LinkAttribute::IfName(name) => Some(name.clone()),
_ => None,
})
}
}
...@@ -35,11 +35,6 @@ pub enum Input { ...@@ -35,11 +35,6 @@ pub enum Input {
/// Batch mode. Run all the prompts, write the outputs, exit. /// Batch mode. Run all the prompts, write the outputs, exit.
Batch(PathBuf), Batch(PathBuf),
/// Start the engine but don't provide any way to talk to it.
/// For multi-node sglang, where the engine connects directly
/// to the co-ordinator via torch distributed / nccl.
None,
} }
impl TryFrom<&str> for Input { impl TryFrom<&str> for Input {
...@@ -50,7 +45,6 @@ impl TryFrom<&str> for Input { ...@@ -50,7 +45,6 @@ impl TryFrom<&str> for Input {
"http" => Ok(Input::Http), "http" => Ok(Input::Http),
"text" => Ok(Input::Text), "text" => Ok(Input::Text),
"stdin" => Ok(Input::Stdin), "stdin" => Ok(Input::Stdin),
"none" => Ok(Input::None),
endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => { endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => {
Ok(Input::Endpoint(endpoint_path.to_string())) Ok(Input::Endpoint(endpoint_path.to_string()))
} }
...@@ -71,7 +65,6 @@ impl fmt::Display for Input { ...@@ -71,7 +65,6 @@ impl fmt::Display for Input {
Input::Stdin => "stdin", Input::Stdin => "stdin",
Input::Endpoint(path) => path, Input::Endpoint(path) => path,
Input::Batch(path) => &path.display().to_string(), Input::Batch(path) => &path.display().to_string(),
Input::None => "none",
}; };
write!(f, "{s}") write!(f, "{s}")
} }
...@@ -101,39 +94,21 @@ pub enum Output { ...@@ -101,39 +94,21 @@ pub enum Output {
/// Run inference on a model in a GGUF file using mistralrs w/ candle /// Run inference on a model in a GGUF file using mistralrs w/ candle
MistralRs, MistralRs,
#[cfg(feature = "sglang")]
/// Deprecated
SgLangLegacy,
/// Run inference using sglang
SgLang,
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
/// Run inference using llama.cpp /// Run inference using llama.cpp
LlamaCpp, LlamaCpp,
/// Run inference using sglang
SgLang,
// Start vllm in a sub-process connecting via nats // Start vllm in a sub-process connecting via nats
// Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>` // Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
Vllm, Vllm,
#[cfg(feature = "vllm")]
/// Run inference using vllm 0.8.X+
Vllm0_8,
#[cfg(feature = "vllm")]
/// Run inference using vllm 0.7.X
Vllm0_7,
/// Run inference using a user supplied python file that accepts and returns /// Run inference using a user supplied python file that accepts and returns
/// strings. It does it's own pre-processing. /// strings. It does it's own pre-processing.
#[cfg(feature = "python")] #[cfg(feature = "python")]
PythonStr(String), PythonStr(String),
/// Run inference using a user supplied python file that accepts and returns
/// tokens. We do the pre-processing.
#[cfg(feature = "python")]
PythonTok(String),
//
// DEVELOPER NOTE // DEVELOPER NOTE
// If you add an engine add it to `available_engines` below, and to Default if it makes sense // If you add an engine add it to `available_engines` below, and to Default if it makes sense
} }
...@@ -146,21 +121,12 @@ impl TryFrom<&str> for Output { ...@@ -146,21 +121,12 @@ impl TryFrom<&str> for Output {
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
"mistralrs" => Ok(Output::MistralRs), "mistralrs" => Ok(Output::MistralRs),
#[cfg(feature = "sglang")]
"sglang_legacy" => Ok(Output::SgLangLegacy),
"sglang" => Ok(Output::SgLang),
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
"llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp), "llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
"sglang" => Ok(Output::SgLang),
"vllm" => Ok(Output::Vllm), "vllm" => Ok(Output::Vllm),
#[cfg(feature = "vllm")]
"vllm0_8" => Ok(Output::Vllm0_8),
#[cfg(feature = "vllm")]
"vllm0_7" => Ok(Output::Vllm0_7),
"echo_full" => Ok(Output::EchoFull), "echo_full" => Ok(Output::EchoFull),
"echo_core" => Ok(Output::EchoCore), "echo_core" => Ok(Output::EchoCore),
...@@ -177,14 +143,6 @@ impl TryFrom<&str> for Output { ...@@ -177,14 +143,6 @@ impl TryFrom<&str> for Output {
Ok(Output::PythonStr(path.to_string())) Ok(Output::PythonStr(path.to_string()))
} }
#[cfg(feature = "python")]
python_tok_gen if python_tok_gen.starts_with(crate::PYTHON_TOK_SCHEME) => {
let path = python_tok_gen
.strip_prefix(crate::PYTHON_TOK_SCHEME)
.unwrap();
Ok(Output::PythonTok(path.to_string()))
}
e => Err(anyhow::anyhow!("Invalid out= option '{e}'")), e => Err(anyhow::anyhow!("Invalid out= option '{e}'")),
} }
} }
...@@ -196,21 +154,12 @@ impl fmt::Display for Output { ...@@ -196,21 +154,12 @@ impl fmt::Display for Output {
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
Output::MistralRs => "mistralrs", Output::MistralRs => "mistralrs",
#[cfg(feature = "sglang")]
Output::SgLangLegacy => "sglang_legacy",
Output::SgLang => "sglang",
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
Output::LlamaCpp => "llamacpp", Output::LlamaCpp => "llamacpp",
Output::SgLang => "sglang",
Output::Vllm => "vllm", Output::Vllm => "vllm",
#[cfg(feature = "vllm")]
Output::Vllm0_8 => "vllm0_8",
#[cfg(feature = "vllm")]
Output::Vllm0_7 => "vllm0_7",
Output::EchoFull => "echo_full", Output::EchoFull => "echo_full",
Output::EchoCore => "echo_core", Output::EchoCore => "echo_core",
...@@ -218,9 +167,6 @@ impl fmt::Display for Output { ...@@ -218,9 +167,6 @@ impl fmt::Display for Output {
#[cfg(feature = "python")] #[cfg(feature = "python")]
Output::PythonStr(_) => "pystr", Output::PythonStr(_) => "pystr",
#[cfg(feature = "python")]
Output::PythonTok(_) => "pytok",
}; };
write!(f, "{s}") write!(f, "{s}")
} }
...@@ -258,22 +204,11 @@ impl Output { ...@@ -258,22 +204,11 @@ impl Output {
} }
out.push(Output::SgLang.to_string()); out.push(Output::SgLang.to_string());
#[cfg(feature = "sglang")]
{
out.push(Output::SgLangLegacy.to_string());
}
out.push(Output::Vllm.to_string()); out.push(Output::Vllm.to_string());
#[cfg(feature = "vllm")]
{
out.push(Output::Vllm0_7.to_string());
out.push(Output::Vllm0_8.to_string());
}
#[cfg(feature = "python")] #[cfg(feature = "python")]
{ {
out.push(Output::PythonStr("file.py".to_string()).to_string()); out.push(Output::PythonStr("file.py".to_string()).to_string());
out.push(Output::PythonTok("file.py".to_string()).to_string());
} }
out out
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::borrow::Cow; use std::borrow::Cow;
use std::io::Write; use std::io::Write;
...@@ -23,6 +11,8 @@ use anyhow::Context; ...@@ -23,6 +11,8 @@ use anyhow::Context;
use regex::Regex; use regex::Regex;
use tokio::io::AsyncBufReadExt; use tokio::io::AsyncBufReadExt;
use dynamo_llm::engines::MultiNodeConfig;
pub mod sglang; pub mod sglang;
pub mod vllm; pub mod vllm;
...@@ -39,6 +29,8 @@ pub async fn start( ...@@ -39,6 +29,8 @@ pub async fn start(
// sglang which GPU to start from, on a multi-GPU system // sglang which GPU to start from, on a multi-GPU system
// vllm uses CUDA_VISIBLE_DEVICES // vllm uses CUDA_VISIBLE_DEVICES
base_gpu_id: Option<u32>, base_gpu_id: Option<u32>,
// sglang multi-node config. vllm uses `ray` externally
multi_node_config: Option<MultiNodeConfig>,
// Path to a JSON file containing extra arguments to the backend engine // Path to a JSON file containing extra arguments to the backend engine
extra_engine_args: Option<&Path>, extra_engine_args: Option<&Path>,
) -> anyhow::Result<(tempfile::TempPath, tokio::process::Child)> { ) -> anyhow::Result<(tempfile::TempPath, tokio::process::Child)> {
...@@ -61,6 +53,15 @@ pub async fn start( ...@@ -61,6 +53,15 @@ pub async fn start(
args.push("--base-gpu-id".to_string()); args.push("--base-gpu-id".to_string());
args.push(base_gpu_id.to_string()); args.push(base_gpu_id.to_string());
} }
// sglang only
if let Some(multi_node_config) = multi_node_config {
args.push("--nnodes".to_string());
args.push(multi_node_config.num_nodes.to_string());
args.push("--node-rank".to_string());
args.push(multi_node_config.node_rank.to_string());
args.push("--dist-init-addr".to_string());
args.push(multi_node_config.leader_addr);
}
if let Some(extra_engine_args) = extra_engine_args { if let Some(extra_engine_args) = extra_engine_args {
args.push("--extra-engine-args".to_string()); args.push("--extra-engine-args".to_string());
args.push(extra_engine_args.to_string_lossy().to_string()); args.push(extra_engine_args.to_string_lossy().to_string());
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# #
# A very basic example of sglang worker handling pre-processed requests. # A very basic example of sglang worker handling pre-processed requests.
...@@ -52,6 +39,9 @@ class Config: ...@@ -52,6 +39,9 @@ class Config:
model: str model: str
base_gpu_id: int base_gpu_id: int
tensor_parallel_size: int tensor_parallel_size: int
nnodes: int
node_rank: int
dist_init_addr: str
extra_engine_args: str extra_engine_args: str
...@@ -111,6 +101,13 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -111,6 +101,13 @@ async def init(runtime: DistributedRuntime, config: Config):
"tp_size": config.tensor_parallel_size, "tp_size": config.tensor_parallel_size,
"base_gpu_id": config.base_gpu_id, "base_gpu_id": config.base_gpu_id,
} }
if config.dist_init_addr != "":
arg_map["trust_remote_code"] = True
arg_map["nnodes"] = config.nnodes
arg_map["dist_init_addr"] = config.dist_init_addr
# In practice this is always 0 because Dynamo only manages the leader
arg_map["node_rank"] = config.node_rank
if config.extra_engine_args != "": if config.extra_engine_args != "":
json_map = {} json_map = {}
# extra_engine_args is a filename # extra_engine_args is a filename
...@@ -157,6 +154,21 @@ def cmd_line_args(): ...@@ -157,6 +154,21 @@ def cmd_line_args():
parser.add_argument( parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use." "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
) )
parser.add_argument(
"--nnodes", type=int, default=1, help="The number of machines SGLang will use"
)
parser.add_argument(
"--node-rank",
type=int,
default=0,
help="Unique number for each node. 0 for the leader.",
)
parser.add_argument(
"--dist-init-addr",
type=str,
default="",
help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
)
parser.add_argument( parser.add_argument(
"--extra-engine-args", "--extra-engine-args",
type=str, type=str,
...@@ -183,6 +195,9 @@ def cmd_line_args(): ...@@ -183,6 +195,9 @@ def cmd_line_args():
config.endpoint = parsed_endpoint_name config.endpoint = parsed_endpoint_name
config.base_gpu_id = args.base_gpu_id config.base_gpu_id = args.base_gpu_id
config.tensor_parallel_size = args.tensor_parallel_size config.tensor_parallel_size = args.tensor_parallel_size
config.nnodes = args.nnodes
config.node_rank = args.node_rank
config.dist_init_addr = args.dist_init_addr
config.extra_engine_args = args.extra_engine_args config.extra_engine_args = args.extra_engine_args
return config return config
......
...@@ -66,19 +66,20 @@ class RequestHandler: ...@@ -66,19 +66,20 @@ class RequestHandler:
Request handler for the generate endpoint Request handler for the generate endpoint
""" """
def __init__(self, engine): def __init__(self, engine, default_sampling_params):
self.engine_client = engine self.engine_client = engine
self.default_sampling_params = default_sampling_params
async def generate(self, request): async def generate(self, request):
request_id = "1" # hello_world example only request_id = "1" # hello_world example only
logging.debug(f"Received request: {request}") logging.debug(f"Received request: {request}")
prompt = TokensPrompt(prompt_token_ids=request["token_ids"]) prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
sampling_params = SamplingParams(
temperature=request["sampling_options"]["temperature"], sampling_params = SamplingParams(**self.default_sampling_params)
# vllm defaults this to 16 sampling_params.temperature = request["sampling_options"]["temperature"]
max_tokens=request["stop_conditions"]["max_tokens"], sampling_params.max_tokens = request["stop_conditions"]["max_tokens"]
)
num_output_tokens_so_far = 0 num_output_tokens_so_far = 0
gen = self.engine_client.generate(prompt, sampling_params, request_id) gen = self.engine_client.generate(prompt, sampling_params, request_id)
async for res in gen: async for res in gen:
...@@ -142,13 +143,18 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -142,13 +143,18 @@ async def init(runtime: DistributedRuntime, config: Config):
arg_map = {**arg_map, **json_map} # json_map gets precedence arg_map = {**arg_map, **json_map} # json_map gets precedence
engine_args = AsyncEngineArgs(**arg_map) engine_args = AsyncEngineArgs(**arg_map)
model_config = engine_args.create_model_config()
# Load default sampling params from `generation_config.json`
default_sampling_params = model_config.get_diff_sampling_param()
engine_context = build_async_engine_client_from_engine_args(engine_args) engine_context = build_async_engine_client_from_engine_args(engine_args)
engine_client = await engine_context.__aenter__() engine_client = await engine_context.__aenter__()
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes) # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked # after the lease is revoked
await endpoint.serve_endpoint(RequestHandler(engine_client).generate, None) await endpoint.serve_endpoint(
RequestHandler(engine_client, default_sampling_params).generate, None
)
def cmd_line_args(): def cmd_line_args():
......
...@@ -36,7 +36,6 @@ use tokio::sync::mpsc; ...@@ -36,7 +36,6 @@ use tokio::sync::mpsc;
use tokio::sync::oneshot::Sender; use tokio::sync::oneshot::Sender;
use tokio_stream::{wrappers::ReceiverStream, StreamExt}; use tokio_stream::{wrappers::ReceiverStream, StreamExt};
use dynamo_llm::backend::ExecutionContext;
use dynamo_llm::engines::{EngineDispatcher, StreamingEngine}; use dynamo_llm::engines::{EngineDispatcher, StreamingEngine};
/// Python snippet to import a file as a module /// Python snippet to import a file as a module
...@@ -89,26 +88,6 @@ pub async fn make_string_engine( ...@@ -89,26 +88,6 @@ pub async fn make_string_engine(
Ok(engine) Ok(engine)
} }
/// An engine that takes and returns tokens.
pub async fn make_token_engine(
cancel_token: CancellationToken,
py_file: &Path,
py_args: Vec<String>,
) -> pipeline_error::Result<ExecutionContext> {
pyo3::prepare_freethreaded_python();
if let Ok(venv) = env::var("VIRTUAL_ENV") {
Python::with_gil(|py| {
if let Err(e) = fix_venv(venv, py) {
tracing::warn!("failed to fix venv: {}", e);
}
});
}
let engine = new_engine(cancel_token, py_file, py_args).await?;
let engine: ExecutionContext = Arc::new(engine);
Ok(engine)
}
#[derive(Clone)] #[derive(Clone)]
pub struct PythonServerStreamingEngine { pub struct PythonServerStreamingEngine {
_cancel_token: CancellationToken, _cancel_token: CancellationToken,
...@@ -128,17 +107,6 @@ async fn new_engine( ...@@ -128,17 +107,6 @@ async fn new_engine(
let user_module = let user_module =
python_file_to_module(py_file, py_args).with_context(|| py_file.display().to_string())?; python_file_to_module(py_file, py_args).with_context(|| py_file.display().to_string())?;
let generator = Python::with_gil(|py| { let generator = Python::with_gil(|py| {
/* Leave commented, `initialize` may be needed to match Triton
if let Ok(initialize) = user_module.getattr(py, "initialize") {
initialize
.call1(py, (py_args,))
.inspect_err(|err| {
println!();
err.display(py);
})
.with_context(|| "Failed calling python engine's initialize(args)")?;
};
*/
user_module user_module
.getattr(py, "generate") .getattr(py, "generate")
.with_context(|| "generate") .with_context(|| "generate")
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-sglang"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
async_zmq = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
async-openai = "0.27.2"
libc = "0.2"
pyo3 = { version = "0.23.3", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"py-clone",
] }
regex = "1"
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::{Path, PathBuf};
use async_stream::stream;
use async_trait::async_trait;
use dynamo_llm::engines::MultiNodeConfig;
use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use dynamo_runtime::runtime::CancellationToken;
pub struct SgLangEngine {
cancel_token: CancellationToken,
worker: super::worker::SgLangWorker,
}
impl SgLangEngine {
pub async fn new(
cancel_token: CancellationToken,
sock_code: &str,
model_path: &Path,
node_conf: MultiNodeConfig,
tensor_parallel_size: u32,
base_gpu_id: u32,
extra_engine_args: Option<PathBuf>,
) -> anyhow::Result<Self> {
let w = super::worker::start(
cancel_token.clone(),
sock_code,
model_path,
node_conf,
tensor_parallel_size,
base_gpu_id,
extra_engine_args,
)
.await?;
let engine = SgLangEngine {
cancel_token,
worker: w,
};
Ok(engine)
}
pub fn take_sglang_worker_handle(&mut self) -> tokio::task::JoinHandle<()> {
self.worker.take_sglang_handle()
}
}
#[async_trait]
impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
for SgLangEngine
{
async fn generate(
&self,
request: SingleIn<BackendInput>,
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
let (request, context) = request.into_parts();
let ctx = context.context();
let request_id = ctx.id().to_string();
let (resp_tx, mut resp_rx) = tokio::sync::mpsc::channel(128);
let work_req = super::worker::WorkRequest {
request_id: context.id().to_string(),
request,
response_channel: resp_tx,
};
self.worker.enqueue_request(work_req).await?;
let cancel_token = self.cancel_token.clone();
let output = stream! {
loop {
tokio::select! {
_ = cancel_token.cancelled() => {
break;
}
maybe_resp_rx = resp_rx.recv() => {
match maybe_resp_rx {
Some(out) => {
yield out;
},
None => {
tracing::trace!(request_id, "generate: response channel closed");
break;
}
}
}
}
}
};
Ok(ResponseStream::new(Box::pin(output), ctx))
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment