Unverified Commit 19a77ae7 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore(dynamo-run): Remove out=sglang|vllm|trtllm (#1920)

parent 3c500ae7
...@@ -64,46 +64,6 @@ pub struct Flags { ...@@ -64,46 +64,6 @@ pub struct Flags {
#[arg(long)] #[arg(long)]
pub model_config: Option<PathBuf>, pub model_config: Option<PathBuf>,
/// sglang, vllm
///
/// How many GPUs to use at once, total across all nodes.
/// This must divide by num_nodes, and each node must use the same number of GPUs.
#[arg(long, default_value = "1", value_parser = clap::value_parser!(u32).range(1..256))]
pub tensor_parallel_size: u32,
/// sglang only
/// vllm uses CUDA_VISIBLE_DEVICES env var
///
/// Use GPUs from this ID upwards.
/// If your machine has four GPUs but the first two (0 and 1) are in use,
/// pass --base-gpu-id 2 to use the third GPU (and up, if tensor_parallel_size > 1)
#[arg(long, default_value = "0", value_parser = clap::value_parser!(u32).range(0..256))]
pub base_gpu_id: u32,
/// vllm and sglang only
///
/// How many nodes/hosts to use
#[arg(long, default_value = "1", value_parser = clap::value_parser!(u32).range(1..256))]
pub num_nodes: u32,
/// vllm and sglang only
///
/// This nodes' unique ID, running from 0 to num_nodes.
#[arg(long, default_value = "0", value_parser = clap::value_parser!(u32).range(0..255))]
pub node_rank: u32,
/// For multi-node / pipeline parallel this is the <host>:<port> of the first node.
///
/// - vllm: The address/port of the Ray head node.
///
/// - sglang: The Torch Distributed init method address, in format <host>:<port>.
/// It becomes "tcp://<host>:<port>" when given to torch.distributed.init_process_group.
/// This expects to use the nccl backend (transparently to us here).
/// All nodes must use the same address here, which is node_rank == 0's address.
///
#[arg(long)]
pub leader_addr: Option<String>,
/// If using `out=dyn` with multiple instances, this says how to route the requests. /// If using `out=dyn` with multiple instances, this says how to route the requests.
/// ///
/// Mostly interesting for KV-aware routing. /// Mostly interesting for KV-aware routing.
...@@ -199,22 +159,6 @@ impl Flags { ...@@ -199,22 +159,6 @@ impl Flags {
} }
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
Output::MistralRs => {} Output::MistralRs => {}
Output::SgLang => {
if !local_model.path().is_dir() {
// TODO GGUF support for sglang: https://github.com/ai-dynamo/dynamo/issues/572
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
}
}
Output::Vllm => {
if self.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
}
Output::Trtllm => {
if self.base_gpu_id != 0 {
anyhow::bail!("TRTLLM does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
}
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
Output::LlamaCpp => { Output::LlamaCpp => {
if !local_model.path().is_file() { if !local_model.path().is_file() {
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use std::time::Duration;
use std::{future::Future, pin::Pin};
use anyhow::Context as _; use anyhow::Context as _;
use dynamo_llm::entrypoint::input::Input; use dynamo_llm::entrypoint::input::Input;
use dynamo_llm::entrypoint::EngineConfig; use dynamo_llm::entrypoint::EngineConfig;
...@@ -17,9 +14,6 @@ pub use flags::Flags; ...@@ -17,9 +14,6 @@ pub use flags::Flags;
mod opt; mod opt;
pub use dynamo_llm::request_template::RequestTemplate; pub use dynamo_llm::request_template::RequestTemplate;
pub use opt::Output; pub use opt::Output;
mod subprocess;
const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
pub async fn run( pub async fn run(
runtime: Runtime, runtime: Runtime,
...@@ -48,6 +42,7 @@ pub async fn run( ...@@ -48,6 +42,7 @@ pub async fn run(
.request_template(flags.request_template.clone()) .request_template(flags.request_template.clone())
.migration_limit(flags.migration_limit); .migration_limit(flags.migration_limit);
// TODO: old, address this later:
// If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint. // If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
// If not, then the endpoint isn't exposed so we let LocalModel invent one. // If not, then the endpoint isn't exposed so we let LocalModel invent one.
let mut rt = Either::Left(runtime.clone()); let mut rt = Either::Left(runtime.clone());
...@@ -71,7 +66,7 @@ pub async fn run( ...@@ -71,7 +66,7 @@ pub async fn run(
flags.validate(&local_model, &out_opt)?; flags.validate(&local_model, &out_opt)?;
// Make an engine from the local_model, flags and output. // Make an engine from the local_model, flags and output.
let (engine_config, extra) = engine_for( let engine_config = engine_for(
runtime.primary_token(), runtime.primary_token(),
out_opt, out_opt,
flags.clone(), flags.clone(),
...@@ -85,17 +80,9 @@ pub async fn run( ...@@ -85,17 +80,9 @@ pub async fn run(
// //
dynamo_llm::entrypoint::input::run_input(rt, in_opt, engine_config).await?; dynamo_llm::entrypoint::input::run_input(rt, in_opt, engine_config).await?;
// Allow engines to ask main thread to wait on an extra future.
// We use this to stop the vllm and sglang sub-process
if let Some(extra) = extra {
extra.await;
}
Ok(()) Ok(())
} }
type ExtraFuture = Pin<Box<dyn Future<Output = ()> + Send>>;
/// Create the engine matching `out_opt` /// Create the engine matching `out_opt`
/// Note validation happens in Flags::validate. In here assume everything is going to work. /// Note validation happens in Flags::validate. In here assume everything is going to work.
async fn engine_for( async fn engine_for(
...@@ -104,71 +91,27 @@ async fn engine_for( ...@@ -104,71 +91,27 @@ async fn engine_for(
flags: Flags, flags: Flags,
local_model: LocalModel, local_model: LocalModel,
rt: Either<Runtime, DistributedRuntime>, rt: Either<Runtime, DistributedRuntime>,
) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> { ) -> anyhow::Result<EngineConfig> {
match out_opt { match out_opt {
Output::Dynamic => Ok((EngineConfig::Dynamic(Box::new(local_model)), None)), Output::Dynamic => Ok(EngineConfig::Dynamic(Box::new(local_model))),
Output::EchoFull => Ok(( Output::EchoFull => Ok(EngineConfig::StaticFull {
EngineConfig::StaticFull { model: Box::new(local_model),
model: Box::new(local_model), engine: dynamo_llm::engines::make_engine_full(),
engine: dynamo_llm::engines::make_engine_full(), }),
}, Output::EchoCore => Ok(EngineConfig::StaticCore {
None, engine: dynamo_llm::engines::make_engine_core(),
)), model: Box::new(local_model),
Output::EchoCore => Ok(( }),
EngineConfig::StaticCore {
engine: dynamo_llm::engines::make_engine_core(),
model: Box::new(local_model),
},
None,
)),
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
Output::MistralRs => Ok(( Output::MistralRs => Ok(EngineConfig::StaticFull {
EngineConfig::StaticFull { engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
engine: dynamo_engine_mistralrs::make_engine(&local_model).await?, model: Box::new(local_model),
model: Box::new(local_model), }),
},
None,
)),
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
Output::LlamaCpp => Ok(( Output::LlamaCpp => Ok(EngineConfig::StaticCore {
EngineConfig::StaticCore { engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?,
engine: dynamo_engine_llamacpp::make_engine(cancel_token, &local_model).await?, model: Box::new(local_model),
model: Box::new(local_model), }),
},
None,
)),
// For multi-node config. vllm uses `ray`, see guide
Output::Vllm => shell(subprocess::vllm::PY, cancel_token, local_model, flags, None).await,
// For multi-node config. trtlllm uses `mpi`, see guide
Output::Trtllm => {
shell(
subprocess::trtllm::PY,
cancel_token,
local_model,
flags,
None,
)
.await
}
Output::SgLang => {
let multi_node_config = if flags.num_nodes > 1 {
Some(dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
})
} else {
None
};
shell(
subprocess::sglang::PY,
cancel_token,
local_model,
flags,
multi_node_config,
)
.await
}
Output::Mocker => { Output::Mocker => {
let Either::Right(drt) = rt else { let Either::Right(drt) = rt else {
panic!("Mocker requires a distributed runtime to run."); panic!("Mocker requires a distributed runtime to run.");
...@@ -180,76 +123,12 @@ async fn engine_for( ...@@ -180,76 +123,12 @@ async fn engine_for(
let engine = let engine =
dynamo_llm::mocker::engine::make_mocker_engine(drt, endpoint, args).await?; dynamo_llm::mocker::engine::make_mocker_engine(drt, endpoint, args).await?;
Ok(( Ok(EngineConfig::StaticCore {
EngineConfig::StaticCore { engine,
engine, model: Box::new(local_model),
model: Box::new(local_model), })
},
None,
))
}
}
}
async fn shell(
py_script: &'static str,
cancel_token: CancellationToken,
local_model: LocalModel,
flags: Flags,
multi_node_config: Option<dynamo_llm::engines::MultiNodeConfig>,
) -> anyhow::Result<(EngineConfig, Option<ExtraFuture>)> {
let (py_script, child) =
match subprocess::start(py_script, &local_model, flags.clone(), multi_node_config).await {
Ok(x) => x,
Err(err) => {
anyhow::bail!("Failed starting engine sub-process: {err}");
}
};
// Sub-process cleanup
let extra: ExtraFuture = Box::pin(async move {
stopper(cancel_token, child, py_script).await;
});
Ok((EngineConfig::Dynamic(Box::new(local_model)), Some(extra)))
}
/// Wait for cancel_token to be cancelled, then stop the child as gracefully as possible.
/// Keeps the TempPath alive until the child is stopped.
async fn stopper(
cancel_token: CancellationToken,
mut child: tokio::process::Child,
py_script: tempfile::TempPath,
) {
cancel_token.cancelled().await;
// Ask subprocess to stop gracefully
if let Some(pid) = child.id() {
unsafe { libc::kill(pid as i32, libc::SIGTERM) };
}
tokio::select! {
exit = child.wait() => {
tracing::trace!("engine sub-process graceful exit");
match exit {
Ok(exit_status) if exit_status.success() => {}
Ok(exit_status) => {
// This is nearly always 15 (SIGTERM)
tracing::trace!("engine sub-process non-0 exit: {exit_status}");
}
Err(err) => {
tracing::warn!("engine sub-process error getting exit status: {err}");
}
}
}
_ = tokio::time::sleep(CHILD_STOP_TIMEOUT) => {
// It didn't stop in time, kill it
child.kill().await.expect("Failed killing engine subprocess");
let _ = child.wait().await;
} }
} }
// This temporary file contains the python script running the engine. It deletes on drop.
// Keep it alive until the engine has stopped.
drop(py_script);
} }
/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it. /// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
......
...@@ -90,6 +90,11 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> { ...@@ -90,6 +90,11 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
in_opt = Some(val.try_into()?); in_opt = Some(val.try_into()?);
} }
"out" => { "out" => {
if val == "sglang" || val == "trtllm" || val == "vllm" {
tracing::error!("To run the {val} engine please use the Python interface, see root README or look in directory `components/backends/`.");
std::process::exit(1);
}
out_opt = Some(val.try_into()?); out_opt = Some(val.try_into()?);
} }
_ => { _ => {
......
...@@ -22,16 +22,6 @@ pub enum Output { ...@@ -22,16 +22,6 @@ pub enum Output {
/// Run inference using llama.cpp /// Run inference using llama.cpp
LlamaCpp, LlamaCpp,
/// Run inference using sglang
SgLang,
/// Run inference using trtllm
Trtllm,
// Start vllm in a sub-process connecting via nats
// Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
Vllm,
Mocker, Mocker,
} }
...@@ -46,11 +36,7 @@ impl TryFrom<&str> for Output { ...@@ -46,11 +36,7 @@ impl TryFrom<&str> for Output {
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
"llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp), "llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
"sglang" => Ok(Output::SgLang),
"trtllm" => Ok(Output::Trtllm),
"vllm" => Ok(Output::Vllm),
"mocker" => Ok(Output::Mocker), "mocker" => Ok(Output::Mocker),
"echo_full" => Ok(Output::EchoFull), "echo_full" => Ok(Output::EchoFull),
"echo_core" => Ok(Output::EchoCore), "echo_core" => Ok(Output::EchoCore),
...@@ -79,11 +65,7 @@ impl fmt::Display for Output { ...@@ -79,11 +65,7 @@ impl fmt::Display for Output {
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
Output::LlamaCpp => "llamacpp", Output::LlamaCpp => "llamacpp",
Output::SgLang => "sglang",
Output::Trtllm => "trtllm",
Output::Vllm => "vllm",
Output::Mocker => "mocker", Output::Mocker => "mocker",
Output::EchoFull => "echo_full", Output::EchoFull => "echo_full",
Output::EchoCore => "echo_core", Output::EchoCore => "echo_core",
...@@ -96,7 +78,11 @@ impl fmt::Display for Output { ...@@ -96,7 +78,11 @@ impl fmt::Display for Output {
impl Output { impl Output {
#[allow(unused_mut)] #[allow(unused_mut)]
pub fn available_engines() -> Vec<String> { pub fn available_engines() -> Vec<String> {
let mut out = vec!["echo_core".to_string(), "echo_full".to_string()]; let mut out = vec![
"echo_core".to_string(),
"echo_full".to_string(),
Output::Mocker.to_string(),
];
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
{ {
out.push(Output::MistralRs.to_string()); out.push(Output::MistralRs.to_string());
...@@ -107,11 +93,6 @@ impl Output { ...@@ -107,11 +93,6 @@ impl Output {
out.push(Output::LlamaCpp.to_string()); out.push(Output::LlamaCpp.to_string());
} }
out.push(Output::SgLang.to_string());
out.push(Output::Trtllm.to_string());
out.push(Output::Vllm.to_string());
out.push(Output::Mocker.to_string());
out out
} }
} }
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::borrow::Cow;
use std::io::Write;
use std::process::Stdio;
use std::sync::LazyLock;
use anyhow::Context;
use regex::Regex;
use tokio::io::AsyncBufReadExt;
use crate::flags::RouterMode;
use dynamo_llm::engines::MultiNodeConfig;
use dynamo_llm::local_model::LocalModel;
pub mod sglang;
pub mod trtllm;
pub mod vllm;
pub async fn start(
// The Python code to run
py_script: &'static str,
// Model info
local_model: &LocalModel,
// Command line flags for user overrides
flags: super::Flags,
// sglang multi-node config. vllm uses `ray` externally
multi_node_config: Option<MultiNodeConfig>,
) -> anyhow::Result<(tempfile::TempPath, tokio::process::Child)> {
let mut tmp = tempfile::NamedTempFile::new()?;
// Writes on Linux don't block
tmp.write_all(py_script.as_bytes())?;
let script_path = tmp.into_temp_path();
let card = local_model.card();
let mut args = vec![
script_path.to_string_lossy().to_string(),
"--endpoint".to_string(),
local_model.endpoint_id().as_url(),
"--model-path".to_string(),
local_model.path().to_string_lossy().to_string(),
"--model-name".to_string(),
local_model.display_name().to_string(),
"--tensor-parallel-size".to_string(),
flags.tensor_parallel_size.to_string(),
"--kv-block-size".to_string(),
card.kv_cache_block_size.to_string(),
"--context-length".to_string(),
card.context_length.to_string(),
"--migration-limit".to_string(),
card.migration_limit.to_string(),
];
// TRTLLM only
// The worker node will only publish events and metrics if the router mode is KV
if flags.router_mode == RouterMode::KV {
args.push("--publish-events-and-metrics".to_string());
}
// sglang only
// vllm uses CUDA_VISIBLE_DEVICES
if flags.base_gpu_id != 0 {
args.push("--base-gpu-id".to_string());
args.push(flags.base_gpu_id.to_string());
}
// sglang only
if let Some(multi_node_config) = multi_node_config {
args.push("--nnodes".to_string());
args.push(multi_node_config.num_nodes.to_string());
args.push("--node-rank".to_string());
args.push(multi_node_config.node_rank.to_string());
args.push("--dist-init-addr".to_string());
args.push(multi_node_config.leader_addr);
}
if let Some(extra_engine_args) = flags.extra_engine_args {
args.push("--extra-engine-args".to_string());
args.push(extra_engine_args.to_string_lossy().to_string());
}
let mut cmd = tokio::process::Command::new("python3");
cmd.kill_on_drop(false)
.args(args)
.stdout(Stdio::piped())
.stderr(Stdio::piped());
let mut child = cmd
.spawn()
.with_context(|| format!("Failed running: '{}'", pretty_cmd(&cmd)))?;
// Safety: We set stdout/stderr a few lines above
let stdout = tokio::io::BufReader::new(child.stdout.take().unwrap());
let stderr = tokio::io::BufReader::new(child.stderr.take().unwrap());
tokio::spawn(async move {
let mut lines = stdout.lines();
while let Ok(Some(line)) = lines.next_line().await {
tracing::info!("{}", strip_log_prefix(&line));
}
});
tokio::spawn(async move {
let mut lines = stderr.lines();
while let Ok(Some(line)) = lines.next_line().await {
// FIXME: always logging INFO/DEBUG will hide real errors, but
// some libraries log non-errors to stderr, which confuses users
// when we log those as ERROR. Using WARN as a middle ground for
// now, but we can probably be smarter here.
tracing::warn!("{}", strip_log_prefix(&line));
}
});
// We must keep temp path alive, it deletes on drop
Ok((script_path, child))
}
pub fn pretty_cmd(c: &tokio::process::Command) -> String {
format!(
"{} {}",
c.as_std().get_program().to_string_lossy(),
c.as_std()
.get_args()
.map(|x| x.to_string_lossy())
.collect::<Vec<std::borrow::Cow<'_, str>>>()
.join(" ")
)
}
// Thanks Gemini
static LOG_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"^(?:(?:[A-Z]+ \d{2}-\d{2} \d{2}:\d{2}:\d{2})|(?:\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\])) (.*)$"
).unwrap()
// ^ Start of the line.
// (?: Non-capturing group for the two prefix alternatives.
// (?: Non-capturing group for the first prefix type.
// [A-Z]+ One or more uppercase letters (log level).
// (single space) A literal space.
// \d{2}-\d{2} Date like MM-DD.
// (single space) A literal space.
// \d{2}:\d{2}:\d{2} Time like HH:MM:SS.
// ) End of first prefix type group.
// | OR
// (?: Non-capturing group for the second prefix type.
// \[ A literal opening square bracket.
// \d{4}-\d{2}-\d{2} Date like YYYY-MM-DD.
// (single space) A literal space.
// \d{2}:\d{2}:\d{2} Time like HH:MM:SS.
// \] A literal closing square bracket.
// ) End of second prefix type group.
// ) End of the alternatives group.
// (single space) A literal space. This is the space BEFORE the message.
// (.*) Capture group 1: The rest of the line (the message).
// $ End of the line.
});
/// Strips the log level, date, and time from the start of a log line.
///
/// # Examples
/// let line = "INFO 05-06 09:38:50 [async_llm.py:252] Added request 1";
/// assert_eq!(strip_log_prefix(line), "[async_llm.py:252] Added request 1");
///
/// let line_no_prefix = "This is a normal line.";
/// assert_eq!(strip_log_prefix(line_no_prefix), "This is a normal line.");
fn strip_log_prefix(line: &str) -> Cow<'_, str> {
if let Some(captures) = LOG_PREFIX_RE.captures(line) {
// `captures.get(0)` would be the entire matched prefix + message.
// `captures.get(1)` is the first capture group, which is `(.*)`, the message itself.
if let Some(message_match) = captures.get(1) {
return Cow::Borrowed(message_match.as_str());
}
}
// If the regex doesn't match, or somehow the capture group is not found (shouldn't happen with (.*))
// return the original line.
Cow::Borrowed(line)
}
#[cfg(test)]
mod tests {
use super::strip_log_prefix;
#[test]
fn test_strip_log_prefix() {
let line = "INFO 05-06 09:38:50 [async_llm.py:252] Added request 1";
let expected = "[async_llm.py:252] Added request 1";
assert_eq!(strip_log_prefix(line), expected);
let line = "Just a regular line.";
assert_eq!(strip_log_prefix(line), line);
let line = "INFO this is not a full prefix";
assert_eq!(strip_log_prefix(line), line);
let line = "[2025-05-06 11:58:51] Capture cuda graph bs [1, 2, 4, 8]";
assert_eq!(strip_log_prefix(line), "Capture cuda graph bs [1, 2, 4, 8]");
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// Source code of the SGLang sub-process
pub const PY: &str = include_str!("sglang_inc.py");
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# `dynamo-run out=sglang` runs this script
# Can also be used standalone: `python3 sglang_inc.py` - lots of optional cmd line params
import argparse
import asyncio
import json
import logging
import sys
from typing import Optional
import sglang
import uvloop
from sglang.srt.entrypoints.engine import EmbeddingReqInput
from sglang.srt.server_args import ServerArgs
from dynamo.llm import ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
# Only used if you run it manually from the command line
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
configure_dynamo_logging()
class Config:
"""Command line parameters or defaults"""
namespace: str
component: str
endpoint: str
model_path: str
model_name: Optional[str]
base_gpu_id: int
tensor_parallel_size: int
kv_block_size: int
context_length: int
nnodes: int
node_rank: int
dist_init_addr: str
migration_limit: int
extra_engine_args: str
class RequestHandler:
"""
Request handler for the generate endpoint
"""
def __init__(self, engine):
self.engine_client = engine
async def generate(self, request):
sampling_params = {}
if request["sampling_options"]["temperature"] is not None:
sampling_params["temperature"] = request["sampling_options"]["temperature"]
sampling_params = {
# sglang defaults this to 128
"max_new_tokens": request["stop_conditions"]["max_tokens"],
}
# Check if this is a batch request
is_batch = "batch_token_ids" in request and request["batch_token_ids"]
if is_batch:
# Track tokens separately for each batch item
num_output_tokens_so_far = {}
logging.debug("received batch token ids")
gen = await self.engine_client.async_generate(
input_ids=request["batch_token_ids"],
sampling_params=sampling_params,
stream=True,
)
else:
num_output_tokens_so_far = 0
logging.debug("received token ids")
gen = await self.engine_client.async_generate(
input_ids=request["token_ids"],
sampling_params=sampling_params,
stream=True,
)
async for res in gen:
# res is a dict
logging.debug(f"res: {res}")
finish_reason = res["meta_info"]["finish_reason"]
if is_batch:
# Handle batch response - get index from SGLang response
index = res.get("index", 0)
if index not in num_output_tokens_so_far:
num_output_tokens_so_far[index] = 0
if finish_reason:
logging.warning(f"finish_reason: {finish_reason}")
# Final response for this batch item
out = {
"token_ids": [],
"finish_reason": finish_reason["type"],
"index": index,
}
else:
# Streaming response for this batch item
next_total_toks = len(res["output_ids"])
new_tokens = res["output_ids"][num_output_tokens_so_far[index] :]
out = {
"token_ids": new_tokens,
"index": index,
}
num_output_tokens_so_far[index] = next_total_toks
else:
if finish_reason:
out = {
"token_ids": [],
"finish_reason": finish_reason["type"],
}
else:
next_total_toks = len(res["output_ids"])
new_tokens = res["output_ids"][num_output_tokens_so_far:]
out = {
"token_ids": new_tokens,
}
num_output_tokens_so_far = next_total_toks
yield out
async def encode(self, request):
obj = EmbeddingReqInput(input_ids=request["token_ids"])
generator = self.engine_client.tokenizer_manager.generate_request(obj, None)
engine_results = await anext(generator)
tokens = 0
embeddings = []
for result in engine_results:
embeddings.append(result["embedding"])
tokens += result["meta_info"]["prompt_tokens"]
out = {
"embeddings": embeddings,
"prompt_tokens": tokens,
"total_tokens": tokens,
}
yield out
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
await init(runtime, cmd_line_args())
async def init(runtime: DistributedRuntime, config: Config):
"""
Instantiate and serve
"""
arg_map = {
"model_path": config.model_path,
"skip_tokenizer_init": True,
"tp_size": config.tensor_parallel_size,
"base_gpu_id": config.base_gpu_id,
}
if config.kv_block_size:
arg_map["page_size"] = config.kv_block_size
if config.context_length:
arg_map["context_length"] = config.context_length
if config.dist_init_addr != "":
arg_map["trust_remote_code"] = True
arg_map["nnodes"] = config.nnodes
arg_map["dist_init_addr"] = config.dist_init_addr
# In practice this is always 0 because Dynamo only manages the leader
arg_map["node_rank"] = config.node_rank
if config.extra_engine_args != "":
json_map = {}
# extra_engine_args is a filename
try:
with open(config.extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.error(f"File {config.extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
# TODO fetch default SamplingParams from generation_config.json
engine_args = ServerArgs(**arg_map)
engine_client = sglang.Engine(server_args=engine_args)
component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
endpoint = component.endpoint(config.endpoint)
model_type = (
ModelType.Backend if not engine_args.is_embedding else ModelType.Embedding
)
await register_llm(
model_type,
endpoint,
config.model_path,
config.model_name,
migration_limit=config.migration_limit,
)
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
handler = RequestHandler(engine_client)
if engine_args.is_embedding:
await endpoint.serve_endpoint(handler.encode)
else:
await endpoint.serve_endpoint(handler.generate)
def cmd_line_args():
parser = argparse.ArgumentParser(
description="SGLang server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--base-gpu-id",
type=int,
default=0,
help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
)
parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size", type=int, default=16, help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--nnodes", type=int, default=1, help="The number of machines SGLang will use"
)
parser.add_argument(
"--node-rank",
type=int,
default=0,
help="Unique number for each node. 0 for the leader.",
)
parser.add_argument(
"--dist-init-addr",
type=str,
default="",
help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the SGLang Engine.",
)
args = parser.parse_args()
config = Config()
config.model_path = args.model_path
if args.model_name:
config.model_name = args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name = None
endpoint_str = args.endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
sys.exit(1)
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
config.namespace = parsed_namespace
config.component = parsed_component_name
config.endpoint = parsed_endpoint_name
config.base_gpu_id = args.base_gpu_id
config.tensor_parallel_size = args.tensor_parallel_size
config.kv_block_size = args.kv_block_size
config.context_length = args.context_length
config.nnodes = args.nnodes
config.node_rank = args.node_rank
config.dist_init_addr = args.dist_init_addr
config.migration_limit = args.migration_limit
config.extra_engine_args = args.extra_engine_args
return config
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// Source code of the TRTLLM sub-process
pub const PY: &str = include_str!("trtllm_inc.py");
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is a sample config for TensorRT-LLM engine.
# The config provides smaller free_gpu_memory_fraction to ensure that the engine
# does not use all the GPU memory and both prefill and decode workers can fit in
# the GPU memory when running in disaggregated mode.
# You might have to tweak this config based on your model size and GPU memory.
backend: pytorch
disable_overlap_scheduler: true
kv_cache_config:
free_gpu_memory_fraction: 0.40
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// Source code of the VLLM sub-process
pub const PY: &str = include_str!("vllm_inc.py");
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# `dynamo-run out=vllm` runs this script
# Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
# Setup checklist:
# - We are in a virtualenv with vllm installed - and patched if using kv routing.
# - `libdynamo_llm_capi.so` is in system lib path or it's containing folder is in LD_LIBRARY_PATH
# It builds in target/debug/ by default.
import argparse
import asyncio
import json
import logging
import os
import sys
import uuid
from typing import Optional
import uvloop
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.inputs import TokensPrompt
from dynamo.llm import (
ForwardPassMetrics,
KvStats,
ModelType,
WorkerMetricsPublisher,
WorkerStats,
register_llm,
)
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sdk.lib.utils import get_capi_library_path
# Only used if you run it manually from the command line
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
configure_dynamo_logging()
class Config:
"""Command line parameters or defaults"""
namespace: str
component: str
endpoint: str
model_path: str
model_name: Optional[str]
tensor_parallel_size: int
kv_block_size: int
context_length: int
migration_limit: int
extra_engine_args: str
class RequestHandler:
"""
Request handler for the generate endpoint
"""
def __init__(self, component, engine, default_sampling_params):
self.component = component
self.engine_client = engine
self.default_sampling_params = default_sampling_params
self.metrics_publisher = WorkerMetricsPublisher()
def setup_kv_metrics(self):
if not hasattr(self.engine_client, "set_metrics_publisher"):
logging.debug("VLLM version does not support KV metrics")
return
self.engine_client.set_metrics_publisher(self.metrics_publisher)
# Initially send dummy metrics to kick start,
# vLLM will not update stat until forward pass is triggered
# Create the structured metrics objects
worker_stats = WorkerStats(
request_active_slots=0,
request_total_slots=1024,
num_requests_waiting=0,
data_parallel_rank=None,
)
kv_stats = KvStats(
kv_active_blocks=0,
kv_total_blocks=1024,
gpu_cache_usage_perc=0.0,
gpu_prefix_cache_hit_rate=0.0,
)
metrics = ForwardPassMetrics(
worker_stats=worker_stats, kv_stats=kv_stats, spec_decode_stats=None
)
# Publish the metrics as a single object
self.metrics_publisher.publish(metrics)
task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(
lambda _: logging.debug("metrics publisher endpoint created")
)
async def create_metrics_publisher_endpoint(self):
logging.debug("Creating metrics publisher endpoint")
await self.metrics_publisher.create_endpoint(self.component)
async def generate(self, request):
# logging.debug(f"Received request: {request}")
request_id = str(uuid.uuid4().hex)
prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
sampling_params = SamplingParams(**self.default_sampling_params)
for key, value in request["sampling_options"].items():
if not value:
continue
if hasattr(sampling_params, key):
setattr(sampling_params, key, value)
max_tokens = request["stop_conditions"]["max_tokens"]
if max_tokens:
sampling_params.max_tokens = max_tokens
num_output_tokens_so_far = 0
gen = self.engine_client.generate(prompt, sampling_params, request_id)
async for res in gen:
# res is vllm's RequestOutput
# This is the expected way for a request to end.
# The new token ID will be eos, don't forward it.
if res.finished:
yield {"finish_reason": "stop", "token_ids": []}
break
if not res.outputs:
yield {"finish_reason": "error", "token_ids": []}
break
output = res.outputs[0]
next_total_toks = len(output.token_ids)
out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
if output.finish_reason:
out["finish_reason"] = output.finish_reason
if output.stop_reason:
out["stop_reason"] = output.stop_reason
yield out
num_output_tokens_so_far = next_total_toks
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
await init(runtime, cmd_line_args())
def _check_and_set_env_value(key, expected, allow_override=False):
if not allow_override and key in os.environ and os.environ[key] != expected:
raise ValueError(
f"{key} is set and doesn't equal expected {expected}. Please unset variable before launch."
)
os.environ.setdefault(key, expected)
async def init(runtime: DistributedRuntime, config: Config):
"""
Instantiate and serve
"""
arg_map = {
"model": config.model_path,
"task": "generate",
"tensor_parallel_size": config.tensor_parallel_size,
"skip_tokenizer_init": True,
"disable_log_requests": True,
"enable_prefix_caching": True,
# KV routing relies on logging KV metrics
"disable_log_stats": False,
}
assert config.kv_block_size > 0, "Must use non-negative integer for KV Block Size"
arg_map["block_size"] = config.kv_block_size
if config.context_length:
# Usually we want it to default to the max (from tokenizer_config.json)
arg_map["max_model_len"] = config.context_length
if config.extra_engine_args != "":
json_map = {}
# extra_engine_args is a filename
try:
with open(config.extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.error(f"File {config.extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
# Patch won't start KVCacheEventManager unless these four are set
component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
endpoint = component.endpoint(config.endpoint)
_check_and_set_env_value("VLLM_WORKER_ID", str(endpoint.lease_id()))
_check_and_set_env_value(
"VLLM_KV_CAPI_PATH", get_capi_library_path(), allow_override=True
)
_check_and_set_env_value("VLLM_KV_NAMESPACE", config.namespace)
_check_and_set_env_value("VLLM_KV_COMPONENT", config.component)
_check_and_set_env_value(
"VLLM_NO_USAGE_STATS", "1", allow_override=True
) # Avoid internal HTTP requests
engine_args = AsyncEngineArgs(**arg_map)
model_config = engine_args.create_model_config()
# Load default sampling params from `generation_config.json`
default_sampling_params = model_config.get_diff_sampling_param()
engine_context = build_async_engine_client_from_engine_args(engine_args)
engine_client = await engine_context.__aenter__()
await register_llm(
ModelType.Backend,
endpoint,
config.model_path,
config.model_name,
context_length=arg_map.get(
"max_model_len", None
), # if None, takes length from tokenizer
kv_cache_block_size=arg_map["block_size"],
migration_limit=config.migration_limit,
)
handler = RequestHandler(component, engine_client, default_sampling_params)
handler.setup_kv_metrics()
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
await endpoint.serve_endpoint(handler.generate)
def cmd_line_args():
parser = argparse.ArgumentParser(
description="vLLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size", type=int, default=16, help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
)
args = parser.parse_args()
config = Config()
config.model_path = args.model_path
if args.model_name:
config.model_name = args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name = None
endpoint_str = args.endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
sys.exit(1)
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
config.namespace = parsed_namespace
config.component = parsed_component_name
config.endpoint = parsed_endpoint_name
config.tensor_parallel_size = args.tensor_parallel_size
config.kv_block_size = args.kv_block_size
config.context_length = args.context_length
config.migration_limit = args.migration_limit
config.extra_engine_args = args.extra_engine_args
return config
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# `dynamo-run out=vllm` runs this script
# Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
# Setup checklist:
# - We are in a virtualenv with vllm installed. V1 is compatible with v0.9.0
# Steps:
# git clone https://github.com/vllm-project/vllm.git
# cd vllm && git checkout v0.9.0
# uv pip uninstall ai-dynamo-vllm
# VLLM_USE_PRECOMPILED=1 uv pip install --editable .
import argparse
import asyncio
import json
import logging
import os
import sys
import uuid
from typing import Optional
import uvloop
from vllm.config import VllmConfig
from vllm.distributed.kv_events import KVEventsConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs import TokensPrompt
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.loggers import StatLoggerBase
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
from dynamo.llm import (
ForwardPassMetrics,
KvStats,
ModelType,
SpecDecodeStats,
WorkerMetricsPublisher,
WorkerStats,
ZmqKvEventPublisher,
ZmqKvEventPublisherConfig,
register_llm,
)
from dynamo.runtime import Component, DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
# Only used if you run it manually from the command line
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
configure_dynamo_logging()
logger = logging.getLogger(__name__)
class Config:
"""Command line parameters or defaults"""
namespace: str
component: str
endpoint: str
model_path: str
model_name: Optional[str]
tensor_parallel_size: int
kv_block_size: int
context_length: int
migration_limit: int
extra_engine_args: str
class DynamoStatLoggerPublisher(StatLoggerBase):
"""Stat logger publisher. Wrapper for the WorkerMetricsPublisher to match the StatLoggerBase interface."""
def __init__(self, component: Component, dp_rank: int) -> None:
self.inner = WorkerMetricsPublisher()
self.inner.create_endpoint(component)
self.dp_rank = dp_rank
def record(
self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]
):
# request_total_slots and kv_total_blocks are properties of model + gpu
# we should only publish them once, not every metric update
# they should be part of some runtime metadata tied to MDC or put in etcd ?
hit_rate = 0
if scheduler_stats.prefix_cache_stats.queries > 0:
hit_rate = (
scheduler_stats.prefix_cache_stats.hits
/ scheduler_stats.prefix_cache_stats.queries
)
worker_stats = WorkerStats(
request_active_slots=scheduler_stats.num_running_reqs,
request_total_slots=0, # TODO - remove from metrics
num_requests_waiting=scheduler_stats.num_waiting_reqs,
data_parallel_rank=None,
)
kv_stats = KvStats(
kv_active_blocks=0, # TODO - need to calculate this
kv_total_blocks=0, # TODO - remove from metrics
gpu_cache_usage_perc=scheduler_stats.gpu_cache_usage, # used in current cost function
gpu_prefix_cache_hit_rate=hit_rate,
)
spec_dec_stats = scheduler_stats.spec_decoding_stats
if spec_dec_stats:
spec_dec_stats = SpecDecodeStats(
num_spec_tokens=spec_dec_stats.num_spec_tokens,
num_drafts=spec_dec_stats.num_drafts,
num_draft_tokens=spec_dec_stats.num_draft_tokens,
num_accepted_tokens=spec_dec_stats.num_accepted_tokens,
num_accepted_tokens_per_pos=spec_dec_stats.num_accepted_tokens_per_pos,
)
metrics = ForwardPassMetrics(
worker_stats=worker_stats,
kv_stats=kv_stats,
spec_decode_stats=spec_dec_stats,
)
self.inner.publish(metrics)
def log_engine_initialized(self) -> None:
pass
class StatLoggerFactory:
"""Factory for creating stat logger publishers. Required by vLLM."""
def __init__(self, component: Component) -> None:
self.component = component
def create_stat_logger(self, dp_rank: int) -> StatLoggerBase:
return DynamoStatLoggerPublisher(self.component, dp_rank)
def __call__(self, vllm_config: VllmConfig, dp_rank: int) -> StatLoggerBase:
return self.create_stat_logger(dp_rank=dp_rank)
class RequestHandler:
"""
Request handler for the generate and clear_kv_blocks endpoints.
"""
def __init__(self, component, engine, default_sampling_params):
self.component = component
self.engine_client = engine
self.default_sampling_params = default_sampling_params
async def clear_kv_blocks(self, request=None):
try:
await self.engine_client.reset_prefix_cache()
yield {"status": "success", "message": "KV cache cleared"}
except Exception as e:
yield {"status": "error", "message": str(e)}
async def generate(self, request):
request_id = str(uuid.uuid4().hex)
prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
sampling_params = SamplingParams(**self.default_sampling_params)
for key, value in request["sampling_options"].items():
if not value:
continue
if hasattr(sampling_params, key):
setattr(sampling_params, key, value)
max_tokens = request["stop_conditions"]["max_tokens"]
if max_tokens:
sampling_params.max_tokens = max_tokens
num_output_tokens_so_far = 0
gen = self.engine_client.generate(prompt, sampling_params, request_id)
async for res in gen:
# res is vllm's RequestOutput
# This is the expected way for a request to end.
# The new token ID will be eos, don't forward it.
if res.finished:
yield {"finish_reason": "stop", "token_ids": []}
break
if not res.outputs:
yield {"finish_reason": "error", "token_ids": []}
break
output = res.outputs[0]
next_total_toks = len(output.token_ids)
out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
if output.finish_reason:
out["finish_reason"] = output.finish_reason
if output.stop_reason:
out["stop_reason"] = output.stop_reason
yield out
num_output_tokens_so_far = next_total_toks
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
await init(runtime, cmd_line_args())
async def init(runtime: DistributedRuntime, config: Config):
"""
Instantiate and serve
"""
component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint)
clear_endpoint = component.endpoint("clear_kv_blocks")
await register_llm(
ModelType.Backend,
generate_endpoint,
config.model_path,
config.model_name,
kv_cache_block_size=config.kv_block_size,
migration_limit=config.migration_limit,
)
arg_map = {
"model": config.model_path,
"task": "generate",
"tensor_parallel_size": config.tensor_parallel_size,
"skip_tokenizer_init": True,
"disable_log_requests": True,
"enable_prefix_caching": True,
# KV routing relies on logging KV metrics
"disable_log_stats": False,
"kv_events_config": KVEventsConfig(
enable_kv_cache_events=True, publisher="zmq"
),
}
if config.context_length:
# Usually we want it to default to the max (from tokenizer_config.json)
arg_map["max_model_len"] = config.context_length
if config.kv_block_size > 0:
arg_map["block_size"] = config.kv_block_size
if config.extra_engine_args != "":
json_map = {}
# extra_engine_args is a filename
try:
with open(config.extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.error(f"File {config.extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
logger.info(f"VLLM config: {arg_map}")
os.environ["VLLM_NO_USAGE_STATS"] = "1" # Avoid internal HTTP requests
os.environ[
"VLLM_WORKER_MULTIPROC_METHOD"
] = "spawn" # Ensure our publisher makes it to the new process
engine_args = AsyncEngineArgs(**arg_map)
model_config = engine_args.create_model_config()
# Load default sampling params from `generation_config.json`
default_sampling_params = model_config.get_diff_sampling_param()
# Taken from build_async_engine_client_from_engine_args()
usage_context = UsageContext.OPENAI_API_SERVER
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
# Explicitly pass our custom stat logger for metrics
engine_client = AsyncLLM.from_vllm_config(
vllm_config=vllm_config,
usage_context=usage_context,
stat_loggers=[StatLoggerFactory(component)],
disable_log_requests=engine_args.disable_log_requests,
disable_log_stats=engine_args.disable_log_stats,
)
logger.info("VllmWorker has been initialized")
zmq_config = ZmqKvEventPublisherConfig(
worker_id=generate_endpoint.lease_id(), kv_block_size=engine_args.block_size
)
_ = ZmqKvEventPublisher(component=component, config=zmq_config)
handler = RequestHandler(component, engine_client, default_sampling_params)
try:
await asyncio.gather(
generate_endpoint.serve_endpoint(handler.generate),
clear_endpoint.serve_endpoint(handler.clear_kv_blocks),
)
except Exception as e:
logger.error(f"Failed to serve endpoints: {e}")
raise
def cmd_line_args():
parser = argparse.ArgumentParser(
description="vLLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size", type=int, default=16, help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
)
args = parser.parse_args()
config = Config()
config.model_path = args.model_path
if args.model_name:
config.model_name = args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name = None
endpoint_str = args.endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
sys.exit(1)
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
config.namespace = parsed_namespace
config.component = parsed_component_name
config.endpoint = parsed_endpoint_name
config.tensor_parallel_size = args.tensor_parallel_size
config.kv_block_size = args.kv_block_size
config.context_length = args.context_length
config.migration_limit = args.migration_limit
config.extra_engine_args = args.extra_engine_args
return config
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
import argparse import argparse
import asyncio import asyncio
import signal
import sys import sys
from pathlib import Path from pathlib import Path
...@@ -31,9 +30,6 @@ import uvloop ...@@ -31,9 +30,6 @@ import uvloop
from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
subprocess_ref = None # Global process reference for cleanup
subprocess_task = None # Global async task reference for cleanup
def parse_args(): def parse_args():
in_mode = "text" in_mode = "text"
...@@ -90,13 +86,6 @@ def parse_args(): ...@@ -90,13 +86,6 @@ def parse_args():
# http_port: Option<u16> # http_port: Option<u16>
parser.add_argument("--http-port", type=int, help="HTTP port for the engine (u16).") parser.add_argument("--http-port", type=int, help="HTTP port for the engine (u16).")
# TODO: Not yet used here
parser.add_argument(
"--tensor-parallel-size",
type=int,
help="Tensor parallel size for the model (e.g., 4).",
)
# Add the positional model argument. # Add the positional model argument.
# It's made optional (nargs='?') because its requirement depends on 'out_mode', # It's made optional (nargs='?') because its requirement depends on 'out_mode',
# which is handled in post-parsing validation. # which is handled in post-parsing validation.
...@@ -131,39 +120,8 @@ def parse_args(): ...@@ -131,39 +120,8 @@ def parse_args():
return parsed_args return parsed_args
async def cleanup_subprocess_async():
"""Clean up the sglang/vllm/trtllm subprocess if it exists."""
global subprocess_ref
if subprocess_ref and subprocess_ref.returncode is None:
subprocess_ref.terminate()
try:
await asyncio.wait_for(subprocess_ref.wait(), timeout=2)
except asyncio.TimeoutError:
subprocess_ref.kill()
await subprocess_ref.wait()
# Only cleanup once
subprocess_ref = None
def signal_handler():
"""Handle signals in async context by cleaning up subprocess and exiting."""
asyncio.create_task(cleanup_subprocess_async())
sys.exit(0)
async def run(): async def run():
global subprocess_ref
global subprocess_task
# Register signal handlers
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGINT, signal_handler) # Ctrl-C
loop.add_signal_handler(signal.SIGTERM, signal_handler) # kill
# If we find cases where subprocess does not stop we may need this. Seem OK so far.
# atexit.register(cleanup_subprocess)
runtime = DistributedRuntime(loop, False) runtime = DistributedRuntime(loop, False)
args = parse_args() args = parse_args()
...@@ -174,57 +132,6 @@ async def run(): ...@@ -174,57 +132,6 @@ async def run():
} }
out_mode = args["out_mode"] out_mode = args["out_mode"]
# Handle subprocess execution for sglang and vllm
if out_mode in ["sglang", "vllm", "trtllm"]:
# Determine which script to run
script_name = f"{out_mode}_inc.py"
script_path = Path(__file__).parent / script_name
if not script_path.exists():
print(f"Error: Script '{script_path}' not found")
sys.exit(1)
# Build command with all relevant arguments
cmd = [sys.executable, str(script_path)]
# Add arguments if they exist
if args["model_path"]:
cmd.extend(["--model-path", args["model_path"]])
flags = args["flags"]
if flags.model_name:
cmd.extend(["--model-name", flags.model_name])
if flags.context_length:
cmd.extend(["--context-length", str(flags.context_length)])
if flags.kv_cache_block_size:
cmd.extend(["--kv-cache-block-size", str(flags.kv_cache_block_size)])
# Start subprocess in background and stream output
print(f"Starting {out_mode} subprocess: {' '.join(cmd)}")
async def stream_subprocess_output():
global subprocess_ref
subprocess_ref = await asyncio.create_subprocess_exec(
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
)
try:
if subprocess_ref.stdout is not None:
async for line in subprocess_ref.stdout:
print(f"Engine: {line.decode().rstrip()}")
await subprocess_ref.wait()
except asyncio.CancelledError:
# Task was cancelled, terminate the subprocess
await cleanup_subprocess_async()
raise
task = asyncio.create_task(stream_subprocess_output())
# Store the task reference for potential cleanup
subprocess_task = task
# Set out_mode to "dyn" because we talk to the subprocess over NATS
out_mode = "dyn"
engine_type = engine_type_map.get(out_mode) engine_type = engine_type_map.get(out_mode)
if engine_type is None: if engine_type is None:
print(f"Unsupported output type: {out_mode}") print(f"Unsupported output type: {out_mode}")
...@@ -249,19 +156,7 @@ async def run(): ...@@ -249,19 +156,7 @@ async def run():
e = EntrypointArgs(engine_type, **entrypoint_kwargs) e = EntrypointArgs(engine_type, **entrypoint_kwargs)
engine = await make_engine(runtime, e) engine = await make_engine(runtime, e)
try: await run_input(runtime, args["in_mode"], engine)
await run_input(runtime, args["in_mode"], engine)
finally:
# Clean up subprocess when main execution finishes
await cleanup_subprocess_async()
# Cancel the subprocess task if it exists
if subprocess_task:
subprocess_task.cancel()
try:
await subprocess_task
except asyncio.CancelledError:
pass
if __name__ == "__main__": if __name__ == "__main__":
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# `dynamo-run out=sglang` runs this script
# Can also be used standalone: `python3 sglang_inc.py` - lots of optional cmd line params
import argparse
import asyncio
import json
import logging
import sys
from typing import Optional
import sglang
import uvloop
from sglang.srt.entrypoints.engine import EmbeddingReqInput
from sglang.srt.server_args import ServerArgs
from dynamo.llm import ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
# Only used if you run it manually from the command line
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
configure_dynamo_logging()
class Config:
"""Command line parameters or defaults"""
namespace: str
component: str
endpoint: str
model_path: str
model_name: Optional[str]
base_gpu_id: int
tensor_parallel_size: int
kv_block_size: int
context_length: int
nnodes: int
node_rank: int
dist_init_addr: str
extra_engine_args: str
class RequestHandler:
"""
Request handler for the generate endpoint
"""
def __init__(self, engine):
self.engine_client = engine
async def generate(self, request):
sampling_params = {}
if request["sampling_options"]["temperature"] is not None:
sampling_params["temperature"] = request["sampling_options"]["temperature"]
# sglang defaults this to 128
sampling_params["max_new_tokens"] = request["stop_conditions"]["max_tokens"]
# Check if this is a batch request
is_batch = "batch_token_ids" in request and request["batch_token_ids"]
if is_batch:
# Track tokens separately for each batch item
num_output_tokens_so_far = {}
gen = await self.engine_client.async_generate(
input_ids=request["batch_token_ids"],
sampling_params=sampling_params,
stream=True,
)
else:
num_output_tokens_so_far = 0
gen = await self.engine_client.async_generate(
input_ids=request["token_ids"],
sampling_params=sampling_params,
stream=True,
)
async for res in gen:
# res is a dict
finish_reason = res["meta_info"]["finish_reason"]
if is_batch:
# Handle batch response - get index from SGLang response
index = res.get("index", 0)
if index not in num_output_tokens_so_far:
num_output_tokens_so_far[index] = 0
if finish_reason:
logging.warning(f"finish_reason: {finish_reason}")
# Final response for this batch item
out = {
"token_ids": [],
"finish_reason": finish_reason["type"],
"index": index,
}
else:
# Streaming response for this batch item
next_total_toks = len(res["output_ids"])
new_tokens = res["output_ids"][num_output_tokens_so_far[index] :]
out = {
"token_ids": new_tokens,
"index": index,
}
num_output_tokens_so_far[index] = next_total_toks
else:
if finish_reason:
out = {
"token_ids": [],
"finish_reason": finish_reason["type"],
}
else:
next_total_toks = len(res["output_ids"])
new_tokens = res["output_ids"][num_output_tokens_so_far:]
out = {
"token_ids": new_tokens,
}
num_output_tokens_so_far = next_total_toks
yield out
async def encode(self, request):
obj = EmbeddingReqInput(input_ids=request["token_ids"])
generator = self.engine_client.tokenizer_manager.generate_request(obj, None)
engine_results = await anext(generator)
tokens = 0
embeddings = []
for result in engine_results:
embeddings.append(result["embedding"])
tokens += result["meta_info"]["prompt_tokens"]
out = {
"embeddings": embeddings,
"prompt_tokens": tokens,
"total_tokens": tokens,
}
yield out
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
await init(runtime, cmd_line_args())
async def init(runtime: DistributedRuntime, config: Config):
"""
Instantiate and serve
"""
arg_map = {
"model_path": config.model_path,
"skip_tokenizer_init": True,
"tp_size": config.tensor_parallel_size,
"base_gpu_id": config.base_gpu_id,
}
if config.kv_block_size:
arg_map["page_size"] = config.kv_block_size
if config.context_length:
arg_map["context_length"] = config.context_length
if config.dist_init_addr != "":
arg_map["trust_remote_code"] = True
arg_map["nnodes"] = config.nnodes
arg_map["dist_init_addr"] = config.dist_init_addr
# In practice this is always 0 because Dynamo only manages the leader
arg_map["node_rank"] = config.node_rank
if config.extra_engine_args != "":
json_map = {}
# extra_engine_args is a filename
try:
with open(config.extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.error(f"File {config.extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
# TODO fetch default SamplingParams from generation_config.json
engine_args = ServerArgs(**arg_map)
engine_client = sglang.Engine(server_args=engine_args)
component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
endpoint = component.endpoint(config.endpoint)
model_type = (
ModelType.Backend if not engine_args.is_embedding else ModelType.Embedding
)
await register_llm(model_type, endpoint, config.model_path, config.model_name)
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
handler = RequestHandler(engine_client)
if engine_args.is_embedding:
await endpoint.serve_endpoint(handler.encode)
else:
await endpoint.serve_endpoint(handler.generate)
def cmd_line_args():
parser = argparse.ArgumentParser(
description="SGLang server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--base-gpu-id",
type=int,
default=0,
help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
)
parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size", type=int, default=16, help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--nnodes", type=int, default=1, help="The number of machines SGLang will use"
)
parser.add_argument(
"--node-rank",
type=int,
default=0,
help="Unique number for each node. 0 for the leader.",
)
parser.add_argument(
"--dist-init-addr",
type=str,
default="",
help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the SGLang Engine.",
)
args = parser.parse_args()
config = Config()
config.model_path = args.model_path
if args.model_name:
config.model_name = args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name = None
endpoint_str = args.endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
sys.exit(1)
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
config.namespace = parsed_namespace
config.component = parsed_component_name
config.endpoint = parsed_endpoint_name
config.base_gpu_id = args.base_gpu_id
config.tensor_parallel_size = args.tensor_parallel_size
config.kv_block_size = args.kv_block_size
config.context_length = args.context_length
config.nnodes = args.nnodes
config.node_rank = args.node_rank
config.dist_init_addr = args.dist_init_addr
config.extra_engine_args = args.extra_engine_args
return config
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# `dynamo-run out=vllm` runs this script
# Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
# Setup checklist:
# - We are in a virtualenv with vllm installed - and patched if using kv routing.
# - `libdynamo_llm_capi.so` is in system lib path or it's containing folder is in LD_LIBRARY_PATH
# It builds in target/debug/ by default.
import argparse
import asyncio
import json
import logging
import os
import sys
import uuid
from typing import Optional
import uvloop
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.inputs import TokensPrompt
from dynamo.llm import (
ForwardPassMetrics,
KvStats,
ModelType,
WorkerMetricsPublisher,
WorkerStats,
register_llm,
)
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
# Only used if you run it manually from the command line
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
configure_dynamo_logging()
class Config:
"""Command line parameters or defaults"""
namespace: str
component: str
endpoint: str
model_path: str
model_name: Optional[str]
tensor_parallel_size: int
kv_block_size: int
context_length: int
extra_engine_args: str
class RequestHandler:
"""
Request handler for the generate endpoint
"""
def __init__(self, component, engine, default_sampling_params):
self.component = component
self.engine_client = engine
self.default_sampling_params = default_sampling_params
self.metrics_publisher = WorkerMetricsPublisher()
def setup_kv_metrics(self):
if not hasattr(self.engine_client, "set_metrics_publisher"):
logging.debug("VLLM version does not support KV metrics")
return
self.engine_client.set_metrics_publisher(self.metrics_publisher)
# Initially send dummy metrics to kick start,
# vLLM will not update stat until forward pass is triggered
# Create the structured metrics objects
worker_stats = WorkerStats(
request_active_slots=0,
request_total_slots=1024,
num_requests_waiting=0,
data_parallel_rank=None,
)
kv_stats = KvStats(
kv_active_blocks=0,
kv_total_blocks=1024,
gpu_cache_usage_perc=0.0,
gpu_prefix_cache_hit_rate=0.0,
)
metrics = ForwardPassMetrics(
worker_stats=worker_stats, kv_stats=kv_stats, spec_decode_stats=None
)
# Publish the metrics as a single object
self.metrics_publisher.publish(metrics)
task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(
lambda _: logging.debug("metrics publisher endpoint created")
)
async def create_metrics_publisher_endpoint(self):
logging.debug("Creating metrics publisher endpoint")
await self.metrics_publisher.create_endpoint(self.component)
async def generate(self, request):
# logging.debug(f"Received request: {request}")
request_id = str(uuid.uuid4().hex)
prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
sampling_params = SamplingParams(**self.default_sampling_params)
for key, value in request["sampling_options"].items():
if not value:
continue
if hasattr(sampling_params, key):
setattr(sampling_params, key, value)
max_tokens = request["stop_conditions"]["max_tokens"]
if max_tokens:
sampling_params.max_tokens = max_tokens
num_output_tokens_so_far = 0
gen = self.engine_client.generate(prompt, sampling_params, request_id)
async for res in gen:
# res is vllm's RequestOutput
# This is the expected way for a request to end.
# The new token ID will be eos, don't forward it.
if res.finished:
yield {"finish_reason": "stop", "token_ids": []}
break
if not res.outputs:
yield {"finish_reason": "error", "token_ids": []}
break
output = res.outputs[0]
next_total_toks = len(output.token_ids)
out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
if output.finish_reason:
out["finish_reason"] = output.finish_reason
if output.stop_reason:
out["stop_reason"] = output.stop_reason
yield out
num_output_tokens_so_far = next_total_toks
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
await init(runtime, cmd_line_args())
def _check_and_set_env_value(key, expected, allow_override=False):
if not allow_override and key in os.environ and os.environ[key] != expected:
raise ValueError(
f"{key} is set and doesn't equal expected {expected}. Please unset variable before launch."
)
os.environ.setdefault(key, expected)
async def init(runtime: DistributedRuntime, config: Config):
"""
Instantiate and serve
"""
arg_map = {
"model": config.model_path,
"task": "generate",
"tensor_parallel_size": config.tensor_parallel_size,
"skip_tokenizer_init": True,
"disable_log_requests": True,
"enable_prefix_caching": True,
# KV routing relies on logging KV metrics
"disable_log_stats": False,
}
assert config.kv_block_size > 0, "Must use non-negative integer for KV Block Size"
arg_map["block_size"] = config.kv_block_size
if config.context_length:
# Usually we want it to default to the max (from tokenizer_config.json)
arg_map["max_model_len"] = config.context_length
if config.extra_engine_args != "":
json_map = {}
# extra_engine_args is a filename
try:
with open(config.extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.error(f"File {config.extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
# Patch won't start KVCacheEventManager unless these four are set
component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
endpoint = component.endpoint(config.endpoint)
_check_and_set_env_value("VLLM_WORKER_ID", str(endpoint.lease_id()))
_check_and_set_env_value(
"VLLM_KV_CAPI_PATH", "libdynamo_llm_capi.so", allow_override=True
)
_check_and_set_env_value("VLLM_KV_NAMESPACE", config.namespace)
_check_and_set_env_value("VLLM_KV_COMPONENT", config.component)
_check_and_set_env_value(
"VLLM_NO_USAGE_STATS", "1", allow_override=True
) # Avoid internal HTTP requests
engine_args = AsyncEngineArgs(**arg_map)
model_config = engine_args.create_model_config()
# Load default sampling params from `generation_config.json`
default_sampling_params = model_config.get_diff_sampling_param()
engine_context = build_async_engine_client_from_engine_args(engine_args)
engine_client = await engine_context.__aenter__()
await register_llm(
ModelType.Backend,
endpoint,
config.model_path,
config.model_name,
context_length=arg_map.get(
"max_model_len", None
), # if None, takes length from tokenizer
kv_cache_block_size=arg_map["block_size"],
)
handler = RequestHandler(component, engine_client, default_sampling_params)
handler.setup_kv_metrics()
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
await endpoint.serve_endpoint(handler.generate)
def cmd_line_args():
parser = argparse.ArgumentParser(
description="vLLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size", type=int, default=16, help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
)
args = parser.parse_args()
config = Config()
config.model_path = args.model_path
if args.model_name:
config.model_name = args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name = None
endpoint_str = args.endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
sys.exit(1)
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
config.namespace = parsed_namespace
config.component = parsed_component_name
config.endpoint = parsed_endpoint_name
config.tensor_parallel_size = args.tensor_parallel_size
config.kv_block_size = args.kv_block_size
config.context_length = args.context_length
config.extra_engine_args = args.extra_engine_args
return config
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
...@@ -80,10 +80,7 @@ pub async fn run( ...@@ -80,10 +80,7 @@ pub async fn run(
(Box::pin(fut), Some(model.card().clone())) (Box::pin(fut), Some(model.card().clone()))
} }
EngineConfig::Dynamic(_) => { EngineConfig::Dynamic(_) => {
// We can only get here for in=dyn out=vllm|sglang`, because vllm and sglang are a unreachable!("An endpoint input will never have a Dynamic engine");
// subprocess that we talk to like a remote endpoint.
// That means the vllm/sglang subprocess is doing all the work, we are idle.
(never_ready(), None)
} }
}; };
...@@ -107,7 +104,3 @@ pub async fn run( ...@@ -107,7 +104,3 @@ pub async fn run(
Ok(()) Ok(())
} }
fn never_ready() -> Pin<Box<dyn Future<Output = anyhow::Result<()>> + Send + 'static>> {
Box::pin(std::future::pending::<anyhow::Result<()>>())
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment