Unverified Commit a745a980 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat(dynamo-engine-vllm): vllm 0.8.X support (#728)

It's different enough that I made a new engine vllm0_8 and renamed the previous engine to vllm0_7.

`dynamo-run out=vllm` now expects 0.8. This matches the container change in #690.

For older use `dynamo-run out=vllm0_7`.
parent 9b05a5b7
......@@ -1595,7 +1595,7 @@ dependencies = [
]
[[package]]
name = "dynamo-engine-vllm"
name = "dynamo-engine-vllm0_7"
version = "0.1.1"
dependencies = [
"anyhow",
......@@ -1614,6 +1614,29 @@ dependencies = [
"tracing",
]
[[package]]
name = "dynamo-engine-vllm0_8"
version = "0.1.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"dynamo-llm",
"dynamo-runtime",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"regex",
"serde",
"serde-pickle",
"serde_json",
"thiserror 2.0.12",
"tokio",
"tokio-stream",
"tracing",
]
[[package]]
name = "dynamo-llm"
version = "0.1.1"
......@@ -1689,7 +1712,8 @@ dependencies = [
"dynamo-engine-python",
"dynamo-engine-sglang",
"dynamo-engine-trtllm",
"dynamo-engine-vllm",
"dynamo-engine-vllm0_7",
"dynamo-engine-vllm0_8",
"dynamo-llm",
"dynamo-runtime",
"futures",
......
......@@ -214,14 +214,14 @@ We use [uv](https://docs.astral.sh/uv/) but any virtualenv manager should work.
uv venv
source .venv/bin/activate
uv pip install pip
uv pip install vllm==0.7.3 setuptools
uv pip install vllm==0.8.4 setuptools
```
**Note: If you're on Ubuntu 22.04 or earlier, you will need to add `--python=python3.10` to your `uv venv` command**
2. Build:
```
cargo build --features vllm
cargo build
cd target/debug
```
......@@ -230,7 +230,7 @@ Inside that virtualenv:
**HF repo:**
```
./dynamo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/
./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct/
```
......
......@@ -30,7 +30,7 @@ description = "Dynamo Run CLI"
default = ["mistralrs", "vllm", "sglang"]
mistralrs = ["dep:dynamo-engine-mistralrs"]
llamacpp = ["dep:dynamo-engine-llamacpp"]
vllm = ["dep:dynamo-engine-vllm", "dep:netlink-packet-route", "dep:rtnetlink"]
vllm = ["dep:dynamo-engine-vllm0_7", "dep:dynamo-engine-vllm0_8", "dep:netlink-packet-route", "dep:rtnetlink"]
sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
trtllm = ["dep:dynamo-engine-trtllm"]
python = ["dep:dynamo-engine-python"]
......@@ -46,7 +46,8 @@ dynamo-runtime = { workspace = true }
dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true }
dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true }
dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
dynamo-engine-vllm = { path = "../../lib/engines/vllm", optional = true }
dynamo-engine-vllm0_7 = { path = "../../lib/engines/vllm0_7", optional = true }
dynamo-engine-vllm0_8 = { path = "../../lib/engines/vllm0_8", optional = true }
dynamo-engine-trtllm = { path = "../../lib/engines/trtllm", optional = true }
dynamo-engine-python = { path = "../../lib/engines/python", optional = true }
......
......@@ -291,7 +291,7 @@ pub async fn run(
}
}
#[cfg(feature = "vllm")]
Output::Vllm => {
Output::Vllm0_7 => {
if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
......@@ -338,7 +338,7 @@ pub async fn run(
};
// vllm multi-node only the leader runs vllm
let (engine, vllm_future) = dynamo_engine_vllm::make_leader_engine(
let (engine, vllm_future) = dynamo_engine_vllm0_7::make_leader_engine(
cancel_token.clone(),
&model_path,
&sock_prefix,
......@@ -359,11 +359,47 @@ pub async fn run(
} else {
// Nodes rank > 0 only run 'ray'
let stop_future =
dynamo_engine_vllm::start_follower(cancel_token.clone(), node_conf).await?;
dynamo_engine_vllm0_7::start_follower(cancel_token.clone(), node_conf).await?;
extra = Some(Box::pin(stop_future));
EngineConfig::None
}
}
#[cfg(feature = "vllm")]
Output::Vllm | Output::Vllm0_8 => {
if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
let Some(model_path) = model_path else {
anyhow::bail!(
"out=vllm requires flag --model-path=<full-path-to-hf-repo-or-model-gguf>"
);
};
let Some(card) = maybe_card.clone() else {
anyhow::bail!(
"Unable to build tokenizer. out=vllm requires --model-path to be an HF repo with fast tokenizer (tokenizer.json) or a GGUF file"
);
};
let node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
let engine = dynamo_engine_vllm0_8::make_engine(
cancel_token.clone(),
&model_path,
node_conf,
flags.tensor_parallel_size,
flags.extra_engine_args.clone(),
)
.await?;
EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine,
card: Box::new(card),
}
}
#[cfg(feature = "llamacpp")]
Output::LlamaCpp => {
let Some(model_path) = model_path else {
......
......@@ -88,7 +88,7 @@ fn main() -> anyhow::Result<()> {
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
};
return dynamo_engine_vllm::run_subprocess(
return dynamo_engine_vllm0_7::run_subprocess(
ZMQ_SOCKET_PREFIX,
&model_path,
node_config,
......
......@@ -111,9 +111,17 @@ pub enum Output {
LlamaCpp,
#[cfg(feature = "vllm")]
/// Run inference using vllm's engine
/// Alias for vllm0_8
Vllm,
#[cfg(feature = "vllm")]
/// Run inference using vllm 0.8.X+
Vllm0_8,
#[cfg(feature = "vllm")]
/// Run inference using vllm 0.7.X
Vllm0_7,
#[cfg(feature = "trtllm")]
/// Run inference using trtllm
TrtLLM,
......@@ -148,6 +156,10 @@ impl TryFrom<&str> for Output {
#[cfg(feature = "vllm")]
"vllm" => Ok(Output::Vllm),
#[cfg(feature = "vllm")]
"vllm0_8" => Ok(Output::Vllm0_8),
#[cfg(feature = "vllm")]
"vllm0_7" => Ok(Output::Vllm0_7),
#[cfg(feature = "trtllm")]
"trtllm" => Ok(Output::TrtLLM),
......@@ -195,6 +207,10 @@ impl fmt::Display for Output {
#[cfg(feature = "vllm")]
Output::Vllm => "vllm",
#[cfg(feature = "vllm")]
Output::Vllm0_8 => "vllm0_8",
#[cfg(feature = "vllm")]
Output::Vllm0_7 => "vllm0_7",
#[cfg(feature = "trtllm")]
Output::TrtLLM => "trtllm",
......@@ -269,6 +285,8 @@ impl Output {
#[cfg(feature = "vllm")]
{
out.push(Output::Vllm.to_string());
out.push(Output::Vllm0_7.to_string());
out.push(Output::Vllm0_8.to_string());
}
#[cfg(feature = "python")]
......
......@@ -14,7 +14,7 @@
# limitations under the License.
[package]
name = "dynamo-engine-vllm"
name = "dynamo-engine-vllm0_7"
version.workspace = true
edition.workspace = true
description.workspace = true
......
......@@ -56,11 +56,4 @@ ipc_path = f"ipc:///tmp/{socket_id}"
engine_alive = multiprocessing.Value("b", True, lock=False)
# 0.7.3
run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, engine_alive)
# 0.8.1
# TODO: In 0.8+ first argument is VllmConfig, not AsyncEngineArgs
# disable_log_stats = False
# disable_log_requests = True
# run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, disable_log_stats, disable_log_requests, engine_alive)
......@@ -42,6 +42,9 @@ use dynamo_llm::{engines::MultiNodeConfig, kv_router::publisher::KvMetricsPublis
/// Wait this long for the vllm sub-process to stop after we send it a KILL
const VLLM_STOP_TIMEOUT: Duration = Duration::from_millis(1500);
// The minor revision version of vllm that this engine supports. 0.8+ is in a different engine.
const VLLM_VERSION: &str = "0.7";
type RequestID = String;
pub struct VllmWorker {
......@@ -255,6 +258,16 @@ fn python_imports() -> Imports {
}
};
// While we're here check vllm version
let version = vllm_module
.getattr(py, "__version__")
.expect("vllm missing __version__ field")
.extract::<String>(py)
.expect("vllm.__version__ is not a string");
if !version.starts_with(VLLM_VERSION) {
panic!("Expected vllm version {VLLM_VERSION}, found {version}");
}
let tokens_prompt_type: PyObject = vllm_module.getattr(py, "TokensPrompt").unwrap();
let sample_params_type: PyObject = vllm_module.getattr(py, "SamplingParams").unwrap();
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-vllm0_8"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
tracing = { workspace = true }
async-openai = "0.27.2"
pyo3 = { version = "0.23.3", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"py-clone",
] }
pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
"attributes",
"testing",
"tokio-runtime",
"unstable-streams",
] }
pythonize = { version = "0.23" }
regex = "1"
serde-pickle = "1.2.0"
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This file is included as a string in lib.rs. Most work should be done in the Rust caller.
#
import json
import logging
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.inputs import TokensPrompt
# TODO this should match DYN_LOG level
logging.basicConfig(level=logging.INFO)
async def main(request_queue, ready_event, extra_engine_args, **kwargs):
arg_map = kwargs
if extra_engine_args != "":
json_map = {}
# extra_engine_args is a filename
try:
with open(extra_engine_args) as f:
json_map = json.load(f)
except FileNotFoundError:
logging.error(f"File {extra_engine_args} not found.")
except json.JSONDecodeError as e:
logging.error(f"Invalid JSON in {extra_engine_args}: {e}")
logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence
engine_args = AsyncEngineArgs(**arg_map)
# Main loop
try:
async with build_async_engine_client_from_engine_args(
engine_args
) as engine_client:
ready_event.set()
while True:
req = await request_queue.get()
if req is None: # Stop sentinel
break
(request_id, request, sampling_params, response_queue) = req
prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
gen = engine_client.generate(prompt, sampling_params, request_id)
async for res in gen:
await response_queue.put(res)
await response_queue.put(None)
request_queue.task_done()
except Exception as e:
logging.error(f"vllm init failed: {e}")
finally:
logging.debug("vllm worker stopped")
async def run_response(response_queue):
try:
while True:
item = await response_queue.get()
yield item
response_queue.task_done()
if item is None:
return
except Exception as e:
logging.error(f"failed reading response from vllm: {e}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment