feat(dynamo-engine-vllm): vllm 0.8.X support (#728)

It's different enough that I made a new engine vllm0_8 and renamed the previous engine to vllm0_7. `dynamo-run out=vllm` now expects 0.8. This matches the container change in #690. For older use `dynamo-run out=vllm0_7`.

feat(dynamo-engine-vllm): vllm 0.8.X support (#728)
It's different enough that I made a new engine vllm0_8 and renamed the previous engine to vllm0_7. `dynamo-run out=vllm` now expects 0.8. This matches the container change in #690. For older use `dynamo-run out=vllm0_7`.
a745a980 · Graham King · GitHub · 9b05a5b7 · a745a980 · a745a980
Unverified Commit a745a980 authored Apr 18, 2025 by Graham King Committed by GitHub Apr 18, 2025
16 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1595,7 +1595,7 @@ dependencies = [
 ]

 [[package]]
-name = "dynamo-engine-vllm"
+name = "dynamo-engine-vllm0_7"
 version = "0.1.1"
 dependencies = [
 "anyhow",
@@ -1614,6 +1614,29 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "dynamo-engine-vllm0_8"
+version = "0.1.1"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "dynamo-llm",
+ "dynamo-runtime",
+ "pyo3",
+ "pyo3-async-runtimes",
+ "pythonize",
+ "regex",
+ "serde",
+ "serde-pickle",
+ "serde_json",
+ "thiserror 2.0.12",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+]
+
 [[package]]
 name = "dynamo-llm"
 version = "0.1.1"
@@ -1689,7 +1712,8 @@ dependencies = [
 "dynamo-engine-python",
 "dynamo-engine-sglang",
 "dynamo-engine-trtllm",
- "dynamo-engine-vllm",
+ "dynamo-engine-vllm0_7",
+ "dynamo-engine-vllm0_8",
 "dynamo-llm",
 "dynamo-runtime",
 "futures",

--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -214,14 +214,14 @@ We use [uv](https://docs.astral.sh/uv/) but any virtualenv manager should work.
 uv venv
 source .venv/bin/activate
 uv pip install pip
-uv pip install vllm==0.7.3 setuptools
+uv pip install vllm==0.8.4 setuptools
 ```

 **Note: If you're on Ubuntu 22.04 or earlier, you will need to add `--python=python3.10` to your `uv venv` command**

 2. Build:
 ```
-cargo build --features vllm
+cargo build
 cd target/debug
 ```

@@ -230,7 +230,7 @@ Inside that virtualenv:

 **HF repo:**
 ```
-./dynamo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/
+./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct/

 ```


--- a/launch/dynamo-run/Cargo.toml
+++ b/launch/dynamo-run/Cargo.toml
@@ -30,7 +30,7 @@ description = "Dynamo Run CLI"
 default = ["mistralrs", "vllm", "sglang"]
 mistralrs = ["dep:dynamo-engine-mistralrs"]
 llamacpp = ["dep:dynamo-engine-llamacpp"]
-vllm = ["dep:dynamo-engine-vllm", "dep:netlink-packet-route", "dep:rtnetlink"]
+vllm = ["dep:dynamo-engine-vllm0_7", "dep:dynamo-engine-vllm0_8", "dep:netlink-packet-route", "dep:rtnetlink"]
 sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
 trtllm = ["dep:dynamo-engine-trtllm"]
 python = ["dep:dynamo-engine-python"]
@@ -46,7 +46,8 @@ dynamo-runtime = { workspace = true }
 dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true }
 dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true }
 dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
-dynamo-engine-vllm = { path = "../../lib/engines/vllm", optional = true }
+dynamo-engine-vllm0_7 = { path = "../../lib/engines/vllm0_7", optional = true }
+dynamo-engine-vllm0_8 = { path = "../../lib/engines/vllm0_8", optional = true }
 dynamo-engine-trtllm = { path = "../../lib/engines/trtllm", optional = true }
 dynamo-engine-python = { path = "../../lib/engines/python", optional = true }


--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -291,7 +291,7 @@ pub async fn run(
            }
        }
        #[cfg(feature = "vllm")]
-        Output::Vllm => {
+        Output::Vllm0_7 => {
            if flags.base_gpu_id != 0 {
                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
            }
@@ -338,7 +338,7 @@ pub async fn run(
                };

                // vllm multi-node only the leader runs vllm
-                let (engine, vllm_future) = dynamo_engine_vllm::make_leader_engine(
+                let (engine, vllm_future) = dynamo_engine_vllm0_7::make_leader_engine(
                    cancel_token.clone(),
                    &model_path,
                    &sock_prefix,
@@ -359,11 +359,47 @@ pub async fn run(
            } else {
                // Nodes rank > 0 only run 'ray'
                let stop_future =
-                    dynamo_engine_vllm::start_follower(cancel_token.clone(), node_conf).await?;
+                    dynamo_engine_vllm0_7::start_follower(cancel_token.clone(), node_conf).await?;
                extra = Some(Box::pin(stop_future));
                EngineConfig::None
            }
        }
+
+        #[cfg(feature = "vllm")]
+        Output::Vllm | Output::Vllm0_8 => {
+            if flags.base_gpu_id != 0 {
+                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
+            }
+            let Some(model_path) = model_path else {
+                anyhow::bail!(
+                    "out=vllm requires flag --model-path=<full-path-to-hf-repo-or-model-gguf>"
+                );
+            };
+            let Some(card) = maybe_card.clone() else {
+                anyhow::bail!(
+                    "Unable to build tokenizer. out=vllm requires --model-path to be an HF repo with fast tokenizer (tokenizer.json) or a GGUF file"
+                );
+            };
+            let node_conf = dynamo_llm::engines::MultiNodeConfig {
+                num_nodes: flags.num_nodes,
+                node_rank: flags.node_rank,
+                leader_addr: flags.leader_addr.clone().unwrap_or_default(),
+            };
+            let engine = dynamo_engine_vllm0_8::make_engine(
+                cancel_token.clone(),
+                &model_path,
+                node_conf,
+                flags.tensor_parallel_size,
+                flags.extra_engine_args.clone(),
+            )
+            .await?;
+            EngineConfig::StaticCore {
+                service_name: card.service_name.clone(),
+                engine,
+                card: Box::new(card),
+            }
+        }
+
        #[cfg(feature = "llamacpp")]
        Output::LlamaCpp => {
            let Some(model_path) = model_path else {

--- a/launch/dynamo-run/src/main.rs
+++ b/launch/dynamo-run/src/main.rs
@@ -88,7 +88,7 @@ fn main() -> anyhow::Result<()> {
                        node_rank: flags.node_rank,
                        leader_addr: flags.leader_addr.unwrap_or_default(),
                    };
-                    return dynamo_engine_vllm::run_subprocess(
+                    return dynamo_engine_vllm0_7::run_subprocess(
                        ZMQ_SOCKET_PREFIX,
                        &model_path,
                        node_config,

--- a/launch/dynamo-run/src/opt.rs
+++ b/launch/dynamo-run/src/opt.rs
@@ -111,9 +111,17 @@ pub enum Output {
    LlamaCpp,

    #[cfg(feature = "vllm")]
-    /// Run inference using vllm's engine
+    /// Alias for vllm0_8
    Vllm,

+    #[cfg(feature = "vllm")]
+    /// Run inference using vllm 0.8.X+
+    Vllm0_8,
+
+    #[cfg(feature = "vllm")]
+    /// Run inference using vllm 0.7.X
+    Vllm0_7,
+
    #[cfg(feature = "trtllm")]
    /// Run inference using trtllm
    TrtLLM,
@@ -148,6 +156,10 @@ impl TryFrom<&str> for Output {

            #[cfg(feature = "vllm")]
            "vllm" => Ok(Output::Vllm),
+            #[cfg(feature = "vllm")]
+            "vllm0_8" => Ok(Output::Vllm0_8),
+            #[cfg(feature = "vllm")]
+            "vllm0_7" => Ok(Output::Vllm0_7),

            #[cfg(feature = "trtllm")]
            "trtllm" => Ok(Output::TrtLLM),
@@ -195,6 +207,10 @@ impl fmt::Display for Output {

            #[cfg(feature = "vllm")]
            Output::Vllm => "vllm",
+            #[cfg(feature = "vllm")]
+            Output::Vllm0_8 => "vllm0_8",
+            #[cfg(feature = "vllm")]
+            Output::Vllm0_7 => "vllm0_7",

            #[cfg(feature = "trtllm")]
            Output::TrtLLM => "trtllm",
@@ -269,6 +285,8 @@ impl Output {
        #[cfg(feature = "vllm")]
        {
            out.push(Output::Vllm.to_string());
+            out.push(Output::Vllm0_7.to_string());
+            out.push(Output::Vllm0_8.to_string());
        }

        #[cfg(feature = "python")]

--- a/lib/engines/vllm/Cargo.toml
+++ b/lib/engines/vllm/Cargo.toml
@@ -14,7 +14,7 @@
 # limitations under the License.

 [package]
-name = "dynamo-engine-vllm"
+name = "dynamo-engine-vllm0_7"
 version.workspace = true
 edition.workspace = true
 description.workspace = true

--- a/lib/engines/vllm/src/engine.rs
+++ b/lib/engines/vllm/src/engine.rs
--- a/lib/engines/vllm/src/lib.rs
+++ b/lib/engines/vllm/src/lib.rs
--- a/lib/engines/vllm/src/ray.rs
+++ b/lib/engines/vllm/src/ray.rs
--- a/lib/engines/vllm/src/subprocess.rs
+++ b/lib/engines/vllm/src/subprocess.rs
--- a/lib/engines/vllm/src/vllm_inc.py
+++ b/lib/engines/vllm/src/vllm_inc.py
@@ -56,11 +56,4 @@ ipc_path = f"ipc:///tmp/{socket_id}"

 engine_alive = multiprocessing.Value("b", True, lock=False)

-# 0.7.3
 run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, engine_alive)
-
-# 0.8.1
-# TODO: In 0.8+ first argument is VllmConfig, not AsyncEngineArgs
-# disable_log_stats = False
-# disable_log_requests = True
-# run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, disable_log_stats, disable_log_requests, engine_alive)
--- a/lib/engines/vllm/src/worker.rs
+++ b/lib/engines/vllm/src/worker.rs
@@ -42,6 +42,9 @@ use dynamo_llm::{engines::MultiNodeConfig, kv_router::publisher::KvMetricsPublis
 /// Wait this long for the vllm sub-process to stop after we send it a KILL
 const VLLM_STOP_TIMEOUT: Duration = Duration::from_millis(1500);

+// The minor revision version of vllm that this engine supports. 0.8+ is in a different engine.
+const VLLM_VERSION: &str = "0.7";
+
 type RequestID = String;

 pub struct VllmWorker {
@@ -255,6 +258,16 @@ fn python_imports() -> Imports {
            }
        };

+        // While we're here check vllm version
+        let version = vllm_module
+            .getattr(py, "__version__")
+            .expect("vllm missing __version__ field")
+            .extract::<String>(py)
+            .expect("vllm.__version__ is not a string");
+        if !version.starts_with(VLLM_VERSION) {
+            panic!("Expected vllm version {VLLM_VERSION}, found {version}");
+        }
+
        let tokens_prompt_type: PyObject = vllm_module.getattr(py, "TokensPrompt").unwrap();
        let sample_params_type: PyObject = vllm_module.getattr(py, "SamplingParams").unwrap();


--- a/lib/engines/vllm0_8/Cargo.toml
+++ b/lib/engines/vllm0_8/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[package]
+name = "dynamo-engine-vllm0_8"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+
+[dependencies]
+dynamo-runtime = { workspace = true }
+dynamo-llm = { workspace = true }
+
+anyhow = { workspace = true }
+async-stream = { workspace = true }
+async-trait = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+thiserror = { workspace = true }
+tokio = { workspace = true }
+tokio-stream = { workspace = true }
+tracing = { workspace = true }
+
+async-openai = "0.27.2"
+pyo3 = { version = "0.23.3", default-features = false, features = [
+  "macros",
+  "experimental-async",
+  "experimental-inspect",
+  "py-clone",
+] }
+pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
+  "attributes",
+  "testing",
+  "tokio-runtime",
+  "unstable-streams",
+] }
+pythonize = { version = "0.23" }
+regex = "1"
+serde-pickle = "1.2.0"
--- a/lib/engines/vllm0_8/src/lib.rs
+++ b/lib/engines/vllm0_8/src/lib.rs
--- a/lib/engines/vllm0_8/src/vllm_inc.py
+++ b/lib/engines/vllm0_8/src/vllm_inc.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# This file is included as a string in lib.rs. Most work should be done in the Rust caller.
+#
+
+import json
+import logging
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args,
+)
+from vllm.inputs import TokensPrompt
+
+# TODO this should match DYN_LOG level
+logging.basicConfig(level=logging.INFO)
+
+
+async def main(request_queue, ready_event, extra_engine_args, **kwargs):
+    arg_map = kwargs
+    if extra_engine_args != "":
+        json_map = {}
+        # extra_engine_args is a filename
+        try:
+            with open(extra_engine_args) as f:
+                json_map = json.load(f)
+        except FileNotFoundError:
+            logging.error(f"File {extra_engine_args} not found.")
+        except json.JSONDecodeError as e:
+            logging.error(f"Invalid JSON in {extra_engine_args}: {e}")
+        logging.debug(f"Adding extra engine arguments: {json_map}")
+        arg_map = {**arg_map, **json_map}  # json_map gets precedence
+    engine_args = AsyncEngineArgs(**arg_map)
+
+    # Main loop
+    try:
+        async with build_async_engine_client_from_engine_args(
+            engine_args
+        ) as engine_client:
+            ready_event.set()
+            while True:
+                req = await request_queue.get()
+                if req is None:  # Stop sentinel
+                    break
+                (request_id, request, sampling_params, response_queue) = req
+
+                prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
+                gen = engine_client.generate(prompt, sampling_params, request_id)
+                async for res in gen:
+                    await response_queue.put(res)
+                await response_queue.put(None)
+
+                request_queue.task_done()
+    except Exception as e:
+        logging.error(f"vllm init failed: {e}")
+    finally:
+        logging.debug("vllm worker stopped")
+
+
+async def run_response(response_queue):
+    try:
+        while True:
+            item = await response_queue.get()
+            yield item
+            response_queue.task_done()
+            if item is None:
+                return
+    except Exception as e:
+        logging.error(f"failed reading response from vllm: {e}")