Unverified Commit 42969800 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove embedded Python vllm and sglang engines (#966)

vllm and sglang are now the sub-process engines from #954

Also updated docs on doing vllm and sglang multi-gpu (tensor parallel) and multi-node (pipeline parallel).
parent 5d89a0c8
...@@ -1533,68 +1533,6 @@ dependencies = [ ...@@ -1533,68 +1533,6 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "dynamo-engine-sglang"
version = "0.2.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"async_zmq",
"dynamo-llm",
"dynamo-runtime",
"libc",
"pyo3",
"regex",
"serde_json",
"tokio",
"tracing",
]
[[package]]
name = "dynamo-engine-vllm0_7"
version = "0.2.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"async_zmq",
"dynamo-llm",
"dynamo-runtime",
"pyo3",
"regex",
"serde-pickle",
"serde_json",
"thiserror 2.0.12",
"tokio",
"tracing",
]
[[package]]
name = "dynamo-engine-vllm0_8"
version = "0.2.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"dynamo-llm",
"dynamo-runtime",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"regex",
"serde",
"serde-pickle",
"serde_json",
"thiserror 2.0.12",
"tokio",
"tokio-stream",
"tracing",
]
[[package]] [[package]]
name = "dynamo-llm" name = "dynamo-llm"
version = "0.2.0" version = "0.2.0"
...@@ -1671,18 +1609,13 @@ dependencies = [ ...@@ -1671,18 +1609,13 @@ dependencies = [
"dynamo-engine-llamacpp", "dynamo-engine-llamacpp",
"dynamo-engine-mistralrs", "dynamo-engine-mistralrs",
"dynamo-engine-python", "dynamo-engine-python",
"dynamo-engine-sglang",
"dynamo-engine-vllm0_7",
"dynamo-engine-vllm0_8",
"dynamo-llm", "dynamo-llm",
"dynamo-runtime", "dynamo-runtime",
"futures", "futures",
"futures-util", "futures-util",
"humantime", "humantime",
"libc", "libc",
"netlink-packet-route",
"regex", "regex",
"rtnetlink",
"serde", "serde",
"serde_json", "serde_json",
"tempfile", "tempfile",
...@@ -1718,7 +1651,7 @@ dependencies = [ ...@@ -1718,7 +1651,7 @@ dependencies = [
"local-ip-address", "local-ip-address",
"log", "log",
"nid", "nid",
"nix 0.29.0", "nix",
"nuid", "nuid",
"once_cell", "once_cell",
"prometheus", "prometheus",
...@@ -3165,12 +3098,6 @@ version = "1.70.1" ...@@ -3165,12 +3098,6 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "iter-read"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071ed4cc1afd86650602c7b11aa2e1ce30762a1c27193201cb5cee9c6ebb1294"
[[package]] [[package]]
name = "itertools" name = "itertools"
version = "0.10.5" version = "0.10.5"
...@@ -3936,70 +3863,6 @@ dependencies = [ ...@@ -3936,70 +3863,6 @@ dependencies = [
"winapi 0.3.9", "winapi 0.3.9",
] ]
[[package]]
name = "netlink-packet-core"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72724faf704479d67b388da142b186f916188505e7e0b26719019c525882eda4"
dependencies = [
"anyhow",
"byteorder",
"netlink-packet-utils",
]
[[package]]
name = "netlink-packet-route"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74c171cd77b4ee8c7708da746ce392440cb7bcf618d122ec9ecc607b12938bf4"
dependencies = [
"anyhow",
"byteorder",
"libc",
"log",
"netlink-packet-core",
"netlink-packet-utils",
]
[[package]]
name = "netlink-packet-utils"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ede8a08c71ad5a95cdd0e4e52facd37190977039a4704eb82a283f713747d34"
dependencies = [
"anyhow",
"byteorder",
"paste",
"thiserror 1.0.69",
]
[[package]]
name = "netlink-proto"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72452e012c2f8d612410d89eea01e2d9b56205274abb35d53f60200b2ec41d60"
dependencies = [
"bytes",
"futures",
"log",
"netlink-packet-core",
"netlink-sys",
"thiserror 2.0.12",
]
[[package]]
name = "netlink-sys"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16c903aa70590cb93691bf97a767c8d1d6122d2cc9070433deb3bbf36ce8bd23"
dependencies = [
"bytes",
"futures",
"libc",
"log",
"tokio",
]
[[package]] [[package]]
name = "nibble_vec" name = "nibble_vec"
version = "0.1.0" version = "0.1.0"
...@@ -4020,17 +3883,6 @@ dependencies = [ ...@@ -4020,17 +3883,6 @@ dependencies = [
"thiserror 1.0.69", "thiserror 1.0.69",
] ]
[[package]]
name = "nix"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
dependencies = [
"bitflags 2.9.0",
"cfg-if 1.0.0",
"libc",
]
[[package]] [[package]]
name = "nix" name = "nix"
version = "0.29.0" version = "0.29.0"
...@@ -5357,24 +5209,6 @@ dependencies = [ ...@@ -5357,24 +5209,6 @@ dependencies = [
"syn 2.0.100", "syn 2.0.100",
] ]
[[package]]
name = "rtnetlink"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b684475344d8df1859ddb2d395dd3dac4f8f3422a1aa0725993cb375fc5caba5"
dependencies = [
"futures",
"log",
"netlink-packet-core",
"netlink-packet-route",
"netlink-packet-utils",
"netlink-proto",
"netlink-sys",
"nix 0.27.1",
"thiserror 1.0.69",
"tokio",
]
[[package]] [[package]]
name = "rustc-demangle" name = "rustc-demangle"
version = "0.1.24" version = "0.1.24"
...@@ -5697,19 +5531,6 @@ dependencies = [ ...@@ -5697,19 +5531,6 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "serde-pickle"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b641fdc8bcf2781ee78b30c599700d64ad4f412976143e4c5d0b9df906bb4843"
dependencies = [
"byteorder",
"iter-read",
"num-bigint",
"num-traits",
"serde",
]
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.219" version = "1.0.219"
......
...@@ -46,7 +46,7 @@ ARG CARGO_BUILD_JOBS ...@@ -46,7 +46,7 @@ ARG CARGO_BUILD_JOBS
ENV CARGO_TARGET_DIR=/workspace/target ENV CARGO_TARGET_DIR=/workspace/target
RUN cargo build --release --locked --features mistralrs,sglang,vllm,python && \ RUN cargo build --release --locked --features mistralrs,python && \
cargo doc --no-deps && \ cargo doc --no-deps && \
cp target/release/dynamo-run /usr/local/bin && \ cp target/release/dynamo-run /usr/local/bin && \
cp target/release/http /usr/local/bin && \ cp target/release/http /usr/local/bin && \
......
...@@ -173,7 +173,7 @@ COPY launch /workspace/launch ...@@ -173,7 +173,7 @@ COPY launch /workspace/launch
COPY deploy/sdk /workspace/deploy/sdk COPY deploy/sdk /workspace/deploy/sdk
# Build Rust crate binaries packaged with the wheel # Build Rust crate binaries packaged with the wheel
RUN cargo build --release --locked --features mistralrs,sglang,vllm,python \ RUN cargo build --release --locked --features mistralrs,python \
-p dynamo-run \ -p dynamo-run \
-p llmctl \ -p llmctl \
# Multiple http named crates are present in dependencies, need to specify the path # Multiple http named crates are present in dependencies, need to specify the path
......
...@@ -59,7 +59,9 @@ RUN apt-get update -y && \ ...@@ -59,7 +59,9 @@ RUN apt-get update -y && \
ninja-build \ ninja-build \
pybind11-dev \ pybind11-dev \
# Rust build dependencies # Rust build dependencies
clang \
libclang-dev \ libclang-dev \
git \
# Install utilities # Install utilities
nvtop \ nvtop \
tmux \ tmux \
...@@ -305,7 +307,7 @@ COPY launch /workspace/launch ...@@ -305,7 +307,7 @@ COPY launch /workspace/launch
COPY deploy/sdk /workspace/deploy/sdk COPY deploy/sdk /workspace/deploy/sdk
# Build Rust crate binaries packaged with the wheel # Build Rust crate binaries packaged with the wheel
RUN cargo build --release --locked --features mistralrs,sglang,vllm,python \ RUN cargo build --release --locked --features mistralrs,python \
-p dynamo-run \ -p dynamo-run \
-p llmctl \ -p llmctl \
# Multiple http named crates are present in dependencies, need to specify the path # Multiple http named crates are present in dependencies, need to specify the path
......
...@@ -3,25 +3,39 @@ ...@@ -3,25 +3,39 @@
* [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm) * [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm)
* [Automatically download a model from Hugging Face](#use-model-from-hugging-face) * [Automatically download a model from Hugging Face](#use-model-from-hugging-face)
* [Run a model from local file](#run-a-model-from-local-file) * [Run a model from local file](#run-a-model-from-local-file)
* [Multi-node](#multi-node) * [Distributed system](#distributed-system)
* [Compiling from Source](#compiling-from-source) * [Full usage details](#full-usage-details)
* [Setup](#setup) * [Setup](#setup)
* [mistral.rs](#mistralrs)
* [llama.cpp](#llamacpp)
* [Sglang](#sglang) * [Sglang](#sglang)
* [lama.cpp](#llama_cpp)
* [Vllm](#vllm) * [Vllm](#vllm)
* [Python bring-your-own-engine](#python-bring-your-own-engine)
* [TensorRT-LLM](#tensorrt-llm-engine) * [TensorRT-LLM](#tensorrt-llm-engine)
* [Echo Engines](#echo-engines) * [Echo Engines](#echo-engines)
* [Write your own engine in Python](#write-your-own-engine-in-python)
* [Batch mode](#batch-mode) * [Batch mode](#batch-mode)
* [Defaults](#defaults) * [Defaults](#defaults)
* [Extra engine arguments](#extra-engine-arguments) * [Extra engine arguments](#extra-engine-arguments)
`dynamo-run` is a CLI tool for exploring the Dynamo components, and an example of how to use them from Rust. It is also available as `dynamo run` if using the Python wheel. `dynamo-run` is a CLI tool for exploring the Dynamo components, and an example of how to use them from Rust. It is also available as `dynamo run` if using the Python wheel.
It supports the following engines: mistralrs, llamacpp, sglang, vllm and tensorrt-llm. `mistralrs` is the default.
Usage:
```
dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]
```
Example: `dynamo run Qwen/Qwen2.5-3B-Instruct`.
Set environment variable `DYN_LOG` to adjust logging level, e.g. `export DYN_LOG=debug`. It has the same syntax as `RUST_LOG`, ask AI for details.
## Quickstart with pip and vllm ## Quickstart with pip and vllm
If you used `pip` to install `dynamo` you should have the `dynamo-run` binary pre-installed with the `vllm` engine. You must be in a virtual env with vllm installed to use this. To compile from source, see "Full documentation" below. If you used `pip` to install `dynamo` you should have the `dynamo-run` binary pre-installed with the `vllm` engine. You must be in a virtual env with vllm installed to use this. To compile from source, see "Full documentation" below.
The vllm and sglang engines require [etcd](https://etcd.io/) and [nats](https://nats.io/) with jetstream (`nats-server -js`). Mistralrs and llamacpp do not.
### Use model from Hugging Face ### Use model from Hugging Face
This will automatically download Qwen2.5 3B from Hugging Face (6 GiB download) and start it in interactive text mode: This will automatically download Qwen2.5 3B from Hugging Face (6 GiB download) and start it in interactive text mode:
...@@ -69,18 +83,26 @@ curl localhost:8080/v1/models ...@@ -69,18 +83,26 @@ curl localhost:8080/v1/models
curl -d '{"model": "Llama-3.2-3B-Instruct-Q4_K_M", "max_completion_tokens": 2049, "messages":[{"role":"user", "content": "What is the capital of South Africa?" }]}' -H 'Content-Type: application/json' http://localhost:8080/v1/chat/completions curl -d '{"model": "Llama-3.2-3B-Instruct-Q4_K_M", "max_completion_tokens": 2049, "messages":[{"role":"user", "content": "What is the capital of South Africa?" }]}' -H 'Content-Type: application/json' http://localhost:8080/v1/chat/completions
``` ```
### Multi-node ### Distributed System
You can run the ingress side (HTTP server and pre-processing) on one machine, for example a CPU node, and the worker on a different machine (a GPU node).
You will need [etcd](https://etcd.io/) and [nats](https://nats.io) installed and accessible from both nodes. You will need [etcd](https://etcd.io/) and [nats](https://nats.io) with jetstream installed and accessible from both nodes.
**Node 1:** **Node 1:**
OpenAI compliant HTTP server, optional pre-processing, worker discovery.
``` ```
dynamo run in=http out=dyn://llama3B_pool dynamo run in=http out=dyn://llama3B_pool
``` ```
**Node 2:** **Node 2:**
Vllm engine. Receives and returns requests over the network.
``` ```
dynamo run in=dyn://llama3B_pool out=vllm ~/llm_models/Llama-3.2-3B-Instruct dynamo-run in=dyn://llama3B_pool out=vllm ~/llms/Llama-3.2-3B-Instruct
``` ```
This will use etcd to auto-discover the model and NATS to talk to it. You can run multiple workers on the same endpoint and it will pick one at random each time. This will use etcd to auto-discover the model and NATS to talk to it. You can run multiple workers on the same endpoint and it will pick one at random each time.
...@@ -89,7 +111,7 @@ The `llama3B_pool` name is purely symbolic, pick anything as long as it matches ...@@ -89,7 +111,7 @@ The `llama3B_pool` name is purely symbolic, pick anything as long as it matches
Run `dynamo run --help` for more options. Run `dynamo run --help` for more options.
## Compiling from Source ## Full usage details
`dynamo-run` is what `dynamo run` executes. It is an example of what you can build in Rust with the `dynamo-llm` and `dynamo-runtime`. The following guide demonstrates how you can build from source with all the features. `dynamo-run` is what `dynamo run` executes. It is an example of what you can build in Rust with the `dynamo-llm` and `dynamo-runtime`. The following guide demonstrates how you can build from source with all the features.
...@@ -125,15 +147,6 @@ source $HOME/.cargo/env ...@@ -125,15 +147,6 @@ source $HOME/.cargo/env
#### Step 3: Build #### Step 3: Build
Run `cargo build` to install the `dynamo-run` binary in `target/debug`.
> **Optionally**, you can run `cargo build` from any location with arguments:
> ```
> --target-dir /path/to/target_directory` specify target_directory with write privileges
> --manifest-path /path/to/project/Cargo.toml` if cargo build is run outside of `launch/` directory
> ```
- Linux with GPU and CUDA (tested on Ubuntu): - Linux with GPU and CUDA (tested on Ubuntu):
``` ```
cargo build --features cuda cargo build --features cuda
...@@ -149,17 +162,50 @@ cargo build --features metal ...@@ -149,17 +162,50 @@ cargo build --features metal
cargo build cargo build
``` ```
Optionally you can run `cargo build` from any location with arguments:
```
--target-dir /path/to/target_directory` # specify target_directory with write privileges
--manifest-path /path/to/project/Cargo.toml` # if cargo build is run outside of `launch/` directory
```
The binary will be called `dynamo-run` in `target/debug` The binary will be called `dynamo-run` in `target/debug`
``` ```
cd target/debug cd target/debug
``` ```
> Note: Build with `--release` for a smaller binary and better performance, but longer build times. The binary will be in `target/release`.
To build for other engines, see the following sections. Build with `--release` for a smaller binary and better performance, but longer build times. The binary will be in `target/release`.
### mistralrs
[mistral.rs](https://github.com/EricLBuehler/mistral.rs) is a pure Rust engine that is fast to run, fast to load, supports GGUF as well as safetensors, and runs well on CPU as well as GPU. For those reasons it is the default engine.
```
dynamo-run Qwen/Qwen2.5-3B-Instruct
```
is equivalent to
```
dynamo-run in=text out=mistralrs Qwen/Qwen2.5-3B-Instruct
```
### llamacpp
Currently [llama.cpp](https://github.com/ggml-org/llama.cpp) is not included by default. Build it like this:
```
cargo build --features llamacpp[,cuda|metal|vulkan] -p dynamo-run
```
```
dynamo-run out=llamacpp ~/llms/Llama-3.2-3B-Instruct-Q6_K.gguf
```
### sglang ### sglang
The [SGLang](https://docs.sglang.ai/index.html) engine requires [etcd](https://etcd.io/) and [nats](https://nats.io/) with jetstream (`nats-server -js`) to be running.
1. Setup the python virtual env: 1. Setup the python virtual env:
``` ```
...@@ -170,42 +216,49 @@ uv pip install sgl-kernel --force-reinstall --no-deps ...@@ -170,42 +216,49 @@ uv pip install sgl-kernel --force-reinstall --no-deps
uv pip install "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ uv pip install "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
``` ```
2. Build 2. Run
Any example above using `out=sglang` will work, but our sglang backend is also multi-gpu.
``` ```
cargo build --features sglang cd target/debug
./dynamo-run in=http out=sglang --model-path ~/llms/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8
``` ```
3. Run To pass extra arguments to the sglang engine see *Extra engine arguments* below.
Any example above using `out=sglang` will work, but our sglang backend is also multi-gpu and multi-node. **Multi-GPU**
**Node 1:** Pass `--tensor-parallel-size <NUM-GPUS>` to `dynamo-run`. To specify which GPU to start from pass `--base-gpu-id <num>`.
```
cd target/debug
./dynamo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --leader-addr 10.217.98.122:9876
```
**Node 2:** For example on a shared eight GPU machine where GPUs 0-3 are already in use:
``` ```
cd target/debug dynamo-run out=sglang <model> --tensor-parallel-size 4 --base-gpu-id 4
./dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --leader-addr 10.217.98.122:9876
``` ```
To pass extra arguments to the sglang engine see *Extra engine arguments* below. **Multi-node:**
### llama_cpp Dynamo only manages the leader node (node rank 0). The follower nodes are started in the [normal sglang way](https://docs.sglang.ai/references/deepseek.html#running-examples-on-multi-node).
Leader node:
``` ```
cargo build --features llamacpp,cuda dynamo-run out=sglang /data/models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 16 --node-rank 0 --num-nodes 2 --leader-addr 10.217.98.122:5000
cd target/debug ```
dynamo-run out=llamacpp ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
All follower nodes. Increment `node-rank` each time:
```
python3 -m sglang.launch_server --model-path /data/models/DeepSeek-R1-Distill-Llama-70B --tp 16 --dist-init-addr 10.217.98.122:5000 --nnodes 2 --node-rank 1 --trust-remote-code
``` ```
If the build step also builds llama_cpp libraries into the same folder as the binary ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynamo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynamo-run` binary.
- Parameters `--leader-addr` and `--dist-init-addr` must match and be the IP address of the leader node. All followers must be able to connect. SGLang is using [PyTorch Distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) for networking.
- Parameters `--tensor-parallel-size` and `--tp` must match and be the total number of GPUs across the cluster.
- `--node-rank` must be unique consecutive integers starting at 1. The leader, managed by Dynamo, is 0.
### vllm ### vllm
Using the [vllm](https://github.com/vllm-project/vllm) Python library. We only use the back half of vllm, talking to it over `zmq`. Slow startup, fast inference. Supports both safetensors from HF and GGUF files. Using the [vllm](https://github.com/vllm-project/vllm) Python library. Slow startup, fast inference. Supports both safetensors from HF and GGUF files, but is very slow for GGUF - prefer llamacpp.
The vllm engine requires requires [etcd](https://etcd.io/) and [nats](https://nats.io/) with jetstream (`nats-server -js`) to be running.
We use [uv](https://docs.astral.sh/uv/) but any virtualenv manager should work. We use [uv](https://docs.astral.sh/uv/) but any virtualenv manager should work.
...@@ -230,100 +283,32 @@ Inside that virtualenv: ...@@ -230,100 +283,32 @@ Inside that virtualenv:
**HF repo:** **HF repo:**
``` ```
./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ ./dynamo-run in=http out=vllm ~/llms/Llama-3.2-3B-Instruct/
```
**GGUF:**
``` ```
./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
```
Note that vllm GGUF handling is very slow. Prefer llamacpp.
**Multi-node:**
vllm uses [ray](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#running-vllm-on-multiple-nodes) for pipeline parallel inference. Dynamo does not change or manage that.
Head node (the one running `dynamo-run`): `ray start --head --port=6379 --dashboard-host 0.0.0.0`
Each worker node: `ray start --address='<HEAD_NODE_IP>:6379`
Remember to pass dynamo-run `--tensor-parallel-size <total-gpus-across-cluster>`, which is often constrained by a model dimension such as being a divisor of the number of attention heads.
To pass extra arguments to the vllm engine see [Extra engine arguments](#extra_engine_arguments) below. To pass extra arguments to the vllm engine see [Extra engine arguments](#extra_engine_arguments) below.
### Python bring-your-own-engine **Multi-GPU**
You can provide your own engine in a Python file. The file must provide a generator with this signature:
```
async def generate(request):
```
Build: `cargo build --features python`
#### Python does the pre-processing
If the Python engine wants to receive and returns strings - it will do the prompt templating and tokenization itself - run it like this:
```
dynamo-run out=pystr:/home/user/my_python_engine.py
```
- The `request` parameter is a map, an OpenAI compatible create chat completion request: https://platform.openai.com/docs/api-reference/chat/create
- The function must `yield` a series of maps conforming to create chat completion stream response (example below).
- If using an HTTP front-end add the `--model-name` flag. This is the name we serve the model under.
The file is loaded once at startup and kept in memory.
**Example engine:** Pass `--tensor-parallel-size <NUM-GPUS>` to `dynamo-run`.
```
import asyncio
async def generate(request):
yield {"id":"1","choices":[{"index":0,"delta":{"content":"The","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
await asyncio.sleep(0.1)
yield {"id":"1","choices":[{"index":0,"delta":{"content":" capital","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
await asyncio.sleep(0.1)
yield {"id":"1","choices":[{"index":0,"delta":{"content":" of","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
await asyncio.sleep(0.1)
yield {"id":"1","choices":[{"index":0,"delta":{"content":" France","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
await asyncio.sleep(0.1)
yield {"id":"1","choices":[{"index":0,"delta":{"content":" is","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
await asyncio.sleep(0.1)
yield {"id":"1","choices":[{"index":0,"delta":{"content":" Paris","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
await asyncio.sleep(0.1)
yield {"id":"1","choices":[{"index":0,"delta":{"content":".","role":"assistant"}}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
await asyncio.sleep(0.1)
yield {"id":"1","choices":[{"index":0,"delta":{"content":"","role":"assistant"},"finish_reason":"stop"}],"created":1841762283,"model":"Llama-3.2-3B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
```
Command line arguments are passed to the python engine like this: To specify which GPUs to use set environment variable `CUDA_VISIBLE_DEVICES`.
```
dynamo-run out=pystr:my_python_engine.py -- -n 42 --custom-arg Orange --yes
```
The python engine receives the arguments in `sys.argv`. The argument list will include some standard ones as well as anything after the `--`. **Multi-node:**
This input: vllm uses [ray](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#running-vllm-on-multiple-nodes) for pipeline parallel inference. Dynamo does not change or manage that.
```
dynamo-run out=pystr:my_engine.py /opt/models/Llama-3.2-3B-Instruct/ --model-name llama_3.2 --tensor-parallel-size 4 -- -n 1
```
is read like this: Here is an example on two 8x nodes:
``` - Leader node: `ray start --head --port=6379`
async def generate(request): - Each follower node: `ray start --address='<HEAD_NODE_IP>:6379`
.. as before .. - Leader node: `dynamo-run out=vllm ~/llms/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 16`
if __name__ == "__main__": The `--tensor-parallel-size` parameter is the total number of GPUs in the cluster. This is often constrained by a model dimension such as being a divisor of the number of attention heads.
print(f"MAIN: {sys.argv}")
```
and produces this output: Startup can be slow so you may want to `export DYN_LOG=debug` to see progress.
```
MAIN: ['my_engine.py', '--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--model-name', 'llama3.2', '--http-port', '8080', '--tensor-parallel-size', '4', '--base-gpu-id', '0', '--num-nodes', '1', '--node-rank', '0', '-n', '1']
```
This allows quick iteration on the engine setup. Note how the `-n` `1` is included. Flags `--leader-addr` and `--model-config` will also be added if provided to `dynamo-run`. Shutdown: `ray stop`
#### TensorRT-LLM engine #### TensorRT-LLM engine
...@@ -345,47 +330,6 @@ Execute the following to load the TensorRT-LLM model specified in the configurat ...@@ -345,47 +330,6 @@ Execute the following to load the TensorRT-LLM model specified in the configurat
dynamo run out=pystr:/workspace/examples/tensorrt_llm/engines/trtllm_engine.py -- --engine_args /workspace/examples/tensorrt_llm/configs/llm_api_config.yaml dynamo run out=pystr:/workspace/examples/tensorrt_llm/engines/trtllm_engine.py -- --engine_args /workspace/examples/tensorrt_llm/configs/llm_api_config.yaml
``` ```
#### Dynamo does the pre-processing
If the Python engine wants to receive and return tokens - the prompt templating and tokenization is already done - run it like this:
```
dynamo-run out=pytok:/home/user/my_python_engine.py --model-path <hf-repo-checkout>
```
- The request parameter is a map that looks like this:
```
{'token_ids': [128000, 128006, 9125, 128007, ... lots more ... ], 'stop_conditions': {'max_tokens': 8192, 'stop': None, 'stop_token_ids_hidden': [128001, 128008, 128009], 'min_tokens': None, 'ignore_eos': None}, 'sampling_options': {'n': None, 'best_of': None, 'presence_penalty': None, 'frequency_penalty': None, 'repetition_penalty': None, 'temperature': None, 'top_p': None, 'top_k': None, 'min_p': None, 'use_beam_search': None, 'length_penalty': None, 'seed': None}, 'eos_token_ids': [128001, 128008, 128009], 'mdc_sum': 'f1cd44546fdcbd664189863b7daece0f139a962b89778469e4cffc9be58ccc88', 'annotations': []}
```
- The `generate` function must `yield` a series of maps that look like this:
```
{"token_ids":[791],"tokens":None,"text":None,"cum_log_probs":None,"log_probs":None,"finish_reason":None}
```
- Command like flag `--model-path` which must point to a Hugging Face repo checkout containing the `tokenizer.json`. The `--model-name` flag is optional. If not provided we use the HF repo name (directory name) as the model name.
**Example engine:**
```
import asyncio
async def generate(request):
yield {"token_ids":[791]}
await asyncio.sleep(0.1)
yield {"token_ids":[6864]}
await asyncio.sleep(0.1)
yield {"token_ids":[315]}
await asyncio.sleep(0.1)
yield {"token_ids":[9822]}
await asyncio.sleep(0.1)
yield {"token_ids":[374]}
await asyncio.sleep(0.1)
yield {"token_ids":[12366]}
await asyncio.sleep(0.1)
yield {"token_ids":[13]}
```
`pytok` supports the same ways of passing command line arguments as `pystr` - `initialize` or `main` with `sys.argv`.
### Echo Engines ### Echo Engines
Dynamo includes two echo engines for testing and debugging purposes: Dynamo includes two echo engines for testing and debugging purposes:
...@@ -445,9 +389,78 @@ The output looks like this: ...@@ -445,9 +389,78 @@ The output looks like this:
{"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855} {"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855}
``` ```
### Write your own engine in Python
Note: This section replaces "bring-your-own-engine".
The [dynamo](https://pypi.org/project/ai-dynamo/) Python library allows you to build your own engine and attach it to Dynamo.
The Python file must do three things:
1. Decorate a function to get the runtime
2. Register on the network
3. Attach a request handler
```
from dynamo.llm import ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker
# 1. Decorate a function to get the runtime
#
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
# 2. Register ourselves on the network
#
component = runtime.namespace("namespace").component("component")
await component.create_service()
model_path = "Qwen/Qwen2.5-0.5B-Instruct" # or "/data/models/Qwen2.5-0.5B-Instruct"
model_type = ModelType.Backend
endpoint = component.endpoint("endpoint")
await register_llm(endpoint, model_path, model_type)
# Initialize your engine here
# engine = ...
# 3. Attach request handler
#
await endpoint.serve_endpoint(RequestHandler(engine).generate, None)
class RequestHandler:
def __init__(self, engine):
...
async def generate(self, request):
# Call the engine
# yield result dict
...
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
```
The `model_path` can be:
- A HuggingFace repo ID. It will be downloaded and cached locally.
- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
- The path to a GGUF file, if your engine supports that.
The `model_type` can be:
- ModelType.Backend. Dynamo handles pre-processing. Your `generate` method receives a `request` dict containing a `token_ids` array of int. It must return a dict also containing a `token_ids` array and an optional `finish_reason` string.
- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). Your engine handles pre-processing.
- ModelType.Completion. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). Your engine handles pre-processing.
Here are some example engines:
- [vllm simple](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_vllm.py)
- [sglang simple](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_sglang.py)
- [vllm](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/vllm_inc.py)
- [sglang](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/sglang_inc.py)
### Defaults ### Defaults
The input defaults to `in=text`. The output will default to `mistralrs` engine. If not available whatever engine you have compiled in (so depending on `--features`). The input defaults to `in=text`. The output will default to `out=mistralrs` engine, unless it is disabled with `--no-default-features` in which case vllm is used.
### Extra engine arguments ### Extra engine arguments
...@@ -463,5 +476,5 @@ Put the arguments in a JSON file: ...@@ -463,5 +476,5 @@ Put the arguments in a JSON file:
Pass it like this: Pass it like this:
``` ```
dynamo-run out=sglang ~/llm_models/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json dynamo-run out=sglang ~/llms/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json
``` ```
...@@ -26,12 +26,9 @@ description = "Dynamo Run CLI" ...@@ -26,12 +26,9 @@ description = "Dynamo Run CLI"
[features] [features]
# Build with `--no-default-features` to disable these defaults # Build with `--no-default-features` to disable these defaults
# We don't include llamacpp by default until we figure out when it needs external libraries default = ["mistralrs"]
default = ["mistralrs", "vllm", "sglang"]
mistralrs = ["dep:dynamo-engine-mistralrs"] mistralrs = ["dep:dynamo-engine-mistralrs"]
llamacpp = ["dep:dynamo-engine-llamacpp"] llamacpp = ["dep:dynamo-engine-llamacpp"]
vllm = ["dep:dynamo-engine-vllm0_7", "dep:dynamo-engine-vllm0_8", "dep:netlink-packet-route", "dep:rtnetlink"]
sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
python = ["dep:dynamo-engine-python"] python = ["dep:dynamo-engine-python"]
cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"] cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"]
...@@ -44,9 +41,6 @@ dynamo-runtime = { workspace = true } ...@@ -44,9 +41,6 @@ dynamo-runtime = { workspace = true }
dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true } dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true }
dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true } dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true }
dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
dynamo-engine-vllm0_7 = { path = "../../lib/engines/vllm0_7", optional = true }
dynamo-engine-vllm0_8 = { path = "../../lib/engines/vllm0_8", optional = true }
dynamo-engine-python = { path = "../../lib/engines/python", optional = true } dynamo-engine-python = { path = "../../lib/engines/python", optional = true }
anyhow = { workspace = true } anyhow = { workspace = true }
...@@ -68,15 +62,3 @@ clap = { version = "4.5", features = ["derive", "env"] } ...@@ -68,15 +62,3 @@ clap = { version = "4.5", features = ["derive", "env"] }
dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] } dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }
futures-util = { version = "0.3" } futures-util = { version = "0.3" }
regex = "1" regex = "1"
[target.x86_64-unknown-linux-gnu.dependencies]
netlink-packet-route = { version = "0.19", optional = true }
rtnetlink = { version = "0.14", optional = true }
[target.x86_64-unknown-linux-musl.dependencies]
netlink-packet-route = { version = "0.19", optional = true }
rtnetlink = { version = "0.14", optional = true }
[target.aarch64-unknown-linux-gnu.dependencies]
netlink-packet-route = { version = "0.19", optional = true }
rtnetlink = { version = "0.14", optional = true }
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr;
use clap::ValueEnum; use clap::ValueEnum;
use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode; use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
...@@ -106,21 +105,6 @@ pub struct Flags { ...@@ -106,21 +105,6 @@ pub struct Flags {
#[arg(long, default_value = "round-robin")] #[arg(long, default_value = "round-robin")]
pub router_mode: RouterMode, pub router_mode: RouterMode,
/// Internal use only.
// Start the python vllm engine sub-process.
#[arg(long, hide = true, default_value = "false")]
pub internal_vllm_process: bool,
/// Internal use only.
/// Start the sglang Python sub-process.
/// The params in the tuple are:
/// - the fd of the write end of a pipe where sglang will signal that it's ready.
/// - the node rank (0 for first host, 1 for second host, etc)
/// - the workers' rank (globally unique)
/// - the GPU to use (locally unique)
#[arg(long, hide = true, value_parser = parse_sglang_flags)]
pub internal_sglang_process: Option<SgLangFlags>,
/// Additional engine-specific arguments from a JSON file. /// Additional engine-specific arguments from a JSON file.
/// Contains a mapping of parameter names to values. /// Contains a mapping of parameter names to values.
#[arg(long)] #[arg(long)]
...@@ -200,30 +184,6 @@ impl Flags { ...@@ -200,30 +184,6 @@ impl Flags {
} }
} }
#[derive(Debug, Clone, Copy)]
pub struct SgLangFlags {
pub pipe_fd: u32,
pub tp_rank: u32,
pub gpu_id: u32,
}
fn parse_sglang_flags(s: &str) -> Result<SgLangFlags, String> {
let nums: Vec<u32> = s
.split(',')
.map(u32::from_str)
.collect::<Result<Vec<_>, _>>()
.map_err(|e| e.to_string())?;
if nums.len() != 3 {
return Err("Need exactly 3 numbers".into());
}
Ok(SgLangFlags {
pipe_fd: nums[0],
tp_rank: nums[1],
gpu_id: nums[2],
})
}
#[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug)] #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug)]
pub enum RouterMode { pub enum RouterMode {
#[default] #[default]
......
...@@ -183,7 +183,6 @@ pub async fn prepare_engine( ...@@ -183,7 +183,6 @@ pub async fn prepare_engine(
_cache_dir: None, _cache_dir: None,
}) })
} }
EngineConfig::None => unreachable!(),
} }
} }
......
...@@ -91,7 +91,6 @@ pub async fn run( ...@@ -91,7 +91,6 @@ pub async fn run(
EngineConfig::Dynamic(_) => { EngineConfig::Dynamic(_) => {
anyhow::bail!("Cannot use endpoint for both in and out"); anyhow::bail!("Cannot use endpoint for both in and out");
} }
EngineConfig::None => unreachable!(),
}; };
tokio::select! { tokio::select! {
......
...@@ -97,7 +97,6 @@ pub async fn run( ...@@ -97,7 +97,6 @@ pub async fn run(
.await?; .await?;
manager.add_completions_model(model.service_name(), cmpl_pipeline)?; manager.add_completions_model(model.service_name(), cmpl_pipeline)?;
} }
EngineConfig::None => unreachable!(),
} }
http_service.run(runtime.primary_token()).await?; http_service.run(runtime.primary_token()).await?;
runtime.shutdown(); // Cancel primary token runtime.shutdown(); // Cancel primary token
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#[cfg(any(feature = "vllm", feature = "sglang"))]
use std::{future::Future, pin::Pin}; use std::{future::Future, pin::Pin};
use std::{io::Read, sync::Arc, time::Duration}; use std::{io::Read, sync::Arc, time::Duration};
use anyhow::Context; use anyhow::Context;
use dynamo_llm::{ use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, LocalModel};
backend::ExecutionContext, engines::StreamingEngine, kv_router::publisher::KvMetricsPublisher,
LocalModel,
};
use dynamo_runtime::{protocols::Endpoint, CancellationToken, DistributedRuntime}; use dynamo_runtime::{protocols::Endpoint, CancellationToken, DistributedRuntime};
mod flags; mod flags;
pub use flags::Flags; pub use flags::Flags;
mod input; mod input;
#[cfg(any(feature = "vllm", feature = "sglang"))]
mod net;
mod opt; mod opt;
pub use dynamo_llm::request_template::RequestTemplate; pub use dynamo_llm::request_template::RequestTemplate;
pub use opt::{Input, Output}; pub use opt::{Input, Output};
...@@ -38,19 +20,12 @@ mod subprocess; ...@@ -38,19 +20,12 @@ mod subprocess;
/// the command line. Hence it's optional, and defaults to this. /// the command line. Hence it's optional, and defaults to this.
const INVISIBLE_MODEL_NAME: &str = "dynamo-run"; const INVISIBLE_MODEL_NAME: &str = "dynamo-run";
/// The component name for the KV publisher, if used
const KV_PUBLISHER_COMPONENT: &str = "kvpublisher";
const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2); const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
/// How we identify a python string endpoint /// How we identify a python string endpoint
#[cfg(feature = "python")] #[cfg(feature = "python")]
const PYTHON_STR_SCHEME: &str = "pystr:"; const PYTHON_STR_SCHEME: &str = "pystr:";
/// How we identify a python token endpoint
#[cfg(feature = "python")]
const PYTHON_TOK_SCHEME: &str = "pytok:";
pub enum EngineConfig { pub enum EngineConfig {
/// An remote networked engine we don't know about yet /// An remote networked engine we don't know about yet
Dynamic(Endpoint), Dynamic(Endpoint),
...@@ -66,24 +41,13 @@ pub enum EngineConfig { ...@@ -66,24 +41,13 @@ pub enum EngineConfig {
engine: ExecutionContext, engine: ExecutionContext,
model: Box<LocalModel>, model: Box<LocalModel>,
}, },
/// vllm multi-node doesn't run an engine on nodes other than 0. 'ray' does all the work.
None,
}
/// Distributed system values
struct DynInput {
endpoint_id: Endpoint,
distributed_runtime: DistributedRuntime,
} }
#[allow(unused_mut)]
pub async fn run( pub async fn run(
runtime: dynamo_runtime::Runtime, runtime: dynamo_runtime::Runtime,
mut in_opt: Input, // mut because vllm and sglang multi-node can change it in_opt: Input,
out_opt: Output, out_opt: Output,
flags: Flags, flags: Flags,
#[allow(unused_variables)] zmq_socket_prefix: Option<String>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let cancel_token = runtime.primary_token(); let cancel_token = runtime.primary_token();
let maybe_path = flags let maybe_path = flags
...@@ -120,29 +84,6 @@ pub async fn run( ...@@ -120,29 +84,6 @@ pub async fn run(
} }
}; };
let dyn_input = match &in_opt {
Input::Endpoint(endpoint_path) => {
if maybe_path.as_ref().map(|mp| mp.is_file()).unwrap_or(false)
&& flags.model_config.is_none()
{
// TODO We need to convert tokenizer extract from GGUF file into something we can
// publish to NATS. Ideally `tokenizer.json` directly, but otherwise an
// intermediate format.
tracing::error!("Serving GGUF files in a distributed system requires `--model-config <hf-repo-dir>` so that we can find the tokenzier config");
return Ok(());
}
// If we are in a distributed system, we need to know our component upfront
let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
let endpoint_id: Endpoint = endpoint_path.parse()?;
Some(DynInput {
endpoint_id,
distributed_runtime,
})
}
_ => None,
};
let mut extra: Option<Pin<Box<dyn Future<Output = ()> + Send>>> = None; // vllm and sglang sub-process let mut extra: Option<Pin<Box<dyn Future<Output = ()> + Send>>> = None; // vllm and sglang sub-process
let template = if let Some(path) = flags.request_template.as_ref() { let template = if let Some(path) = flags.request_template.as_ref() {
...@@ -183,13 +124,17 @@ pub async fn run( ...@@ -183,13 +124,17 @@ pub async fn run(
engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?, engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?,
model: Box::new(local_model), model: Box::new(local_model),
}, },
Output::SgLang => { Output::SgLang => {
if !local_model.path().is_dir() { if !local_model.path().is_dir() {
// TODO Does sglang support GGUF? Can we make it work? // TODO Does sglang support GGUF? Can we make it work?
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout"); anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
} }
let (py_script, mut child) = match subprocess::start( let multi_node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
let (py_script, child) = match subprocess::start(
subprocess::sglang::PY, subprocess::sglang::PY,
local_model.path(), local_model.path(),
flags.tensor_parallel_size, flags.tensor_parallel_size,
...@@ -198,6 +143,11 @@ pub async fn run( ...@@ -198,6 +143,11 @@ pub async fn run(
} else { } else {
Some(flags.base_gpu_id) Some(flags.base_gpu_id)
}, },
if flags.num_nodes <= 1 {
None
} else {
Some(multi_node_conf)
},
flags.extra_engine_args.as_deref(), flags.extra_engine_args.as_deref(),
) )
.await .await
...@@ -216,151 +166,16 @@ pub async fn run( ...@@ -216,151 +166,16 @@ pub async fn run(
let endpoint: Endpoint = subprocess::ENDPOINT.parse()?; let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
EngineConfig::Dynamic(endpoint) EngineConfig::Dynamic(endpoint)
} }
#[cfg(feature = "sglang")]
Output::SgLangLegacy => {
if !local_model.path().is_dir() {
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
}
let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("sglang requires zmq_socket_prefix");
};
let node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
if node_conf.num_nodes > 1 {
if let Ok(Some(if_name)) = net::get_primary_interface().await {
tracing::info!("If you see 'gloo' errors from sglang try setting these environment variables:");
tracing::info!("export GLOO_SOCKET_IFNAME={if_name}");
tracing::info!("export NCCL_SOCKET_IFNAME={if_name}");
}
if node_conf.node_rank != 0 {
// Follower nodes take input from leader node over pytorch distributed, not
// from user.
in_opt = Input::None;
}
}
let (engine, sglang_process) = dynamo_engine_sglang::make_engine(
cancel_token.clone(),
local_model.path(),
&sock_prefix,
node_conf,
flags.tensor_parallel_size,
flags.base_gpu_id,
flags.extra_engine_args.clone(),
)
.await?;
extra = Some(Box::pin(async move {
let _ = sglang_process.await;
}));
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
}
}
#[cfg(feature = "vllm")]
Output::Vllm0_7 => {
if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("vllm requires zmq_socket_prefix");
};
let node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
if node_conf.num_nodes > 1 {
if let Ok(Some(if_name)) = net::get_primary_interface().await {
tracing::info!("If you see network errors from vllm try setting this environment variable:");
tracing::info!("export NCCL_SOCKET_IFNAME={if_name}");
}
if node_conf.node_rank != 0 {
// Only node 0 runs vllm, the others communicate over ray
in_opt = Input::None;
}
}
if node_conf.node_rank == 0 {
let kv_metrics_publisher = if let Some(dyn_input) = &dyn_input {
let kvp_component = dyn_input
.distributed_runtime
.namespace(dyn_input.endpoint_id.namespace.clone())?
.component(KV_PUBLISHER_COMPONENT)?;
let kvp = Arc::new(KvMetricsPublisher::new()?);
let kvp_inner = kvp.clone();
tokio::spawn(
async move { kvp_inner.create_endpoint(kvp_component, None).await },
);
Some(kvp)
} else {
None
};
// vllm multi-node only the leader runs vllm
let (engine, vllm_future) = dynamo_engine_vllm0_7::make_leader_engine(
cancel_token.clone(),
local_model.path(),
&sock_prefix,
node_conf,
flags.tensor_parallel_size,
flags.extra_engine_args.clone(),
kv_metrics_publisher,
)
.await?;
extra = Some(Box::pin(async move {
let _ = vllm_future.await;
}));
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
}
} else {
// Nodes rank > 0 only run 'ray'
let stop_future =
dynamo_engine_vllm0_7::start_follower(cancel_token.clone(), node_conf).await?;
extra = Some(Box::pin(stop_future));
EngineConfig::None
}
}
#[cfg(feature = "vllm")]
Output::Vllm0_8 => {
if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
let node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.clone().unwrap_or_default(),
};
let engine = dynamo_engine_vllm0_8::make_engine(
cancel_token.clone(),
local_model.path(),
node_conf,
flags.tensor_parallel_size,
flags.extra_engine_args.clone(),
)
.await?;
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
}
}
// No feature flag because it uses a sub-process, it's very cheap to include
Output::Vllm => { Output::Vllm => {
if flags.base_gpu_id != 0 { if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead."); anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
} }
let (py_script, mut child) = match subprocess::start( let (py_script, child) = match subprocess::start(
subprocess::vllm::PY, subprocess::vllm::PY,
local_model.path(), local_model.path(),
flags.tensor_parallel_size, flags.tensor_parallel_size,
None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead
None, // multi-node config. vllm uses `ray`, see guide
flags.extra_engine_args.as_deref(), flags.extra_engine_args.as_deref(),
) )
.await .await
...@@ -405,18 +220,6 @@ pub async fn run( ...@@ -405,18 +220,6 @@ pub async fn run(
model: Box::new(local_model), model: Box::new(local_model),
} }
} }
#[cfg(feature = "python")]
Output::PythonTok(path_str) => {
let card = local_model.card();
let py_args = flags.as_vec(&path_str, &card.service_name);
let p = std::path::PathBuf::from(path_str);
let engine =
dynamo_engine_python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
EngineConfig::StaticCore {
engine,
model: Box::new(local_model),
}
}
}; };
match in_opt { match in_opt {
...@@ -443,16 +246,8 @@ pub async fn run( ...@@ -443,16 +246,8 @@ pub async fn run(
.await?; .await?;
} }
Input::Endpoint(path) => { Input::Endpoint(path) => {
let Some(dyn_input) = dyn_input else { let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
unreachable!("We set dyn_input earlier"); crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
};
crate::input::endpoint::run(dyn_input.distributed_runtime, path, engine_config).await?;
}
Input::None => {
// Multi-node setup. The engine sub-process has been started and is talking
// to it's node_rank 0 controller. We do nothing.
// TODO: Acquire an etcd lease, we are running
cancel_token.cancelled().await;
} }
} }
......
...@@ -24,15 +24,13 @@ const HELP: &str = r#" ...@@ -24,15 +24,13 @@ const HELP: &str = r#"
dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally. dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally.
Example: Example:
- cargo build --release --features mistralrs,cuda - cargo build --features cuda -p dynamo-run
- cd target/release - cd target/debug
- ./dynamo-run hf_checkouts/Llama-3.2-3B-Instruct/ - ./dynamo-run Qwen/Qwen2.5-3B-Instruct
- OR: ./dynamo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf - OR: ./dynamo-run /data/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
"#; "#;
const ZMQ_SOCKET_PREFIX: &str = "dyn"; const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]";
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>|none] out=ENGINE_LIST [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]";
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
// Set log level based on verbosity flag // Set log level based on verbosity flag
...@@ -56,72 +54,6 @@ fn main() -> anyhow::Result<()> { ...@@ -56,72 +54,6 @@ fn main() -> anyhow::Result<()> {
logging::init(); logging::init();
// Call sub-processes before starting the Runtime machinery
// For anything except sub-process starting try_parse_from will error.
if let Ok(flags) = dynamo_run::Flags::try_parse_from(env::args()) {
#[allow(unused_variables)]
if let Some(sglang_flags) = flags.internal_sglang_process {
let Some(model_path) = flags.model_path_flag.as_ref() else {
anyhow::bail!("sglang subprocess requires --model-path");
};
if !model_path.is_dir() {
anyhow::bail!("sglang subprocess requires model path to be a directory containing the safetensors files");
}
if cfg!(feature = "sglang") {
#[cfg(feature = "sglang")]
{
let gpu_config = dynamo_engine_sglang::MultiGPUConfig {
tp_size: flags.tensor_parallel_size,
tp_rank: sglang_flags.tp_rank,
gpu_id: sglang_flags.gpu_id,
};
let node_config = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
};
return dynamo_engine_sglang::run_subprocess(
ZMQ_SOCKET_PREFIX,
model_path,
sglang_flags.pipe_fd as std::os::fd::RawFd,
node_config,
gpu_config,
flags.extra_engine_args,
);
}
} else {
panic!("Rebuild with --features=sglang");
}
}
#[allow(unused_variables)]
if flags.internal_vllm_process {
let Some(model_path) = flags.model_path_flag else {
anyhow::bail!("vllm subprocess requires --model-path flag");
};
if cfg!(feature = "vllm") {
#[cfg(feature = "vllm")]
{
let node_config = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
};
return dynamo_engine_vllm0_7::run_subprocess(
ZMQ_SOCKET_PREFIX,
&model_path,
node_config,
flags.tensor_parallel_size,
flags.extra_engine_args,
flags.router_mode.is_kv_routing(),
);
}
} else {
panic!("Rebuild with --features=vllm");
}
}
}
// max_worker_threads and max_blocking_threads from env vars or config file. // max_worker_threads and max_blocking_threads from env vars or config file.
let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?; let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?;
...@@ -195,14 +127,7 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> { ...@@ -195,14 +127,7 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
.chain(env::args().skip(non_flag_params)), .chain(env::args().skip(non_flag_params)),
)?; )?;
dynamo_run::run( dynamo_run::run(runtime, in_opt, out_opt, flags).await
runtime,
in_opt,
out_opt,
flags,
Some(ZMQ_SOCKET_PREFIX.to_string()),
)
.await
} }
/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it. /// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Mac build uses none of this
#![allow(dead_code)]
#[cfg(target_os = "linux")]
pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
unix::get_primary_interface().await
}
#[cfg(target_os = "macos")]
pub async fn get_primary_interface() -> Result<Option<String>, LinkDataError> {
Ok(None)
}
#[derive(Debug)]
pub struct LinkDataError {
kind: LinkDataErrorKind,
interface: Option<String>,
}
impl LinkDataError {
fn connection(connection_error: std::io::Error) -> Self {
let kind = LinkDataErrorKind::Connection(connection_error);
let interface = None;
Self { kind, interface }
}
#[cfg(target_os = "linux")]
fn communication(communication_error: rtnetlink::Error) -> Self {
let kind = LinkDataErrorKind::Communication(communication_error);
let interface = None;
Self { kind, interface }
}
}
impl std::fmt::Display for LinkDataError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let err_message = "could not get interface link data";
if let Some(interface) = self.interface.as_ref() {
write!(f, "{err_message} for {interface}")
} else {
write!(f, "{err_message}")
}
}
}
impl std::error::Error for LinkDataError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self.kind {
LinkDataErrorKind::Connection(ref e) => Some(e),
#[cfg(target_os = "linux")]
LinkDataErrorKind::Communication(ref e) => Some(e),
}
}
}
#[derive(Debug)]
pub enum LinkDataErrorKind {
Connection(std::io::Error),
#[cfg(target_os = "linux")]
Communication(rtnetlink::Error),
}
#[cfg(target_os = "linux")]
mod unix {
use futures_util::TryStreamExt;
use netlink_packet_route::address::AddressAttribute;
use netlink_packet_route::link::LinkLayerType;
use netlink_packet_route::link::State as LinkState;
use netlink_packet_route::link::{LinkAttribute, LinkMessage};
use netlink_packet_route::AddressFamily;
use std::collections::HashMap;
use std::collections::HashSet;
use std::collections::VecDeque;
pub async fn get_primary_interface() -> Result<Option<String>, super::LinkDataError> {
let mut candidates: VecDeque<String> = get_ipv4_interface_links()
.await?
.into_iter()
.filter(|(k, v)| {
v.is_ethernet() && v.link_is_up() && v.has_carrier() && k.starts_with("e")
})
.map(|(k, _)| k)
.collect();
Ok(candidates.pop_front())
}
#[derive(Clone, Debug)]
// Most of the fields are Option<T> because the netlink protocol allows them
// to be absent (even though we have no reason to believe they'd ever actually
// be missing).
struct InterfaceLinkData {
link_type: LinkLayerType,
state: Option<LinkState>,
has_carrier: bool,
}
impl InterfaceLinkData {
pub fn link_is_up(&self) -> bool {
self.state
.map(|state| matches!(state, LinkState::Up))
.unwrap_or(false)
}
pub fn is_ethernet(&self) -> bool {
matches!(self.link_type, LinkLayerType::Ether)
}
pub fn has_carrier(&self) -> bool {
self.has_carrier
}
}
impl From<LinkMessage> for InterfaceLinkData {
fn from(link_message: LinkMessage) -> Self {
let link_type = link_message.header.link_layer_type;
let state = link_message
.attributes
.iter()
.find_map(|attribute| match attribute {
LinkAttribute::OperState(state) => Some(*state),
_ => None,
});
let has_carrier = link_message
.attributes
.iter()
.find_map(|attribute| match attribute {
LinkAttribute::Carrier(1) => Some(true),
_ => None,
})
.unwrap_or(false);
InterfaceLinkData {
link_type,
state,
has_carrier,
}
}
}
// Retrieve the link data (state, MTU, etc.) for all interfaces, and return
// them as a HashMap keyed by interface name. This is roughly equivalent to `ip
// link show` since we're using the same netlink interface under the hood as
// that command.
async fn get_ipv4_interface_links(
) -> Result<HashMap<String, InterfaceLinkData>, super::LinkDataError> {
let (netlink_connection, rtnetlink_handle, _receiver) =
rtnetlink::new_connection().map_err(super::LinkDataError::connection)?;
// We have to spawn off the netlink connection because of the architecture
// of `netlink_proto::Connection`, which runs in the background and owns
// the socket. We communicate with it via channel messages, and it will exit
// when both `rtnetlink_handle` and `_receiver` go out of scope.
tokio::spawn(netlink_connection);
let address_handle = rtnetlink_handle.address().get().execute();
let ipv4s: HashSet<String> = address_handle
.try_filter_map(|addr_message| async move {
if matches!(addr_message.header.family, AddressFamily::Inet) {
Ok(addr_message
.attributes
.into_iter()
.find(|attr| matches!(attr, AddressAttribute::Label(_)))
.and_then(|x| match x {
AddressAttribute::Label(label) => Some(label),
_ => None,
}))
} else {
Ok(None)
}
})
.try_collect()
.await
.map_err(super::LinkDataError::communication)?;
let link_handle = rtnetlink_handle.link().get().execute();
link_handle
.try_filter_map(|link_message| async {
let maybe_interface_data = match extract_interface_name(&link_message) {
Some(interface_name) => {
if ipv4s.contains(&interface_name) {
Some((interface_name, InterfaceLinkData::from(link_message)))
} else {
None
}
}
None => {
let idx = link_message.header.index;
eprintln!(
"Network interface with index {idx} doesn't have a name (no IfName attribute)"
);
None
}
};
Ok(maybe_interface_data)
})
.try_collect()
.await
.map_err(super::LinkDataError::communication)
}
fn extract_interface_name(link_message: &LinkMessage) -> Option<String> {
link_message
.attributes
.iter()
.find_map(|attribute| match attribute {
LinkAttribute::IfName(name) => Some(name.clone()),
_ => None,
})
}
}
...@@ -35,11 +35,6 @@ pub enum Input { ...@@ -35,11 +35,6 @@ pub enum Input {
/// Batch mode. Run all the prompts, write the outputs, exit. /// Batch mode. Run all the prompts, write the outputs, exit.
Batch(PathBuf), Batch(PathBuf),
/// Start the engine but don't provide any way to talk to it.
/// For multi-node sglang, where the engine connects directly
/// to the co-ordinator via torch distributed / nccl.
None,
} }
impl TryFrom<&str> for Input { impl TryFrom<&str> for Input {
...@@ -50,7 +45,6 @@ impl TryFrom<&str> for Input { ...@@ -50,7 +45,6 @@ impl TryFrom<&str> for Input {
"http" => Ok(Input::Http), "http" => Ok(Input::Http),
"text" => Ok(Input::Text), "text" => Ok(Input::Text),
"stdin" => Ok(Input::Stdin), "stdin" => Ok(Input::Stdin),
"none" => Ok(Input::None),
endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => { endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => {
Ok(Input::Endpoint(endpoint_path.to_string())) Ok(Input::Endpoint(endpoint_path.to_string()))
} }
...@@ -71,7 +65,6 @@ impl fmt::Display for Input { ...@@ -71,7 +65,6 @@ impl fmt::Display for Input {
Input::Stdin => "stdin", Input::Stdin => "stdin",
Input::Endpoint(path) => path, Input::Endpoint(path) => path,
Input::Batch(path) => &path.display().to_string(), Input::Batch(path) => &path.display().to_string(),
Input::None => "none",
}; };
write!(f, "{s}") write!(f, "{s}")
} }
...@@ -101,39 +94,21 @@ pub enum Output { ...@@ -101,39 +94,21 @@ pub enum Output {
/// Run inference on a model in a GGUF file using mistralrs w/ candle /// Run inference on a model in a GGUF file using mistralrs w/ candle
MistralRs, MistralRs,
#[cfg(feature = "sglang")]
/// Deprecated
SgLangLegacy,
/// Run inference using sglang
SgLang,
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
/// Run inference using llama.cpp /// Run inference using llama.cpp
LlamaCpp, LlamaCpp,
/// Run inference using sglang
SgLang,
// Start vllm in a sub-process connecting via nats // Start vllm in a sub-process connecting via nats
// Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>` // Sugar for `python vllm_inc.py --endpoint <thing> --model <thing>`
Vllm, Vllm,
#[cfg(feature = "vllm")]
/// Run inference using vllm 0.8.X+
Vllm0_8,
#[cfg(feature = "vllm")]
/// Run inference using vllm 0.7.X
Vllm0_7,
/// Run inference using a user supplied python file that accepts and returns /// Run inference using a user supplied python file that accepts and returns
/// strings. It does it's own pre-processing. /// strings. It does it's own pre-processing.
#[cfg(feature = "python")] #[cfg(feature = "python")]
PythonStr(String), PythonStr(String),
/// Run inference using a user supplied python file that accepts and returns
/// tokens. We do the pre-processing.
#[cfg(feature = "python")]
PythonTok(String),
//
// DEVELOPER NOTE // DEVELOPER NOTE
// If you add an engine add it to `available_engines` below, and to Default if it makes sense // If you add an engine add it to `available_engines` below, and to Default if it makes sense
} }
...@@ -146,21 +121,12 @@ impl TryFrom<&str> for Output { ...@@ -146,21 +121,12 @@ impl TryFrom<&str> for Output {
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
"mistralrs" => Ok(Output::MistralRs), "mistralrs" => Ok(Output::MistralRs),
#[cfg(feature = "sglang")]
"sglang_legacy" => Ok(Output::SgLangLegacy),
"sglang" => Ok(Output::SgLang),
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
"llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp), "llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
"sglang" => Ok(Output::SgLang),
"vllm" => Ok(Output::Vllm), "vllm" => Ok(Output::Vllm),
#[cfg(feature = "vllm")]
"vllm0_8" => Ok(Output::Vllm0_8),
#[cfg(feature = "vllm")]
"vllm0_7" => Ok(Output::Vllm0_7),
"echo_full" => Ok(Output::EchoFull), "echo_full" => Ok(Output::EchoFull),
"echo_core" => Ok(Output::EchoCore), "echo_core" => Ok(Output::EchoCore),
...@@ -177,14 +143,6 @@ impl TryFrom<&str> for Output { ...@@ -177,14 +143,6 @@ impl TryFrom<&str> for Output {
Ok(Output::PythonStr(path.to_string())) Ok(Output::PythonStr(path.to_string()))
} }
#[cfg(feature = "python")]
python_tok_gen if python_tok_gen.starts_with(crate::PYTHON_TOK_SCHEME) => {
let path = python_tok_gen
.strip_prefix(crate::PYTHON_TOK_SCHEME)
.unwrap();
Ok(Output::PythonTok(path.to_string()))
}
e => Err(anyhow::anyhow!("Invalid out= option '{e}'")), e => Err(anyhow::anyhow!("Invalid out= option '{e}'")),
} }
} }
...@@ -196,21 +154,12 @@ impl fmt::Display for Output { ...@@ -196,21 +154,12 @@ impl fmt::Display for Output {
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
Output::MistralRs => "mistralrs", Output::MistralRs => "mistralrs",
#[cfg(feature = "sglang")]
Output::SgLangLegacy => "sglang_legacy",
Output::SgLang => "sglang",
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
Output::LlamaCpp => "llamacpp", Output::LlamaCpp => "llamacpp",
Output::SgLang => "sglang",
Output::Vllm => "vllm", Output::Vllm => "vllm",
#[cfg(feature = "vllm")]
Output::Vllm0_8 => "vllm0_8",
#[cfg(feature = "vllm")]
Output::Vllm0_7 => "vllm0_7",
Output::EchoFull => "echo_full", Output::EchoFull => "echo_full",
Output::EchoCore => "echo_core", Output::EchoCore => "echo_core",
...@@ -218,9 +167,6 @@ impl fmt::Display for Output { ...@@ -218,9 +167,6 @@ impl fmt::Display for Output {
#[cfg(feature = "python")] #[cfg(feature = "python")]
Output::PythonStr(_) => "pystr", Output::PythonStr(_) => "pystr",
#[cfg(feature = "python")]
Output::PythonTok(_) => "pytok",
}; };
write!(f, "{s}") write!(f, "{s}")
} }
...@@ -258,22 +204,11 @@ impl Output { ...@@ -258,22 +204,11 @@ impl Output {
} }
out.push(Output::SgLang.to_string()); out.push(Output::SgLang.to_string());
#[cfg(feature = "sglang")]
{
out.push(Output::SgLangLegacy.to_string());
}
out.push(Output::Vllm.to_string()); out.push(Output::Vllm.to_string());
#[cfg(feature = "vllm")]
{
out.push(Output::Vllm0_7.to_string());
out.push(Output::Vllm0_8.to_string());
}
#[cfg(feature = "python")] #[cfg(feature = "python")]
{ {
out.push(Output::PythonStr("file.py".to_string()).to_string()); out.push(Output::PythonStr("file.py".to_string()).to_string());
out.push(Output::PythonTok("file.py".to_string()).to_string());
} }
out out
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::borrow::Cow; use std::borrow::Cow;
use std::io::Write; use std::io::Write;
...@@ -23,6 +11,8 @@ use anyhow::Context; ...@@ -23,6 +11,8 @@ use anyhow::Context;
use regex::Regex; use regex::Regex;
use tokio::io::AsyncBufReadExt; use tokio::io::AsyncBufReadExt;
use dynamo_llm::engines::MultiNodeConfig;
pub mod sglang; pub mod sglang;
pub mod vllm; pub mod vllm;
...@@ -39,6 +29,8 @@ pub async fn start( ...@@ -39,6 +29,8 @@ pub async fn start(
// sglang which GPU to start from, on a multi-GPU system // sglang which GPU to start from, on a multi-GPU system
// vllm uses CUDA_VISIBLE_DEVICES // vllm uses CUDA_VISIBLE_DEVICES
base_gpu_id: Option<u32>, base_gpu_id: Option<u32>,
// sglang multi-node config. vllm uses `ray` externally
multi_node_config: Option<MultiNodeConfig>,
// Path to a JSON file containing extra arguments to the backend engine // Path to a JSON file containing extra arguments to the backend engine
extra_engine_args: Option<&Path>, extra_engine_args: Option<&Path>,
) -> anyhow::Result<(tempfile::TempPath, tokio::process::Child)> { ) -> anyhow::Result<(tempfile::TempPath, tokio::process::Child)> {
...@@ -61,6 +53,15 @@ pub async fn start( ...@@ -61,6 +53,15 @@ pub async fn start(
args.push("--base-gpu-id".to_string()); args.push("--base-gpu-id".to_string());
args.push(base_gpu_id.to_string()); args.push(base_gpu_id.to_string());
} }
// sglang only
if let Some(multi_node_config) = multi_node_config {
args.push("--nnodes".to_string());
args.push(multi_node_config.num_nodes.to_string());
args.push("--node-rank".to_string());
args.push(multi_node_config.node_rank.to_string());
args.push("--dist-init-addr".to_string());
args.push(multi_node_config.leader_addr);
}
if let Some(extra_engine_args) = extra_engine_args { if let Some(extra_engine_args) = extra_engine_args {
args.push("--extra-engine-args".to_string()); args.push("--extra-engine-args".to_string());
args.push(extra_engine_args.to_string_lossy().to_string()); args.push(extra_engine_args.to_string_lossy().to_string());
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# #
# A very basic example of sglang worker handling pre-processed requests. # A very basic example of sglang worker handling pre-processed requests.
...@@ -52,6 +39,9 @@ class Config: ...@@ -52,6 +39,9 @@ class Config:
model: str model: str
base_gpu_id: int base_gpu_id: int
tensor_parallel_size: int tensor_parallel_size: int
nnodes: int
node_rank: int
dist_init_addr: str
extra_engine_args: str extra_engine_args: str
...@@ -111,6 +101,13 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -111,6 +101,13 @@ async def init(runtime: DistributedRuntime, config: Config):
"tp_size": config.tensor_parallel_size, "tp_size": config.tensor_parallel_size,
"base_gpu_id": config.base_gpu_id, "base_gpu_id": config.base_gpu_id,
} }
if config.dist_init_addr != "":
arg_map["trust_remote_code"] = True
arg_map["nnodes"] = config.nnodes
arg_map["dist_init_addr"] = config.dist_init_addr
# In practice this is always 0 because Dynamo only manages the leader
arg_map["node_rank"] = config.node_rank
if config.extra_engine_args != "": if config.extra_engine_args != "":
json_map = {} json_map = {}
# extra_engine_args is a filename # extra_engine_args is a filename
...@@ -157,6 +154,21 @@ def cmd_line_args(): ...@@ -157,6 +154,21 @@ def cmd_line_args():
parser.add_argument( parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use." "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
) )
parser.add_argument(
"--nnodes", type=int, default=1, help="The number of machines SGLang will use"
)
parser.add_argument(
"--node-rank",
type=int,
default=0,
help="Unique number for each node. 0 for the leader.",
)
parser.add_argument(
"--dist-init-addr",
type=str,
default="",
help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
)
parser.add_argument( parser.add_argument(
"--extra-engine-args", "--extra-engine-args",
type=str, type=str,
...@@ -183,6 +195,9 @@ def cmd_line_args(): ...@@ -183,6 +195,9 @@ def cmd_line_args():
config.endpoint = parsed_endpoint_name config.endpoint = parsed_endpoint_name
config.base_gpu_id = args.base_gpu_id config.base_gpu_id = args.base_gpu_id
config.tensor_parallel_size = args.tensor_parallel_size config.tensor_parallel_size = args.tensor_parallel_size
config.nnodes = args.nnodes
config.node_rank = args.node_rank
config.dist_init_addr = args.dist_init_addr
config.extra_engine_args = args.extra_engine_args config.extra_engine_args = args.extra_engine_args
return config return config
......
...@@ -66,19 +66,20 @@ class RequestHandler: ...@@ -66,19 +66,20 @@ class RequestHandler:
Request handler for the generate endpoint Request handler for the generate endpoint
""" """
def __init__(self, engine): def __init__(self, engine, default_sampling_params):
self.engine_client = engine self.engine_client = engine
self.default_sampling_params = default_sampling_params
async def generate(self, request): async def generate(self, request):
request_id = "1" # hello_world example only request_id = "1" # hello_world example only
logging.debug(f"Received request: {request}") logging.debug(f"Received request: {request}")
prompt = TokensPrompt(prompt_token_ids=request["token_ids"]) prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
sampling_params = SamplingParams(
temperature=request["sampling_options"]["temperature"], sampling_params = SamplingParams(**self.default_sampling_params)
# vllm defaults this to 16 sampling_params.temperature = request["sampling_options"]["temperature"]
max_tokens=request["stop_conditions"]["max_tokens"], sampling_params.max_tokens = request["stop_conditions"]["max_tokens"]
)
num_output_tokens_so_far = 0 num_output_tokens_so_far = 0
gen = self.engine_client.generate(prompt, sampling_params, request_id) gen = self.engine_client.generate(prompt, sampling_params, request_id)
async for res in gen: async for res in gen:
...@@ -142,13 +143,18 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -142,13 +143,18 @@ async def init(runtime: DistributedRuntime, config: Config):
arg_map = {**arg_map, **json_map} # json_map gets precedence arg_map = {**arg_map, **json_map} # json_map gets precedence
engine_args = AsyncEngineArgs(**arg_map) engine_args = AsyncEngineArgs(**arg_map)
model_config = engine_args.create_model_config()
# Load default sampling params from `generation_config.json`
default_sampling_params = model_config.get_diff_sampling_param()
engine_context = build_async_engine_client_from_engine_args(engine_args) engine_context = build_async_engine_client_from_engine_args(engine_args)
engine_client = await engine_context.__aenter__() engine_client = await engine_context.__aenter__()
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes) # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked # after the lease is revoked
await endpoint.serve_endpoint(RequestHandler(engine_client).generate, None) await endpoint.serve_endpoint(
RequestHandler(engine_client, default_sampling_params).generate, None
)
def cmd_line_args(): def cmd_line_args():
......
...@@ -36,7 +36,6 @@ use tokio::sync::mpsc; ...@@ -36,7 +36,6 @@ use tokio::sync::mpsc;
use tokio::sync::oneshot::Sender; use tokio::sync::oneshot::Sender;
use tokio_stream::{wrappers::ReceiverStream, StreamExt}; use tokio_stream::{wrappers::ReceiverStream, StreamExt};
use dynamo_llm::backend::ExecutionContext;
use dynamo_llm::engines::{EngineDispatcher, StreamingEngine}; use dynamo_llm::engines::{EngineDispatcher, StreamingEngine};
/// Python snippet to import a file as a module /// Python snippet to import a file as a module
...@@ -89,26 +88,6 @@ pub async fn make_string_engine( ...@@ -89,26 +88,6 @@ pub async fn make_string_engine(
Ok(engine) Ok(engine)
} }
/// An engine that takes and returns tokens.
pub async fn make_token_engine(
cancel_token: CancellationToken,
py_file: &Path,
py_args: Vec<String>,
) -> pipeline_error::Result<ExecutionContext> {
pyo3::prepare_freethreaded_python();
if let Ok(venv) = env::var("VIRTUAL_ENV") {
Python::with_gil(|py| {
if let Err(e) = fix_venv(venv, py) {
tracing::warn!("failed to fix venv: {}", e);
}
});
}
let engine = new_engine(cancel_token, py_file, py_args).await?;
let engine: ExecutionContext = Arc::new(engine);
Ok(engine)
}
#[derive(Clone)] #[derive(Clone)]
pub struct PythonServerStreamingEngine { pub struct PythonServerStreamingEngine {
_cancel_token: CancellationToken, _cancel_token: CancellationToken,
...@@ -128,17 +107,6 @@ async fn new_engine( ...@@ -128,17 +107,6 @@ async fn new_engine(
let user_module = let user_module =
python_file_to_module(py_file, py_args).with_context(|| py_file.display().to_string())?; python_file_to_module(py_file, py_args).with_context(|| py_file.display().to_string())?;
let generator = Python::with_gil(|py| { let generator = Python::with_gil(|py| {
/* Leave commented, `initialize` may be needed to match Triton
if let Ok(initialize) = user_module.getattr(py, "initialize") {
initialize
.call1(py, (py_args,))
.inspect_err(|err| {
println!();
err.display(py);
})
.with_context(|| "Failed calling python engine's initialize(args)")?;
};
*/
user_module user_module
.getattr(py, "generate") .getattr(py, "generate")
.with_context(|| "generate") .with_context(|| "generate")
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-sglang"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
async_zmq = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
async-openai = "0.27.2"
libc = "0.2"
pyo3 = { version = "0.23.3", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"py-clone",
] }
regex = "1"
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::{Path, PathBuf};
use async_stream::stream;
use async_trait::async_trait;
use dynamo_llm::engines::MultiNodeConfig;
use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use dynamo_runtime::runtime::CancellationToken;
pub struct SgLangEngine {
cancel_token: CancellationToken,
worker: super::worker::SgLangWorker,
}
impl SgLangEngine {
pub async fn new(
cancel_token: CancellationToken,
sock_code: &str,
model_path: &Path,
node_conf: MultiNodeConfig,
tensor_parallel_size: u32,
base_gpu_id: u32,
extra_engine_args: Option<PathBuf>,
) -> anyhow::Result<Self> {
let w = super::worker::start(
cancel_token.clone(),
sock_code,
model_path,
node_conf,
tensor_parallel_size,
base_gpu_id,
extra_engine_args,
)
.await?;
let engine = SgLangEngine {
cancel_token,
worker: w,
};
Ok(engine)
}
pub fn take_sglang_worker_handle(&mut self) -> tokio::task::JoinHandle<()> {
self.worker.take_sglang_handle()
}
}
#[async_trait]
impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
for SgLangEngine
{
async fn generate(
&self,
request: SingleIn<BackendInput>,
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
let (request, context) = request.into_parts();
let ctx = context.context();
let request_id = ctx.id().to_string();
let (resp_tx, mut resp_rx) = tokio::sync::mpsc::channel(128);
let work_req = super::worker::WorkRequest {
request_id: context.id().to_string(),
request,
response_channel: resp_tx,
};
self.worker.enqueue_request(work_req).await?;
let cancel_token = self.cancel_token.clone();
let output = stream! {
loop {
tokio::select! {
_ = cancel_token.cancelled() => {
break;
}
maybe_resp_rx = resp_rx.recv() => {
match maybe_resp_rx {
Some(out) => {
yield out;
},
None => {
tracing::trace!(request_id, "generate: response channel closed");
break;
}
}
}
}
}
};
Ok(ResponseStream::new(Box::pin(output), ctx))
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment