Commit 14ce7e03 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

refactor: Rename 'tio' to 'dynemo-run' (#18)

parent 2791b9ea
......@@ -40,7 +40,7 @@ jobs:
pre-merge-rust:
runs-on: ubuntu-latest
strategy:
matrix: { dir: ['lib/runtime', 'lib/llm', 'lib/bindings/c', 'lib/bindings/python', 'launch/tio', 'applications/llm/count', 'examples/rust'] }
matrix: { dir: ['lib/runtime', 'lib/llm', 'lib/bindings/c', 'lib/bindings/python', 'launch/dynemo-run', 'applications/llm/count', 'examples/rust'] }
permissions:
contents: read
steps:
......
......@@ -92,7 +92,7 @@ RUN cd examples/rust && \
cp target/release/http /usr/local/bin/ && \
cp target/release/llmctl /usr/local/bin/
# TODO: Build tio
# TODO: Build dynemo-run
# COPY applications/...
# Generate C bindings for kv cache routing in vLLM
......
......@@ -1374,6 +1374,31 @@ dependencies = [
"reborrow",
]
[[package]]
name = "dynemo-run"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"clap",
"dialoguer",
"futures",
"futures-util",
"libc",
"netlink-packet-route",
"rtnetlink",
"serde",
"serde_json",
"tokio",
"tokio-util",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]]
name = "ed25519"
version = "2.2.3"
......@@ -5165,31 +5190,6 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tio"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"clap",
"dialoguer",
"futures",
"futures-util",
"libc",
"netlink-packet-route",
"rtnetlink",
"serde",
"serde_json",
"tokio",
"tokio-util",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]]
name = "tokenizers"
version = "0.21.0"
......
......@@ -14,11 +14,11 @@
# limitations under the License.
[package]
name = "tio"
name = "dynemo-run"
version = "0.1.0"
edition = "2021"
authors = ["NVIDIA"]
homepage = "https://github.com/triton-inference-server/triton_distributed"
homepage = "https://github.com/dynemo-ai/dynemo"
license = "Apache-2.0"
[features]
......
# triton-llm service runner
# Dynemo service runner
`tio` is a tool for exploring the triton-distributed and triton-llm components.
`dynemo-run` is a tool for exploring the dynemo components.
## Quickstart
- Install Rust
- `cargo install --features mistralrs,cuda --git https://github.com/dynemo-ai/dynemo.git dynemo-run`
- `dynemo-run <GGUF or HF-repo-checkout>`
## Install and start pre-requisites
......@@ -31,11 +37,11 @@ For example one of these should be fast and good quality on almost any machine:
*Text interface*
`./target/release/tio Llama-3.2-1B-Instruct-Q4_K_M.gguf` or path to a Hugging Face repo checkout instead of the GGUF.
`./target/release/dynemo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf` or path to a Hugging Face repo checkout instead of the GGUF.
*HTTP interface*
`./target/release/tio in=http --model-path Llama-3.2-1B-Instruct-Q4_K_M.gguf`
`./target/release/dynemo-run in=http --model-path Llama-3.2-1B-Instruct-Q4_K_M.gguf`
List the models: `curl localhost:8080/v1/models`
......@@ -48,19 +54,19 @@ curl -d '{"model": "Llama-3.2-1B-Instruct-Q4_K_M", "max_tokens": 2049, "messages
Node 1:
```
tio in=http out=tdr://llama3B_pool
dynemo-run in=http out=dyn://llama3B_pool
```
Node 2:
```
tio in=tdr://llama3B_pool out=mistralrs ~/llm_models/Llama-3.2-3B-Instruct
dynemo-run in=dyn://llama3B_pool out=mistralrs ~/llm_models/Llama-3.2-3B-Instruct
```
This will use etcd to auto-discover the model and NATS to talk to it. You can run multiple workers on the same endpoint and it will pick one at random each time.
The `ns/backend/mistralrs` are purely symbolic, pick anything as long as it has three parts, and it matches the other node.
Run `tio --help` for more options.
Run `dynemo-run --help` for more options.
## sglang
......@@ -86,26 +92,26 @@ Any example above using `out=sglang` will work, but our sglang backend is also m
Node 1:
```
tio in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
dynemo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
```
Node 2:
```
tio in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
dynemo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
```
## llama_cpp
- `cargo build --release --features llamacpp,cuda`
- `tio out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`
- `dynemo-run out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`
The extra `--model-config` flag is because:
- llama_cpp only runs GGUF
- We send it tokens, meaning we do the tokenization ourself, so we need a tokenizer
- We don't yet read it out of the GGUF (TODO), so we need an HF repo with `tokenizer.json` et al
If the build step also builds llama_cpp libraries into `target/release` ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `tio` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `tio` binary.
If the build step also builds llama_cpp libraries into `target/release` ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynemo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynemo-run` binary.
## vllm
......@@ -130,25 +136,25 @@ cargo build --release --features vllm
Run (still inside that virtualenv) - HF repo:
```
./target/release/tio in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/
./target/release/dynemo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/
```
Run (still inside that virtualenv) - GGUF:
```
./target/release/tio in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
./target/release/dynemo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
```
+ Multi-node:
Node 1:
```
tio in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
dynemo-run in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
```
Node 2:
```
tio in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
dynemo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
```
......@@ -163,7 +169,7 @@ cargo build --release --features trtllm
Run:
```
tio in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/
dynemo-run in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/
```
Note that TRT-LLM uses it's own `.engine` format for weights. Repo models must be converted like so:
......@@ -181,7 +187,7 @@ python convert_checkpoint.py --model_dir /tmp/model/ --output_dir ./converted --
trtllm-build --checkpoint_dir ./converted --output_dir ./final/trtllm_engine --use_paged_context_fmha enable --gemm_plugin auto
```
The `--model-path` you give to `tio` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant).
The `--model-path` you give to `dynemo-run` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant).
+ Execute
TRT-LLM is a C++ library that must have been previously built and installed. It needs a lot of memory to compile. Gitlab builds a container you can try:
......
......@@ -41,7 +41,7 @@ pub use opt::{Input, Output};
/// How we identify a namespace/component/endpoint URL.
/// Technically the '://' is not part of the scheme but it eliminates several string
/// concatenations.
const ENDPOINT_SCHEME: &str = "tdr://";
const ENDPOINT_SCHEME: &str = "dyn://";
pub enum EngineConfig {
/// An remote networked engine we don't know about yet
......
......@@ -17,17 +17,17 @@ use std::env;
use clap::Parser;
use tio::{Input, Output};
use dynemo_run::{Input, Output};
use triton_distributed_runtime::logging;
const HELP: &str = r#"
tio is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use triton-distributed locally.
dynemo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynemo locally.
Example:
- cargo build --release --features mistralrs,cuda
- cd target/release
- ./tio hf_checkouts/Llama-3.2-3B-Instruct/
- OR: ./tio Llama-3.2-1B-Instruct-Q4_K_M.gguf
- ./dynemo-run hf_checkouts/Llama-3.2-3B-Instruct/
- OR: ./dynemo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf
"#;
......@@ -39,16 +39,16 @@ const DEFAULT_OUT: Output = Output::MistralRs;
#[cfg(not(feature = "mistralrs"))]
const DEFAULT_OUT: Output = Output::EchoFull;
const ZMQ_SOCKET_PREFIX: &str = "tio";
const ZMQ_SOCKET_PREFIX: &str = "dyn";
const USAGE: &str = "USAGE: tio in=[http|text|tdr://<path>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]";
const USAGE: &str = "USAGE: dynemo-run in=[http|text|dyn://<path>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]";
fn main() -> anyhow::Result<()> {
logging::init();
// Call sub-processes before starting the Runtime machinery
// For anything except sub-process starting try_parse_from will error.
if let Ok(flags) = tio::Flags::try_parse_from(env::args()) {
if let Ok(flags) = dynemo_run::Flags::try_parse_from(env::args()) {
#[allow(unused_variables)]
if let Some(sglang_flags) = flags.internal_sglang_process {
let Some(model_path) = flags.model_path_flag.as_ref() else {
......@@ -124,10 +124,10 @@ fn main() -> anyhow::Result<()> {
// One per process. Wraps a Runtime with holds two tokio runtimes.
let worker = triton_distributed_runtime::Worker::from_config(rt_config)?;
worker.execute(tio_wrapper)
worker.execute(wrapper)
}
async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
async fn wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
let mut in_opt = None;
let mut out_opt = None;
let args: Vec<String> = env::args().skip(1).collect();
......@@ -171,13 +171,13 @@ async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Re
// Clap skips the first argument expecting it to be the binary name, so add it back
// Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
let flags = tio::Flags::try_parse_from(
["tio".to_string()]
let flags = dynemo_run::Flags::try_parse_from(
["dynemo-run".to_string()]
.into_iter()
.chain(env::args().skip(non_flag_params)),
)?;
tio::run(
dynemo_run::run(
runtime,
in_opt,
out_opt,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment