Commit 14ce7e03 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

refactor: Rename 'tio' to 'dynemo-run' (#18)

parent 2791b9ea
...@@ -40,7 +40,7 @@ jobs: ...@@ -40,7 +40,7 @@ jobs:
pre-merge-rust: pre-merge-rust:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: { dir: ['lib/runtime', 'lib/llm', 'lib/bindings/c', 'lib/bindings/python', 'launch/tio', 'applications/llm/count', 'examples/rust'] } matrix: { dir: ['lib/runtime', 'lib/llm', 'lib/bindings/c', 'lib/bindings/python', 'launch/dynemo-run', 'applications/llm/count', 'examples/rust'] }
permissions: permissions:
contents: read contents: read
steps: steps:
......
...@@ -92,7 +92,7 @@ RUN cd examples/rust && \ ...@@ -92,7 +92,7 @@ RUN cd examples/rust && \
cp target/release/http /usr/local/bin/ && \ cp target/release/http /usr/local/bin/ && \
cp target/release/llmctl /usr/local/bin/ cp target/release/llmctl /usr/local/bin/
# TODO: Build tio # TODO: Build dynemo-run
# COPY applications/... # COPY applications/...
# Generate C bindings for kv cache routing in vLLM # Generate C bindings for kv cache routing in vLLM
......
...@@ -1374,6 +1374,31 @@ dependencies = [ ...@@ -1374,6 +1374,31 @@ dependencies = [
"reborrow", "reborrow",
] ]
[[package]]
name = "dynemo-run"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"clap",
"dialoguer",
"futures",
"futures-util",
"libc",
"netlink-packet-route",
"rtnetlink",
"serde",
"serde_json",
"tokio",
"tokio-util",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]] [[package]]
name = "ed25519" name = "ed25519"
version = "2.2.3" version = "2.2.3"
...@@ -5165,31 +5190,6 @@ version = "0.1.1" ...@@ -5165,31 +5190,6 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tio"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"clap",
"dialoguer",
"futures",
"futures-util",
"libc",
"netlink-packet-route",
"rtnetlink",
"serde",
"serde_json",
"tokio",
"tokio-util",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]] [[package]]
name = "tokenizers" name = "tokenizers"
version = "0.21.0" version = "0.21.0"
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
# limitations under the License. # limitations under the License.
[package] [package]
name = "tio" name = "dynemo-run"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
homepage = "https://github.com/triton-inference-server/triton_distributed" homepage = "https://github.com/dynemo-ai/dynemo"
license = "Apache-2.0" license = "Apache-2.0"
[features] [features]
......
# triton-llm service runner # Dynemo service runner
`tio` is a tool for exploring the triton-distributed and triton-llm components. `dynemo-run` is a tool for exploring the dynemo components.
## Quickstart
- Install Rust
- `cargo install --features mistralrs,cuda --git https://github.com/dynemo-ai/dynemo.git dynemo-run`
- `dynemo-run <GGUF or HF-repo-checkout>`
## Install and start pre-requisites ## Install and start pre-requisites
...@@ -31,11 +37,11 @@ For example one of these should be fast and good quality on almost any machine: ...@@ -31,11 +37,11 @@ For example one of these should be fast and good quality on almost any machine:
*Text interface* *Text interface*
`./target/release/tio Llama-3.2-1B-Instruct-Q4_K_M.gguf` or path to a Hugging Face repo checkout instead of the GGUF. `./target/release/dynemo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf` or path to a Hugging Face repo checkout instead of the GGUF.
*HTTP interface* *HTTP interface*
`./target/release/tio in=http --model-path Llama-3.2-1B-Instruct-Q4_K_M.gguf` `./target/release/dynemo-run in=http --model-path Llama-3.2-1B-Instruct-Q4_K_M.gguf`
List the models: `curl localhost:8080/v1/models` List the models: `curl localhost:8080/v1/models`
...@@ -48,19 +54,19 @@ curl -d '{"model": "Llama-3.2-1B-Instruct-Q4_K_M", "max_tokens": 2049, "messages ...@@ -48,19 +54,19 @@ curl -d '{"model": "Llama-3.2-1B-Instruct-Q4_K_M", "max_tokens": 2049, "messages
Node 1: Node 1:
``` ```
tio in=http out=tdr://llama3B_pool dynemo-run in=http out=dyn://llama3B_pool
``` ```
Node 2: Node 2:
``` ```
tio in=tdr://llama3B_pool out=mistralrs ~/llm_models/Llama-3.2-3B-Instruct dynemo-run in=dyn://llama3B_pool out=mistralrs ~/llm_models/Llama-3.2-3B-Instruct
``` ```
This will use etcd to auto-discover the model and NATS to talk to it. You can run multiple workers on the same endpoint and it will pick one at random each time. This will use etcd to auto-discover the model and NATS to talk to it. You can run multiple workers on the same endpoint and it will pick one at random each time.
The `ns/backend/mistralrs` are purely symbolic, pick anything as long as it has three parts, and it matches the other node. The `ns/backend/mistralrs` are purely symbolic, pick anything as long as it has three parts, and it matches the other node.
Run `tio --help` for more options. Run `dynemo-run --help` for more options.
## sglang ## sglang
...@@ -86,26 +92,26 @@ Any example above using `out=sglang` will work, but our sglang backend is also m ...@@ -86,26 +92,26 @@ Any example above using `out=sglang` will work, but our sglang backend is also m
Node 1: Node 1:
``` ```
tio in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876 dynemo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
``` ```
Node 2: Node 2:
``` ```
tio in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876 dynemo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
``` ```
## llama_cpp ## llama_cpp
- `cargo build --release --features llamacpp,cuda` - `cargo build --release --features llamacpp,cuda`
- `tio out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/` - `dynemo-run out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`
The extra `--model-config` flag is because: The extra `--model-config` flag is because:
- llama_cpp only runs GGUF - llama_cpp only runs GGUF
- We send it tokens, meaning we do the tokenization ourself, so we need a tokenizer - We send it tokens, meaning we do the tokenization ourself, so we need a tokenizer
- We don't yet read it out of the GGUF (TODO), so we need an HF repo with `tokenizer.json` et al - We don't yet read it out of the GGUF (TODO), so we need an HF repo with `tokenizer.json` et al
If the build step also builds llama_cpp libraries into `target/release` ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `tio` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `tio` binary. If the build step also builds llama_cpp libraries into `target/release` ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynemo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynemo-run` binary.
## vllm ## vllm
...@@ -130,25 +136,25 @@ cargo build --release --features vllm ...@@ -130,25 +136,25 @@ cargo build --release --features vllm
Run (still inside that virtualenv) - HF repo: Run (still inside that virtualenv) - HF repo:
``` ```
./target/release/tio in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/ ./target/release/dynemo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/
``` ```
Run (still inside that virtualenv) - GGUF: Run (still inside that virtualenv) - GGUF:
``` ```
./target/release/tio in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/ ./target/release/dynemo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
``` ```
+ Multi-node: + Multi-node:
Node 1: Node 1:
``` ```
tio in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0 dynemo-run in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
``` ```
Node 2: Node 2:
``` ```
tio in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1 dynemo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
``` ```
...@@ -163,7 +169,7 @@ cargo build --release --features trtllm ...@@ -163,7 +169,7 @@ cargo build --release --features trtllm
Run: Run:
``` ```
tio in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/ dynemo-run in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/
``` ```
Note that TRT-LLM uses it's own `.engine` format for weights. Repo models must be converted like so: Note that TRT-LLM uses it's own `.engine` format for weights. Repo models must be converted like so:
...@@ -181,7 +187,7 @@ python convert_checkpoint.py --model_dir /tmp/model/ --output_dir ./converted -- ...@@ -181,7 +187,7 @@ python convert_checkpoint.py --model_dir /tmp/model/ --output_dir ./converted --
trtllm-build --checkpoint_dir ./converted --output_dir ./final/trtllm_engine --use_paged_context_fmha enable --gemm_plugin auto trtllm-build --checkpoint_dir ./converted --output_dir ./final/trtllm_engine --use_paged_context_fmha enable --gemm_plugin auto
``` ```
The `--model-path` you give to `tio` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant). The `--model-path` you give to `dynemo-run` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant).
+ Execute + Execute
TRT-LLM is a C++ library that must have been previously built and installed. It needs a lot of memory to compile. Gitlab builds a container you can try: TRT-LLM is a C++ library that must have been previously built and installed. It needs a lot of memory to compile. Gitlab builds a container you can try:
......
...@@ -41,7 +41,7 @@ pub use opt::{Input, Output}; ...@@ -41,7 +41,7 @@ pub use opt::{Input, Output};
/// How we identify a namespace/component/endpoint URL. /// How we identify a namespace/component/endpoint URL.
/// Technically the '://' is not part of the scheme but it eliminates several string /// Technically the '://' is not part of the scheme but it eliminates several string
/// concatenations. /// concatenations.
const ENDPOINT_SCHEME: &str = "tdr://"; const ENDPOINT_SCHEME: &str = "dyn://";
pub enum EngineConfig { pub enum EngineConfig {
/// An remote networked engine we don't know about yet /// An remote networked engine we don't know about yet
......
...@@ -17,17 +17,17 @@ use std::env; ...@@ -17,17 +17,17 @@ use std::env;
use clap::Parser; use clap::Parser;
use tio::{Input, Output}; use dynemo_run::{Input, Output};
use triton_distributed_runtime::logging; use triton_distributed_runtime::logging;
const HELP: &str = r#" const HELP: &str = r#"
tio is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use triton-distributed locally. dynemo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynemo locally.
Example: Example:
- cargo build --release --features mistralrs,cuda - cargo build --release --features mistralrs,cuda
- cd target/release - cd target/release
- ./tio hf_checkouts/Llama-3.2-3B-Instruct/ - ./dynemo-run hf_checkouts/Llama-3.2-3B-Instruct/
- OR: ./tio Llama-3.2-1B-Instruct-Q4_K_M.gguf - OR: ./dynemo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf
"#; "#;
...@@ -39,16 +39,16 @@ const DEFAULT_OUT: Output = Output::MistralRs; ...@@ -39,16 +39,16 @@ const DEFAULT_OUT: Output = Output::MistralRs;
#[cfg(not(feature = "mistralrs"))] #[cfg(not(feature = "mistralrs"))]
const DEFAULT_OUT: Output = Output::EchoFull; const DEFAULT_OUT: Output = Output::EchoFull;
const ZMQ_SOCKET_PREFIX: &str = "tio"; const ZMQ_SOCKET_PREFIX: &str = "dyn";
const USAGE: &str = "USAGE: tio in=[http|text|tdr://<path>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]"; const USAGE: &str = "USAGE: dynemo-run in=[http|text|dyn://<path>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]";
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
logging::init(); logging::init();
// Call sub-processes before starting the Runtime machinery // Call sub-processes before starting the Runtime machinery
// For anything except sub-process starting try_parse_from will error. // For anything except sub-process starting try_parse_from will error.
if let Ok(flags) = tio::Flags::try_parse_from(env::args()) { if let Ok(flags) = dynemo_run::Flags::try_parse_from(env::args()) {
#[allow(unused_variables)] #[allow(unused_variables)]
if let Some(sglang_flags) = flags.internal_sglang_process { if let Some(sglang_flags) = flags.internal_sglang_process {
let Some(model_path) = flags.model_path_flag.as_ref() else { let Some(model_path) = flags.model_path_flag.as_ref() else {
...@@ -124,10 +124,10 @@ fn main() -> anyhow::Result<()> { ...@@ -124,10 +124,10 @@ fn main() -> anyhow::Result<()> {
// One per process. Wraps a Runtime with holds two tokio runtimes. // One per process. Wraps a Runtime with holds two tokio runtimes.
let worker = triton_distributed_runtime::Worker::from_config(rt_config)?; let worker = triton_distributed_runtime::Worker::from_config(rt_config)?;
worker.execute(tio_wrapper) worker.execute(wrapper)
} }
async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> { async fn wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
let mut in_opt = None; let mut in_opt = None;
let mut out_opt = None; let mut out_opt = None;
let args: Vec<String> = env::args().skip(1).collect(); let args: Vec<String> = env::args().skip(1).collect();
...@@ -171,13 +171,13 @@ async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Re ...@@ -171,13 +171,13 @@ async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Re
// Clap skips the first argument expecting it to be the binary name, so add it back // Clap skips the first argument expecting it to be the binary name, so add it back
// Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag. // Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
let flags = tio::Flags::try_parse_from( let flags = dynemo_run::Flags::try_parse_from(
["tio".to_string()] ["dynemo-run".to_string()]
.into_iter() .into_iter()
.chain(env::args().skip(non_flag_params)), .chain(env::args().skip(non_flag_params)),
)?; )?;
tio::run( dynemo_run::run(
runtime, runtime,
in_opt, in_opt,
out_opt, out_opt,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment