refactor: Rename 'tio' to 'dynemo-run' (#18)

14ce7e03 · Graham King · GitHub · 2791b9ea · 14ce7e03 · 14ce7e03
Commit 14ce7e03 authored Mar 04, 2025 by Graham King Committed by GitHub Mar 04, 2025
18 changed files
--- a/.github/workflows/pre-merge-rust.yml
+++ b/.github/workflows/pre-merge-rust.yml
@@ -40,7 +40,7 @@ jobs:
  pre-merge-rust:
    runs-on: ubuntu-latest
    strategy:
-      matrix: { dir: ['lib/runtime', 'lib/llm', 'lib/bindings/c', 'lib/bindings/python', 'launch/tio', 'applications/llm/count', 'examples/rust'] }
+      matrix: { dir: ['lib/runtime', 'lib/llm', 'lib/bindings/c', 'lib/bindings/python', 'launch/dynemo-run', 'applications/llm/count', 'examples/rust'] }
    permissions:
      contents: read
    steps:

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -92,7 +92,7 @@ RUN cd examples/rust && \
    cp target/release/http /usr/local/bin/ && \
    cp target/release/llmctl /usr/local/bin/

-# TODO: Build tio
+# TODO: Build dynemo-run
 # COPY applications/...

 # Generate C bindings for kv cache routing in vLLM

--- a/launch/tio/Cargo.lock
+++ b/launch/tio/Cargo.lock
@@ -1374,6 +1374,31 @@ dependencies = [
 "reborrow",
 ]

+[[package]]
+name = "dynemo-run"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "clap",
+ "dialoguer",
+ "futures",
+ "futures-util",
+ "libc",
+ "netlink-packet-route",
+ "rtnetlink",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "tracing-subscriber",
+ "triton-distributed-llm",
+ "triton-distributed-runtime",
+]
+
 [[package]]
 name = "ed25519"
 version = "2.2.3"
@@ -5165,31 +5190,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

-[[package]]
-name = "tio"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "clap",
- "dialoguer",
- "futures",
- "futures-util",
- "libc",
- "netlink-packet-route",
- "rtnetlink",
- "serde",
- "serde_json",
- "tokio",
- "tokio-util",
- "tracing",
- "tracing-subscriber",
- "triton-distributed-llm",
- "triton-distributed-runtime",
-]
-
 [[package]]
 name = "tokenizers"
 version = "0.21.0"

--- a/launch/tio/Cargo.toml
+++ b/launch/tio/Cargo.toml
@@ -14,11 +14,11 @@
 # limitations under the License.

 [package]
-name = "tio"
+name = "dynemo-run"
 version = "0.1.0"
 edition = "2021"
 authors = ["NVIDIA"]
-homepage = "https://github.com/triton-inference-server/triton_distributed"
+homepage = "https://github.com/dynemo-ai/dynemo"
 license = "Apache-2.0"

 [features]

--- a/launch/tio/README.md
+++ b/launch/tio/README.md
-# triton-llm service runner
+# Dynemo service runner

-`tio` is a tool for exploring the triton-distributed and triton-llm components.
+`dynemo-run` is a tool for exploring the dynemo components.
+
+## Quickstart
+
+- Install Rust
+- `cargo install --features mistralrs,cuda --git https://github.com/dynemo-ai/dynemo.git dynemo-run`
+- `dynemo-run <GGUF or HF-repo-checkout>`

 ## Install and start pre-requisites

@@ -31,11 +37,11 @@ For example one of these should be fast and good quality on almost any machine:

 *Text interface*

-`./target/release/tio Llama-3.2-1B-Instruct-Q4_K_M.gguf` or path to a Hugging Face repo checkout instead of the GGUF.
+`./target/release/dynemo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf` or path to a Hugging Face repo checkout instead of the GGUF.

 *HTTP interface*

-`./target/release/tio in=http --model-path Llama-3.2-1B-Instruct-Q4_K_M.gguf`
+`./target/release/dynemo-run in=http --model-path Llama-3.2-1B-Instruct-Q4_K_M.gguf`

 List the models: `curl localhost:8080/v1/models`

@@ -48,19 +54,19 @@ curl -d '{"model": "Llama-3.2-1B-Instruct-Q4_K_M", "max_tokens": 2049, "messages

 Node 1:
 ```
-tio in=http out=tdr://llama3B_pool
+dynemo-run in=http out=dyn://llama3B_pool
 ```

 Node 2:
 ```
-tio in=tdr://llama3B_pool out=mistralrs ~/llm_models/Llama-3.2-3B-Instruct
+dynemo-run in=dyn://llama3B_pool out=mistralrs ~/llm_models/Llama-3.2-3B-Instruct
 ```

 This will use etcd to auto-discover the model and NATS to talk to it. You can run multiple workers on the same endpoint and it will pick one at random each time.

 The `ns/backend/mistralrs` are purely symbolic, pick anything as long as it has three parts, and it matches the other node.

-Run `tio --help` for more options.
+Run `dynemo-run --help` for more options.

 ## sglang

@@ -86,26 +92,26 @@ Any example above using `out=sglang` will work, but our sglang backend is also m

 Node 1:
 ```
-tio in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
+dynemo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
 ```

 Node 2:
 ```
-tio in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
+dynemo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
 ```

 ## llama_cpp

 - `cargo build --release --features llamacpp,cuda`

- `tio out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`
+- `dynemo-run out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`

 The extra `--model-config` flag is because:
 - llama_cpp only runs GGUF
 - We send it tokens, meaning we do the tokenization ourself, so we need a tokenizer
 - We don't yet read it out of the GGUF (TODO), so we need an HF repo with `tokenizer.json` et al

-If the build step also builds llama_cpp libraries into `target/release` ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `tio` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `tio` binary.
+If the build step also builds llama_cpp libraries into `target/release` ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynemo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynemo-run` binary.

 ## vllm

@@ -130,25 +136,25 @@ cargo build --release --features vllm

 Run (still inside that virtualenv) - HF repo:
 ```
-./target/release/tio in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/
+./target/release/dynemo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/

 ```

 Run (still inside that virtualenv) - GGUF:
 ```
-./target/release/tio in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
+./target/release/dynemo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
 ```

 + Multi-node:

 Node 1:
 ```
-tio in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
+dynemo-run in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
 ```

 Node 2:
 ```
-tio in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
+dynemo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
 ```


@@ -163,7 +169,7 @@ cargo build --release --features trtllm

 Run:
 ```
-tio in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/
+dynemo-run in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/
 ```

 Note that TRT-LLM uses it's own `.engine` format for weights. Repo models must be converted like so:
@@ -181,7 +187,7 @@ python convert_checkpoint.py --model_dir /tmp/model/ --output_dir ./converted --
 trtllm-build --checkpoint_dir ./converted --output_dir ./final/trtllm_engine --use_paged_context_fmha enable --gemm_plugin auto
 ```

-The `--model-path` you give to `tio` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant).
+The `--model-path` you give to `dynemo-run` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant).

 + Execute
 TRT-LLM is a C++ library that must have been previously built and installed. It needs a lot of memory to compile. Gitlab builds a container you can try:

--- a/launch/tio/rust-toolchain.toml
+++ b/launch/tio/rust-toolchain.toml
--- a/launch/tio/src/flags.rs
+++ b/launch/tio/src/flags.rs
--- a/launch/tio/src/input.rs
+++ b/launch/tio/src/input.rs
--- a/launch/tio/src/input/endpoint.rs
+++ b/launch/tio/src/input/endpoint.rs
--- a/launch/tio/src/input/http.rs
+++ b/launch/tio/src/input/http.rs
--- a/launch/tio/src/input/text.rs
+++ b/launch/tio/src/input/text.rs
--- a/launch/tio/src/lib.rs
+++ b/launch/tio/src/lib.rs
@@ -41,7 +41,7 @@ pub use opt::{Input, Output};
 /// How we identify a namespace/component/endpoint URL.
 /// Technically the '://' is not part of the scheme but it eliminates several string
 /// concatenations.
-const ENDPOINT_SCHEME: &str = "tdr://";
+const ENDPOINT_SCHEME: &str = "dyn://";

 pub enum EngineConfig {
    /// An remote networked engine we don't know about yet

--- a/launch/tio/src/main.rs
+++ b/launch/tio/src/main.rs
@@ -17,17 +17,17 @@ use std::env;

 use clap::Parser;

-use tio::{Input, Output};
+use dynemo_run::{Input, Output};
 use triton_distributed_runtime::logging;

 const HELP: &str = r#"
-tio is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use triton-distributed locally.
+dynemo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynemo locally.

 Example:
 - cargo build --release --features mistralrs,cuda
 - cd target/release
- ./tio hf_checkouts/Llama-3.2-3B-Instruct/
- OR: ./tio Llama-3.2-1B-Instruct-Q4_K_M.gguf
+- ./dynemo-run hf_checkouts/Llama-3.2-3B-Instruct/
+- OR: ./dynemo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf

 "#;

@@ -39,16 +39,16 @@ const DEFAULT_OUT: Output = Output::MistralRs;
 #[cfg(not(feature = "mistralrs"))]
 const DEFAULT_OUT: Output = Output::EchoFull;

-const ZMQ_SOCKET_PREFIX: &str = "tio";
+const ZMQ_SOCKET_PREFIX: &str = "dyn";

-const USAGE: &str = "USAGE: tio in=[http|text|tdr://<path>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]";
+const USAGE: &str = "USAGE: dynemo-run in=[http|text|dyn://<path>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]";

 fn main() -> anyhow::Result<()> {
    logging::init();

    // Call sub-processes before starting the Runtime machinery
    // For anything except sub-process starting try_parse_from will error.
-    if let Ok(flags) = tio::Flags::try_parse_from(env::args()) {
+    if let Ok(flags) = dynemo_run::Flags::try_parse_from(env::args()) {
        #[allow(unused_variables)]
        if let Some(sglang_flags) = flags.internal_sglang_process {
            let Some(model_path) = flags.model_path_flag.as_ref() else {
@@ -124,10 +124,10 @@ fn main() -> anyhow::Result<()> {
    // One per process. Wraps a Runtime with holds two tokio runtimes.
    let worker = triton_distributed_runtime::Worker::from_config(rt_config)?;

-    worker.execute(tio_wrapper)
+    worker.execute(wrapper)
 }

-async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
+async fn wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
    let mut in_opt = None;
    let mut out_opt = None;
    let args: Vec<String> = env::args().skip(1).collect();
@@ -171,13 +171,13 @@ async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Re

    // Clap skips the first argument expecting it to be the binary name, so add it back
    // Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
-    let flags = tio::Flags::try_parse_from(
-        ["tio".to_string()]
+    let flags = dynemo_run::Flags::try_parse_from(
+        ["dynemo-run".to_string()]
            .into_iter()
            .chain(env::args().skip(non_flag_params)),
    )?;

-    tio::run(
+    dynemo_run::run(
        runtime,
        in_opt,
        out_opt,

--- a/launch/tio/src/net.rs
+++ b/launch/tio/src/net.rs
--- a/launch/tio/src/opt.rs
+++ b/launch/tio/src/opt.rs
--- a/launch/tio/src/output.rs
+++ b/launch/tio/src/output.rs
--- a/launch/tio/src/output/echo_core.rs
+++ b/launch/tio/src/output/echo_core.rs
--- a/launch/tio/src/output/echo_full.rs
+++ b/launch/tio/src/output/echo_full.rs