Commit 602352ce authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

chore: rename dynamo (#44)


Co-authored-by: default avatarBiswa Panda <biswa.panda@gmail.com>
parent ecf53ce2
......@@ -33,8 +33,8 @@ repository = "https://github.com/dynemo-ai/dynemo.git"
[workspace.dependencies]
# local or crates.io
dynemo-runtime = { path = "../../lib/runtime" }
dynemo-llm = { path = "../../lib/llm" }
dynamo-runtime = { path = "../../lib/runtime" }
dynamo-llm = { path = "../../lib/llm" }
# crates.io
anyhow = { version = "1" }
......
......@@ -22,6 +22,6 @@ license.workspace = true
homepage.workspace = true
[dependencies]
dynemo-runtime = { workspace = true }
dynamo-runtime = { workspace = true }
# third-party
......@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use dynemo_runtime::{
use dynamo_runtime::{
logging, protocols::annotated::Annotated, stream::StreamExt, DistributedRuntime, Result,
Runtime, Worker,
};
......
......@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use dynemo_runtime::{
use dynamo_runtime::{
logging,
pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
......
......@@ -13,4 +13,4 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub const DEFAULT_NAMESPACE: &str = "dynemo";
pub const DEFAULT_NAMESPACE: &str = "dynamo";
......@@ -24,8 +24,8 @@ homepage.workspace = true
repository.workspace = true
[dependencies]
dynemo-runtime = { workspace = true}
dynemo-llm = { workspace = true}
dynamo-runtime = { workspace = true}
dynamo-llm = { workspace = true}
clap = { version = "4.5", features = ["derive"] }
serde = { workspace = true }
......
......@@ -16,14 +16,14 @@
use clap::Parser;
use std::sync::Arc;
use dynemo_llm::{
use dynamo_llm::{
http::service::{
discovery::{model_watcher, ModelWatchState},
service_v2::HttpService,
},
model_type::ModelType,
};
use dynemo_runtime::{
use dynamo_runtime::{
logging, transports::etcd::PrefixWatcher, DistributedRuntime, Result, Runtime, Worker,
};
......
......@@ -23,8 +23,8 @@ homepage.workspace = true
repository.workspace = true
[dependencies]
dynemo-runtime = { workspace = true}
dynemo-llm = { workspace = true}
dynamo-runtime = { workspace = true}
dynamo-llm = { workspace = true}
serde = { workspace = true }
serde_json = { workspace = true }
......
......@@ -16,8 +16,8 @@
use clap::{Parser, Subcommand};
use tracing as log;
use dynemo_llm::{http::service::discovery::ModelEntry, model_type::ModelType};
use dynemo_runtime::{
use dynamo_llm::{http::service::discovery::ModelEntry, model_type::ModelType};
use dynamo_runtime::{
distributed::DistributedConfig, logging, protocols::Endpoint, raise, DistributedRuntime,
Result, Runtime, Worker,
};
......
......@@ -23,7 +23,7 @@ homepage.workspace = true
repository.workspace = true
[dependencies]
dynemo-runtime = { workspace = true }
dynamo-runtime = { workspace = true }
# third-party
futures = { workspace = true }
......
......@@ -16,7 +16,7 @@
use futures::StreamExt;
use service_metrics::DEFAULT_NAMESPACE;
use dynemo_runtime::{
use dynamo_runtime::{
logging, protocols::annotated::Annotated, utils::Duration, DistributedRuntime, Result, Runtime,
Worker,
};
......
......@@ -15,7 +15,7 @@
use service_metrics::{MyStats, DEFAULT_NAMESPACE};
use dynemo_runtime::{
use dynamo_runtime::{
logging,
pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
......
......@@ -15,7 +15,7 @@
use serde::{Deserialize, Serialize};
pub const DEFAULT_NAMESPACE: &str = "dynemo";
pub const DEFAULT_NAMESPACE: &str = "dynamo";
#[derive(Serialize, Deserialize)]
// Dummy Stats object to demonstrate how to attach a custom stats handler
......
......@@ -1396,7 +1396,7 @@ dependencies = [
]
[[package]]
name = "dynemo-llm"
name = "dynamo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
......@@ -1412,7 +1412,7 @@ dependencies = [
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"dynamo-runtime",
"either",
"erased-serde",
"futures",
......@@ -1450,7 +1450,7 @@ dependencies = [
]
[[package]]
name = "dynemo-run"
name = "dynamo-run"
version = "0.1.0"
dependencies = [
"anyhow",
......@@ -1459,8 +1459,8 @@ dependencies = [
"async-trait",
"clap",
"dialoguer",
"dynemo-llm",
"dynemo-runtime",
"dynamo-llm",
"dynamo-runtime",
"futures",
"futures-util",
"libc",
......@@ -1475,7 +1475,7 @@ dependencies = [
]
[[package]]
name = "dynemo-runtime"
name = "dynamo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
......
......@@ -14,7 +14,7 @@
# limitations under the License.
[package]
name = "dynemo-run"
name = "dynamo-run"
version = "0.1.0"
edition = "2021"
authors = ["NVIDIA"]
......@@ -22,14 +22,14 @@ homepage = "https://github.com/dynemo-ai/dynemo"
license = "Apache-2.0"
[features]
mistralrs = ["dynemo-llm/mistralrs"]
sglang = ["dynemo-llm/sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
vllm = ["dynemo-llm/vllm", "dep:netlink-packet-route", "dep:rtnetlink"]
llamacpp = ["dynemo-llm/llamacpp"]
trtllm = ["dynemo-llm/trtllm"]
python = ["dynemo-llm/python"]
cuda = ["dynemo-llm/cuda"]
metal = ["dynemo-llm/metal"]
mistralrs = ["dynamo-llm/mistralrs"]
sglang = ["dynamo-llm/sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
vllm = ["dynamo-llm/vllm", "dep:netlink-packet-route", "dep:rtnetlink"]
llamacpp = ["dynamo-llm/llamacpp"]
trtllm = ["dynamo-llm/trtllm"]
python = ["dynamo-llm/python"]
cuda = ["dynamo-llm/cuda"]
metal = ["dynamo-llm/metal"]
[dependencies]
anyhow = "1"
......@@ -49,5 +49,5 @@ tokio = { version = "1", features = ["full"] }
tokio-util = { version = "0.7", features = ["codec", "net"] }
tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
dynemo-runtime = { path = "../../lib/runtime" }
dynemo-llm = { path = "../../lib/llm" }
dynamo-runtime = { path = "../../lib/runtime" }
dynamo-llm = { path = "../../lib/llm" }
# Dynemo service runner
# Dynamo service runner
`dynemo-run` is a tool for exploring the dynemo components.
`dynamo-run` is a tool for exploring the dynamo components.
## Setup
......@@ -36,11 +36,11 @@ For example one of these should be fast and good quality on almost any machine:
*Text interface*
`./target/release/dynemo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf` or path to a Hugging Face repo checkout instead of the GGUF.
`./target/release/dynamo-run Llama-3.2-1B-Instruct-Q4_K_M.gguf` or path to a Hugging Face repo checkout instead of the GGUF.
*HTTP interface*
`./target/release/dynemo-run in=http --model-path Llama-3.2-1B-Instruct-Q4_K_M.gguf`
`./target/release/dynamo-run in=http --model-path Llama-3.2-1B-Instruct-Q4_K_M.gguf`
List the models: `curl localhost:8080/v1/models`
......@@ -53,19 +53,19 @@ curl -d '{"model": "Llama-3.2-1B-Instruct-Q4_K_M", "max_tokens": 2049, "messages
Node 1:
```
dynemo-run in=http out=dyn://llama3B_pool
dynamo-run in=http out=dyn://llama3B_pool
```
Node 2:
```
dynemo-run in=dyn://llama3B_pool out=mistralrs ~/llm_models/Llama-3.2-3B-Instruct
dynamo-run in=dyn://llama3B_pool out=mistralrs ~/llm_models/Llama-3.2-3B-Instruct
```
This will use etcd to auto-discover the model and NATS to talk to it. You can run multiple workers on the same endpoint and it will pick one at random each time.
The `ns/backend/mistralrs` are purely symbolic, pick anything as long as it has three parts, and it matches the other node.
Run `dynemo-run --help` for more options.
Run `dynamo-run --help` for more options.
## sglang
......@@ -91,26 +91,26 @@ Any example above using `out=sglang` will work, but our sglang backend is also m
Node 1:
```
dynemo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
dynamo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
```
Node 2:
```
dynemo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
```
## llama_cpp
- `cargo build --release --features llamacpp,cuda`
- `dynemo-run out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`
- `dynamo-run out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`
The extra `--model-config` flag is because:
- llama_cpp only runs GGUF
- We send it tokens, meaning we do the tokenization ourself, so we need a tokenizer
- We don't yet read it out of the GGUF (TODO), so we need an HF repo with `tokenizer.json` et al
If the build step also builds llama_cpp libraries into `target/release` ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynemo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynemo-run` binary.
If the build step also builds llama_cpp libraries into `target/release` ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynamo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynamo-run` binary.
## vllm
......@@ -135,25 +135,25 @@ cargo build --release --features vllm
Run (still inside that virtualenv) - HF repo:
```
./target/release/dynemo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/
./target/release/dynamo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct/
```
Run (still inside that virtualenv) - GGUF:
```
./target/release/dynemo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
./target/release/dynamo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
```
+ Multi-node:
Node 1:
```
dynemo-run in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
dynamo-run in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
```
Node 2:
```
dynemo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
dynamo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
```
## Python bring-your-own-engine
......@@ -170,7 +170,7 @@ Build: `cargo build --release --features python`
If the Python engine wants to receive and returns strings - it will do the prompt templating and tokenization itself - run it like this:
```
dynemo-run out=pystr:/home/user/my_python_engine.py --name <model-name>
dynamo-run out=pystr:/home/user/my_python_engine.py --name <model-name>
```
- The `request` parameter is a map, an OpenAI compatible create chat completion request: https://platform.openai.com/docs/api-reference/chat/create
......@@ -201,11 +201,11 @@ async def generate(request):
yield {"id":"1","choices":[{"index":0,"delta":{"content":"","role":"assistant"},"finish_reason":"stop"}],"created":1841762283,"model":"Llama-3.2-1B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
```
### Dynemo does the pre-processing
### Dynamo does the pre-processing
If the Python engine wants to receive and return tokens - the prompt templating and tokenization is already done - run it like this:
```
dynemo-run out=pytok:/home/user/my_python_engine.py --model-path <hf-repo-checkout>
dynamo-run out=pytok:/home/user/my_python_engine.py --model-path <hf-repo-checkout>
```
- The request parameter is a map that looks like this:
......@@ -251,7 +251,7 @@ cargo build --release --features trtllm
Run:
```
dynemo-run in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/
dynamo-run in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/
```
Note that TRT-LLM uses it's own `.engine` format for weights. Repo models must be converted like so:
......@@ -269,7 +269,7 @@ python convert_checkpoint.py --model_dir /tmp/model/ --output_dir ./converted --
trtllm-build --checkpoint_dir ./converted --output_dir ./final/trtllm_engine --use_paged_context_fmha enable --gemm_plugin auto
```
The `--model-path` you give to `dynemo-run` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant).
The `--model-path` you give to `dynamo-run` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant).
+ Execute
TRT-LLM is a C++ library that must have been previously built and installed. It needs a lot of memory to compile. Gitlab builds a container you can try:
......
......@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use dynemo_llm::{
use dynamo_llm::{
backend::Backend,
http::service::discovery::ModelEntry,
model_type::ModelType,
......@@ -25,10 +25,10 @@ use dynemo_llm::{
Annotated,
},
};
use dynemo_runtime::pipeline::{
use dynamo_runtime::pipeline::{
network::Ingress, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source,
};
use dynemo_runtime::{protocols::Endpoint, DistributedRuntime, Runtime};
use dynamo_runtime::{protocols::Endpoint, DistributedRuntime, Runtime};
use crate::EngineConfig;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment