Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
1af7433b
Commit
1af7433b
authored
Mar 05, 2025
by
Neelay Shah
Committed by
GitHub
Mar 05, 2025
Browse files
refactor: rename triton_distributed to dynemo (#22)
Co-authored-by:
Graham King
<
grahamk@nvidia.com
>
parent
ee4ef06b
Changes
165
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
332 additions
and
333 deletions
+332
-333
launch/dynemo-run/src/input/text.rs
launch/dynemo-run/src/input/text.rs
+7
-7
launch/dynemo-run/src/lib.rs
launch/dynemo-run/src/lib.rs
+10
-11
launch/dynemo-run/src/main.rs
launch/dynemo-run/src/main.rs
+8
-8
launch/dynemo-run/src/output/echo_core.rs
launch/dynemo-run/src/output/echo_core.rs
+6
-6
launch/dynemo-run/src/output/echo_full.rs
launch/dynemo-run/src/output/echo_full.rs
+5
-5
lib/bindings/c/Cargo.lock
lib/bindings/c/Cargo.lock
+114
-114
lib/bindings/c/Cargo.toml
lib/bindings/c/Cargo.toml
+6
-6
lib/bindings/c/build.rs
lib/bindings/c/build.rs
+1
-1
lib/bindings/c/cbindgen.toml
lib/bindings/c/cbindgen.toml
+3
-3
lib/bindings/c/src/lib.rs
lib/bindings/c/src/lib.rs
+38
-38
lib/bindings/python/.gitignore
lib/bindings/python/.gitignore
+1
-1
lib/bindings/python/Cargo.lock
lib/bindings/python/Cargo.lock
+113
-113
lib/bindings/python/Cargo.toml
lib/bindings/python/Cargo.toml
+5
-5
lib/bindings/python/README.md
lib/bindings/python/README.md
+1
-1
lib/bindings/python/examples/bls/bar.py
lib/bindings/python/examples/bls/bar.py
+2
-2
lib/bindings/python/examples/bls/bls.py
lib/bindings/python/examples/bls/bls.py
+2
-2
lib/bindings/python/examples/bls/foo.py
lib/bindings/python/examples/bls/foo.py
+2
-2
lib/bindings/python/examples/error_handling/client.py
lib/bindings/python/examples/error_handling/client.py
+3
-3
lib/bindings/python/examples/error_handling/run.py
lib/bindings/python/examples/error_handling/run.py
+2
-2
lib/bindings/python/examples/error_handling/server.py
lib/bindings/python/examples/error_handling/server.py
+3
-3
No files found.
launch/dynemo-run/src/input/text.rs
View file @
1af7433b
...
...
@@ -13,12 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use
futures
::
StreamExt
;
use
std
::{
io
::{
ErrorKind
,
Read
,
Write
},
sync
::
Arc
,
};
use
triton_distributed_llm
::{
use
dynemo_llm
::{
backend
::
Backend
,
preprocessor
::
OpenAIPreprocessor
,
types
::{
...
...
@@ -29,10 +24,15 @@ use triton_distributed_llm::{
Annotated
,
},
};
use
triton_distributed
_runtime
::{
use
dynemo
_runtime
::{
pipeline
::{
Context
,
ManyOut
,
Operator
,
ServiceBackend
,
ServiceFrontend
,
SingleIn
,
Source
},
runtime
::
CancellationToken
,
};
use
futures
::
StreamExt
;
use
std
::{
io
::{
ErrorKind
,
Read
,
Write
},
sync
::
Arc
,
};
use
crate
::
EngineConfig
;
...
...
launch/dynemo-run/src/lib.rs
View file @
1af7433b
...
...
@@ -16,7 +16,7 @@
#[cfg(any(feature
=
"vllm"
,
feature
=
"sglang"
))]
use
std
::{
future
::
Future
,
pin
::
Pin
};
use
triton_distributed
_llm
::{
use
dynemo
_llm
::{
backend
::
ExecutionContext
,
model_card
::
model
::
ModelDeploymentCard
,
types
::{
...
...
@@ -27,7 +27,7 @@ use triton_distributed_llm::{
Annotated
,
},
};
use
triton_distributed
_runtime
::{
component
::
Client
,
protocols
::
Endpoint
,
DistributedRuntime
};
use
dynemo
_runtime
::{
component
::
Client
,
protocols
::
Endpoint
,
DistributedRuntime
};
mod
flags
;
pub
use
flags
::
Flags
;
...
...
@@ -67,7 +67,7 @@ pub enum EngineConfig {
#[allow(unused_mut)]
pub
async
fn
run
(
runtime
:
triton_distributed
_runtime
::
Runtime
,
runtime
:
dynemo
_runtime
::
Runtime
,
mut
in_opt
:
Input
,
// mut because vllm and sglang multi-node can change it
out_opt
:
Output
,
flags
:
Flags
,
...
...
@@ -173,13 +173,12 @@ pub async fn run(
};
EngineConfig
::
StaticFull
{
service_name
:
model_name
,
engine
:
triton_distributed_llm
::
engines
::
mistralrs
::
make_engine
(
&
model_path
)
.await
?
,
engine
:
dynemo_llm
::
engines
::
mistralrs
::
make_engine
(
&
model_path
)
.await
?
,
}
}
#[cfg(feature
=
"sglang"
)]
Output
::
SgLang
=>
{
use
triton_distributed
_llm
::
engines
::
sglang
;
use
dynemo
_llm
::
engines
::
sglang
;
let
Some
(
model_path
)
=
model_path
else
{
anyhow
::
bail!
(
"out=sglang requires flag --model-path=<full-path-to-model-dir>"
);
};
...
...
@@ -191,7 +190,7 @@ pub async fn run(
let
Some
(
sock_prefix
)
=
zmq_socket_prefix
else
{
anyhow
::
bail!
(
"sglang requires zmq_socket_prefix"
);
};
let
node_conf
=
triton_distributed
_llm
::
engines
::
MultiNodeConfig
{
let
node_conf
=
dynemo
_llm
::
engines
::
MultiNodeConfig
{
num_nodes
:
flags
.num_nodes
,
node_rank
:
flags
.node_rank
,
leader_addr
:
flags
.leader_addr
.unwrap_or_default
(),
...
...
@@ -229,7 +228,7 @@ pub async fn run(
}
#[cfg(feature
=
"vllm"
)]
Output
::
Vllm
=>
{
use
triton_distributed
_llm
::
engines
::
vllm
;
use
dynemo
_llm
::
engines
::
vllm
;
if
flags
.base_gpu_id
!=
0
{
anyhow
::
bail!
(
"vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead."
);
}
...
...
@@ -253,7 +252,7 @@ pub async fn run(
let
Some
(
sock_prefix
)
=
zmq_socket_prefix
else
{
anyhow
::
bail!
(
"vllm requires zmq_socket_prefix"
);
};
let
node_conf
=
triton_distributed
_llm
::
engines
::
MultiNodeConfig
{
let
node_conf
=
dynemo
_llm
::
engines
::
MultiNodeConfig
{
num_nodes
:
flags
.num_nodes
,
node_rank
:
flags
.node_rank
,
leader_addr
:
flags
.leader_addr
.unwrap_or_default
(),
...
...
@@ -296,7 +295,7 @@ pub async fn run(
}
#[cfg(feature
=
"llamacpp"
)]
Output
::
LlamaCpp
=>
{
use
triton_distributed
_llm
::
engines
::
llamacpp
;
use
dynemo
_llm
::
engines
::
llamacpp
;
let
Some
(
model_path
)
=
model_path
else
{
anyhow
::
bail!
(
"out=llamacpp requires flag --model-path=<full-path-to-model-gguf>"
);
};
...
...
@@ -317,7 +316,7 @@ pub async fn run(
}
#[cfg(feature
=
"trtllm"
)]
Output
::
TrtLLM
=>
{
use
triton_distributed
_llm
::
engines
::
trtllm
;
use
dynemo
_llm
::
engines
::
trtllm
;
let
Some
(
model_path
)
=
model_path
else
{
anyhow
::
bail!
(
"out=trtllm requires flag --model-path=<full-path-to-model-dir>"
);
};
...
...
launch/dynemo-run/src/main.rs
View file @
1af7433b
...
...
@@ -18,7 +18,7 @@ use std::env;
use
clap
::
Parser
;
use
dynemo_run
::{
Input
,
Output
};
use
triton_distributed
_runtime
::
logging
;
use
dynemo
_runtime
::
logging
;
const
HELP
:
&
str
=
r#"
dynemo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynemo locally.
...
...
@@ -60,13 +60,13 @@ fn main() -> anyhow::Result<()> {
if
cfg!
(
feature
=
"sglang"
)
{
#[cfg(feature
=
"sglang"
)]
{
use
triton_distributed
_llm
::
engines
::
sglang
;
use
dynemo
_llm
::
engines
::
sglang
;
let
gpu_config
=
sglang
::
MultiGPUConfig
{
tp_size
:
flags
.tensor_parallel_size
,
tp_rank
:
sglang_flags
.tp_rank
,
gpu_id
:
sglang_flags
.gpu_id
,
};
let
node_config
=
triton_distributed
_llm
::
engines
::
MultiNodeConfig
{
let
node_config
=
dynemo
_llm
::
engines
::
MultiNodeConfig
{
num_nodes
:
flags
.num_nodes
,
node_rank
:
flags
.node_rank
,
leader_addr
:
flags
.leader_addr
.unwrap_or_default
(),
...
...
@@ -98,8 +98,8 @@ fn main() -> anyhow::Result<()> {
if
cfg!
(
feature
=
"vllm"
)
{
#[cfg(feature
=
"vllm"
)]
{
use
triton_distributed
_llm
::
engines
::
vllm
;
let
node_config
=
triton_distributed
_llm
::
engines
::
MultiNodeConfig
{
use
dynemo
_llm
::
engines
::
vllm
;
let
node_config
=
dynemo
_llm
::
engines
::
MultiNodeConfig
{
num_nodes
:
flags
.num_nodes
,
node_rank
:
flags
.node_rank
,
leader_addr
:
flags
.leader_addr
.unwrap_or_default
(),
...
...
@@ -119,15 +119,15 @@ fn main() -> anyhow::Result<()> {
}
// max_worker_threads and max_blocking_threads from env vars or config file.
let
rt_config
=
triton_distributed
_runtime
::
RuntimeConfig
::
from_settings
()
?
;
let
rt_config
=
dynemo
_runtime
::
RuntimeConfig
::
from_settings
()
?
;
// One per process. Wraps a Runtime with holds two tokio runtimes.
let
worker
=
triton_distributed
_runtime
::
Worker
::
from_config
(
rt_config
)
?
;
let
worker
=
dynemo
_runtime
::
Worker
::
from_config
(
rt_config
)
?
;
worker
.execute
(
wrapper
)
}
async
fn
wrapper
(
runtime
:
triton_distributed
_runtime
::
Runtime
)
->
anyhow
::
Result
<
()
>
{
async
fn
wrapper
(
runtime
:
dynemo
_runtime
::
Runtime
)
->
anyhow
::
Result
<
()
>
{
let
mut
in_opt
=
None
;
let
mut
out_opt
=
None
;
let
args
:
Vec
<
String
>
=
env
::
args
()
.skip
(
1
)
.collect
();
...
...
launch/dynemo-run/src/output/echo_core.rs
View file @
1af7433b
...
...
@@ -18,12 +18,12 @@ use std::{sync::Arc, time::Duration};
use
async_stream
::
stream
;
use
async_trait
::
async_trait
;
use
triton_distributed
_llm
::
backend
::
ExecutionContext
;
use
triton_distributed
_llm
::
preprocessor
::
BackendInput
;
use
triton_distributed
_llm
::
protocols
::
common
::
llm_backend
::
LLMEngineOutput
;
use
triton_distributed
_runtime
::
engine
::{
AsyncEngine
,
AsyncEngineContextProvider
,
ResponseStream
};
use
triton_distributed
_runtime
::
pipeline
::{
Error
,
ManyOut
,
SingleIn
};
use
triton_distributed
_runtime
::
protocols
::
annotated
::
Annotated
;
use
dynemo
_llm
::
backend
::
ExecutionContext
;
use
dynemo
_llm
::
preprocessor
::
BackendInput
;
use
dynemo
_llm
::
protocols
::
common
::
llm_backend
::
LLMEngineOutput
;
use
dynemo
_runtime
::
engine
::{
AsyncEngine
,
AsyncEngineContextProvider
,
ResponseStream
};
use
dynemo
_runtime
::
pipeline
::{
Error
,
ManyOut
,
SingleIn
};
use
dynemo
_runtime
::
protocols
::
annotated
::
Annotated
;
/// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s.
...
...
launch/dynemo-run/src/output/echo_full.rs
View file @
1af7433b
...
...
@@ -18,13 +18,13 @@ use std::{sync::Arc, time::Duration};
use
async_stream
::
stream
;
use
async_trait
::
async_trait
;
use
triton_distributed
_llm
::
protocols
::
openai
::
chat_completions
::{
use
dynemo
_llm
::
protocols
::
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
};
use
triton_distributed
_llm
::
types
::
openai
::
chat_completions
::
OpenAIChatCompletionsStreamingEngine
;
use
triton_distributed
_runtime
::
engine
::{
AsyncEngine
,
AsyncEngineContextProvider
,
ResponseStream
};
use
triton_distributed
_runtime
::
pipeline
::{
Error
,
ManyOut
,
SingleIn
};
use
triton_distributed
_runtime
::
protocols
::
annotated
::
Annotated
;
use
dynemo
_llm
::
types
::
openai
::
chat_completions
::
OpenAIChatCompletionsStreamingEngine
;
use
dynemo
_runtime
::
engine
::{
AsyncEngine
,
AsyncEngineContextProvider
,
ResponseStream
};
use
dynemo
_runtime
::
pipeline
::{
Error
,
ManyOut
,
SingleIn
};
use
dynemo
_runtime
::
protocols
::
annotated
::
Annotated
;
/// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s.
...
...
lib/bindings/c/Cargo.lock
View file @
1af7433b
...
...
@@ -954,6 +954,99 @@ dependencies = [
"syn 2.0.96",
]
[[package]]
name = "dynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.0",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynemo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "ed25519"
version = "2.2.3"
...
...
@@ -1853,6 +1946,27 @@ version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "libdynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-once-cell",
"cbindgen",
"dynemo-llm",
"dynemo-runtime",
"futures",
"libc",
"once_cell",
"serde",
"serde_json",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"uuid",
]
[[package]]
name = "libloading"
version = "0.8.6"
...
...
@@ -1873,27 +1987,6 @@ dependencies = [
"libc",
]
[[package]]
name = "libtriton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-once-cell",
"cbindgen",
"futures",
"libc",
"once_cell",
"serde",
"serde_json",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
"uuid",
]
[[package]]
name = "linux-raw-sys"
version = "0.4.15"
...
...
@@ -3955,99 +4048,6 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "triton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.0",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-distributed-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "try-lock"
version = "0.2.5"
...
...
lib/bindings/c/Cargo.toml
View file @
1af7433b
...
...
@@ -14,24 +14,24 @@
# limitations under the License.
[package]
name
=
"lib
triton-distributed
-llm"
name
=
"lib
dynemo
-llm"
version
=
"0.2.1"
edition
=
"2021"
authors
=
["NVIDIA"]
license
=
"Apache-2.0"
homepage
=
"https://github.com/
triton-inference-server/triton_distributed
"
repository
=
"https://github.com/
triton-inference-server/triton_distributed
"
homepage
=
"https://github.com/
dynemo-ai/dynemo
"
repository
=
"https://github.com/
dynemo-ai/dynemo.git
"
[lib]
name
=
"
triton_distributed
_llm_capi"
name
=
"
dynemo
_llm_capi"
crate-type
=
["cdylib"]
[build-dependencies]
cbindgen
=
"0.27"
[dependencies]
triton-distributed
-llm
=
{
path
=
"../../llm"
}
triton-distributed
-runtime
=
{
path
=
"../../runtime"
}
dynemo
-llm
=
{
path
=
"../../llm"
}
dynemo
-runtime
=
{
path
=
"../../runtime"
}
anyhow
=
{
version
=
"1"
}
futures
=
"0.3"
...
...
lib/bindings/c/build.rs
View file @
1af7433b
...
...
@@ -22,7 +22,7 @@ fn main() {
let
header_path
=
Path
::
new
(
&
crate_dir
)
.join
(
"include"
)
.join
(
"nvidia"
)
.join
(
"
triton
_llm"
)
.join
(
"
dynemo
_llm"
)
.join
(
"llm_engine.h"
);
cbindgen
::
generate
(
crate_dir
)
...
...
lib/bindings/c/cbindgen.toml
View file @
1af7433b
...
...
@@ -15,7 +15,7 @@
language
=
"C++"
cpp_compat
=
true
include_guard
=
"__NVIDIA_
TRITON
_LLM_API__"
include_guard
=
"__NVIDIA_
DYNEMO
_LLM_API__"
[enum]
...
...
@@ -25,7 +25,7 @@ enum_class = false
[export]
include
=
[
"
Triton
LlmResult"
,
"
triton
_llm_init"
,
"
triton
_llm_shutdown"
]
include
=
[
"
Dynemo
LlmResult"
,
"
dynemo
_llm_init"
,
"
dynemo
_llm_shutdown"
]
[export.rename]
"
Triton
LlmResult"
=
"
triton
_llm_result_t"
"
Dynemo
LlmResult"
=
"
dynemo
_llm_result_t"
lib/bindings/c/src/lib.rs
View file @
1af7433b
...
...
@@ -19,10 +19,10 @@ use once_cell::sync::OnceCell;
use
std
::
ffi
::
CStr
;
use
std
::
sync
::
atomic
::{
AtomicU32
,
Ordering
};
use
triton_distributed
_llm
::
kv_router
::{
use
dynemo
_llm
::
kv_router
::{
indexer
::
compute_block_hash_for_seq
,
protocols
::
*
,
publisher
::
KvEventPublisher
,
};
use
triton_distributed
_runtime
::{
DistributedRuntime
,
Worker
};
use
dynemo
_runtime
::{
DistributedRuntime
,
Worker
};
static
WK
:
OnceCell
<
Worker
>
=
OnceCell
::
new
();
static
DRT
:
AsyncOnceCell
<
DistributedRuntime
>
=
AsyncOnceCell
::
new
();
// [FIXME] shouldn't the publisher be instance passing between API calls?
...
...
@@ -41,7 +41,7 @@ fn initialize_tracing() {
}
#[repr(u32)]
pub
enum
Triton
LlmResult
{
pub
enum
Dynemo
LlmResult
{
OK
=
0
,
ERR
=
1
,
}
...
...
@@ -49,17 +49,17 @@ pub enum TritonLlmResult {
/// # Safety
/// the namespace_c_str and component_c_str are passed as pointers to C strings
#[no_mangle]
pub
unsafe
extern
"C"
fn
triton
_llm_init
(
pub
unsafe
extern
"C"
fn
dynemo
_llm_init
(
namespace_c_str
:
*
const
c_char
,
component_c_str
:
*
const
c_char
,
worker_id
:
i64
,
)
->
Triton
LlmResult
{
)
->
Dynemo
LlmResult
{
initialize_tracing
();
let
wk
=
match
WK
.get_or_try_init
(
Worker
::
from_settings
)
{
Ok
(
wk
)
=>
wk
.clone
(),
Err
(
e
)
=>
{
eprintln!
(
"Failed to initialize runtime: {:?}"
,
e
);
return
Triton
LlmResult
::
ERR
;
return
Dynemo
LlmResult
::
ERR
;
}
};
let
rt
=
wk
.runtime
();
...
...
@@ -73,7 +73,7 @@ pub unsafe extern "C" fn triton_llm_init(
Ok
(
_
)
=>
Ok
(()),
Err
(
e
)
=>
{
eprintln!
(
"Failed to initialize distributed runtime: {:?}"
,
e
);
Err
(
Triton
LlmResult
::
ERR
)
Err
(
Dynemo
LlmResult
::
ERR
)
}
}
});
...
...
@@ -81,7 +81,7 @@ pub unsafe extern "C" fn triton_llm_init(
Ok
(
s
)
=>
s
.to_string
(),
Err
(
e
)
=>
{
eprintln!
(
"Failed to convert C string to Rust string: {:?}"
,
e
);
return
Triton
LlmResult
::
ERR
;
return
Dynemo
LlmResult
::
ERR
;
}
};
...
...
@@ -89,18 +89,18 @@ pub unsafe extern "C" fn triton_llm_init(
Ok
(
s
)
=>
s
.to_string
(),
Err
(
e
)
=>
{
eprintln!
(
"Failed to convert C string to Rust string: {:?}"
,
e
);
return
Triton
LlmResult
::
ERR
;
return
Dynemo
LlmResult
::
ERR
;
}
};
match
result
{
Ok
(
_
)
=>
match
KV_PUB
.get_or_try_init
(
move
||
triton
_create_kv_publisher
(
namespace
,
component
,
worker_id
))
.get_or_try_init
(
move
||
dynemo
_create_kv_publisher
(
namespace
,
component
,
worker_id
))
{
Ok
(
_
)
=>
Triton
LlmResult
::
OK
,
Ok
(
_
)
=>
Dynemo
LlmResult
::
OK
,
Err
(
e
)
=>
{
eprintln!
(
"Failed to initialize distributed runtime: {:?}"
,
e
);
Triton
LlmResult
::
ERR
Dynemo
LlmResult
::
ERR
}
},
Err
(
e
)
=>
e
,
...
...
@@ -108,33 +108,33 @@ pub unsafe extern "C" fn triton_llm_init(
}
#[no_mangle]
pub
extern
"C"
fn
triton
_llm_shutdown
()
->
Triton
LlmResult
{
pub
extern
"C"
fn
dynemo
_llm_shutdown
()
->
Dynemo
LlmResult
{
let
wk
=
match
WK
.get
()
{
Some
(
wk
)
=>
wk
,
None
=>
{
eprintln!
(
"Runtime not initialized"
);
return
Triton
LlmResult
::
ERR
;
return
Dynemo
LlmResult
::
ERR
;
}
};
wk
.runtime
()
.shutdown
();
Triton
LlmResult
::
OK
Dynemo
LlmResult
::
OK
}
#[no_mangle]
pub
extern
"C"
fn
triton
_llm_load_publisher_create
()
->
Triton
LlmResult
{
Triton
LlmResult
::
OK
pub
extern
"C"
fn
dynemo
_llm_load_publisher_create
()
->
Dynemo
LlmResult
{
Dynemo
LlmResult
::
OK
}
// instantiate a kv publisher
// this will bring up the task to publish and the channels to await publishing events
// the [`
triton
_kv_publish_store_event`] call will use a handle to the publisher to send events
// store and the [`
triton
_kv_event_create_removed`] will create remove events
// the [`
dynemo
_kv_publish_store_event`] call will use a handle to the publisher to send events
// store and the [`
dynemo
_kv_event_create_removed`] will create remove events
// these call mus be driving by external c++ threads that are consuming the kv events from the
// c++ executor api
fn
triton
_create_kv_publisher
(
fn
dynemo
_create_kv_publisher
(
namespace
:
String
,
component
:
String
,
worker_id
:
i64
,
...
...
@@ -238,7 +238,7 @@ fn kv_event_create_removed_from_parts(
/// parent_hash is passed as pointer to indicate whether the blocks
/// has a parent hash or not. nullptr is used to represent no parent hash
#[no_mangle]
pub
unsafe
extern
"C"
fn
triton
_kv_event_publish_stored
(
pub
unsafe
extern
"C"
fn
dynemo
_kv_event_publish_stored
(
event_id
:
u64
,
token_ids
:
*
const
u32
,
num_block_tokens
:
*
const
usize
,
...
...
@@ -246,7 +246,7 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored(
num_blocks
:
usize
,
parent_hash
:
*
const
u64
,
lora_id
:
u64
,
)
->
Triton
LlmResult
{
)
->
Dynemo
LlmResult
{
let
publisher
=
KV_PUB
.get
()
.unwrap
();
let
parent_hash
=
{
if
parent_hash
.is_null
()
{
...
...
@@ -265,40 +265,40 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored(
lora_id
,
);
match
publisher
.publish
(
event
)
{
Ok
(
_
)
=>
Triton
LlmResult
::
OK
,
Ok
(
_
)
=>
Dynemo
LlmResult
::
OK
,
Err
(
e
)
=>
{
eprintln!
(
"Error publishing stored kv event {:?}"
,
e
);
Triton
LlmResult
::
ERR
Dynemo
LlmResult
::
ERR
}
}
}
#[no_mangle]
pub
extern
"C"
fn
triton
_kv_event_publish_removed
(
pub
extern
"C"
fn
dynemo
_kv_event_publish_removed
(
event_id
:
u64
,
block_ids
:
*
const
u64
,
num_blocks
:
usize
,
)
->
Triton
LlmResult
{
)
->
Dynemo
LlmResult
{
let
publisher
=
KV_PUB
.get
()
.unwrap
();
let
event
=
kv_event_create_removed_from_parts
(
event_id
,
block_ids
,
num_blocks
);
match
publisher
.publish
(
event
)
{
Ok
(
_
)
=>
Triton
LlmResult
::
OK
,
Ok
(
_
)
=>
Dynemo
LlmResult
::
OK
,
Err
(
e
)
=>
{
eprintln!
(
"Error publishing removed kv event {:?}"
,
e
);
Triton
LlmResult
::
ERR
Dynemo
LlmResult
::
ERR
}
}
}
// #[no_mangle]
// pub extern "C" fn
triton
_kv_publish_store_event(
// pub extern "C" fn
dynemo
_kv_publish_store_event(
// event_id: u64,
// token_ids: *const u32,
// num_tokens: usize,
// lora_id: u64,
// ) ->
Triton
LlmResult {
// ) ->
Dynemo
LlmResult {
// // if event.is_null() || token_ids.is_null() {
// // return
triton
KvErrorType::INVALID_TOKEN_IDS;
// // return
dynemo
KvErrorType::INVALID_TOKEN_IDS;
// // }
// // let tokens = unsafe { std::slice::from_raw_parts(token_ids, num_tokens) }.to_vec();
...
...
@@ -311,15 +311,15 @@ pub extern "C" fn triton_kv_event_publish_removed(
// // unsafe { *event = Box::into_raw(new_event) };
//
Triton
LlmResult::OK
//
Dynemo
LlmResult::OK
// }
// #[no_mangle]
// pub extern "C" fn
triton
_kv_event_create_removed(
// pub extern "C" fn
dynemo
_kv_event_create_removed(
// event_id: u64,
// block_hashes: *const u64,
// num_hashes: usize,
// ) ->
Triton
LlmResult {
// ) ->
Dynemo
LlmResult {
// // if event.is_null() || block_hashes.is_null() {
// // return -1;
// // }
...
...
@@ -334,19 +334,19 @@ pub extern "C" fn triton_kv_event_publish_removed(
// // unsafe { *event = Box::into_raw(new_event) };
// // 0
//
Triton
LlmResult::OK
//
Dynemo
LlmResult::OK
// }
// /// create load publisher object and return a handle
// /// load publisher will instantiate the nats service and tie its stats handler to
// /// a watch channel receiver. the watch channel sender will be attach to the
// /// handle and calls to [`
triton
_load_stats_publish`] issue the stats to the watch t
// pub extern "C" fn
triton
_load_publisher_create() -> *mut LoadPublisher {
// /// handle and calls to [`
dynemo
_load_stats_publish`] issue the stats to the watch t
// pub extern "C" fn
dynemo
_load_publisher_create() -> *mut LoadPublisher {
// // let publisher = Box::new(LoadPublisher::new());
// // Box::into_raw(publisher)
// }
// pub extern "C" fn
triton
_load_stats_publish(
// pub extern "C" fn
dynemo
_load_stats_publish(
// publisher: *mut LoadPublisher,
// active_slots: u64,
// total_slots: u64,
...
...
lib/bindings/python/.gitignore
View file @
1af7433b
/target
python/
triton_distributed/
*.so
python/
dynemo/.
*.so
lib/bindings/python/Cargo.lock
View file @
1af7433b
...
...
@@ -956,6 +956,119 @@ dependencies = [
"syn 2.0.98",
]
[[package]]
name = "dynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynemo-py3"
version = "0.2.1"
dependencies = [
"dynemo-llm",
"dynemo-runtime",
"futures",
"once_cell",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"serde",
"serde_json",
"thiserror 2.0.11",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
]
[[package]]
name = "dynemo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "ed25519"
version = "2.2.3"
...
...
@@ -4004,119 +4117,6 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "triton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-distributed-py3"
version = "0.2.1"
dependencies = [
"futures",
"once_cell",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"serde",
"serde_json",
"thiserror 2.0.11",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]]
name = "triton-distributed-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "try-lock"
version = "0.2.5"
...
...
lib/bindings/python/Cargo.toml
View file @
1af7433b
...
...
@@ -14,13 +14,13 @@
# limitations under the License.
[package]
name
=
"
triton-distributed
-py3"
name
=
"
dynemo
-py3"
version
=
"0.2.1"
edition
=
"2021"
authors
=
["NVIDIA"]
license
=
"Apache-2.0"
homepage
=
"https://github.com/
triton-inference-server/triton_distributed
"
repository
=
"https://github.com/
triton-inference-server/triton_distributed
"
homepage
=
"https://github.com/
dynemo-ai/dynemo
"
repository
=
"https://github.com/
dynemo-ai/dynemo.git
"
[lib]
path
=
"rust/lib.rs"
...
...
@@ -30,8 +30,8 @@ crate-type = ["cdylib"]
[dependencies]
triton-distributed
-llm
=
{
path
=
"../../llm"
}
triton-distributed
-runtime
=
{
path
=
"../../runtime"
}
dynemo
-llm
=
{
path
=
"../../llm"
}
dynemo
-runtime
=
{
path
=
"../../runtime"
}
futures
=
"0.3"
once_cell
=
"1.20.3"
...
...
lib/bindings/python/README.md
View file @
1af7433b
...
...
@@ -41,7 +41,7 @@ source .venv/bin/activate
uv pip install maturin
```
4.
Build and install
triton_distributed
wheel
4.
Build and install
dynemo
wheel
```
maturin develop --uv
```
...
...
lib/bindings/python/examples/bls/bar.py
View file @
1af7433b
...
...
@@ -17,7 +17,7 @@ import asyncio
import
uvloop
from
triton_distributed
.runtime
import
DistributedRuntime
,
triton
_worker
from
dynemo
.runtime
import
DistributedRuntime
,
dynemo
_worker
uvloop
.
install
()
...
...
@@ -29,7 +29,7 @@ class RequestHandler:
yield
char
@
triton
_worker
()
@
dynemo
_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
component
=
runtime
.
namespace
(
"examples/bls"
).
component
(
"bar"
)
await
component
.
create_service
()
...
...
lib/bindings/python/examples/bls/bls.py
View file @
1af7433b
...
...
@@ -17,12 +17,12 @@ import asyncio
import
uvloop
from
triton_distributed
.runtime
import
DistributedRuntime
,
triton
_worker
from
dynemo
.runtime
import
DistributedRuntime
,
dynemo
_worker
uvloop
.
install
()
@
triton
_worker
()
@
dynemo
_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
foo
=
(
await
runtime
.
namespace
(
"examples/bls"
)
...
...
lib/bindings/python/examples/bls/foo.py
View file @
1af7433b
...
...
@@ -17,7 +17,7 @@ import asyncio
import
uvloop
from
triton_distributed
.runtime
import
DistributedRuntime
,
triton
_worker
from
dynemo
.runtime
import
DistributedRuntime
,
dynemo
_worker
uvloop
.
install
()
...
...
@@ -28,7 +28,7 @@ class RequestHandler:
yield
char
@
triton
_worker
()
@
dynemo
_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
component
=
runtime
.
namespace
(
"examples/bls"
).
component
(
"foo"
)
await
component
.
create_service
()
...
...
lib/bindings/python/examples/error_handling/client.py
View file @
1af7433b
...
...
@@ -17,12 +17,12 @@ import asyncio
import
uvloop
from
triton_distributed
.runtime
import
DistributedRuntime
,
triton
_worker
from
dynemo
.runtime
import
DistributedRuntime
,
dynemo
_worker
@
triton
_worker
()
@
dynemo
_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
await
init
(
runtime
,
"
triton-init
"
)
await
init
(
runtime
,
"
dynemo
"
)
async
def
init
(
runtime
:
DistributedRuntime
,
ns
:
str
):
...
...
lib/bindings/python/examples/error_handling/run.py
View file @
1af7433b
...
...
@@ -21,7 +21,7 @@ import uvloop
from
client
import
init
as
client_init
from
server
import
init
as
server_init
from
triton_distributed
.runtime
import
DistributedRuntime
,
triton
_worker
from
dynemo
.runtime
import
DistributedRuntime
,
dynemo
_worker
def
random_string
(
length
=
10
):
...
...
@@ -29,7 +29,7 @@ def random_string(length=10):
return
""
.
join
(
random
.
choices
(
chars
,
k
=
length
))
@
triton
_worker
()
@
dynemo
_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
ns
=
random_string
()
task
=
asyncio
.
create_task
(
server_init
(
runtime
,
ns
))
...
...
lib/bindings/python/examples/error_handling/server.py
View file @
1af7433b
...
...
@@ -17,7 +17,7 @@ import asyncio
import
uvloop
from
triton_distributed
.runtime
import
DistributedRuntime
,
triton
_worker
from
dynemo
.runtime
import
DistributedRuntime
,
dynemo
_worker
class
RequestHandler
:
...
...
@@ -33,9 +33,9 @@ class RequestHandler:
yield
char
@
triton
_worker
()
@
dynemo
_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
await
init
(
runtime
,
"
triton-init
"
)
await
init
(
runtime
,
"
dynemo
"
)
async
def
init
(
runtime
:
DistributedRuntime
,
ns
:
str
):
...
...
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment