Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2bf27924
Unverified
Commit
2bf27924
authored
Jul 08, 2025
by
Graham King
Committed by
GitHub
Jul 08, 2025
Browse files
feat(python): Python bindings for the Dynamo CLI tools (#1799)
parent
3e3ff934
Changes
16
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
2534 additions
and
222 deletions
+2534
-222
Cargo.lock
Cargo.lock
+1
-0
examples/cli/cli.py
examples/cli/cli.py
+156
-0
launch/dynamo-run/Cargo.toml
launch/dynamo-run/Cargo.toml
+1
-0
launch/dynamo-run/src/lib.rs
launch/dynamo-run/src/lib.rs
+9
-4
lib/bindings/python/Cargo.lock
lib/bindings/python/Cargo.lock
+2086
-170
lib/bindings/python/Cargo.toml
lib/bindings/python/Cargo.toml
+6
-1
lib/bindings/python/README.md
lib/bindings/python/README.md
+20
-0
lib/bindings/python/rust/lib.rs
lib/bindings/python/rust/lib.rs
+5
-0
lib/bindings/python/rust/llm.rs
lib/bindings/python/rust/llm.rs
+1
-12
lib/bindings/python/rust/llm/entrypoint.rs
lib/bindings/python/rust/llm/entrypoint.rs
+190
-0
lib/bindings/python/src/dynamo/_core.pyi
lib/bindings/python/src/dynamo/_core.pyi
+20
-0
lib/bindings/python/src/dynamo/llm/__init__.py
lib/bindings/python/src/dynamo/llm/__init__.py
+6
-12
lib/engines/llamacpp/Cargo.toml
lib/engines/llamacpp/Cargo.toml
+2
-12
lib/llm/src/entrypoint.rs
lib/llm/src/entrypoint.rs
+1
-0
lib/llm/src/entrypoint/input.rs
lib/llm/src/entrypoint/input.rs
+25
-7
lib/llm/src/local_model.rs
lib/llm/src/local_model.rs
+5
-4
No files found.
Cargo.lock
View file @
2bf27924
...
@@ -1789,6 +1789,7 @@ dependencies = [
...
@@ -1789,6 +1789,7 @@ dependencies = [
"dynamo-engine-mistralrs",
"dynamo-engine-mistralrs",
"dynamo-llm",
"dynamo-llm",
"dynamo-runtime",
"dynamo-runtime",
"either",
"futures",
"futures",
"futures-util",
"futures-util",
"libc",
"libc",
...
...
examples/cli/cli.py
0 → 100644
View file @
2bf27924
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Example cli using the Python bindings, similar to `dynamo-run`.
# Usage: `python cli.py in=text out=mistralrs <your-model>`.
# Must be in a virtualenv with the Dynamo bindings (or wheel) installed.
import
argparse
import
asyncio
import
sys
from
pathlib
import
Path
import
uvloop
from
dynamo.llm
import
EngineType
,
EntrypointArgs
,
make_engine
,
run_input
from
dynamo.runtime
import
DistributedRuntime
def
parse_args
():
in_mode
=
"text"
out_mode
=
"echo"
batch_file
=
None
# Specific to in_mode="batch"
# List to hold arguments that argparse will process (flags and model path)
argparse_args
=
[]
# --- Step 1: Manual Pre-parsing for 'in=' and 'out=' ---
# Iterate through sys.argv[1:] to extract in= and out=
# and collect remaining arguments for argparse.
for
arg
in
sys
.
argv
[
1
:]:
if
arg
.
startswith
(
"in="
):
in_val
=
arg
[
len
(
"in="
)
:]
if
in_val
.
startswith
(
"batch:"
):
in_mode
=
"batch"
batch_file
=
in_val
[
len
(
"batch:"
)
:]
else
:
in_mode
=
in_val
elif
arg
.
startswith
(
"out="
):
out_mode
=
arg
[
len
(
"out="
)
:]
else
:
# This argument is not 'in=' or 'out=', so it's either a flag or the model path
argparse_args
.
append
(
arg
)
# --- Step 2: Argparse for flags and the model path ---
parser
=
argparse
.
ArgumentParser
(
description
=
"Dynamo CLI: Connect inputs to an engine"
,
formatter_class
=
argparse
.
RawTextHelpFormatter
,
# To preserve multi-line help formatting
)
# model_name: Option<String>
parser
.
add_argument
(
"--model-name"
,
type
=
str
,
help
=
"Name of the model to load."
)
# model_config: Option<PathBuf>
parser
.
add_argument
(
"--model-config"
,
type
=
Path
,
help
=
"Path to the model configuration file."
)
# context_length: Option<u32>
parser
.
add_argument
(
"--context-length"
,
type
=
int
,
help
=
"Maximum context length for the model (u32)."
)
# template_file: Option<PathBuf>
parser
.
add_argument
(
"--template-file"
,
type
=
Path
,
help
=
"Path to the template file for text generation."
,
)
# kv_cache_block_size: Option<u32>
parser
.
add_argument
(
"--kv-cache-block-size"
,
type
=
int
,
help
=
"KV cache block size (u32)."
)
# http_port: Option<u16>
parser
.
add_argument
(
"--http-port"
,
type
=
int
,
help
=
"HTTP port for the engine (u16)."
)
# TODO: Not yet used here
parser
.
add_argument
(
"--tensor-parallel-size"
,
type
=
int
,
help
=
"Tensor parallel size for the model (e.g., 4)."
,
)
# Add the positional model argument.
# It's made optional (nargs='?') because its requirement depends on 'out_mode',
# which is handled in post-parsing validation.
parser
.
add_argument
(
"model"
,
nargs
=
"?"
,
# Make it optional for argparse, we'll validate manually
help
=
"Path to the model (e.g., Qwen/Qwen3-0.6B).
\n
"
"Required unless out=dyn."
,
)
# Parse the arguments that were not 'in=' or 'out='
flags
=
parser
.
parse_args
(
argparse_args
)
# --- Step 3: Post-parsing Validation and Final Assignment ---
# Validate 'batch' mode requires a file path
if
in_mode
==
"batch"
and
not
batch_file
:
parser
.
error
(
"Batch mode requires a file path: in=batch:FILE"
)
# Validate model path requirement based on 'out_mode'
if
out_mode
!=
"dyn"
and
flags
.
model
is
None
:
parser
.
error
(
"Model path is required unless out=dyn."
)
# Consolidate all parsed arguments into a dictionary
parsed_args
=
{
"in_mode"
:
in_mode
,
"out_mode"
:
out_mode
,
"batch_file"
:
batch_file
,
# Will be None if in_mode is not "batch"
"model_path"
:
flags
.
model
,
"flags"
:
flags
,
}
return
parsed_args
async
def
run
():
loop
=
asyncio
.
get_running_loop
()
runtime
=
DistributedRuntime
(
loop
,
False
)
args
=
parse_args
()
engine_type_map
=
{
"echo"
:
EngineType
.
Echo
,
"mistralrs"
:
EngineType
.
MistralRs
,
"llamacpp"
:
EngineType
.
LlamaCpp
,
"dyn"
:
EngineType
.
Dynamic
,
}
out_mode
=
args
[
"out_mode"
]
engine_type
=
engine_type_map
.
get
(
out_mode
)
if
engine_type
is
None
:
print
(
f
"Unsupported output type:
{
out_mode
}
"
)
sys
.
exit
(
1
)
# TODO: The "vllm", "sglang" and "trtllm" cases should call Python directly
entrypoint_kwargs
=
{
"model_path"
:
args
[
"model_path"
]}
flags
=
args
[
"flags"
]
if
flags
.
model_name
is
not
None
:
entrypoint_kwargs
[
"model_name"
]
=
flags
.
model_name
if
flags
.
model_config
is
not
None
:
entrypoint_kwargs
[
"model_config"
]
=
flags
.
model_config
if
flags
.
context_length
is
not
None
:
entrypoint_kwargs
[
"context_length"
]
=
flags
.
context_length
if
flags
.
template_file
is
not
None
:
entrypoint_kwargs
[
"template_file"
]
=
flags
.
template_file
if
flags
.
kv_cache_block_size
is
not
None
:
entrypoint_kwargs
[
"kv_cache_block_size"
]
=
flags
.
kv_cache_block_size
if
flags
.
http_port
is
not
None
:
entrypoint_kwargs
[
"http_port"
]
=
flags
.
http_port
e
=
EntrypointArgs
(
engine_type
,
**
entrypoint_kwargs
)
engine
=
await
make_engine
(
runtime
,
e
)
await
run_input
(
runtime
,
args
[
"in_mode"
],
engine
)
if
__name__
==
"__main__"
:
uvloop
.
run
(
run
())
launch/dynamo-run/Cargo.toml
View file @
2bf27924
...
@@ -34,6 +34,7 @@ anyhow = { workspace = true }
...
@@ -34,6 +34,7 @@ anyhow = { workspace = true }
async-openai
=
{
workspace
=
true
}
async-openai
=
{
workspace
=
true
}
async-stream
=
{
workspace
=
true
}
async-stream
=
{
workspace
=
true
}
async-trait
=
{
workspace
=
true
}
async-trait
=
{
workspace
=
true
}
either
=
{
workspace
=
true
}
futures
=
{
workspace
=
true
}
futures
=
{
workspace
=
true
}
libc
=
{
workspace
=
true
}
libc
=
{
workspace
=
true
}
serde
=
{
workspace
=
true
}
serde
=
{
workspace
=
true
}
...
...
launch/dynamo-run/src/lib.rs
View file @
2bf27924
...
@@ -11,6 +11,7 @@ use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
...
@@ -11,6 +11,7 @@ use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
use
dynamo_runtime
::
CancellationToken
;
use
dynamo_runtime
::
CancellationToken
;
mod
flags
;
mod
flags
;
use
either
::
Either
;
pub
use
flags
::
Flags
;
pub
use
flags
::
Flags
;
mod
opt
;
mod
opt
;
pub
use
dynamo_llm
::
request_template
::
RequestTemplate
;
pub
use
dynamo_llm
::
request_template
::
RequestTemplate
;
...
@@ -41,14 +42,19 @@ pub async fn run(
...
@@ -41,14 +42,19 @@ pub async fn run(
.kv_cache_block_size
(
flags
.kv_cache_block_size
)
.kv_cache_block_size
(
flags
.kv_cache_block_size
)
// Only set if user provides. Usually loaded from tokenizer_config.json
// Only set if user provides. Usually loaded from tokenizer_config.json
.context_length
(
flags
.context_length
)
.context_length
(
flags
.context_length
)
.http_port
(
flags
.http_port
)
.http_port
(
Some
(
flags
.http_port
)
)
.router_config
(
flags
.router_config
())
.router_config
(
flags
.router_config
())
.request_template
(
flags
.request_template
.clone
());
.request_template
(
flags
.request_template
.clone
());
// If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
// If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
// If not, then the endpoint isn't exposed so we let LocalModel invent one.
// If not, then the endpoint isn't exposed so we let LocalModel invent one.
let
mut
rt
=
Either
::
Left
(
runtime
.clone
());
if
let
Input
::
Endpoint
(
path
)
=
&
in_opt
{
if
let
Input
::
Endpoint
(
path
)
=
&
in_opt
{
builder
.endpoint_id
(
path
.parse
()
.with_context
(||
path
.clone
())
?
);
builder
.endpoint_id
(
Some
(
path
.parse
()
.with_context
(||
path
.clone
())
?
));
let
distributed_runtime
=
dynamo_runtime
::
DistributedRuntime
::
from_settings
(
runtime
.clone
())
.await
?
;
rt
=
Either
::
Right
(
distributed_runtime
);
};
};
let
local_model
=
builder
.build
()
.await
?
;
let
local_model
=
builder
.build
()
.await
?
;
...
@@ -70,8 +76,7 @@ pub async fn run(
...
@@ -70,8 +76,7 @@ pub async fn run(
//
//
// Run in from an input
// Run in from an input
//
//
dynamo_llm
::
entrypoint
::
input
::
run_input
(
rt
,
in_opt
,
engine_config
)
.await
?
;
dynamo_llm
::
entrypoint
::
input
::
run_input
(
in_opt
,
runtime
,
engine_config
)
.await
?
;
// Allow engines to ask main thread to wait on an extra future.
// Allow engines to ask main thread to wait on an extra future.
// We use this to stop the vllm and sglang sub-process
// We use this to stop the vllm and sglang sub-process
...
...
lib/bindings/python/Cargo.lock
View file @
2bf27924
This diff is collapsed.
Click to expand it.
lib/bindings/python/Cargo.toml
View file @
2bf27924
...
@@ -36,21 +36,26 @@ crate-type = ["cdylib", "rlib"]
...
@@ -36,21 +36,26 @@ crate-type = ["cdylib", "rlib"]
[features]
[features]
default
=
[]
default
=
[]
block-manager
=
[
"dynamo-llm/block-manager"
,
"dep:dlpark"
]
block-manager
=
[
"dynamo-llm/block-manager"
,
"dep:dlpark"
]
mistralrs
=
["dep:dynamo-engine-mistralrs"]
llamacpp
=
["dep:dynamo-engine-llamacpp"]
[dependencies]
[dependencies]
dynamo-llm
=
{
path
=
"../../llm"
}
dynamo-llm
=
{
path
=
"../../llm"
}
dynamo-runtime
=
{
path
=
"../../runtime"
}
dynamo-runtime
=
{
path
=
"../../runtime"
}
dynamo-engine-mistralrs
=
{
path
=
"../../engines/mistralrs"
,
features
=
["cuda"]
,
optional
=
true
}
dynamo-engine-llamacpp
=
{
path
=
"../../engines/llamacpp"
,
features
=
[
"cuda"
,
"dynamic-link"
],
optional
=
true
}
anyhow
=
{
version
=
"1"
}
anyhow
=
{
version
=
"1"
}
async-openai
=
{
version
=
"0.29.0"
}
async-openai
=
{
version
=
"0.29.0"
}
async-stream
=
{
version
=
"0.3"
}
async-stream
=
{
version
=
"0.3"
}
async-trait
=
{
version
=
"0.1"
}
async-trait
=
{
version
=
"0.1"
}
either
=
{
version
=
"1.13"
,
features
=
["serde"]
}
futures
=
{
version
=
"0.3"
}
futures
=
{
version
=
"0.3"
}
once_cell
=
{
version
=
"1.20.3"
}
once_cell
=
{
version
=
"1.20.3"
}
serde
=
{
version
=
"1"
}
serde
=
{
version
=
"1"
}
serde_json
=
{
version
=
"1.0.138"
}
serde_json
=
{
version
=
"1.0.138"
}
thiserror
=
{
version
=
"2.0"
}
thiserror
=
{
version
=
"2.0"
}
tokio
=
{
version
=
"1"
,
features
=
["full"]
}
tokio
=
{
version
=
"1
.46.0
"
,
features
=
["full"]
}
tokio-stream
=
{
version
=
"0"
}
tokio-stream
=
{
version
=
"0"
}
tokio-util
=
{
version
=
"0.7"
}
tokio-util
=
{
version
=
"0.7"
}
tracing
=
{
version
=
"0"
}
tracing
=
{
version
=
"0"
}
...
...
lib/bindings/python/README.md
View file @
2bf27924
...
@@ -46,6 +46,26 @@ uv pip install maturin
...
@@ -46,6 +46,26 @@ uv pip install maturin
maturin develop --uv
maturin develop --uv
```
```
5.
Experimental: To allow using mistral.rs and llama.cpp via the bindings, build with feature flags:
```
maturin develop --features mistralrs,llamacpp --release
```
`--release`
is optional. It builds slower but the resulting library is significantly faster.
See
`examples/cli/cli.py`
for usage.
They will both be built for CUDA by default. If you see a runtime error
`CUDA_ERROR_STUB_LIBRARY`
this is because
the stub
`libcuda.so`
is earlier on the library search path than the real libcuda. Try removing the
`rpath`
from the library:
```
patchelf --set-rpath '' _core.cpython-312-x86_64-linux-gnu.so
```
If you include the
`llamacpp`
feature flag,
`libllama.so`
and
`libggml.so`
(and family) will need to be available at runtime.
## Run Examples
## Run Examples
### Prerequisite
### Prerequisite
...
...
lib/bindings/python/rust/lib.rs
View file @
2bf27924
...
@@ -63,6 +63,8 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
...
@@ -63,6 +63,8 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m
.add_function
(
wrap_pyfunction!
(
llm
::
kv
::
compute_block_hash_for_seq_py
,
m
)
?
)
?
;
m
.add_function
(
wrap_pyfunction!
(
llm
::
kv
::
compute_block_hash_for_seq_py
,
m
)
?
)
?
;
m
.add_function
(
wrap_pyfunction!
(
log_message
,
m
)
?
)
?
;
m
.add_function
(
wrap_pyfunction!
(
log_message
,
m
)
?
)
?
;
m
.add_function
(
wrap_pyfunction!
(
register_llm
,
m
)
?
)
?
;
m
.add_function
(
wrap_pyfunction!
(
register_llm
,
m
)
?
)
?
;
m
.add_function
(
wrap_pyfunction!
(
llm
::
entrypoint
::
make_engine
,
m
)
?
)
?
;
m
.add_function
(
wrap_pyfunction!
(
llm
::
entrypoint
::
run_input
,
m
)
?
)
?
;
m
.add_class
::
<
DistributedRuntime
>
()
?
;
m
.add_class
::
<
DistributedRuntime
>
()
?
;
m
.add_class
::
<
CancellationToken
>
()
?
;
m
.add_class
::
<
CancellationToken
>
()
?
;
...
@@ -73,6 +75,9 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
...
@@ -73,6 +75,9 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m
.add_class
::
<
EtcdClient
>
()
?
;
m
.add_class
::
<
EtcdClient
>
()
?
;
m
.add_class
::
<
AsyncResponseStream
>
()
?
;
m
.add_class
::
<
AsyncResponseStream
>
()
?
;
m
.add_class
::
<
llm
::
disagg_router
::
DisaggregatedRouter
>
()
?
;
m
.add_class
::
<
llm
::
disagg_router
::
DisaggregatedRouter
>
()
?
;
m
.add_class
::
<
llm
::
entrypoint
::
EntrypointArgs
>
()
?
;
m
.add_class
::
<
llm
::
entrypoint
::
EngineConfig
>
()
?
;
m
.add_class
::
<
llm
::
entrypoint
::
EngineType
>
()
?
;
m
.add_class
::
<
llm
::
kv
::
WorkerMetricsPublisher
>
()
?
;
m
.add_class
::
<
llm
::
kv
::
WorkerMetricsPublisher
>
()
?
;
m
.add_class
::
<
llm
::
model_card
::
ModelDeploymentCard
>
()
?
;
m
.add_class
::
<
llm
::
model_card
::
ModelDeploymentCard
>
()
?
;
m
.add_class
::
<
llm
::
preprocessor
::
OAIChatPreprocessor
>
()
?
;
m
.add_class
::
<
llm
::
preprocessor
::
OAIChatPreprocessor
>
()
?
;
...
...
lib/bindings/python/rust/llm.rs
View file @
2bf27924
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// This module provides a high-performance interface that bridges Python
/// This module provides a high-performance interface that bridges Python
/// applications with the Rust-powered Dynamo LLM runtime.
/// applications with the Rust-powered Dynamo LLM runtime.
...
@@ -41,6 +29,7 @@ use super::*;
...
@@ -41,6 +29,7 @@ use super::*;
pub
mod
backend
;
pub
mod
backend
;
pub
mod
block_manager
;
pub
mod
block_manager
;
pub
mod
disagg_router
;
pub
mod
disagg_router
;
pub
mod
entrypoint
;
pub
mod
kv
;
pub
mod
kv
;
pub
mod
model_card
;
pub
mod
model_card
;
pub
mod
nats
;
pub
mod
nats
;
...
...
lib/bindings/python/rust/llm/entrypoint.rs
0 → 100644
View file @
2bf27924
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use
std
::
fmt
::
Display
;
use
std
::
path
::
PathBuf
;
use
pyo3
::{
exceptions
::
PyException
,
prelude
::
*
};
use
dynamo_llm
::
entrypoint
::
input
::
Input
;
use
dynamo_llm
::
entrypoint
::
EngineConfig
as
RsEngineConfig
;
use
dynamo_llm
::
local_model
::{
LocalModel
,
LocalModelBuilder
};
use
dynamo_runtime
::
protocols
::
Endpoint
as
EndpointId
;
#[pyclass(eq,
eq_int)]
#[derive(Clone,
Debug,
PartialEq)]
#[repr(i32)]
pub
enum
EngineType
{
Echo
=
1
,
MistralRs
=
2
,
LlamaCpp
=
3
,
Dynamic
=
4
,
}
#[pyclass]
#[derive(Clone,
Debug)]
pub
(
crate
)
struct
EntrypointArgs
{
engine_type
:
EngineType
,
model_path
:
Option
<
PathBuf
>
,
model_name
:
Option
<
String
>
,
model_config
:
Option
<
PathBuf
>
,
endpoint_id
:
Option
<
EndpointId
>
,
context_length
:
Option
<
u32
>
,
template_file
:
Option
<
PathBuf
>
,
//router_config: Option<RouterConfig>,
kv_cache_block_size
:
Option
<
u32
>
,
http_port
:
Option
<
u16
>
,
}
#[pymethods]
impl
EntrypointArgs
{
#[allow(clippy::too_many_arguments)]
#[new]
#[pyo3(signature
=
(engine_type,
model_path=None,
model_name=None,
model_config=None,
endpoint_id=None,
context_length=None,
template_file=None,
kv_cache_block_size=None,
http_port=None))]
pub
fn
new
(
engine_type
:
EngineType
,
model_path
:
Option
<
PathBuf
>
,
model_name
:
Option
<
String
>
,
// e.g. "dyn://namespace.component.endpoint"
model_config
:
Option
<
PathBuf
>
,
endpoint_id
:
Option
<
String
>
,
context_length
:
Option
<
u32
>
,
template_file
:
Option
<
PathBuf
>
,
//router_config: Option<RouterConfig>,
kv_cache_block_size
:
Option
<
u32
>
,
http_port
:
Option
<
u16
>
,
)
->
PyResult
<
Self
>
{
let
endpoint_id_obj
:
Option
<
EndpointId
>
=
match
endpoint_id
{
Some
(
eid
)
=>
Some
(
eid
.parse
()
.map_err
(|
_
|
{
PyErr
::
new
::
<
pyo3
::
exceptions
::
PyValueError
,
_
>
(
format!
(
"Invalid endpoint_id format: {eid}"
))
})
?
),
None
=>
None
,
};
Ok
(
EntrypointArgs
{
engine_type
,
model_path
,
model_name
,
model_config
,
endpoint_id
:
endpoint_id_obj
,
context_length
,
template_file
,
//router_config,
kv_cache_block_size
,
http_port
,
})
}
}
#[pyclass]
#[derive(Clone)]
pub
(
crate
)
struct
EngineConfig
{
inner
:
RsEngineConfig
,
}
#[pyfunction]
#[pyo3(signature
=
(distributed_runtime,
args))]
pub
fn
make_engine
<
'p
>
(
py
:
Python
<
'p
>
,
distributed_runtime
:
super
::
DistributedRuntime
,
args
:
EntrypointArgs
,
)
->
PyResult
<
Bound
<
'p
,
PyAny
>>
{
let
mut
builder
=
LocalModelBuilder
::
default
();
builder
.model_path
(
args
.model_path
)
.model_name
(
args
.model_name
)
.model_config
(
args
.model_config
)
.endpoint_id
(
args
.endpoint_id
)
.context_length
(
args
.context_length
)
.request_template
(
args
.template_file
)
.kv_cache_block_size
(
args
.kv_cache_block_size
)
.http_port
(
args
.http_port
);
pyo3_async_runtimes
::
tokio
::
future_into_py
(
py
,
async
move
{
let
local_model
=
builder
.build
()
.await
.map_err
(
to_pyerr
)
?
;
let
inner
=
select_engine
(
distributed_runtime
,
args
.engine_type
,
local_model
)
.await
.map_err
(
to_pyerr
)
?
;
Ok
(
EngineConfig
{
inner
})
})
}
async
fn
select_engine
(
#[allow(unused_variables)]
distributed_runtime
:
super
::
DistributedRuntime
,
engine_type
:
EngineType
,
local_model
:
LocalModel
,
)
->
anyhow
::
Result
<
RsEngineConfig
>
{
let
inner
=
match
engine_type
{
EngineType
::
Echo
=>
{
// There is no validation for the echo engine
RsEngineConfig
::
StaticFull
{
model
:
Box
::
new
(
local_model
),
engine
:
dynamo_llm
::
engines
::
make_engine_full
(),
}
}
EngineType
::
Dynamic
=>
RsEngineConfig
::
Dynamic
(
Box
::
new
(
local_model
)),
EngineType
::
MistralRs
=>
{
#[cfg(feature
=
"mistralrs"
)]
{
RsEngineConfig
::
StaticFull
{
engine
:
dynamo_engine_mistralrs
::
make_engine
(
&
local_model
)
.await
?
,
model
:
Box
::
new
(
local_model
),
}
}
#[cfg(not(feature
=
"mistralrs"
))]
{
anyhow
::
bail!
(
"mistralrs engine is not enabled. Rebuild bindings with `--features mistralrs`"
);
}
}
EngineType
::
LlamaCpp
=>
{
#[cfg(feature
=
"llamacpp"
)]
{
RsEngineConfig
::
StaticCore
{
engine
:
dynamo_engine_llamacpp
::
make_engine
(
distributed_runtime
.inner
.primary_token
(),
&
local_model
,
)
.await
?
,
model
:
Box
::
new
(
local_model
),
}
}
#[cfg(not(feature
=
"llamacpp"
))]
{
anyhow
::
bail!
(
"llamacpp engine is not enabled. Rebuild bindings with `--features llamacpp`"
);
}
}
};
Ok
(
inner
)
}
#[pyfunction]
#[pyo3(signature
=
(distributed_runtime,
input,
engine_config))]
pub
fn
run_input
<
'p
>
(
py
:
Python
<
'p
>
,
distributed_runtime
:
super
::
DistributedRuntime
,
input
:
&
str
,
engine_config
:
EngineConfig
,
)
->
PyResult
<
Bound
<
'p
,
PyAny
>>
{
let
input_enum
:
Input
=
input
.parse
()
.map_err
(
to_pyerr
)
?
;
pyo3_async_runtimes
::
tokio
::
future_into_py
(
py
,
async
move
{
dynamo_llm
::
entrypoint
::
input
::
run_input
(
either
::
Either
::
Right
(
distributed_runtime
.inner
.clone
()),
input_enum
,
engine_config
.inner
,
)
.await
.map_err
(
to_pyerr
)
?
;
Ok
(())
})
}
pub
fn
to_pyerr
<
E
>
(
err
:
E
)
->
PyErr
where
E
:
Display
,
{
PyException
::
new_err
(
format!
(
"{}"
,
err
))
}
lib/bindings/python/src/dynamo/_core.pyi
View file @
2bf27924
...
@@ -839,6 +839,18 @@ async def register_llm(model_type: ModelType, endpoint: Endpoint, model_path: st
...
@@ -839,6 +839,18 @@ async def register_llm(model_type: ModelType, endpoint: Endpoint, model_path: st
"""Attach the model at path to the given endpoint, and advertise it as model_type"""
"""Attach the model at path to the given endpoint, and advertise it as model_type"""
...
...
class EngineConfig:
"""Holds internal configuration for a Dynamo engine."""
...
async def make_engine(args: EntrypointArgs) -> EngineConfig:
"""Make an engine matching the args"""
...
async def run_input(runtime: DistributedRuntime, input: str, engine_config: EngineConfig) -> None:
"""Start an engine, connect it to an input, and run until stopped."""
...
class NatsQueue:
class NatsQueue:
"""
"""
A queue implementation using NATS JetStream for task distribution
A queue implementation using NATS JetStream for task distribution
...
@@ -1144,3 +1156,11 @@ class ZmqKvEventListener:
...
@@ -1144,3 +1156,11 @@ class ZmqKvEventListener:
ValueError: If events cannot be serialized to JSON
ValueError: If events cannot be serialized to JSON
"""
"""
...
...
class EntrypointArgs:
"""
Settings to connect an input to a worker and run them.
Use by `dynamo run`.
"""
...
lib/bindings/python/src/dynamo/llm/__init__.py
View file @
2bf27924
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# flake8: noqa
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
logging
...
@@ -24,6 +14,8 @@ except ImportError:
...
@@ -24,6 +14,8 @@ except ImportError:
from
dynamo._core
import
ApproxKvIndexer
as
ApproxKvIndexer
from
dynamo._core
import
ApproxKvIndexer
as
ApproxKvIndexer
from
dynamo._core
import
DisaggregatedRouter
as
DisaggregatedRouter
from
dynamo._core
import
DisaggregatedRouter
as
DisaggregatedRouter
from
dynamo._core
import
EngineType
from
dynamo._core
import
EntrypointArgs
as
EntrypointArgs
from
dynamo._core
import
ForwardPassMetrics
as
ForwardPassMetrics
from
dynamo._core
import
ForwardPassMetrics
as
ForwardPassMetrics
from
dynamo._core
import
HttpAsyncEngine
as
HttpAsyncEngine
from
dynamo._core
import
HttpAsyncEngine
as
HttpAsyncEngine
from
dynamo._core
import
HttpError
as
HttpError
from
dynamo._core
import
HttpError
as
HttpError
...
@@ -43,7 +35,9 @@ from dynamo._core import ZmqKvEventListener as ZmqKvEventListener
...
@@ -43,7 +35,9 @@ from dynamo._core import ZmqKvEventListener as ZmqKvEventListener
from
dynamo._core
import
ZmqKvEventPublisher
as
ZmqKvEventPublisher
from
dynamo._core
import
ZmqKvEventPublisher
as
ZmqKvEventPublisher
from
dynamo._core
import
ZmqKvEventPublisherConfig
as
ZmqKvEventPublisherConfig
from
dynamo._core
import
ZmqKvEventPublisherConfig
as
ZmqKvEventPublisherConfig
from
dynamo._core
import
compute_block_hash_for_seq_py
as
compute_block_hash_for_seq_py
from
dynamo._core
import
compute_block_hash_for_seq_py
as
compute_block_hash_for_seq_py
from
dynamo._core
import
make_engine
from
dynamo._core
import
register_llm
as
register_llm
from
dynamo._core
import
register_llm
as
register_llm
from
dynamo._core
import
run_input
try
:
try
:
from
dynamo.llm.tensorrtllm
import
(
# noqa: F401
from
dynamo.llm.tensorrtllm
import
(
# noqa: F401
...
...
lib/engines/llamacpp/Cargo.toml
View file @
2bf27924
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
[package]
name
=
"dynamo-engine-llamacpp"
name
=
"dynamo-engine-llamacpp"
...
@@ -30,6 +18,8 @@ cuda = ["llama-cpp-2/cuda"]
...
@@ -30,6 +18,8 @@ cuda = ["llama-cpp-2/cuda"]
metal
=
["llama-cpp-2/metal"]
metal
=
["llama-cpp-2/metal"]
vulkan
=
["llama-cpp-2/vulkan"]
vulkan
=
["llama-cpp-2/vulkan"]
openmp
=
["llama-cpp-2/openmp"]
openmp
=
["llama-cpp-2/openmp"]
# We cannot link libllama into a `.so`, so the bindings need this
dynamic-link
=
["llama-cpp-2/dynamic-link"]
[dependencies]
[dependencies]
dynamo-runtime
=
{
workspace
=
true
}
dynamo-runtime
=
{
workspace
=
true
}
...
...
lib/llm/src/entrypoint.rs
View file @
2bf27924
...
@@ -31,6 +31,7 @@ impl RouterConfig {
...
@@ -31,6 +31,7 @@ impl RouterConfig {
}
}
}
}
#[derive(Clone)]
pub
enum
EngineConfig
{
pub
enum
EngineConfig
{
/// Remote networked engines
/// Remote networked engines
Dynamic
(
Box
<
LocalModel
>
),
Dynamic
(
Box
<
LocalModel
>
),
...
...
lib/llm/src/entrypoint/input.rs
View file @
2bf27924
...
@@ -11,6 +11,7 @@ use std::{
...
@@ -11,6 +11,7 @@ use std::{
fmt
,
fmt
,
io
::{
IsTerminal
as
_
,
Read
as
_
},
io
::{
IsTerminal
as
_
,
Read
as
_
},
path
::
PathBuf
,
path
::
PathBuf
,
str
::
FromStr
,
};
};
pub
mod
batch
;
pub
mod
batch
;
...
@@ -19,7 +20,8 @@ pub mod endpoint;
...
@@ -19,7 +20,8 @@ pub mod endpoint;
pub
mod
http
;
pub
mod
http
;
pub
mod
text
;
pub
mod
text
;
use
dynamo_runtime
::{
protocols
::
ENDPOINT_SCHEME
,
DistributedRuntime
};
use
dynamo_runtime
::
protocols
::
ENDPOINT_SCHEME
;
use
either
::
Either
;
const
BATCH_PREFIX
:
&
str
=
"batch:"
;
const
BATCH_PREFIX
:
&
str
=
"batch:"
;
...
@@ -42,6 +44,14 @@ pub enum Input {
...
@@ -42,6 +44,14 @@ pub enum Input {
Batch
(
PathBuf
),
Batch
(
PathBuf
),
}
}
impl
FromStr
for
Input
{
type
Err
=
anyhow
::
Error
;
fn
from_str
(
s
:
&
str
)
->
Result
<
Self
,
Self
::
Err
>
{
Input
::
try_from
(
s
)
}
}
impl
TryFrom
<&
str
>
for
Input
{
impl
TryFrom
<&
str
>
for
Input
{
type
Error
=
anyhow
::
Error
;
type
Error
=
anyhow
::
Error
;
...
@@ -87,28 +97,36 @@ impl Default for Input {
...
@@ -87,28 +97,36 @@ impl Default for Input {
/// Run the given engine (EngineConfig) connected to an input.
/// Run the given engine (EngineConfig) connected to an input.
/// Does not return until the input exits.
/// Does not return until the input exits.
/// For Input::Endpoint pass a DistributedRuntime. For everything else pass either a Runtime or a
/// DistributedRuntime.
pub
async
fn
run_input
(
pub
async
fn
run_input
(
rt
:
Either
<
dynamo_runtime
::
Runtime
,
dynamo_runtime
::
DistributedRuntime
>
,
in_opt
:
Input
,
in_opt
:
Input
,
runtime
:
dynamo_runtime
::
Runtime
,
engine_config
:
super
::
EngineConfig
,
engine_config
:
super
::
EngineConfig
,
)
->
anyhow
::
Result
<
()
>
{
)
->
anyhow
::
Result
<
()
>
{
let
runtime
=
match
&
rt
{
Either
::
Left
(
rt
)
=>
rt
.clone
(),
Either
::
Right
(
drt
)
=>
drt
.runtime
()
.clone
(),
};
match
in_opt
{
match
in_opt
{
Input
::
Http
=>
{
Input
::
Http
=>
{
http
::
run
(
runtime
.clone
()
,
engine_config
)
.await
?
;
http
::
run
(
runtime
,
engine_config
)
.await
?
;
}
}
Input
::
Text
=>
{
Input
::
Text
=>
{
text
::
run
(
runtime
.clone
()
,
None
,
engine_config
)
.await
?
;
text
::
run
(
runtime
,
None
,
engine_config
)
.await
?
;
}
}
Input
::
Stdin
=>
{
Input
::
Stdin
=>
{
let
mut
prompt
=
String
::
new
();
let
mut
prompt
=
String
::
new
();
std
::
io
::
stdin
()
.read_to_string
(
&
mut
prompt
)
.unwrap
();
std
::
io
::
stdin
()
.read_to_string
(
&
mut
prompt
)
.unwrap
();
text
::
run
(
runtime
.clone
()
,
Some
(
prompt
),
engine_config
)
.await
?
;
text
::
run
(
runtime
,
Some
(
prompt
),
engine_config
)
.await
?
;
}
}
Input
::
Batch
(
path
)
=>
{
Input
::
Batch
(
path
)
=>
{
batch
::
run
(
runtime
.clone
()
,
path
,
engine_config
)
.await
?
;
batch
::
run
(
runtime
,
path
,
engine_config
)
.await
?
;
}
}
Input
::
Endpoint
(
path
)
=>
{
Input
::
Endpoint
(
path
)
=>
{
let
distributed_runtime
=
DistributedRuntime
::
from_settings
(
runtime
.clone
())
.await
?
;
let
Either
::
Right
(
distributed_runtime
)
=
rt
else
{
anyhow
::
bail!
(
"Input::Endpoint requires passing a DistributedRuntime"
);
};
endpoint
::
run
(
distributed_runtime
,
path
,
engine_config
)
.await
?
;
endpoint
::
run
(
distributed_runtime
,
path
,
engine_config
)
.await
?
;
}
}
}
}
...
...
lib/llm/src/local_model.rs
View file @
2bf27924
...
@@ -80,8 +80,8 @@ impl LocalModelBuilder {
...
@@ -80,8 +80,8 @@ impl LocalModelBuilder {
self
self
}
}
pub
fn
endpoint_id
(
&
mut
self
,
endpoint_id
:
EndpointId
)
->
&
mut
Self
{
pub
fn
endpoint_id
(
&
mut
self
,
endpoint_id
:
Option
<
EndpointId
>
)
->
&
mut
Self
{
self
.endpoint_id
=
Some
(
endpoint_id
)
;
self
.endpoint_id
=
endpoint_id
;
self
self
}
}
...
@@ -96,8 +96,9 @@ impl LocalModelBuilder {
...
@@ -96,8 +96,9 @@ impl LocalModelBuilder {
self
self
}
}
pub
fn
http_port
(
&
mut
self
,
port
:
u16
)
->
&
mut
Self
{
/// Passing None resets it to default
self
.http_port
=
port
;
pub
fn
http_port
(
&
mut
self
,
port
:
Option
<
u16
>
)
->
&
mut
Self
{
self
.http_port
=
port
.unwrap_or
(
DEFAULT_HTTP_PORT
);
self
self
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment