Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
c78b5901
Unverified
Commit
c78b5901
authored
Oct 28, 2025
by
Graham King
Committed by
GitHub
Oct 28, 2025
Browse files
chore(runtime): Do not expose etcd lease ID (#3915)
Signed-off-by:
Graham King
<
grahamk@nvidia.com
>
parent
a79122c6
Changes
22
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
37 additions
and
176 deletions
+37
-176
components/src/dynamo/sglang/publisher.py
components/src/dynamo/sglang/publisher.py
+1
-3
components/src/dynamo/trtllm/main.py
components/src/dynamo/trtllm/main.py
+1
-1
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+1
-1
examples/multimodal/components/worker.py
examples/multimodal/components/worker.py
+1
-1
lib/bindings/python/rust/lib.rs
lib/bindings/python/rust/lib.rs
+3
-6
lib/bindings/python/rust/llm/block_manager.rs
lib/bindings/python/rust/llm/block_manager.rs
+1
-6
lib/bindings/python/rust/llm/kv.rs
lib/bindings/python/rust/llm/kv.rs
+1
-8
lib/bindings/python/src/dynamo/_core.pyi
lib/bindings/python/src/dynamo/_core.pyi
+2
-2
lib/llm/src/block_manager/controller.rs
lib/llm/src/block_manager/controller.rs
+1
-1
lib/llm/src/block_manager/distributed/worker.rs
lib/llm/src/block_manager/distributed/worker.rs
+1
-8
lib/llm/src/kv_router.rs
lib/llm/src/kv_router.rs
+1
-7
lib/llm/src/kv_router/publisher.rs
lib/llm/src/kv_router/publisher.rs
+1
-9
lib/llm/src/mocker/engine.rs
lib/llm/src/mocker/engine.rs
+1
-6
lib/llm/tests/block_manager.rs
lib/llm/tests/block_manager.rs
+2
-2
lib/runtime/src/component.rs
lib/runtime/src/component.rs
+0
-1
lib/runtime/src/component/endpoint.rs
lib/runtime/src/component/endpoint.rs
+15
-46
lib/runtime/src/discovery.rs
lib/runtime/src/discovery.rs
+0
-39
lib/runtime/src/distributed.rs
lib/runtime/src/distributed.rs
+2
-26
lib/runtime/src/lib.rs
lib/runtime/src/lib.rs
+0
-1
lib/runtime/src/storage/key_value_store/etcd.rs
lib/runtime/src/storage/key_value_store/etcd.rs
+2
-2
No files found.
components/src/dynamo/sglang/publisher.py
View file @
c78b5901
...
...
@@ -123,10 +123,8 @@ class DynamoSglangPublisher:
ep
=
kv_events
.
get
(
"endpoint"
)
zmq_ep
=
ep
.
replace
(
"*"
,
get_local_ip_auto
())
if
ep
else
None
lease_id
=
self
.
generate_endpoint
.
lease_id
()
zmq_config
=
ZmqKvEventPublisherConfig
(
worker_id
=
lease_id
,
worker_id
=
self
.
generate_endpoint
.
connection_id
()
,
kv_block_size
=
self
.
server_args
.
page_size
,
zmq_endpoint
=
zmq_ep
,
)
...
...
components/src/dynamo/trtllm/main.py
View file @
c78b5901
...
...
@@ -421,7 +421,7 @@ async def init(runtime: DistributedRuntime, config: Config):
component
,
engine
,
kv_listener
,
int
(
endpoint
.
lease
_id
()),
int
(
endpoint
.
connection
_id
()),
config
.
kv_block_size
,
metrics_labels
,
)
as
publisher
:
...
...
components/src/dynamo/vllm/main.py
View file @
c78b5901
...
...
@@ -135,7 +135,7 @@ def setup_kv_event_publisher(
).
replace
(
"*"
,
"127.0.0.1"
)
zmq_config
=
ZmqKvEventPublisherConfig
(
worker_id
=
generate_endpoint
.
lease
_id
(),
worker_id
=
generate_endpoint
.
connection
_id
(),
kv_block_size
=
vllm_config
.
cache_config
.
block_size
,
zmq_endpoint
=
zmq_endpoint
,
)
...
...
examples/multimodal/components/worker.py
View file @
c78b5901
...
...
@@ -163,7 +163,7 @@ class VllmBaseWorker:
).
replace
(
"*"
,
"127.0.0.1"
)
zmq_config
=
ZmqKvEventPublisherConfig
(
worker_id
=
endpoint
.
lease
_id
(),
worker_id
=
endpoint
.
connection
_id
(),
kv_block_size
=
vllm_config
.
cache_config
.
block_size
,
zmq_endpoint
=
zmq_endpoint
,
)
...
...
lib/bindings/python/rust/lib.rs
View file @
c78b5901
...
...
@@ -773,12 +773,9 @@ impl Endpoint {
})
}
fn
lease_id
(
&
self
)
->
u64
{
self
.inner
.drt
()
.primary_lease
()
.map
(|
l
|
l
.id
())
.unwrap_or
(
0
)
// Opaque unique ID for this worker. May change over worker lifetime.
fn
connection_id
(
&
self
)
->
u64
{
self
.inner
.drt
()
.connection_id
()
}
/// Get a RuntimeMetrics helper for creating Prometheus metrics
...
...
lib/bindings/python/rust/llm/block_manager.rs
View file @
c78b5901
...
...
@@ -225,12 +225,7 @@ impl BlockManager {
self
._controller
=
Some
(
Arc
::
new
(
controller
));
let
instance_id
=
component
.inner
.drt
()
.primary_lease
()
.map
(|
lease
|
lease
.id
())
.ok_or_else
(||
to_pyerr
(
anyhow
::
anyhow!
(
"no instance id"
)))
?
;
let
instance_id
=
component
.inner
.drt
()
.connection_id
();
tracing
::
info!
(
"Dynamo KVBM Controller: {}.{}:{}"
,
...
...
lib/bindings/python/rust/llm/kv.rs
View file @
c78b5901
...
...
@@ -991,14 +991,7 @@ async fn create_kv_router_from_endpoint(
// Get component from endpoint
let
component
=
endpoint
.inner
.component
();
// Verify we're not in static mode
if
component
.drt
()
.primary_lease
()
.is_none
()
{
return
Err
(
PyErr
::
new
::
<
pyo3
::
exceptions
::
PyRuntimeError
,
_
>
(
"Failed to get primary lease: Cannot KV route static workers"
,
));
}
// Create ModelManager and use it to create KvRouter (ensures etcd registration)
// Create ModelManager and use it to create KvRouter (ensures registration)
let
model_manager
=
Arc
::
new
(
llm_rs
::
discovery
::
ModelManager
::
new
());
let
kv_router
=
model_manager
.kv_chooser_for
(
component
,
block_size
as
u32
,
kv_router_config
)
...
...
lib/bindings/python/src/dynamo/_core.pyi
View file @
c78b5901
...
...
@@ -157,9 +157,9 @@ class Endpoint:
"""
...
async def lease
_id(self) -> int:
def connection
_id(self) -> int:
"""
Return primary lease id. Currently,
can
not set a different lease id
.
Opaque unique ID for this worker. May
c
h
an
ge over worker lifetime
.
"""
...
...
...
lib/llm/src/block_manager/controller.rs
View file @
c78b5901
...
...
@@ -112,7 +112,7 @@ mod tests {
.await
.unwrap
();
let
worker_id
=
drt
.
primary_lease
()
.unwrap
()
.
id
();
let
worker_id
=
drt
.
connection_
id
();
let
block_manager
=
create_reference_block_manager_with_counts
(
8
,
16
,
0
)
.await
;
...
...
lib/llm/src/block_manager/distributed/worker.rs
View file @
c78b5901
...
...
@@ -677,14 +677,7 @@ impl KvbmWorker {
bytes_per_block
:
usize
,
)
->
anyhow
::
Result
<
()
>
{
let
drt
=
config
.drt
.clone
();
let
worker_id
=
drt
.primary_lease
()
.ok_or
(
anyhow
::
anyhow!
(
"unable to get primary lease; check that drt is not static"
))
?
.id
()
as
usize
;
let
worker_id
=
drt
.connection_id
()
as
usize
;
// Readiness gating for ping
let
state
=
Arc
::
new
(
WorkerState
::
new
());
...
...
lib/llm/src/kv_router.rs
View file @
c78b5901
...
...
@@ -224,13 +224,7 @@ impl KvRouter {
consumer_uuid
:
String
,
)
->
Result
<
Self
>
{
let
kv_router_config
=
kv_router_config
.unwrap_or_default
();
let
cancellation_token
=
component
.drt
()
.primary_lease
()
.expect
(
"Cannot KV route static workers"
)
.primary_token
();
let
cancellation_token
=
component
.drt
()
.primary_token
();
let
generate_endpoint
=
component
.endpoint
(
"generate"
);
let
client
=
generate_endpoint
.client
()
.await
?
;
...
...
lib/llm/src/kv_router/publisher.rs
View file @
c78b5901
...
...
@@ -784,15 +784,7 @@ impl WorkerMetricsPublisher {
}
pub
async
fn
create_endpoint
(
&
self
,
component
:
Component
)
->
Result
<
()
>
{
let
worker_id
=
component
.drt
()
.primary_lease
()
.map
(|
lease
|
lease
.id
())
.unwrap_or_else
(||
{
tracing
::
warn!
(
"Component is static, assuming worker_id of 0"
);
0
});
let
worker_id
=
component
.drt
()
.connection_id
();
self
.start_nats_metrics_publishing
(
component
.namespace
()
.clone
(),
worker_id
);
Ok
(())
}
...
...
lib/llm/src/mocker/engine.rs
View file @
c78b5901
...
...
@@ -246,12 +246,7 @@ impl MockVllmEngine {
tracing
::
debug!
(
"Component found for KV events publishing"
);
tracing
::
debug!
(
"Getting worker_id"
);
let
worker_id
=
comp
.drt
()
.primary_lease
()
.expect
(
"Cannot publish KV events without lease"
)
// ← This will PANIC on static!
.id
();
// let worker_id = 0;
let
worker_id
=
comp
.drt
()
.connection_id
();
tracing
::
debug!
(
"Worker_id set to: {worker_id}"
);
tracing
::
debug!
(
"Creating KV event publisher"
);
...
...
lib/llm/tests/block_manager.rs
View file @
c78b5901
...
...
@@ -216,7 +216,7 @@ pub mod llm_kvbm {
impl
DynamoKvbmRuntimeConfigBuilder
{
pub
fn
build
(
self
)
->
Result
<
kvbm
::
config
::
KvManagerRuntimeConfig
>
{
let
(
runtime
,
nixl
)
=
self
.build_internal
()
?
.dissolve
();
let
worker_id
=
runtime
.
primary_lease
()
.unwrap
()
.
id
();
let
worker_id
=
runtime
.
connection_
id
();
Ok
(
kvbm
::
config
::
KvManagerRuntimeConfig
::
builder
()
.worker_id
(
worker_id
)
.cancellation_token
(
runtime
.primary_token
()
.child_token
())
...
...
@@ -247,7 +247,7 @@ pub mod llm_kvbm {
impl
DynamoEventManager
{
pub
fn
new
(
component
:
Arc
<
KVBMDynamoRuntimeComponent
>
)
->
Self
{
let
(
tx
,
rx
)
=
mpsc
::
unbounded_channel
();
let
worker_id
=
component
.drt
()
.
primary_lease
()
.unwrap
()
.
id
();
let
worker_id
=
component
.drt
()
.
connection_
id
();
component
.drt
()
.runtime
()
.secondary
()
.spawn
(
async
move
{
worker_task
(
component
,
rx
)
.await
;
});
...
...
lib/runtime/src/component.rs
View file @
c78b5901
...
...
@@ -33,7 +33,6 @@ use std::fmt;
use
crate
::{
config
::
HealthStatus
,
discovery
::
Lease
,
metrics
::{
MetricsHierarchy
,
MetricsRegistry
,
prometheus_names
},
service
::
ServiceSet
,
transports
::
etcd
::{
ETCD_ROOT_PATH
,
EtcdPath
},
...
...
lib/runtime/src/component/endpoint.rs
View file @
c78b5901
...
...
@@ -15,12 +15,6 @@ pub struct EndpointConfig {
#[builder(private)]
endpoint
:
Endpoint
,
// todo: move lease to component/service
/// Lease
#[educe(Debug(ignore))]
#[builder(default)]
lease
:
Option
<
Lease
>
,
/// Endpoint handler
#[educe(Debug(ignore))]
handler
:
Arc
<
dyn
PushWorkHandler
>
,
...
...
@@ -61,19 +55,17 @@ impl EndpointConfigBuilder {
pub
async
fn
start
(
self
)
->
Result
<
()
>
{
let
(
endpoint
,
lease
,
handler
,
stats_handler
,
metrics_labels
,
graceful_shutdown
,
health_check_payload
,
)
=
self
.build_internal
()
?
.dissolve
();
let
lease
=
lease
.or
(
endpoint
.drt
()
.primary_lease
());
let
lease_id
=
lease
.as_ref
()
.map
(|
l
|
l
.id
())
.unwrap_or
(
0
);
let
connection_id
=
endpoint
.drt
()
.connection_id
();
tracing
::
debug!
(
"Starting endpoint: {}"
,
endpoint
.etcd_path_with_lease_id
(
lease
_id
)
endpoint
.etcd_path_with_lease_id
(
connection
_id
)
);
let
service_name
=
endpoint
.component
.service_name
();
...
...
@@ -107,25 +99,26 @@ impl EndpointConfigBuilder {
if
let
Some
(
stats_handler
)
=
stats_handler
{
handler_map
.lock
()
.insert
(
endpoint
.subject_to
(
lease
_id
),
stats_handler
);
.insert
(
endpoint
.subject_to
(
connection
_id
),
stats_handler
);
}
// creates an endpoint for the service
let
service_endpoint
=
group
.endpoint
(
&
endpoint
.name_with_id
(
lease
_id
))
.endpoint
(
&
endpoint
.name_with_id
(
connection
_id
))
.await
.map_err
(|
e
|
anyhow
::
anyhow!
(
"Failed to start endpoint: {e}"
))
?
;
// Create a token that responds to both runtime shutdown and lease expiration
let
runtime_shutdown_token
=
endpoint
.drt
()
.child_token
();
// This creates a child token of the runtime's endpoint_shutdown_token. That token is
// cancelled first as part of graceful shutdown. See Runtime::shutdown.
let
endpoint_shutdown_token
=
endpoint
.drt
()
.child_token
();
// Extract all values needed from endpoint before any spawns
let
namespace_name
=
endpoint
.component.namespace.name
.clone
();
let
component_name
=
endpoint
.component.name
.clone
();
let
endpoint_name
=
endpoint
.name
.clone
();
let
system_health
=
endpoint
.drt
()
.system_health
.clone
();
let
subject
=
endpoint
.subject_to
(
lease
_id
);
let
etcd_path
=
endpoint
.etcd_path_with_lease_id
(
lease
_id
);
let
subject
=
endpoint
.subject_to
(
connection
_id
);
let
etcd_path
=
endpoint
.etcd_path_with_lease_id
(
connection
_id
);
let
etcd_client
=
endpoint
.component.drt.etcd_client
.clone
();
// Register health check target in SystemHealth if provided
...
...
@@ -134,7 +127,7 @@ impl EndpointConfigBuilder {
component
:
component_name
.clone
(),
endpoint
:
endpoint_name
.clone
(),
namespace
:
namespace_name
.clone
(),
instance_id
:
lease
_id
,
instance_id
:
connection
_id
,
transport
:
TransportType
::
NatsTcp
(
subject
.clone
()),
};
tracing
::
debug!
(
endpoint_name
=
%
endpoint_name
,
"Registering endpoint health check target"
);
...
...
@@ -149,29 +142,6 @@ impl EndpointConfigBuilder {
}
}
let
cancel_token
=
if
let
Some
(
lease
)
=
lease
.as_ref
()
{
// Create a new token that will be cancelled when EITHER the lease expires OR runtime shutdown occurs
let
combined_token
=
CancellationToken
::
new
();
let
combined_for_select
=
combined_token
.clone
();
let
lease_token
=
lease
.child_token
();
// Use secondary runtime for this lightweight monitoring task
endpoint
.drt
()
.runtime
()
.secondary
()
.spawn
(
async
move
{
tokio
::
select!
{
_
=
lease_token
.cancelled
()
=>
{
tracing
::
trace!
(
"Lease cancelled, triggering endpoint shutdown"
);
}
_
=
runtime_shutdown_token
.cancelled
()
=>
{
tracing
::
trace!
(
"Runtime shutdown triggered, cancelling endpoint"
);
}
}
combined_for_select
.cancel
();
});
combined_token
}
else
{
// No lease, just use runtime shutdown token
runtime_shutdown_token
};
// Register with graceful shutdown tracker if needed
if
graceful_shutdown
{
tracing
::
debug!
(
...
...
@@ -186,12 +156,11 @@ impl EndpointConfigBuilder {
let
push_endpoint
=
PushEndpoint
::
builder
()
.service_handler
(
handler
)
.cancellation_token
(
cancel
_token
.clone
())
.cancellation_token
(
endpoint_shutdown
_token
.clone
())
.graceful_shutdown
(
graceful_shutdown
)
.build
()
.map_err
(|
e
|
anyhow
::
anyhow!
(
"Failed to build push endpoint: {e}"
))
?
;
// launch in primary runtime
let
tracker_clone
=
if
graceful_shutdown
{
Some
(
endpoint
.drt
()
.graceful_shutdown_tracker
())
}
else
{
...
...
@@ -210,7 +179,7 @@ impl EndpointConfigBuilder {
namespace_name_for_task
,
component_name_for_task
,
endpoint_name_for_task
,
lease
_id
,
connection
_id
,
system_health
,
)
.await
;
...
...
@@ -231,7 +200,7 @@ impl EndpointConfigBuilder {
component
:
component_name
.clone
(),
endpoint
:
endpoint_name
.clone
(),
namespace
:
namespace_name
.clone
(),
instance_id
:
lease
_id
,
instance_id
:
connection
_id
,
transport
:
TransportType
::
NatsTcp
(
subject
),
};
...
...
@@ -239,7 +208,7 @@ impl EndpointConfigBuilder {
if
let
Some
(
etcd_client
)
=
&
etcd_client
&&
let
Err
(
e
)
=
etcd_client
.kv_create
(
&
etcd_path
,
info
,
Some
(
lease
_id
))
.kv_create
(
&
etcd_path
,
info
,
Some
(
connection
_id
))
.await
{
tracing
::
error!
(
...
...
@@ -248,7 +217,7 @@ impl EndpointConfigBuilder {
error
=
%
e
,
"Unable to register service for discovery"
);
cancel
_token
.cancel
();
endpoint_shutdown
_token
.cancel
();
return
Err
(
error!
(
"Unable to register service for discovery. Check discovery service status"
));
...
...
lib/runtime/src/discovery.rs
deleted
100644 → 0
View file @
a79122c6
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use
crate
::{
Result
,
transports
::
etcd
};
pub
use
etcd
::
Lease
;
pub
struct
DiscoveryClient
{
namespace
:
String
,
etcd_client
:
etcd
::
Client
,
}
impl
DiscoveryClient
{
/// Create a new [`DiscoveryClient`]
///
/// This will establish a connection to the etcd server, create a primary lease,
/// and spawn a task to keep the lease alive and tie the lifetime of the [`Runtime`]
/// to the lease.
///
/// If the lease expires, the [`Runtime`] will be shutdown.
/// If the [`Runtime`] is shutdown, the lease will be revoked.
pub
(
crate
)
fn
new
(
namespace
:
String
,
etcd_client
:
etcd
::
Client
)
->
Self
{
DiscoveryClient
{
namespace
,
etcd_client
,
}
}
/// Get the primary lease ID
pub
fn
primary_lease_id
(
&
self
)
->
u64
{
self
.etcd_client
.lease_id
()
}
/// Create a [`Lease`] with a given time-to-live (TTL).
/// This [`Lease`] will be tied to the [`crate::Runtime`], but has its own independent [`crate::CancellationToken`].
pub
async
fn
create_lease
(
&
self
,
ttl
:
u64
)
->
Result
<
Lease
>
{
self
.etcd_client
.create_lease
(
ttl
)
.await
}
}
lib/runtime/src/distributed.rs
View file @
c78b5901
...
...
@@ -9,7 +9,6 @@ use crate::transports::nats::DRTNatsClientPrometheusMetrics;
use
crate
::{
ErrorContext
,
component
::{
self
,
ComponentBuilder
,
Endpoint
,
InstanceSource
,
Namespace
},
discovery
::
DiscoveryClient
,
metrics
::
PrometheusUpdateCallback
,
metrics
::{
MetricsHierarchy
,
MetricsRegistry
},
service
::
ServiceClient
,
...
...
@@ -211,10 +210,8 @@ impl DistributedRuntime {
self
.runtime
.primary_token
()
}
/// The etcd lease all our components will be attached to.
/// Not available for static workers.
pub
fn
primary_lease
(
&
self
)
->
Option
<
etcd
::
Lease
>
{
self
.etcd_client
.as_ref
()
.map
(|
c
|
c
.primary_lease
())
pub
fn
connection_id
(
&
self
)
->
u64
{
self
.store
.connection_id
()
}
pub
fn
shutdown
(
&
self
)
{
...
...
@@ -226,27 +223,6 @@ impl DistributedRuntime {
Namespace
::
new
(
self
.clone
(),
name
.into
(),
self
.is_static
)
}
// /// Create a [`Component`]
// pub fn component(
// &self,
// name: impl Into<String>,
// namespace: impl Into<String>,
// ) -> Result<Component> {
// Ok(ComponentBuilder::from_runtime(self.clone())
// .name(name.into())
// .namespace(namespace.into())
// .build()?)
// }
pub
(
crate
)
fn
discovery_client
(
&
self
,
namespace
:
impl
Into
<
String
>
)
->
DiscoveryClient
{
DiscoveryClient
::
new
(
namespace
.into
(),
self
.etcd_client
.clone
()
.expect
(
"Attempt to get discovery_client on static DistributedRuntime"
),
)
}
pub
(
crate
)
fn
service_client
(
&
self
)
->
Option
<
ServiceClient
>
{
self
.nats_client
()
.map
(|
nc
|
ServiceClient
::
new
(
nc
.clone
()))
}
...
...
lib/runtime/src/lib.rs
View file @
c78b5901
...
...
@@ -22,7 +22,6 @@ pub use config::RuntimeConfig;
pub
mod
component
;
pub
mod
compute
;
pub
mod
discovery
;
pub
mod
engine
;
pub
mod
health_check
;
pub
mod
system_status_server
;
...
...
lib/runtime/src/storage/key_value_store/etcd.rs
View file @
c78b5901
...
...
@@ -174,7 +174,7 @@ impl EtcdBucket {
tracing
::
trace!
(
"etcd create: {k}"
);
// Use atomic transaction to check and create in one operation
let
put_options
=
PutOptions
::
new
()
.with_lease
(
self
.client
.
primary_
lease
()
.
id
()
as
i64
);
let
put_options
=
PutOptions
::
new
()
.with_lease
(
self
.client
.lease
_
id
()
as
i64
);
// Build transaction that creates key only if it doesn't exist
let
txn
=
Txn
::
new
()
...
...
@@ -243,7 +243,7 @@ impl EtcdBucket {
}
let
put_options
=
PutOptions
::
new
()
.with_lease
(
self
.client
.
primary_
lease
()
.
id
()
as
i64
)
.with_lease
(
self
.client
.lease
_
id
()
as
i64
)
.with_prev_key
();
let
mut
put_resp
=
self
.client
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment