Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
6ffd20a8
Unverified
Commit
6ffd20a8
authored
Sep 30, 2025
by
Graham King
Committed by
GitHub
Sep 30, 2025
Browse files
chore: Move model_input, model_type from ModelEntry to ModelDeploymentCard (#3292)
Signed-off-by:
Graham King
<
grahamk@nvidia.com
>
parent
50dfd3af
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
219 additions
and
714 deletions
+219
-714
lib/llm/src/discovery/model_entry.rs
lib/llm/src/discovery/model_entry.rs
+1
-16
lib/llm/src/discovery/model_manager.rs
lib/llm/src/discovery/model_manager.rs
+14
-15
lib/llm/src/discovery/watcher.rs
lib/llm/src/discovery/watcher.rs
+36
-46
lib/llm/src/entrypoint/input/http.rs
lib/llm/src/entrypoint/input/http.rs
+14
-48
lib/llm/src/grpc/service/kserve.rs
lib/llm/src/grpc/service/kserve.rs
+14
-16
lib/llm/src/http/service/health.rs
lib/llm/src/http/service/health.rs
+4
-3
lib/llm/src/http/service/metrics.rs
lib/llm/src/http/service/metrics.rs
+19
-55
lib/llm/src/local_model.rs
lib/llm/src/local_model.rs
+2
-2
lib/llm/src/model_card.rs
lib/llm/src/model_card.rs
+24
-32
lib/llm/src/model_type.rs
lib/llm/src/model_type.rs
+3
-2
lib/llm/tests/http_metrics.rs
lib/llm/tests/http_metrics.rs
+3
-4
lib/llm/tests/http_namespace_integration.rs
lib/llm/tests/http_namespace_integration.rs
+30
-174
lib/llm/tests/kserve_service.rs
lib/llm/tests/kserve_service.rs
+36
-82
lib/llm/tests/namespace.rs
lib/llm/tests/namespace.rs
+0
-219
lib/runtime/examples/Cargo.lock
lib/runtime/examples/Cargo.lock
+12
-0
lib/runtime/src/component.rs
lib/runtime/src/component.rs
+7
-0
No files found.
lib/llm/src/discovery/model_entry.rs
View file @
6ffd20a8
...
...
@@ -4,10 +4,7 @@
use
dynamo_runtime
::{
protocols
,
slug
::
Slug
};
use
serde
::{
Deserialize
,
Serialize
};
use
crate
::{
local_model
::
runtime_config
::
ModelRuntimeConfig
,
model_type
::{
ModelInput
,
ModelType
},
};
use
crate
::
local_model
::
runtime_config
::
ModelRuntimeConfig
;
/// [ModelEntry] contains the information to discover models
#[derive(Debug,
Clone,
Serialize,
Deserialize,
Eq,
PartialEq)]
...
...
@@ -20,17 +17,9 @@ pub struct ModelEntry {
#[serde(rename
=
"endpoint"
)]
pub
endpoint_id
:
protocols
::
EndpointId
,
/// Specifies whether the model is a chat, completions, etc model.
pub
model_type
:
ModelType
,
/// Runtime configuration specific to this model instance
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
runtime_config
:
Option
<
ModelRuntimeConfig
>
,
/// Specifies the model input type.
/// `Tokens` for engines that expect pre-processed input.
/// `Text` for engines that take care of pre-processing themselves.
pub
model_input
:
ModelInput
,
}
impl
ModelEntry
{
...
...
@@ -38,8 +27,4 @@ impl ModelEntry {
pub
fn
slug
(
&
self
)
->
Slug
{
Slug
::
from_string
(
&
self
.name
)
}
pub
fn
requires_preprocessing
(
&
self
)
->
bool
{
matches!
(
self
.model_input
,
ModelInput
::
Tokens
)
}
}
lib/llm/src/discovery/model_manager.rs
View file @
6ffd20a8
...
...
@@ -11,8 +11,8 @@ use parking_lot::{Mutex, RwLock};
use
dynamo_runtime
::
component
::
Component
;
use
dynamo_runtime
::
prelude
::
DistributedRuntimeProvider
;
use
crate
::
discovery
::{
KV_ROUTERS_ROOT_PATH
,
ModelEntry
};
use
crate
::
kv_router
::{
KvRouterConfig
,
scheduler
::
DefaultWorkerSelector
};
use
crate
::{
discovery
::
KV_ROUTERS_ROOT_PATH
,
model_card
::
ModelDeploymentCard
};
use
crate
::{
kv_router
::
KvRouter
,
types
::
generic
::
tensor
::
TensorStreamingEngine
,
...
...
@@ -40,7 +40,7 @@ pub struct ModelManager {
tensor_engines
:
RwLock
<
ModelEngines
<
TensorStreamingEngine
>>
,
// These two are Mutex because we read and write rarely and equally
entrie
s
:
Mutex
<
HashMap
<
String
,
Model
Entry
>>
,
card
s
:
Mutex
<
HashMap
<
String
,
Model
DeploymentCard
>>
,
kv_choosers
:
Mutex
<
HashMap
<
String
,
Arc
<
KvRouter
>>>
,
}
...
...
@@ -57,13 +57,13 @@ impl ModelManager {
chat_completion_engines
:
RwLock
::
new
(
ModelEngines
::
default
()),
embeddings_engines
:
RwLock
::
new
(
ModelEngines
::
default
()),
tensor_engines
:
RwLock
::
new
(
ModelEngines
::
default
()),
entrie
s
:
Mutex
::
new
(
HashMap
::
new
()),
card
s
:
Mutex
::
new
(
HashMap
::
new
()),
kv_choosers
:
Mutex
::
new
(
HashMap
::
new
()),
}
}
pub
fn
get_model_
entrie
s
(
&
self
)
->
Vec
<
Model
Entry
>
{
self
.
entrie
s
.lock
()
.values
()
.cloned
()
.collect
()
pub
fn
get_model_
card
s
(
&
self
)
->
Vec
<
Model
DeploymentCard
>
{
self
.
card
s
.lock
()
.values
()
.cloned
()
.collect
()
}
pub
fn
has_model_any
(
&
self
,
model
:
&
str
)
->
bool
{
...
...
@@ -196,15 +196,15 @@ impl ModelManager {
.ok_or
(
ModelManagerError
::
ModelNotFound
(
model
.to_string
()))
}
/// Save a Model
Entry under
an instance's etcd `models/` key so we can fetch it later when the key is
/// Save a Model
DeploymentCard from
an instance's etcd `models/` key so we can fetch it later when the key is
/// deleted from etcd.
pub
fn
save_model_
entry
(
&
self
,
key
:
&
str
,
entry
:
Model
Entry
)
{
self
.
entrie
s
.lock
()
.insert
(
key
.to_string
(),
entry
);
pub
fn
save_model_
card
(
&
self
,
key
:
&
str
,
entry
:
Model
DeploymentCard
)
{
self
.
card
s
.lock
()
.insert
(
key
.to_string
(),
entry
);
}
/// Remove and return model
entry
for this instance's etcd key. We do this when the instance stops.
pub
fn
remove_model_
entry
(
&
self
,
key
:
&
str
)
->
Option
<
Model
Entry
>
{
self
.
entrie
s
.lock
()
.remove
(
key
)
/// Remove and return model
card
for this instance's etcd key. We do this when the instance stops.
pub
fn
remove_model_
card
(
&
self
,
key
:
&
str
)
->
Option
<
Model
DeploymentCard
>
{
self
.
card
s
.lock
()
.remove
(
key
)
}
pub
async
fn
kv_chooser_for
(
...
...
@@ -279,12 +279,11 @@ impl ModelManager {
}
pub
fn
get_model_tool_call_parser
(
&
self
,
model
:
&
str
)
->
Option
<
String
>
{
self
.
entrie
s
self
.
card
s
.lock
()
.values
()
.find
(|
entry
|
entry
.name
==
model
)
.and_then
(|
entry
|
entry
.runtime_config
.as_ref
())
.and_then
(|
config
|
config
.tool_call_parser
.clone
())
.find
(|
c
|
c
.display_name
==
model
)
.and_then
(|
c
|
c
.runtime_config.tool_call_parser
.as_ref
())
.map
(|
parser
|
parser
.to_string
())
}
...
...
lib/llm/src/discovery/watcher.rs
View file @
6ffd20a8
...
...
@@ -40,10 +40,10 @@ use crate::{
use
super
::{
MODEL_ROOT_PATH
,
ModelEntry
,
ModelManager
};
use
crate
::
namespace
::
is_global_namespace
;
#[derive(Debug,
Clone
,
Copy,
PartialEq
)]
#[derive(Debug,
Clone)]
pub
enum
ModelUpdate
{
Added
(
Model
Type
),
Removed
(
Model
Type
),
Added
(
Model
DeploymentCard
),
Removed
(
Model
DeploymentCard
),
}
pub
struct
ModelWatcher
{
...
...
@@ -140,25 +140,8 @@ impl ModelWatcher {
continue
;
}
};
self
.manager
.save_model_entry
(
key
,
model_entry
.clone
());
if
let
Some
(
tx
)
=
&
self
.model_update_tx
{
tx
.send
(
ModelUpdate
::
Added
(
model_entry
.model_type
))
.await
.ok
();
}
if
self
.manager
.has_model_any
(
&
model_entry
.name
)
{
tracing
::
trace!
(
name
=
model_entry
.name
,
namespace
=
model_entry
.endpoint_id.namespace
,
"New endpoint for existing model"
);
self
.notify_on_model
.notify_waiters
();
continue
;
}
match
self
.handle_put
(
&
model_entry
)
.await
{
match
self
.handle_put
(
key
,
&
model_entry
)
.await
{
Ok
(())
=>
{
tracing
::
info!
(
model_name
=
model_entry
.name
,
...
...
@@ -196,13 +179,13 @@ impl ModelWatcher {
/// Returns the name of the model we just deleted, if any.
async
fn
handle_delete
(
&
self
,
kv
:
&
KeyValue
)
->
anyhow
::
Result
<
Option
<
String
>>
{
let
key
=
kv
.key_str
()
?
;
let
model_entry
=
match
self
.manager
.remove_model_
entry
(
key
)
{
Some
(
entry
)
=>
entry
,
let
card
=
match
self
.manager
.remove_model_
card
(
key
)
{
Some
(
card
)
=>
card
,
None
=>
{
anyhow
::
bail!
(
"Missing Model
Entry
for {key}"
);
anyhow
::
bail!
(
"Missing Model
DeploymentCard
for {key}"
);
}
};
let
model_name
=
model_entry
.name
;
let
model_name
=
card
.display_name
.clone
()
;
let
active_instances
=
self
.entries_for_model
(
&
model_name
)
.await
...
...
@@ -257,7 +240,7 @@ impl ModelWatcher {
||
(
tensor_model_removed
&&
*
model_type
==
ModelType
::
TensorBased
))
&&
let
Some
(
tx
)
=
&
self
.model_update_tx
{
tx
.send
(
ModelUpdate
::
Removed
(
*
model_type
))
.await
.ok
();
tx
.send
(
ModelUpdate
::
Removed
(
card
.clone
()
))
.await
.ok
();
}
}
}
...
...
@@ -267,7 +250,7 @@ impl ModelWatcher {
// Handles a PUT event from etcd, this usually means adding a new model to the list of served
// models.
async
fn
handle_put
(
&
self
,
model_entry
:
&
ModelEntry
)
->
anyhow
::
Result
<
()
>
{
async
fn
handle_put
(
&
self
,
key
:
&
str
,
model_entry
:
&
ModelEntry
)
->
anyhow
::
Result
<
()
>
{
let
endpoint_id
=
&
model_entry
.endpoint_id
;
let
component
=
self
.drt
...
...
@@ -294,9 +277,24 @@ impl ModelWatcher {
}
};
if
model_entry
.model_input
==
ModelInput
::
Tokens
&&
(
model_entry
.model_type
.supports_chat
()
||
model_entry
.model_type
.supports_completions
())
self
.manager
.save_model_card
(
key
,
card
.clone
());
if
self
.manager
.has_model_any
(
&
model_entry
.name
)
{
tracing
::
trace!
(
name
=
model_entry
.name
,
namespace
=
model_entry
.endpoint_id.namespace
,
"New endpoint for existing model"
);
self
.notify_on_model
.notify_waiters
();
return
Ok
(());
}
if
let
Some
(
tx
)
=
&
self
.model_update_tx
{
tx
.send
(
ModelUpdate
::
Added
(
card
.clone
()))
.await
.ok
();
}
if
card
.model_input
==
ModelInput
::
Tokens
&&
(
card
.model_type
.supports_chat
()
||
card
.model_type
.supports_completions
())
{
// Case 1: Tokens + (Chat OR Completions OR Both)
// A model that expects pre-processed requests meaning it's up to us whether we
...
...
@@ -321,7 +319,7 @@ impl ModelWatcher {
let
tokenizer_hf
=
card
.tokenizer_hf
()
.context
(
"tokenizer_hf"
)
?
;
// Add chat engine only if the model supports chat
if
model_entry
.model_type
.supports_chat
()
{
if
card
.model_type
.supports_chat
()
{
let
chat_engine
=
entrypoint
::
build_routed_pipeline
::
<
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
...
...
@@ -342,7 +340,7 @@ impl ModelWatcher {
}
// Add completions engine only if the model supports completions
if
model_entry
.model_type
.supports_completions
()
{
if
card
.model_type
.supports_completions
()
{
let
formatter
=
PromptFormatter
::
no_op
();
let
PromptFormatter
::
OAI
(
formatter
)
=
formatter
;
let
preprocessor
=
OpenAIPreprocessor
::
new_with_parts
(
...
...
@@ -370,9 +368,7 @@ impl ModelWatcher {
.context
(
"add_completions_model"
)
?
;
tracing
::
info!
(
"Completions is ready"
);
}
}
else
if
model_entry
.model_input
==
ModelInput
::
Text
&&
model_entry
.model_type
.supports_chat
()
{
}
else
if
card
.model_input
==
ModelInput
::
Text
&&
card
.model_type
.supports_chat
()
{
// Case 3: Text + Chat
let
push_router
=
PushRouter
::
<
NvCreateChatCompletionRequest
,
...
...
@@ -384,9 +380,7 @@ impl ModelWatcher {
let
engine
=
Arc
::
new
(
push_router
);
self
.manager
.add_chat_completions_model
(
&
model_entry
.name
,
engine
)
?
;
}
else
if
model_entry
.model_input
==
ModelInput
::
Text
&&
model_entry
.model_type
.supports_completions
()
{
}
else
if
card
.model_input
==
ModelInput
::
Text
&&
card
.model_type
.supports_completions
()
{
// Case 2: Text + Completions
let
push_router
=
PushRouter
::
<
NvCreateCompletionRequest
,
...
...
@@ -398,9 +392,7 @@ impl ModelWatcher {
let
engine
=
Arc
::
new
(
push_router
);
self
.manager
.add_completions_model
(
&
model_entry
.name
,
engine
)
?
;
}
else
if
model_entry
.model_input
==
ModelInput
::
Tokens
&&
model_entry
.model_type
.supports_embedding
()
{
}
else
if
card
.model_input
==
ModelInput
::
Tokens
&&
card
.model_type
.supports_embedding
()
{
// Case 4: Tokens + Embeddings
// Create preprocessing pipeline similar to Backend
...
...
@@ -434,9 +426,7 @@ impl ModelWatcher {
self
.manager
.add_embeddings_model
(
&
model_entry
.name
,
embedding_engine
)
?
;
}
else
if
model_entry
.model_input
==
ModelInput
::
Tensor
&&
model_entry
.model_type
.supports_tensor
()
{
}
else
if
card
.model_input
==
ModelInput
::
Tensor
&&
card
.model_type
.supports_tensor
()
{
// Case 5: Tensor + Tensor (non-LLM)
let
push_router
=
PushRouter
::
<
NvCreateTensorRequest
,
...
...
@@ -452,8 +442,8 @@ impl ModelWatcher {
anyhow
::
bail!
(
"Unsupported model configuration: {} with {} input. Supported combinations:
\
Tokens+(Chat|Completions), Text+Chat, Text+Completions, Tokens+Embeddings, Tensor+TensorBased"
,
model_entry
.model_type
,
model_entry
.model_input
.as_str
()
card
.model_type
,
card
.model_input
.as_str
()
);
}
...
...
lib/llm/src/entrypoint/input/http.rs
View file @
6ffd20a8
...
...
@@ -220,9 +220,6 @@ async fn run_watcher(
http_service
:
Arc
<
HttpService
>
,
metrics
:
Arc
<
crate
::
http
::
service
::
metrics
::
Metrics
>
,
)
->
anyhow
::
Result
<
()
>
{
// Clone model_manager before it's moved into ModelWatcher
let
model_manager_clone
=
model_manager
.clone
();
let
mut
watch_obj
=
ModelWatcher
::
new
(
runtime
,
model_manager
,
...
...
@@ -241,20 +238,12 @@ async fn run_watcher(
// Spawn a task to watch for model type changes and update HTTP service endpoints and metrics
let
_
endpoint_enabler_task
=
tokio
::
spawn
(
async
move
{
while
let
Some
(
model_type
)
=
rx
.recv
()
.await
{
tracing
::
debug!
(
"Received model type update: {:?}"
,
model_type
);
while
let
Some
(
model_update
)
=
rx
.recv
()
.await
{
// Update HTTP endpoints (existing functionality)
update_http_endpoints
(
http_service
.clone
(),
model_
type
);
update_http_endpoints
(
http_service
.clone
(),
model_
update
.clone
()
);
// Update metrics (only for added models)
update_model_metrics
(
model_type
,
model_manager_clone
.clone
(),
metrics
.clone
(),
Some
(
etcd_client
.clone
()),
)
.await
;
update_model_metrics
(
model_update
,
metrics
.clone
());
}
});
...
...
@@ -273,15 +262,15 @@ fn update_http_endpoints(service: Arc<HttpService>, model_type: ModelUpdate) {
model_type
);
match
model_type
{
ModelUpdate
::
Added
(
model_type
)
=>
{
ModelUpdate
::
Added
(
card
)
=>
{
// Handle all supported endpoint types, not just the first one
for
endpoint_type
in
model_type
.as_endpoint_types
()
{
for
endpoint_type
in
card
.
model_type
.as_endpoint_types
()
{
service
.enable_model_endpoint
(
endpoint_type
,
true
);
}
}
ModelUpdate
::
Removed
(
model_type
)
=>
{
ModelUpdate
::
Removed
(
card
)
=>
{
// Handle all supported endpoint types, not just the first one
for
endpoint_type
in
model_type
.as_endpoint_types
()
{
for
endpoint_type
in
card
.
model_type
.as_endpoint_types
()
{
service
.enable_model_endpoint
(
endpoint_type
,
false
);
}
}
...
...
@@ -289,42 +278,19 @@ fn update_http_endpoints(service: Arc<HttpService>, model_type: ModelUpdate) {
}
/// Updates metrics for model type changes
async
fn
update_model_metrics
(
fn
update_model_metrics
(
model_type
:
ModelUpdate
,
model_manager
:
Arc
<
ModelManager
>
,
metrics
:
Arc
<
crate
::
http
::
service
::
metrics
::
Metrics
>
,
etcd_client
:
Option
<
etcd
::
Client
>
,
)
{
match
model_type
{
ModelUpdate
::
Added
(
model_type
)
=>
{
tracing
::
debug!
(
"Updating metrics for added model type: {:?}"
,
model_type
);
// Get all model entries and update metrics for matching types
let
model_entries
=
model_manager
.get_model_entries
();
for
entry
in
model_entries
{
if
entry
.model_type
==
model_type
{
// Update runtime config metrics if available
if
let
Some
(
runtime_config
)
=
&
entry
.runtime_config
{
metrics
.update_runtime_config_metrics
(
&
entry
.name
,
runtime_config
);
}
// Update MDC metrics if etcd is available
if
let
Some
(
ref
etcd
)
=
etcd_client
&&
let
Err
(
e
)
=
metrics
.update_metrics_from_model_entry_with_mdc
(
&
entry
,
etcd
)
.await
{
tracing
::
debug!
(
model
=
%
entry
.name
,
error
=
%
e
,
"Failed to update MDC metrics for newly added model"
);
}
}
ModelUpdate
::
Added
(
card
)
=>
{
tracing
::
debug!
(
"Updating metrics for added model: {}"
,
card
.display_name
);
if
let
Err
(
err
)
=
metrics
.update_metrics_from_mdc
(
&
card
)
{
tracing
::
warn!
(
%
err
,
model_name
=
card
.display_name
,
"update_metrics_from_mdc failed"
);
}
}
ModelUpdate
::
Removed
(
model_type
)
=>
{
tracing
::
debug!
(
"M
odel
type removed: {:?}"
,
model_type
);
ModelUpdate
::
Removed
(
card
)
=>
{
tracing
::
debug!
(
m
odel
_name
=
card
.display_name
,
"Model removed"
);
// Note: Metrics are typically not removed to preserve historical data
// This matches the behavior in the polling task
}
...
...
lib/llm/src/grpc/service/kserve.rs
View file @
6ffd20a8
...
...
@@ -397,15 +397,14 @@ impl GrpcInferenceService for KserveService {
&
self
,
request
:
Request
<
ModelMetadataRequest
>
,
)
->
Result
<
Response
<
ModelMetadataResponse
>
,
Status
>
{
let
entrie
s
=
self
.state
.manager
()
.get_model_
entrie
s
();
let
card
s
=
self
.state
.manager
()
.get_model_
card
s
();
let
request_model_name
=
&
request
.into_inner
()
.name
;
if
let
Some
(
entry
)
=
entrie
s
if
let
Some
(
card
)
=
card
s
.into_iter
()
.find
(|
entry
|
request_model_name
==
&
entry
.
name
)
.find
(|
card
|
request_model_name
==
&
card
.display_
name
)
{
if
entry
.model_type
.supports_tensor
()
{
if
let
Some
(
config
)
=
entry
.runtime_config
.as_ref
()
&&
let
Some
(
tensor_model_config
)
=
config
.tensor_model_config
.as_ref
()
if
card
.model_type
.supports_tensor
()
{
if
let
Some
(
tensor_model_config
)
=
card
.runtime_config.tensor_model_config
.as_ref
()
{
return
Ok
(
Response
::
new
(
ModelMetadataResponse
{
name
:
tensor_model_config
.name
.clone
(),
...
...
@@ -437,9 +436,9 @@ impl GrpcInferenceService for KserveService {
"Model '{}' has type Tensor but no model config is provided"
,
request_model_name
)))
?
}
else
if
entry
.model_type
.supports_completions
()
{
}
else
if
card
.model_type
.supports_completions
()
{
return
Ok
(
Response
::
new
(
ModelMetadataResponse
{
name
:
entry
.
name
,
name
:
card
.display_
name
,
versions
:
vec!
[
"1"
.to_string
()],
platform
:
"dynamo"
.to_string
(),
inputs
:
vec!
[
...
...
@@ -479,15 +478,14 @@ impl GrpcInferenceService for KserveService {
&
self
,
request
:
Request
<
ModelConfigRequest
>
,
)
->
Result
<
Response
<
ModelConfigResponse
>
,
Status
>
{
let
entrie
s
=
self
.state
.manager
()
.get_model_
entrie
s
();
let
card
s
=
self
.state
.manager
()
.get_model_
card
s
();
let
request_model_name
=
&
request
.into_inner
()
.name
;
if
let
Some
(
entry
)
=
entrie
s
if
let
Some
(
card
)
=
card
s
.into_iter
()
.find
(|
entry
|
request_model_name
==
&
entry
.
name
)
.find
(|
card
|
request_model_name
==
&
card
.display_
name
)
{
if
entry
.model_type
.supports_tensor
()
{
if
let
Some
(
config
)
=
entry
.runtime_config
.as_ref
()
&&
let
Some
(
tensor_model_config
)
=
config
.tensor_model_config
.as_ref
()
if
card
.model_type
.supports_tensor
()
{
if
let
Some
(
tensor_model_config
)
=
card
.runtime_config.tensor_model_config
.as_ref
()
{
let
model_config
=
ModelConfig
{
name
:
tensor_model_config
.name
.clone
(),
...
...
@@ -523,9 +521,9 @@ impl GrpcInferenceService for KserveService {
"Model '{}' has type Tensor but no model config is provided"
,
request_model_name
)))
?
}
else
if
entry
.model_type
.supports_completions
()
{
}
else
if
card
.model_type
.supports_completions
()
{
let
config
=
ModelConfig
{
name
:
entry
.
name
,
name
:
card
.display_
name
,
platform
:
"dynamo"
.to_string
(),
backend
:
"dynamo"
.to_string
(),
input
:
vec!
[
...
...
lib/llm/src/http/service/health.rs
View file @
6ffd20a8
...
...
@@ -52,7 +52,6 @@ async fn live_handler(
async
fn
health_handler
(
axum
::
extract
::
State
(
state
):
axum
::
extract
::
State
<
Arc
<
service_v2
::
State
>>
,
)
->
impl
IntoResponse
{
let
model_entries
=
state
.manager
()
.get_model_entries
();
let
instances
=
if
let
Some
(
etcd_client
)
=
state
.etcd_client
()
{
match
list_all_instances
(
etcd_client
)
.await
{
Ok
(
instances
)
=>
instances
,
...
...
@@ -65,10 +64,12 @@ async fn health_handler(
vec!
[]
};
let
endpoints
:
Vec
<
String
>
=
model_entri
es
let
mut
endpoints
:
Vec
<
String
>
=
instanc
es
.iter
()
.map
(|
entry
|
entry
.endpoint_id
.as_url
())
.map
(|
instance
|
instance
.endpoint_id
()
.as_url
())
.collect
();
endpoints
.sort
();
endpoints
.dedup
();
(
StatusCode
::
OK
,
Json
(
json!
({
...
...
lib/llm/src/http/service/metrics.rs
View file @
6ffd20a8
...
...
@@ -18,12 +18,9 @@ use std::{
time
::{
Duration
,
Instant
},
};
use
crate
::
discovery
::
ModelEntry
;
use
crate
::
local_model
::
runtime_config
::
ModelRuntimeConfig
;
use
crate
::
model_card
::
{
ModelDeploymentCard
,
ROOT_PATH
as
MDC_ROOT_PATH
}
;
use
crate
::
model_card
::
ModelDeploymentCard
;
use
dynamo_runtime
::
metrics
::
prometheus_names
::
clamp_u64_to_i64
;
use
dynamo_runtime
::
slug
::
Slug
;
use
dynamo_runtime
::
storage
::
key_value_store
::{
EtcdStorage
,
KeyValueStore
,
KeyValueStoreManager
};
pub
use
prometheus
::
Registry
;
...
...
@@ -472,60 +469,27 @@ impl Metrics {
}
}
/// Update metrics from a
ModelEntry and its
ModelDeploymentCard
/// Update metrics from a ModelDeploymentCard
/// This updates both runtime config metrics and MDC-specific metrics
pub
async
fn
update_metrics_from_model_entry_with_mdc
(
&
self
,
model_entry
:
&
ModelEntry
,
etcd_client
:
&
dynamo_runtime
::
transports
::
etcd
::
Client
,
)
->
anyhow
::
Result
<
()
>
{
// Update runtime config metrics
if
let
Some
(
runtime_config
)
=
&
model_entry
.runtime_config
{
self
.update_runtime_config_metrics
(
&
model_entry
.name
,
runtime_config
);
}
// Load and update MDC metrics
let
model_slug
=
Slug
::
from_string
(
&
model_entry
.name
);
let
store
:
Box
<
dyn
KeyValueStore
>
=
Box
::
new
(
EtcdStorage
::
new
(
etcd_client
.clone
()));
let
card_store
=
Arc
::
new
(
KeyValueStoreManager
::
new
(
store
));
match
card_store
.load
::
<
ModelDeploymentCard
>
(
MDC_ROOT_PATH
,
&
model_slug
)
.await
{
Ok
(
Some
(
mdc
))
=>
{
// Inline MDC metrics update
pub
fn
update_metrics_from_mdc
(
&
self
,
card
:
&
ModelDeploymentCard
)
->
anyhow
::
Result
<
()
>
{
self
.update_runtime_config_metrics
(
&
card
.display_name
,
&
card
.runtime_config
);
self
.model_context_length
.with_label_values
(
&
[
&
model_entry
.
name
])
.set
(
mdc
.context_length
as
i64
);
.with_label_values
(
&
[
&
card
.display_
name
])
.set
(
card
.context_length
as
i64
);
self
.model_kv_cache_block_size
.with_label_values
(
&
[
&
model_entry
.
name
])
.set
(
mdc
.kv_cache_block_size
as
i64
);
.with_label_values
(
&
[
&
card
.display_
name
])
.set
(
card
.kv_cache_block_size
as
i64
);
self
.model_migration_limit
.with_label_values
(
&
[
&
model_entry
.
name
])
.set
(
mdc
.migration_limit
as
i64
);
.with_label_values
(
&
[
&
card
.display_
name
])
.set
(
card
.migration_limit
as
i64
);
tracing
::
debug!
(
model
=
%
model_entry
.
name
,
model
=
%
card
.display_
name
,
"Successfully updated MDC metrics"
);
}
Ok
(
None
)
=>
{
tracing
::
debug!
(
model
=
%
model_entry
.name
,
"No MDC found in storage, skipping MDC metrics"
);
}
Err
(
e
)
=>
{
tracing
::
debug!
(
model
=
%
model_entry
.name
,
error
=
%
e
,
"Failed to load MDC for metrics update"
);
}
}
Ok
(())
}
...
...
lib/llm/src/local_model.rs
View file @
6ffd20a8
...
...
@@ -410,6 +410,8 @@ impl LocalModel {
let
Some
(
etcd_client
)
=
endpoint
.drt
()
.etcd_client
()
else
{
anyhow
::
bail!
(
"Cannot attach to static endpoint"
);
};
self
.card.model_type
=
model_type
;
self
.card.model_input
=
model_input
;
// Store model config files in NATS object store
let
nats_client
=
endpoint
.drt
()
.nats_client
();
...
...
@@ -431,9 +433,7 @@ impl LocalModel {
let
model_registration
=
ModelEntry
{
name
:
self
.display_name
()
.to_string
(),
endpoint_id
:
endpoint
.id
(),
model_type
,
runtime_config
:
Some
(
self
.runtime_config
.clone
()),
model_input
,
};
etcd_client
.kv_create
(
...
...
lib/llm/src/model_card.rs
View file @
6ffd20a8
...
...
@@ -16,10 +16,10 @@ use std::fmt;
use
std
::
fs
::
File
;
use
std
::
path
::{
Path
,
PathBuf
};
use
std
::
sync
::
Arc
;
use
std
::
time
::
Duration
;
use
crate
::
common
::
checked_file
::
CheckedFile
;
use
crate
::
local_model
::
runtime_config
::
ModelRuntimeConfig
;
use
crate
::
model_type
::{
ModelInput
,
ModelType
};
use
anyhow
::{
Context
,
Result
};
use
derive_builder
::
Builder
;
use
dynamo_runtime
::
DistributedRuntime
;
...
...
@@ -34,9 +34,6 @@ use crate::protocols::TokenIdType;
/// Identify model deployment cards in the key-value store
pub
const
ROOT_PATH
:
&
str
=
"mdc"
;
/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
const
CARD_MAX_AGE
:
chrono
::
TimeDelta
=
chrono
::
TimeDelta
::
minutes
(
5
);
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
ModelInfoType
{
...
...
@@ -118,9 +115,6 @@ pub struct ModelDeploymentCard {
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
prompt_context
:
Option
<
Vec
<
PromptContextMixin
>>
,
/// When this card was last advertised by a worker. None if not yet published.
pub
last_published
:
Option
<
chrono
::
DateTime
<
chrono
::
Utc
>>
,
/// Max context (in number of tokens) this model can handle
pub
context_length
:
u32
,
...
...
@@ -132,6 +126,14 @@ pub struct ModelDeploymentCard {
/// connection to the current worker.
pub
migration_limit
:
u32
,
/// Specifies whether the model is a chat, completions, etc model.
pub
model_type
:
ModelType
,
/// Specifies the model input type.
/// `Tokens` for engines that expect pre-processed input.
/// `Text` for engines that take care of pre-processing themselves.
pub
model_input
:
ModelInput
,
/// User-defined metadata for custom worker behavior
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
user_data
:
Option
<
serde_json
::
Value
>
,
...
...
@@ -161,17 +163,6 @@ impl ModelDeploymentCard {
}
}
/// How often we should check if a model deployment card expired because it's workers are gone
pub
fn
expiry_check_period
()
->
Duration
{
match
CARD_MAX_AGE
.to_std
()
{
Ok
(
duration
)
=>
duration
/
3
,
Err
(
_
)
=>
{
// Only happens if CARD_MAX_AGE is negative, which it isn't
unreachable!
(
"Cannot run card expiry watcher, invalid CARD_MAX_AGE"
);
}
}
}
/// Load a model deployment card from a JSON file
pub
fn
load_from_json_file
<
P
:
AsRef
<
Path
>>
(
file
:
P
)
->
std
::
io
::
Result
<
Self
>
{
let
contents
=
std
::
fs
::
read_to_string
(
&
file
)
?
;
...
...
@@ -210,15 +201,6 @@ impl ModelDeploymentCard {
format!
(
"{}"
,
blake3
::
hash
(
json
.as_bytes
()))
}
/// Was this card last published a long time ago, suggesting the worker is gone?
pub
fn
is_expired
(
&
self
)
->
bool
{
if
let
Some
(
last_published
)
=
self
.last_published
.as_ref
()
{
chrono
::
Utc
::
now
()
-
last_published
>
CARD_MAX_AGE
}
else
{
false
}
}
/// Is this a full model card with tokenizer?
/// There are cases where we have a placeholder card (see `with_name_only`).
pub
fn
has_tokenizer
(
&
self
)
->
bool
{
...
...
@@ -405,6 +387,10 @@ impl ModelDeploymentCard {
}
}
pub
fn
requires_preprocessing
(
&
self
)
->
bool
{
matches!
(
self
.model_input
,
ModelInput
::
Tokens
)
}
/// Load a ModelDeploymentCard from storage the DistributedRuntime is configured to use.
/// Card should be fully local and ready to use when the call returns.
pub
async
fn
load_from_store
(
...
...
@@ -491,10 +477,11 @@ impl ModelDeploymentCard {
prompt_formatter
:
Some
(
PromptFormatterArtifact
::
GGUF
(
gguf_file
.to_path_buf
())),
chat_template_file
:
None
,
prompt_context
:
None
,
// TODO - auto-detect prompt context
last_published
:
None
,
context_length
,
kv_cache_block_size
:
0
,
migration_limit
:
0
,
model_type
:
Default
::
default
(),
// set later
model_input
:
Default
::
default
(),
// set later
user_data
:
None
,
runtime_config
:
ModelRuntimeConfig
::
default
(),
cache_dir
:
None
,
...
...
@@ -554,10 +541,11 @@ impl ModelDeploymentCard {
prompt_formatter
:
PromptFormatterArtifact
::
from_repo
(
repo_id
)
?
,
chat_template_file
,
prompt_context
:
None
,
// TODO - auto-detect prompt context
last_published
:
None
,
context_length
,
kv_cache_block_size
:
0
,
// set later
migration_limit
:
0
,
model_type
:
Default
::
default
(),
// set later
model_input
:
Default
::
default
(),
// set later
user_data
:
None
,
runtime_config
:
ModelRuntimeConfig
::
default
(),
cache_dir
:
None
,
...
...
@@ -565,15 +553,19 @@ impl ModelDeploymentCard {
}
}
impl
PartialEq
for
ModelDeploymentCard
{
fn
eq
(
&
self
,
other
:
&
ModelDeploymentCard
)
->
bool
{
self
.mdcsum
()
==
other
.mdcsum
()
}
}
/// A ModelDeploymentCard is published a single time per instance and never updated.
impl
Versioned
for
ModelDeploymentCard
{
fn
revision
(
&
self
)
->
u64
{
0
}
fn
set_revision
(
&
mut
self
,
_
revision
:
u64
)
{
self
.last_published
=
Some
(
chrono
::
Utc
::
now
());
}
fn
set_revision
(
&
mut
self
,
_
revision
:
u64
)
{}
}
impl
fmt
::
Display
for
ModelDeploymentCard
{
...
...
lib/llm/src/model_type.rs
View file @
6ffd20a8
...
...
@@ -30,7 +30,7 @@ bitflags! {
/// Using bitflags avoids deep branching on a single enum variant,
/// simplifies checks like `supports_chat()`, and enables efficient,
/// type-safe combinations of multiple endpoint types within a single byte.
#[derive(Copy,
Debug,
Clone,
Serialize,
Deserialize,
Eq,
PartialEq)]
#[derive(Copy,
Debug,
Default,
Clone,
Serialize,
Deserialize,
Eq,
PartialEq)]
pub
struct
ModelType
:
u8
{
const
Chat
=
1
<<
0
;
const
Completions
=
1
<<
1
;
...
...
@@ -100,9 +100,10 @@ impl fmt::Display for ModelType {
}
}
#[derive(Copy,
Debug,
Clone,
Display,
Serialize,
Deserialize,
Eq,
PartialEq)]
#[derive(Copy,
Debug,
Default,
Clone,
Display,
Serialize,
Deserialize,
Eq,
PartialEq)]
pub
enum
ModelInput
{
/// Raw text input
#[default]
Text
,
/// Pre-processed input
Tokens
,
...
...
lib/llm/tests/http_metrics.rs
View file @
6ffd20a8
...
...
@@ -293,12 +293,13 @@ async fn test_metrics_with_mock_model() {
mod
integration_tests
{
use
super
::
*
;
use
dynamo_llm
::{
discovery
::{
ModelEntry
,
ModelWatcher
},
discovery
::{
MODEL_ROOT_PATH
,
ModelEntry
,
ModelWatcher
},
engines
::
make_echo_engine
,
entrypoint
::
EngineConfig
,
local_model
::
LocalModelBuilder
,
};
use
dynamo_runtime
::
DistributedRuntime
;
use
dynamo_runtime
::
pipeline
::
RouterMode
;
use
std
::
sync
::
Arc
;
#[tokio::test]
...
...
@@ -335,8 +336,6 @@ mod integration_tests {
// Set up model watcher to discover models from etcd (like production)
// This is crucial for the polling task to find model entries
use
dynamo_llm
::
discovery
::{
MODEL_ROOT_PATH
,
ModelWatcher
};
use
dynamo_runtime
::
pipeline
::
RouterMode
;
let
model_watcher
=
ModelWatcher
::
new
(
distributed_runtime
.clone
(),
...
...
@@ -531,7 +530,7 @@ mod integration_tests {
if
let
Some
(
key
)
=
key
{
// Remove from ModelManager first (this returns the ModelEntry)
if
let
Some
(
_
removed_
entry
)
=
manager
.remove_model_
entry
(
&
key
)
{
if
let
Some
(
_
removed_
card
)
=
manager
.remove_model_
card
(
&
key
)
{
// Remove engines (following ModelWatcher::handle_delete pattern)
manager
.remove_chat_completions_model
(
&
model_entry
.name
)
...
...
lib/llm/tests/http_namespace_integration.rs
View file @
6ffd20a8
...
...
@@ -4,109 +4,18 @@
//! Integration tests for HTTP service namespace discovery functionality.
//! These tests verify that the HTTP service correctly filters models based on namespace configuration.
use
dynamo_llm
::{
discovery
::
ModelEntry
,
model_type
::{
ModelInput
,
ModelType
},
namespace
::{
GLOBAL_NAMESPACE
,
is_global_namespace
},
};
use
dynamo_llm
::
namespace
::{
GLOBAL_NAMESPACE
,
is_global_namespace
};
use
dynamo_runtime
::
protocols
::
EndpointId
;
// Helper function to create a test ModelEntry
fn
create_test_model_entry
(
name
:
&
str
,
namespace
:
&
str
,
component
:
&
str
,
endpoint_name
:
&
str
,
model_type
:
ModelType
,
model_input
:
ModelInput
,
)
->
ModelEntry
{
ModelEntry
{
name
:
name
.to_string
(),
endpoint_id
:
EndpointId
{
// Helper function to create a test ModelDeploymentCard
fn
create_test_endpoint
(
namespace
:
&
str
,
component
:
&
str
,
endpoint_name
:
&
str
)
->
EndpointId
{
EndpointId
{
namespace
:
namespace
.to_string
(),
component
:
component
.to_string
(),
name
:
endpoint_name
.to_string
(),
},
model_type
,
model_input
,
runtime_config
:
None
,
}
}
#[test]
fn
test_namespace_filtering_behavior
()
{
// Test the core namespace filtering logic used in HTTP service
let
test_models
=
vec!
[
create_test_model_entry
(
"model-1"
,
"vllm-agg"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"model-2"
,
"sglang-prod"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"model-3"
,
"dynamo"
,
"backend"
,
"generate"
,
ModelType
::
Completions
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"model-4"
,
"tensorrt-llm"
,
"backend"
,
"generate"
,
ModelType
::
Embedding
,
ModelInput
::
Tokens
,
),
];
// Test filtering for specific namespace "vllm-agg"
let
target_namespace
=
"vllm-agg"
;
let
is_global
=
is_global_namespace
(
target_namespace
);
let
filtered_models
:
Vec
<&
ModelEntry
>
=
test_models
.iter
()
.filter
(|
model
|
is_global
||
model
.endpoint_id.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_models
.len
(),
1
);
assert_eq!
(
filtered_models
[
0
]
.name
,
"model-1"
);
assert_eq!
(
filtered_models
[
0
]
.endpoint_id.namespace
,
"vllm-agg"
);
// Test filtering for global namespace (should include all models)
let
target_namespace
=
GLOBAL_NAMESPACE
;
let
is_global
=
is_global_namespace
(
target_namespace
);
let
filtered_models_global
:
Vec
<&
ModelEntry
>
=
test_models
.iter
()
.filter
(|
model
|
is_global
||
model
.endpoint_id.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_models_global
.len
(),
4
);
// All models should be included
// Test filtering for empty namespace (treated as global)
let
target_namespace
=
""
;
let
is_global
=
is_global_namespace
(
target_namespace
);
let
filtered_models_empty
:
Vec
<&
ModelEntry
>
=
test_models
.iter
()
.filter
(|
model
|
is_global
||
model
.endpoint_id.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_models_empty
.len
(),
4
);
// All models should be included
}
#[test]
fn
test_endpoint_id_namespace_extraction
()
{
// Test endpoint ID parsing for different namespace formats
...
...
@@ -165,62 +74,30 @@ fn test_model_discovery_scoping_scenarios() {
// Scenario 1: Frontend configured for specific namespace should only see models from that namespace
let
frontend_namespace
=
"vllm-agg"
;
let
available_models
=
vec!
[
create_test_model_entry
(
"llama-7b"
,
"vllm-agg"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"mistral-7b"
,
"vllm-agg"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"gpt-3.5"
,
"sglang-prod"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"claude-3"
,
"dynamo"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_endpoint
(
"vllm-agg"
,
"backend"
,
"generate"
),
create_test_endpoint
(
"vllm-agg"
,
"backend"
,
"generate"
),
create_test_endpoint
(
"sglang-prod"
,
"backend"
,
"generate"
),
create_test_endpoint
(
"dynamo"
,
"backend"
,
"generate"
),
];
let
visible_models
:
Vec
<&
ModelEntry
>
=
available_models
let
visible_models
:
Vec
<&
EndpointId
>
=
available_models
.iter
()
.filter
(|
model
|
{
.filter
(|
endpoint
|
{
let
is_global
=
is_global_namespace
(
frontend_namespace
);
is_global
||
model
.
endpoint
_id
.namespace
==
frontend_namespace
is_global
||
endpoint
.namespace
==
frontend_namespace
})
.collect
();
assert_eq!
(
visible_models
.len
(),
2
);
assert
!
(
visible_models
.iter
()
.all
(|
m
|
m
.endpoint_id.namespace
==
"vllm-agg"
)
);
assert
!
(
visible_models
.iter
()
.all
(|
m
|
m
.namespace
==
"vllm-agg"
));
// Scenario 2: Frontend configured for global namespace should see all models
let
frontend_namespace
=
GLOBAL_NAMESPACE
;
let
visible_models_global
:
Vec
<&
ModelEntry
>
=
available_models
let
visible_models_global
:
Vec
<&
EndpointId
>
=
available_models
.iter
()
.filter
(|
model
|
{
.filter
(|
endpoint
|
{
let
is_global
=
is_global_namespace
(
frontend_namespace
);
is_global
||
model
.
endpoint
_id
.namespace
==
frontend_namespace
is_global
||
endpoint
.namespace
==
frontend_namespace
})
.collect
();
...
...
@@ -228,11 +105,11 @@ fn test_model_discovery_scoping_scenarios() {
// Scenario 3: Frontend configured for non-existent namespace should see no models
let
frontend_namespace
=
"non-existent-namespace"
;
let
visible_models_none
:
Vec
<&
ModelEntry
>
=
available_models
let
visible_models_none
:
Vec
<&
EndpointId
>
=
available_models
.iter
()
.filter
(|
model
|
{
.filter
(|
endpoint
|
{
let
is_global
=
is_global_namespace
(
frontend_namespace
);
is_global
||
model
.
endpoint
_id
.namespace
==
frontend_namespace
is_global
||
endpoint
.namespace
==
frontend_namespace
})
.collect
();
...
...
@@ -244,30 +121,9 @@ fn test_namespace_boundary_conditions() {
// Test edge cases and boundary conditions for namespace handling
let
test_models
=
vec!
[
create_test_model_entry
(
"model-1"
,
""
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
// Empty namespace
create_test_model_entry
(
"model-2"
,
"dynamo"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
// Global namespace
create_test_model_entry
(
"model-3"
,
"ns-with-special-chars_123"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_endpoint
(
""
,
"backend"
,
"generate"
),
// Empty namespace
create_test_endpoint
(
"dynamo"
,
"backend"
,
"generate"
),
// Global namespace
create_test_endpoint
(
"ns-with-special-chars_123"
,
"backend"
,
"generate"
),
];
// Test filtering with empty target namespace (should be treated as global)
...
...
@@ -275,9 +131,9 @@ fn test_namespace_boundary_conditions() {
let
is_global
=
is_global_namespace
(
target_namespace
);
assert
!
(
is_global
);
// Empty namespace should be treated as global
let
filtered_empty
:
Vec
<&
ModelEntry
>
=
test_models
let
filtered_empty
:
Vec
<&
EndpointId
>
=
test_models
.iter
()
.filter
(|
model
|
is_global
||
model
.
endpoint_id.
namespace
==
target_namespace
)
.filter
(|
model
|
is_global
||
model
.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_empty
.len
(),
3
);
// All models should be visible
...
...
@@ -287,9 +143,9 @@ fn test_namespace_boundary_conditions() {
let
is_global
=
is_global_namespace
(
target_namespace
);
assert
!
(
is_global
);
let
filtered_global
:
Vec
<&
ModelEntry
>
=
test_models
let
filtered_global
:
Vec
<&
EndpointId
>
=
test_models
.iter
()
.filter
(|
model
|
is_global
||
model
.
endpoint_id.
namespace
==
target_namespace
)
.filter
(|
model
|
is_global
||
model
.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_global
.len
(),
3
);
// All models should be visible
...
...
@@ -299,9 +155,9 @@ fn test_namespace_boundary_conditions() {
let
is_global
=
is_global_namespace
(
target_namespace
);
assert
!
(
!
is_global
);
// Should be case-sensitive
let
filtered_uppercase
:
Vec
<&
ModelEntry
>
=
test_models
let
filtered_uppercase
:
Vec
<&
EndpointId
>
=
test_models
.iter
()
.filter
(|
model
|
is_global
||
model
.
endpoint_id.
namespace
==
target_namespace
)
.filter
(|
model
|
is_global
||
model
.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_uppercase
.len
(),
0
);
// No models should be visible
...
...
lib/llm/tests/kserve_service.rs
View file @
6ffd20a8
...
...
@@ -6,11 +6,10 @@ pub mod kserve_test {
pub
mod
inference
{
tonic
::
include_proto!
(
"inference"
);
}
use
dynamo_llm
::
discovery
::
ModelEntry
;
use
dynamo_llm
::
local_model
::
runtime_config
::
ModelRuntimeConfig
;
use
dynamo_llm
::
model_card
::
ModelDeploymentCard
;
use
dynamo_llm
::
model_type
::{
ModelInput
,
ModelType
};
use
dynamo_llm
::
protocols
::
tensor
;
use
dynamo_runtime
::
protocols
::
EndpointId
;
use
inference
::
grpc_inference_service_client
::
GrpcInferenceServiceClient
;
use
inference
::{
DataType
,
ModelConfigRequest
,
ModelInferRequest
,
ModelInferResponse
,
ModelMetadataRequest
,
...
...
@@ -284,20 +283,10 @@ pub mod kserve_test {
manager
.add_completions_model
(
"split"
,
split
.clone
())
.unwrap
();
manager
.save_model_entry
(
"split"
,
ModelEntry
{
name
:
"split"
.to_string
(),
endpoint_id
:
EndpointId
{
namespace
:
"namespace"
.to_string
(),
component
:
"component"
.to_string
(),
name
:
"split"
.to_string
(),
},
model_type
:
ModelType
::
Completions
,
model_input
:
ModelInput
::
Text
,
runtime_config
:
None
,
},
);
let
mut
card
=
ModelDeploymentCard
::
with_name_only
(
"split"
);
card
.model_type
=
ModelType
::
Completions
;
card
.model_input
=
ModelInput
::
Text
;
manager
.save_model_card
(
"split"
,
card
);
manager
.add_chat_completions_model
(
"failure"
,
failure
.clone
())
...
...
@@ -305,37 +294,17 @@ pub mod kserve_test {
manager
.add_completions_model
(
"failure"
,
failure
.clone
())
.unwrap
();
manager
.save_model_entry
(
"failure"
,
ModelEntry
{
name
:
"failure"
.to_string
(),
endpoint_id
:
EndpointId
{
namespace
:
"namespace"
.to_string
(),
component
:
"component"
.to_string
(),
name
:
"failure"
.to_string
(),
},
model_type
:
ModelType
::
Completions
|
ModelType
::
Chat
,
model_input
:
ModelInput
::
Text
,
runtime_config
:
None
,
},
);
let
mut
card
=
ModelDeploymentCard
::
with_name_only
(
"failure"
);
card
.model_type
=
ModelType
::
Completions
|
ModelType
::
Chat
;
card
.model_input
=
ModelInput
::
Text
;
manager
.save_model_card
(
"failure"
,
card
);
manager
.add_completions_model
(
"long_running"
,
long_running
.clone
())
.unwrap
();
manager
.save_model_entry
(
"long_running"
,
ModelEntry
{
name
:
"long_running"
.to_string
(),
endpoint_id
:
EndpointId
{
namespace
:
"namespace"
.to_string
(),
component
:
"component"
.to_string
(),
name
:
"long_running"
.to_string
(),
},
model_type
:
ModelType
::
Completions
,
model_input
:
ModelInput
::
Text
,
runtime_config
:
None
,
},
);
let
mut
card
=
ModelDeploymentCard
::
with_name_only
(
"long_running"
);
card
.model_type
=
ModelType
::
Completions
;
card
.model_input
=
ModelInput
::
Text
;
manager
.save_model_card
(
"long_running"
,
card
);
(
service
,
split
,
failure
,
long_running
)
}
...
...
@@ -1179,21 +1148,13 @@ pub mod kserve_test {
});
// Failure, model registered as Tensor but does not provide model config (in runtime config)
let
entry
=
ModelEntry
{
name
:
"tensor"
.to_string
(),
endpoint_id
:
EndpointId
{
namespace
:
"namespace"
.to_string
(),
component
:
"component"
.to_string
(),
name
:
"endpoint"
.to_string
(),
},
model_type
:
ModelType
::
TensorBased
,
model_input
:
ModelInput
::
Tensor
,
runtime_config
:
None
,
};
let
mut
card
=
ModelDeploymentCard
::
with_name_only
(
"tensor"
);
card
.model_type
=
ModelType
::
TensorBased
;
card
.model_input
=
ModelInput
::
Tensor
;
service_with_engines
.0
.model_manager
()
.save_model_
entry
(
"key"
,
entry
);
.save_model_
card
(
"key"
,
card
);
let
response
=
client
.model_metadata
(
request
)
.await
;
assert
!
(
response
.is_err
());
...
...
@@ -1236,17 +1197,11 @@ pub mod kserve_test {
service_with_engines
.0
.model_manager
()
.remove_model_entry
(
"key"
);
let
entry
=
ModelEntry
{
name
:
"tensor"
.to_string
(),
endpoint_id
:
EndpointId
{
namespace
:
"namespace"
.to_string
(),
component
:
"component"
.to_string
(),
name
:
"endpoint"
.to_string
(),
},
model_type
:
ModelType
::
TensorBased
,
model_input
:
ModelInput
::
Tensor
,
runtime_config
:
Some
(
ModelRuntimeConfig
{
.remove_model_card
(
"key"
);
let
mut
card
=
ModelDeploymentCard
::
with_name_only
(
"tensor"
);
card
.model_type
=
ModelType
::
TensorBased
;
card
.model_input
=
ModelInput
::
Tensor
;
card
.runtime_config
=
ModelRuntimeConfig
{
tensor_model_config
:
Some
(
tensor
::
TensorModelConfig
{
name
:
"tensor"
.to_string
(),
inputs
:
vec!
[
tensor
::
TensorMetadata
{
...
...
@@ -1261,12 +1216,11 @@ pub mod kserve_test {
}],
}),
..
Default
::
default
()
}),
};
service_with_engines
.0
.model_manager
()
.save_model_
entry
(
"key"
,
entry
);
.save_model_
card
(
"key"
,
card
);
// Success
let
request
=
tonic
::
Request
::
new
(
ModelMetadataRequest
{
...
...
lib/llm/tests/namespace.rs
View file @
6ffd20a8
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use
dynamo_llm
::{
discovery
::
ModelEntry
,
model_type
::{
ModelInput
,
ModelType
},
namespace
::{
GLOBAL_NAMESPACE
,
is_global_namespace
},
};
use
dynamo_runtime
::
protocols
::
EndpointId
;
#[test]
fn
test_is_global_namespace_with_global_string
()
{
assert
!
(
is_global_namespace
(
GLOBAL_NAMESPACE
));
assert
!
(
is_global_namespace
(
"dynamo"
));
}
#[test]
fn
test_is_global_namespace_with_empty_string
()
{
assert
!
(
is_global_namespace
(
""
));
}
#[test]
fn
test_is_global_namespace_with_specific_namespace
()
{
assert
!
(
!
is_global_namespace
(
"test-namespace"
));
assert
!
(
!
is_global_namespace
(
"my-custom-namespace"
));
}
#[test]
fn
test_is_global_namespace_with_whitespace
()
{
// Whitespace should not be considered global
assert
!
(
!
is_global_namespace
(
" "
));
assert
!
(
!
is_global_namespace
(
" "
));
assert
!
(
!
is_global_namespace
(
"
\t
"
));
assert
!
(
!
is_global_namespace
(
"
\n
"
));
}
#[test]
fn
test_is_global_namespace_case_sensitivity
()
{
// Should be case sensitive
assert
!
(
!
is_global_namespace
(
"Dynamo"
));
assert
!
(
!
is_global_namespace
(
"DYNAMO"
));
}
#[test]
fn
test_global_namespace_constant
()
{
assert_eq!
(
GLOBAL_NAMESPACE
,
"dynamo"
);
}
// Helper function to create a test ModelEntry
fn
create_test_model_entry
(
name
:
&
str
,
namespace
:
&
str
,
component
:
&
str
,
endpoint_name
:
&
str
,
model_type
:
ModelType
,
model_input
:
ModelInput
,
)
->
ModelEntry
{
ModelEntry
{
name
:
name
.to_string
(),
endpoint_id
:
EndpointId
{
namespace
:
namespace
.to_string
(),
component
:
component
.to_string
(),
name
:
endpoint_name
.to_string
(),
},
model_type
,
model_input
,
runtime_config
:
None
,
}
}
#[test]
fn
test_model_entry_creation_with_different_namespaces
()
{
// Test creating ModelEntry with specific namespace
let
model_vllm
=
create_test_model_entry
(
"test-model-1"
,
"vllm-agg"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
);
assert_eq!
(
model_vllm
.name
,
"test-model-1"
);
assert_eq!
(
model_vllm
.endpoint_id.namespace
,
"vllm-agg"
);
assert_eq!
(
model_vllm
.endpoint_id.component
,
"backend"
);
assert_eq!
(
model_vllm
.endpoint_id.name
,
"generate"
);
assert_eq!
(
model_vllm
.model_type
,
ModelType
::
Chat
);
assert_eq!
(
model_vllm
.model_input
,
ModelInput
::
Tokens
);
// Test creating ModelEntry with global namespace
let
model_global
=
create_test_model_entry
(
"test-model-2"
,
"dynamo"
,
"frontend"
,
"http"
,
ModelType
::
Completions
,
ModelInput
::
Text
,
);
assert_eq!
(
model_global
.name
,
"test-model-2"
);
assert_eq!
(
model_global
.endpoint_id.namespace
,
"dynamo"
);
assert_eq!
(
model_global
.endpoint_id.component
,
"frontend"
);
assert_eq!
(
model_global
.endpoint_id.name
,
"http"
);
assert_eq!
(
model_global
.model_type
,
ModelType
::
Completions
);
assert_eq!
(
model_global
.model_input
,
ModelInput
::
Text
);
}
#[test]
fn
test_namespace_filtering_logic
()
{
// Test the core logic that would be used in namespace filtering
let
models
=
vec!
[
create_test_model_entry
(
"model-1"
,
"vllm-agg"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"model-2"
,
"sglang-prod"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"model-3"
,
"dynamo"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
create_test_model_entry
(
"model-4"
,
""
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
),
];
// Test filtering for specific namespace "vllm-agg"
let
target_namespace
=
"vllm-agg"
;
let
global_namespace
=
is_global_namespace
(
target_namespace
);
let
filtered_vllm
:
Vec
<&
ModelEntry
>
=
models
.iter
()
.filter
(|
model
|
global_namespace
||
model
.endpoint_id.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_vllm
.len
(),
1
);
assert_eq!
(
filtered_vllm
[
0
]
.name
,
"model-1"
);
assert_eq!
(
filtered_vllm
[
0
]
.endpoint_id.namespace
,
"vllm-agg"
);
// Test filtering for global namespace (should include all)
let
target_namespace
=
"dynamo"
;
let
global_namespace
=
is_global_namespace
(
target_namespace
);
let
filtered_global
:
Vec
<&
ModelEntry
>
=
models
.iter
()
.filter
(|
model
|
global_namespace
||
model
.endpoint_id.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_global
.len
(),
4
);
// All models should be included
// Test filtering for empty namespace (should include all, treated as global)
let
target_namespace
=
""
;
let
global_namespace
=
is_global_namespace
(
target_namespace
);
let
filtered_empty
:
Vec
<&
ModelEntry
>
=
models
.iter
()
.filter
(|
model
|
global_namespace
||
model
.endpoint_id.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_empty
.len
(),
4
);
// All models should be included
// Test filtering for non-existent namespace
let
target_namespace
=
"non-existent"
;
let
global_namespace
=
is_global_namespace
(
target_namespace
);
let
filtered_none
:
Vec
<&
ModelEntry
>
=
models
.iter
()
.filter
(|
model
|
global_namespace
||
model
.endpoint_id.namespace
==
target_namespace
)
.collect
();
assert_eq!
(
filtered_none
.len
(),
0
);
// No models should match
}
#[test]
fn
test_model_entry_serialization
()
{
// Test that ModelEntry can be serialized and deserialized (important for etcd storage)
let
model
=
create_test_model_entry
(
"test-model"
,
"vllm-agg"
,
"backend"
,
"generate"
,
ModelType
::
Chat
,
ModelInput
::
Tokens
,
);
// Serialize to JSON
let
json
=
serde_json
::
to_string
(
&
model
)
.expect
(
"Failed to serialize ModelEntry"
);
assert
!
(
json
.contains
(
"test-model"
));
assert
!
(
json
.contains
(
"vllm-agg"
));
assert
!
(
json
.contains
(
"backend"
));
assert
!
(
json
.contains
(
"generate"
));
// Deserialize from JSON
let
deserialized
:
ModelEntry
=
serde_json
::
from_str
(
&
json
)
.expect
(
"Failed to deserialize ModelEntry"
);
assert_eq!
(
deserialized
.name
,
model
.name
);
assert_eq!
(
deserialized
.endpoint_id.namespace
,
model
.endpoint_id.namespace
);
assert_eq!
(
deserialized
.endpoint_id.component
,
model
.endpoint_id.component
);
assert_eq!
(
deserialized
.endpoint_id.name
,
model
.endpoint_id.name
);
assert_eq!
(
deserialized
.model_type
,
model
.model_type
);
assert_eq!
(
deserialized
.model_input
,
model
.model_input
);
}
#[test]
fn
test_endpoint_namespace_parsing
()
{
// Test Endpoint creation from string with namespace
...
...
lib/runtime/examples/Cargo.lock
View file @
6ffd20a8
...
...
@@ -687,12 +687,14 @@ dependencies = [
"once_cell",
"prometheus",
"rand 0.9.1",
"rayon",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 2.0.12",
"tokio",
"tokio-rayon",
"tokio-stream",
"tokio-util",
"tower-http",
...
...
@@ -2848,6 +2850,16 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "tokio-rayon"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cf33a76e0b1dd03b778f83244137bd59887abf25c0e87bc3e7071105f457693"
dependencies = [
"rayon",
"tokio",
]
[[package]]
name = "tokio-rustls"
version = "0.26.2"
...
...
lib/runtime/src/component.rs
View file @
6ffd20a8
...
...
@@ -107,6 +107,13 @@ impl Instance {
pub
fn
id
(
&
self
)
->
i64
{
self
.instance_id
}
pub
fn
endpoint_id
(
&
self
)
->
EndpointId
{
EndpointId
{
namespace
:
self
.namespace
.clone
(),
component
:
self
.component
.clone
(),
name
:
self
.endpoint
.clone
(),
}
}
}
/// A [Component] a discoverable entity in the distributed runtime.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment