Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d2aad651
"tests/vscode:/vscode.git/clone" did not exist on "e54894fc85a9861fb38a49701b5844462c3d77e4"
Unverified
Commit
d2aad651
authored
Apr 14, 2026
by
Biswa Panda
Committed by
GitHub
Apr 14, 2026
Browse files
fix(runtime): graceful removal of disagg model from /v1/models when prefill engine dies (#7131)
parent
49feb284
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
648 additions
and
40 deletions
+648
-40
lib/llm/src/discovery/model.rs
lib/llm/src/discovery/model.rs
+135
-15
lib/llm/src/discovery/model_manager.rs
lib/llm/src/discovery/model_manager.rs
+206
-0
lib/llm/src/discovery/watcher.rs
lib/llm/src/discovery/watcher.rs
+13
-1
lib/llm/src/discovery/worker_set.rs
lib/llm/src/discovery/worker_set.rs
+16
-1
lib/llm/src/entrypoint/input/common.rs
lib/llm/src/entrypoint/input/common.rs
+20
-1
lib/llm/src/grpc/service/openai.rs
lib/llm/src/grpc/service/openai.rs
+6
-1
lib/llm/src/grpc/service/tensor.rs
lib/llm/src/grpc/service/tensor.rs
+6
-1
lib/llm/src/http/service/openai.rs
lib/llm/src/http/service/openai.rs
+61
-14
lib/llm/src/kv_router/prefill_router/activation.rs
lib/llm/src/kv_router/prefill_router/activation.rs
+72
-0
lib/llm/src/kv_router/prefill_router/execution.rs
lib/llm/src/kv_router/prefill_router/execution.rs
+3
-2
lib/llm/src/kv_router/prefill_router/mod.rs
lib/llm/src/kv_router/prefill_router/mod.rs
+110
-4
No files found.
lib/llm/src/discovery/model.rs
View file @
d2aad651
...
...
@@ -176,7 +176,7 @@ impl Model {
self
.worker_sets
.iter
()
.any
(|
entry
|
{
let
ws
=
entry
.value
();
if
ws
.worker_count
()
==
0
{
if
ws
.worker_count
()
==
0
||
!
ws
.can_serve_requests
()
{
return
false
;
}
has_serving_engine
(
ws
.as_ref
())
||
(
!
has_any_serving_engine
&&
ws
.is_prefill_set
())
...
...
@@ -189,41 +189,41 @@ impl Model {
&
self
,
)
->
Result
<
OpenAIChatCompletionsStreamingEngine
,
ModelManagerError
>
{
self
.select_worker_set_with
(|
ws
|
ws
.chat_engine
.clone
())
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_chat_engi
ne
()))
}
pub
fn
get_completions_engine
(
&
self
,
)
->
Result
<
OpenAICompletionsStreamingEngine
,
ModelManagerError
>
{
self
.select_worker_set_with
(|
ws
|
ws
.completions_engine
.clone
())
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_completions_engi
ne
()))
}
pub
fn
get_embeddings_engine
(
&
self
,
)
->
Result
<
OpenAIEmbeddingsStreamingEngine
,
ModelManagerError
>
{
self
.select_worker_set_with
(|
ws
|
ws
.embeddings_engine
.clone
())
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_embeddings_engi
ne
()))
}
pub
fn
get_images_engine
(
&
self
)
->
Result
<
OpenAIImagesStreamingEngine
,
ModelManagerError
>
{
self
.select_worker_set_with
(|
ws
|
ws
.images_engine
.clone
())
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_images_engi
ne
()))
}
pub
fn
get_videos_engine
(
&
self
)
->
Result
<
OpenAIVideosStreamingEngine
,
ModelManagerError
>
{
self
.select_worker_set_with
(|
ws
|
ws
.videos_engine
.clone
())
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_videos_engi
ne
()))
}
pub
fn
get_audios_engine
(
&
self
)
->
Result
<
OpenAIAudiosStreamingEngine
,
ModelManagerError
>
{
self
.select_worker_set_with
(|
ws
|
ws
.audios_engine
.clone
())
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_audios_engi
ne
()))
}
pub
fn
get_tensor_engine
(
&
self
)
->
Result
<
TensorStreamingEngine
,
ModelManagerError
>
{
self
.select_worker_set_with
(|
ws
|
ws
.tensor_engine
.clone
())
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_tensor_engi
ne
()))
}
// -- Combined engine + parsing options (atomically from one WorkerSet) --
...
...
@@ -232,7 +232,7 @@ impl Model {
&
self
,
)
->
Result
<
(
OpenAIChatCompletionsStreamingEngine
,
ParsingOptions
),
ModelManagerError
>
{
self
.select_worker_set_with
(|
ws
|
ws
.chat_engine
.clone
()
.map
(|
e
|
(
e
,
ws
.parsing_options
())))
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_chat_engi
ne
()))
}
pub
fn
get_completions_engine_with_parsing
(
...
...
@@ -243,7 +243,7 @@ impl Model {
.clone
()
.map
(|
e
|
(
e
,
ws
.parsing_options
()))
})
.ok_or_else
(||
ModelManagerError
::
ModelNotFound
(
self
.name
.clo
ne
()))
.ok_or_else
(||
self
.engine_error
(
self
.has_completions_engi
ne
()))
}
// -- Worker monitoring (aggregated across WorkerSets) --
...
...
@@ -283,6 +283,19 @@ impl Model {
.sum
()
}
// -- Internal helpers --
/// Return the appropriate error when no servable WorkerSet was found.
/// If the engine exists but no WorkerSet can serve (zero workers, prefill not activated,
/// etc.), return ModelUnavailable (maps to 503). Otherwise ModelNotFound (maps to 404).
fn
engine_error
(
&
self
,
engine_exists
:
bool
)
->
ModelManagerError
{
if
engine_exists
{
ModelManagerError
::
ModelUnavailable
(
self
.name
.clone
())
}
else
{
ModelManagerError
::
ModelNotFound
(
self
.name
.clone
())
}
}
// -- Internal selection --
/// Select a WorkerSet and extract a value from it.
...
...
@@ -298,19 +311,18 @@ impl Model {
F
:
Fn
(
&
WorkerSet
)
->
Option
<
T
>
,
{
// Fast path: single set (same zero-worker filtering as the multi-set path below)
// TODO: When the single set has 0 workers, this returns None which maps to
// ModelNotFound (404). Ideally should be 503 "no available workers" — see follow-up.
if
self
.worker_sets
.len
()
==
1
{
return
self
.worker_sets
.iter
()
.next
()
.and_then
(|
entry
|
{
let
ws
=
entry
.value
();
if
ws
.worker_count
()
==
0
{
if
ws
.worker_count
()
==
0
||
!
ws
.can_serve_requests
()
{
return
None
;
}
extract
(
ws
)
});
}
// Collect eligible sets with their worker counts, skipping sets with no workers.
// Collect eligible sets with their worker counts, skipping sets with no workers
// or sets whose prefill router has died under enforce_disagg.
// In-process models (no discovery watcher) return count=1, so they always participate.
// Discovery models with count=0 have no available workers and are skipped.
let
eligible
:
Vec
<
(
T
,
usize
)
>
=
self
...
...
@@ -319,7 +331,7 @@ impl Model {
.filter_map
(|
entry
|
{
let
ws
=
entry
.value
();
let
count
=
ws
.worker_count
();
if
count
==
0
{
if
count
==
0
||
!
ws
.can_serve_requests
()
{
return
None
;
}
extract
(
ws
)
.map
(|
val
|
(
val
,
count
))
...
...
@@ -600,4 +612,112 @@ mod tests {
// Both have 0 workers → all filtered → Err
assert
!
(
model
.get_chat_engine
()
.is_err
());
}
// -- Disaggregated prefill death tests --
use
crate
::
kv_router
::
PrefillRouter
;
/// Build a WorkerSet with a deactivated PrefillRouter simulating "was activated, now dead".
/// worker_count defaults to 1 (no instance_count_rx -> in-process default).
fn
make_worker_set_with_dead_prefill
(
namespace
:
&
str
,
enforce_disagg
:
bool
)
->
Arc
<
WorkerSet
>
{
let
mut
ws
=
WorkerSet
::
new
(
namespace
.to_string
(),
"abc"
.to_string
(),
crate
::
model_card
::
ModelDeploymentCard
::
default
(),
);
let
pr
=
PrefillRouter
::
disabled
(
std
::
sync
::
Arc
::
new
(
crate
::
discovery
::
ModelManager
::
new
()),
dynamo_runtime
::
pipeline
::
RouterMode
::
RoundRobin
,
enforce_disagg
,
);
pr
.deactivate
();
ws
.prefill_router
=
Some
(
pr
);
Arc
::
new
(
ws
)
}
/// Baseline: a WorkerSet without a PrefillRouter is always displayable
/// (worker_count=1, is_prefill_set=true, no can_serve_requests block).
#[test]
fn
test_is_displayable_true_basic
()
{
let
model
=
Model
::
new
(
"llama"
.to_string
());
model
.add_worker_set
(
"ns1"
.to_string
(),
make_worker_set
(
"ns1"
,
"abc"
));
assert
!
(
model
.is_displayable
(),
"model with an unconstrained WorkerSet must be displayable"
);
}
/// When the prefill engine dies and enforce_disagg is set, the model must be
/// hidden from /v1/models.
#[test]
fn
test_is_displayable_false_when_prefill_dies_enforce_disagg
()
{
let
model
=
Model
::
new
(
"llama"
.to_string
());
model
.add_worker_set
(
"ns1"
.to_string
(),
make_worker_set_with_dead_prefill
(
"ns1"
,
true
),
);
assert
!
(
!
model
.is_displayable
(),
"model must be hidden when prefill died and enforce_disagg=true"
);
}
/// When enforce_disagg is false the deployment can fall back to aggregated mode,
/// so the model should remain visible in /v1/models.
#[test]
fn
test_is_displayable_true_when_prefill_dies_no_enforce
()
{
let
model
=
Model
::
new
(
"llama"
.to_string
());
model
.add_worker_set
(
"ns1"
.to_string
(),
make_worker_set_with_dead_prefill
(
"ns1"
,
false
),
);
assert
!
(
model
.is_displayable
(),
"model must remain visible when prefill died but enforce_disagg=false (fallback)"
);
}
/// A single WorkerSet with a deactivated prefill router (enforce_disagg=true) must be
/// skipped by select_worker_set_with(), causing engine accessors to return Err.
#[test]
fn
test_dead_prefill_single_set_not_selectable
()
{
let
model
=
Model
::
new
(
"llama"
.to_string
());
model
.add_worker_set
(
"ns1"
.to_string
(),
make_worker_set_with_dead_prefill
(
"ns1"
,
true
),
);
assert
!
(
model
.get_chat_engine
()
.is_err
());
assert
!
(
model
.get_completions_engine
()
.is_err
());
}
/// With two WorkerSets -- one healthy, one with dead prefill -- the healthy set
/// keeps the model displayable. Removing the healthy set hides the model.
#[test]
fn
test_dead_prefill_multi_set_skips_dead_namespace
()
{
let
model
=
Model
::
new
(
"llama"
.to_string
());
// Healthy set (no prefill constraint)
model
.add_worker_set
(
"healthy"
.to_string
(),
make_worker_set
(
"healthy"
,
"abc"
));
// Dead set (deactivated prefill + enforce_disagg)
model
.add_worker_set
(
"dead"
.to_string
(),
make_worker_set_with_dead_prefill
(
"dead"
,
true
),
);
assert
!
(
model
.is_displayable
(),
"model must be displayable when at least one healthy set exists"
);
// Removing the healthy set leaves only the dead set -- model must be hidden.
model
.remove_worker_set
(
"healthy"
);
assert
!
(
!
model
.is_displayable
(),
"model must be hidden when only the dead prefill set remains"
);
}
}
lib/llm/src/discovery/model_manager.rs
View file @
d2aad651
...
...
@@ -46,6 +46,9 @@ pub enum ModelManagerError {
#[error(
"Model not found: {0}"
)]
ModelNotFound
(
String
),
#[error(
"Model unavailable: {0}"
)]
ModelUnavailable
(
String
),
#[error(
"Model already exists: {0}"
)]
ModelAlreadyExists
(
String
),
}
...
...
@@ -703,6 +706,39 @@ impl ModelManager {
);
}
None
=>
{
// Try to reactivate an existing deactivated router first.
// This handles prefill rejoin after a transient failure: the decode
// WorkerSet's PrefillRouter already exists but is deactivated.
if
let
Some
(
model
)
=
self
.get_model
(
model_name
)
&&
let
Some
(
ws
)
=
model
.get_worker_set
(
namespace
)
&&
let
Some
(
ref
pr
)
=
ws
.prefill_router
&&
pr
.is_deactivated
()
{
pr
.reactivate
();
// Store the endpoint so that if the decode WorkerSet is rebuilt
// (removed and re-added), a subsequent register_prefill_router call
// finds PrefillReady instead of falling back to DecodeWaiting and
// stalling.
let
(
tx
,
rx
)
=
oneshot
::
channel
();
tx
.send
(
endpoint
)
.map_err
(|
_
|
{
anyhow
::
anyhow!
(
"Failed to send endpoint for prefill model {}:{}"
,
model_name
,
namespace
)
})
?
;
self
.prefill_router_activators
.insert
(
key
,
PrefillActivationState
::
PrefillReady
(
rx
));
tracing
::
info!
(
model_name
=
%
model_name
,
namespace
=
%
namespace
,
"Reactivated existing prefill router for decode WorkerSet (prefill rejoin)"
);
return
Ok
(());
}
// No existing deactivated router -- store endpoint for a future decode
// registration.
let
(
tx
,
rx
)
=
oneshot
::
channel
();
tx
.send
(
endpoint
)
.map_err
(|
_
|
{
anyhow
::
anyhow!
(
...
...
@@ -723,6 +759,18 @@ impl ModelManager {
}
}
/// Deactivate the prefill router on the decode WorkerSet for the given model/namespace.
/// Called by the watcher when all prefill workers in a namespace are removed.
/// After deactivation, requests fall back to aggregated mode (or fail if enforce_disagg).
pub
fn
deactivate_prefill_router_for_decode
(
&
self
,
model_name
:
&
str
,
namespace
:
&
str
)
{
if
let
Some
(
model
)
=
self
.get_model
(
model_name
)
&&
let
Some
(
ws
)
=
model
.get_worker_set
(
namespace
)
&&
let
Some
(
ref
pr
)
=
ws
.prefill_router
{
pr
.deactivate
();
}
}
/// Remove the prefill router activator for a (model, namespace) pair.
/// Called when a WorkerSet is removed to prevent stale activators.
pub
fn
remove_prefill_activator
(
&
self
,
model_name
:
&
str
,
namespace
:
&
str
)
{
...
...
@@ -1090,4 +1138,162 @@ mod tests {
"gpt-4:default-abc"
);
}
// -- deactivate_prefill_router_for_decode tests --
use
crate
::
kv_router
::
PrefillRouter
;
/// Helper: make a WorkerSet with an activated PrefillRouter attached.
/// The router is marked as activated to simulate a real deployment where
/// the prefill endpoint has already rendezvoused with the decode side.
fn
make_worker_set_with_prefill_router
(
namespace
:
&
str
,
mdcsum
:
&
str
,
enforce_disagg
:
bool
,
)
->
WorkerSet
{
let
mut
ws
=
make_worker_set
(
namespace
,
mdcsum
);
let
pr
=
PrefillRouter
::
disabled
(
std
::
sync
::
Arc
::
new
(
ModelManager
::
new
()),
dynamo_runtime
::
pipeline
::
RouterMode
::
RoundRobin
,
enforce_disagg
,
);
pr
.mark_activated_for_test
();
ws
.prefill_router
=
Some
(
pr
);
ws
}
/// Calling deactivate on a non-existent model must not panic.
#[test]
fn
test_deactivate_prefill_router_for_decode_noop_missing_model
()
{
let
mm
=
ModelManager
::
new
();
mm
.deactivate_prefill_router_for_decode
(
"nonexistent"
,
"ns1"
);
}
/// Calling deactivate on a WorkerSet without a prefill_router must not panic.
#[test]
fn
test_deactivate_prefill_router_for_decode_noop_no_router
()
{
let
mm
=
ModelManager
::
new
();
mm
.add_worker_set
(
"llama"
,
"ns1"
,
make_worker_set
(
"ns1"
,
"abc"
));
mm
.deactivate_prefill_router_for_decode
(
"llama"
,
"ns1"
);
}
/// Full pipeline test: deactivate finds the WorkerSet, calls deactivate() on its
/// PrefillRouter, and the model is hidden from model_display_names() when
/// enforce_disagg=true.
#[test]
fn
test_deactivate_prefill_router_for_decode_hides_model
()
{
let
mm
=
ModelManager
::
new
();
mm
.add_worker_set
(
"llama"
,
"ns1"
,
make_worker_set_with_prefill_router
(
"ns1"
,
"abc"
,
true
),
);
// Model is visible before deactivation.
assert
!
(
mm
.model_display_names
()
.contains
(
"llama"
));
mm
.deactivate_prefill_router_for_decode
(
"llama"
,
"ns1"
);
// Model must be hidden after deactivation with enforce_disagg=true.
assert
!
(
!
mm
.model_display_names
()
.contains
(
"llama"
),
"model must be hidden after prefill deactivation with enforce_disagg=true"
);
// Idempotent: calling again must not panic.
mm
.deactivate_prefill_router_for_decode
(
"llama"
,
"ns1"
);
assert
!
(
!
mm
.model_display_names
()
.contains
(
"llama"
));
}
/// Full disagg lifecycle with enforce_disagg=true:
/// decode registers -> prefill registers -> prefill dies -> model hidden.
#[test]
fn
test_disagg_lifecycle_prefill_death_hides_model
()
{
let
mm
=
ModelManager
::
new
();
// Step 1: Decode WorkerSet with a PrefillRouter (not yet deactivated).
mm
.add_worker_set
(
"llama"
,
"decode-ns"
,
make_worker_set_with_prefill_router
(
"decode-ns"
,
"abc"
,
true
),
);
assert
!
(
mm
.model_display_names
()
.contains
(
"llama"
),
"step 1: model must be visible with active prefill router"
);
// Step 2: Prefill WorkerSet registers (same model, different namespace key).
mm
.add_worker_set
(
"llama"
,
"prefill-ns"
,
make_worker_set
(
"prefill-ns"
,
"abc"
));
assert
!
(
mm
.model_display_names
()
.contains
(
"llama"
),
"step 2: model must be visible with both decode and prefill"
);
// Step 3: Prefill WorkerSet removed (engine dies).
mm
.remove_worker_set
(
"llama"
,
"prefill-ns"
);
// Step 4: Deactivate the prefill router on the decode side.
mm
.deactivate_prefill_router_for_decode
(
"llama"
,
"decode-ns"
);
assert
!
(
!
mm
.model_display_names
()
.contains
(
"llama"
),
"step 4: model must be hidden after prefill death with enforce_disagg=true"
);
}
/// Full disagg lifecycle with enforce_disagg=false (fallback allowed).
#[test]
fn
test_disagg_lifecycle_prefill_death_keeps_model_no_enforce
()
{
let
mm
=
ModelManager
::
new
();
mm
.add_worker_set
(
"llama"
,
"decode-ns"
,
make_worker_set_with_prefill_router
(
"decode-ns"
,
"abc"
,
false
),
);
assert
!
(
mm
.model_display_names
()
.contains
(
"llama"
));
// Deactivate -- model stays visible (enforce_disagg=false, fallback allowed).
mm
.deactivate_prefill_router_for_decode
(
"llama"
,
"decode-ns"
);
assert
!
(
mm
.model_display_names
()
.contains
(
"llama"
),
"model must remain visible (enforce_disagg=false, fallback allowed)"
);
}
/// Full disagg lifecycle including prefill rejoin after transient failure.
/// decode registers -> prefill dies -> model hidden -> prefill rejoins -> model visible.
#[test]
fn
test_disagg_lifecycle_prefill_rejoin_restores_model
()
{
let
mm
=
ModelManager
::
new
();
// Decode WorkerSet with enforce_disagg=true.
mm
.add_worker_set
(
"llama"
,
"decode-ns"
,
make_worker_set_with_prefill_router
(
"decode-ns"
,
"abc"
,
true
),
);
assert
!
(
mm
.model_display_names
()
.contains
(
"llama"
));
// Prefill dies -> deactivate.
mm
.deactivate_prefill_router_for_decode
(
"llama"
,
"decode-ns"
);
assert
!
(
!
mm
.model_display_names
()
.contains
(
"llama"
),
"model must be hidden after prefill death"
);
// Prefill rejoins -> reactivate via the WorkerSet's PrefillRouter.
if
let
Some
(
model
)
=
mm
.get_model
(
"llama"
)
&&
let
Some
(
ws
)
=
model
.get_worker_set
(
"decode-ns"
)
&&
let
Some
(
ref
pr
)
=
ws
.prefill_router
{
pr
.reactivate
();
}
else
{
panic!
(
"decode WorkerSet or prefill_router not found"
);
}
assert
!
(
mm
.model_display_names
()
.contains
(
"llama"
),
"model must be visible again after prefill rejoin"
);
}
}
lib/llm/src/discovery/watcher.rs
View file @
d2aad651
...
...
@@ -336,6 +336,15 @@ impl ModelWatcher {
"Removed WorkerSet (no remaining instances in namespace)"
);
}
// If the removed component was a prefill worker, deactivate the decode-side
// prefill router so requests fall back to aggregated mode (or fail cleanly
// with enforce_disagg). The decode WorkerSet's namespace matches the
// deployment namespace, not the ws_key.
if
card
.model_type
.supports_prefill
()
{
self
.manager
.deactivate_prefill_router_for_decode
(
&
model_name
,
worker_namespace
);
}
}
// Check if the Model still has instances in any namespace
...
...
@@ -542,9 +551,12 @@ impl ModelWatcher {
self
.router_config.load_threshold_config
.clone
(),
));
// Store KV router and worker monitor on the WorkerSet
// Store KV router, worker monitor, and prefill router on the WorkerSet.
// The prefill router is stored so the watcher can deactivate/reactivate it
// when prefill workers die or rejoin.
worker_set
.kv_router
=
kv_chooser
.clone
();
worker_set
.worker_monitor
=
worker_monitor
.clone
();
worker_set
.prefill_router
=
prefill_chooser
.clone
();
// Add chat engine only if the model supports chat
if
card
.model_type
.supports_chat
()
{
...
...
lib/llm/src/discovery/worker_set.rs
View file @
d2aad651
...
...
@@ -11,7 +11,7 @@ use tokio::sync::watch;
use
crate
::{
discovery
::
KvWorkerMonitor
,
kv_router
::
KvRouter
,
kv_router
::
{
KvRouter
,
PrefillRouter
},
model_card
::
ModelDeploymentCard
,
types
::{
generic
::
tensor
::
TensorStreamingEngine
,
...
...
@@ -51,6 +51,10 @@ pub struct WorkerSet {
/// Worker monitor for load-based rejection
pub
(
crate
)
worker_monitor
:
Option
<
KvWorkerMonitor
>
,
/// Prefill router for disaggregated serving. Stored here so the watcher can
/// deactivate it when all prefill workers die, and reactivate when they rejoin.
pub
(
crate
)
prefill_router
:
Option
<
Arc
<
PrefillRouter
>>
,
/// Watcher for available instance IDs (from the Client's discovery watch).
/// None for in-process models (http/grpc) which don't have a discovery client.
instance_count_rx
:
Option
<
watch
::
Receiver
<
Vec
<
u64
>>>
,
...
...
@@ -71,6 +75,7 @@ impl WorkerSet {
tensor_engine
:
None
,
kv_router
:
None
,
worker_monitor
:
None
,
prefill_router
:
None
,
instance_count_rx
:
None
,
}
}
...
...
@@ -152,6 +157,16 @@ impl WorkerSet {
pub
fn
set_instance_watcher
(
&
mut
self
,
rx
:
watch
::
Receiver
<
Vec
<
u64
>>
)
{
self
.instance_count_rx
=
Some
(
rx
);
}
/// Whether this WorkerSet can serve requests. Delegates to the prefill router
/// if one exists; otherwise always returns true.
/// When the prefill router is deactivated and enforce_disagg is set, this returns
/// false, causing the model to be hidden from /v1/models and requests to be rejected.
pub
fn
can_serve_requests
(
&
self
)
->
bool
{
self
.prefill_router
.as_ref
()
.is_none_or
(|
pr
|
pr
.can_serve_requests
())
}
}
#[cfg(test)]
...
...
lib/llm/src/entrypoint/input/common.rs
View file @
d2aad651
...
...
@@ -136,7 +136,26 @@ pub async fn prepare_engine(
let
model_service_name
=
watch_obj
.wait_for_chat_model
()
.await
;
tracing
::
info!
(
"Connected to {model_service_name}"
);
let
engine
=
model_manager
.get_chat_completions_engine
(
&
model_service_name
)
?
;
// In disaggregated deployments the model may be listed before the prefill
// router is fully activated, causing a transient ModelUnavailable. Retry
// with a timeout so the startup path doesn't fail during this cold-start
// window, but also doesn't hang indefinitely on misconfiguration.
let
deadline
=
tokio
::
time
::
Instant
::
now
()
+
Duration
::
from_secs
(
120
);
let
engine
=
loop
{
match
model_manager
.get_chat_completions_engine
(
&
model_service_name
)
{
Ok
(
engine
)
=>
break
engine
,
Err
(
crate
::
discovery
::
ModelManagerError
::
ModelUnavailable
(
_
))
if
tokio
::
time
::
Instant
::
now
()
<
deadline
=>
{
tracing
::
debug!
(
model
=
%
model_service_name
,
"Model listed but not yet servable, waiting for prefill activation"
);
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
500
))
.await
;
}
Err
(
e
)
=>
return
Err
(
e
.into
()),
}
};
Ok
(
PreparedEngine
{
service_name
:
model_service_name
,
engine
,
...
...
lib/llm/src/grpc/service/openai.rs
View file @
d2aad651
...
...
@@ -85,7 +85,12 @@ pub async fn completion_response_stream(
let
(
engine
,
parsing_options
)
=
state
.manager
()
.get_completions_engine_with_parsing
(
model
)
.map_err
(|
_
|
Status
::
not_found
(
"model not found"
))
?
;
.map_err
(|
e
|
match
e
{
crate
::
discovery
::
ModelManagerError
::
ModelUnavailable
(
_
)
=>
{
Status
::
unavailable
(
"model temporarily unavailable"
)
}
_
=>
Status
::
not_found
(
"model not found"
),
})
?
;
let
http_queue_guard
=
state
.metrics_clone
()
.create_http_queue_guard
(
model
);
...
...
lib/llm/src/grpc/service/tensor.rs
View file @
d2aad651
...
...
@@ -86,7 +86,12 @@ pub async fn tensor_response_stream(
let
engine
=
state
.manager
()
.get_tensor_engine
(
model
)
.map_err
(|
_
|
Status
::
not_found
(
"model not found"
))
?
;
.map_err
(|
e
|
match
e
{
crate
::
discovery
::
ModelManagerError
::
ModelUnavailable
(
_
)
=>
{
Status
::
unavailable
(
"model temporarily unavailable"
)
}
_
=>
Status
::
not_found
(
"model not found"
),
})
?
;
let
http_queue_guard
=
state
.metrics_clone
()
.create_http_queue_guard
(
model
);
...
...
lib/llm/src/http/service/openai.rs
View file @
d2aad651
...
...
@@ -143,6 +143,29 @@ impl ErrorMessage {
)
}
/// Model exists but is temporarily unable to serve (e.g., prefill not activated,
/// no available workers). Returns 503 so clients can retry.
pub
fn
model_unavailable
()
->
ErrorResponse
{
let
code
=
StatusCode
::
SERVICE_UNAVAILABLE
;
let
error_type
=
map_error_code_to_error_type
(
code
);
(
code
,
Json
(
ErrorMessage
{
message
:
"Model temporarily unavailable"
.to_string
(),
error_type
,
code
:
code
.as_u16
(),
}),
)
}
/// Convert a ModelManagerError to the appropriate HTTP response.
pub
fn
from_model_error
(
e
:
&
crate
::
discovery
::
ModelManagerError
)
->
ErrorResponse
{
match
e
{
crate
::
discovery
::
ModelManagerError
::
ModelUnavailable
(
_
)
=>
Self
::
model_unavailable
(),
_
=>
Self
::
model_not_found
(),
}
}
/// Service Unavailable
/// This is returned when the service is live, but not ready.
pub
fn
_
service_unavailable
()
->
ErrorResponse
{
...
...
@@ -469,8 +492,8 @@ async fn completions_single(
let
(
engine
,
parsing_options
)
=
state
.manager
()
.get_completions_engine_with_parsing
(
&
model
)
.map_err
(|
_
|
{
let
err_response
=
ErrorMessage
::
model_
not_found
(
);
.map_err
(|
e
|
{
let
err_response
=
ErrorMessage
::
from_
model_
error
(
&
e
);
inflight_guard
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
})
?
;
...
...
@@ -609,8 +632,8 @@ async fn completions_batch(
let
(
engine
,
parsing_options
)
=
state
.manager
()
.get_completions_engine_with_parsing
(
&
model
)
.map_err
(|
_
|
{
let
err_response
=
ErrorMessage
::
model_
not_found
(
);
.map_err
(|
e
|
{
let
err_response
=
ErrorMessage
::
from_
model_
error
(
&
e
);
inflight_guard
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
})
?
;
...
...
@@ -790,8 +813,8 @@ async fn embeddings(
let
http_queue_guard
=
state
.metrics_clone
()
.create_http_queue_guard
(
model
);
// todo - error handling should be more robust
let
engine
=
state
.manager
()
.get_embeddings_engine
(
model
)
.map_err
(|
_
|
{
let
err_response
=
ErrorMessage
::
model_
not_found
(
);
let
engine
=
state
.manager
()
.get_embeddings_engine
(
model
)
.map_err
(|
e
|
{
let
err_response
=
ErrorMessage
::
from_
model_
error
(
&
e
);
inflight
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
})
?
;
...
...
@@ -1200,8 +1223,8 @@ async fn chat_completions(
let
(
engine
,
parsing_options
)
=
state
.manager
()
.get_chat_completions_engine_with_parsing
(
&
model
)
.map_err
(|
_
|
{
let
err_response
=
ErrorMessage
::
model_
not_found
(
);
.map_err
(|
e
|
{
let
err_response
=
ErrorMessage
::
from_
model_
error
(
&
e
);
inflight_guard
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
})
?
;
...
...
@@ -1612,8 +1635,8 @@ async fn responses(
let
(
engine
,
parsing_options
)
=
state
.manager
()
.get_chat_completions_engine_with_parsing
(
&
model
)
.map_err
(|
_
|
{
let
err_response
=
ErrorMessage
::
model_
not_found
(
);
.map_err
(|
e
|
{
let
err_response
=
ErrorMessage
::
from_
model_
error
(
&
e
);
inflight_guard
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
})
?
;
...
...
@@ -2065,7 +2088,7 @@ async fn images(
let
engine
=
state
.manager
()
.get_images_engine
(
&
model
)
.map_err
(|
_
|
ErrorMessage
::
model_
not_found
(
))
?
;
.map_err
(|
e
|
ErrorMessage
::
from_
model_
error
(
&
e
))
?
;
// this will increment the inflight gauge for the model
let
mut
inflight
=
state
.metrics_clone
()
.create_inflight_guard
(
...
...
@@ -2183,7 +2206,7 @@ async fn videos(
let
engine
=
state
.manager
()
.get_videos_engine
(
&
model
)
.map_err
(|
_
|
ErrorMessage
::
model_
not_found
(
))
?
;
.map_err
(|
e
|
ErrorMessage
::
from_
model_
error
(
&
e
))
?
;
// this will increment the inflight gauge for the model
let
mut
inflight
=
state
.metrics_clone
()
.create_inflight_guard
(
...
...
@@ -2256,7 +2279,7 @@ async fn video_stream(
let
engine
=
state
.manager
()
.get_videos_engine
(
&
model
)
.map_err
(|
_
|
ErrorMessage
::
model_
not_found
(
))
?
;
.map_err
(|
e
|
ErrorMessage
::
from_
model_
error
(
&
e
))
?
;
let
mut
inflight
=
state
...
...
@@ -2432,7 +2455,7 @@ async fn audio_speech(
let
engine
=
state
.manager
()
.get_audios_engine
(
&
model
)
.map_err
(|
_
|
ErrorMessage
::
model_
not_found
(
))
?
;
.map_err
(|
e
|
ErrorMessage
::
from_
model_
error
(
&
e
))
?
;
let
mut
inflight
=
state
.metrics_clone
()
.create_inflight_guard
(
&
model
,
...
...
@@ -3397,6 +3420,30 @@ mod tests {
);
}
#[test]
fn
test_extract_error_type_from_response_unavailable
()
{
let
response
=
ErrorMessage
::
model_unavailable
();
assert_eq!
(
extract_error_type_from_response
(
&
response
),
ErrorType
::
Overload
);
}
#[test]
fn
test_from_model_error_maps_correctly
()
{
let
not_found
=
ModelManagerError
::
ModelNotFound
(
"x"
.to_string
());
assert_eq!
(
ErrorMessage
::
from_model_error
(
&
not_found
)
.0
,
StatusCode
::
NOT_FOUND
);
let
unavailable
=
ModelManagerError
::
ModelUnavailable
(
"x"
.to_string
());
assert_eq!
(
ErrorMessage
::
from_model_error
(
&
unavailable
)
.0
,
StatusCode
::
SERVICE_UNAVAILABLE
);
}
#[test]
fn
test_extract_error_type_from_response_internal
()
{
let
response
=
ErrorMessage
::
internal_server_error
(
"Something went wrong"
);
...
...
lib/llm/src/kv_router/prefill_router/activation.rs
View file @
d2aad651
...
...
@@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
use
std
::
sync
::
Arc
;
use
std
::
sync
::
atomic
::
Ordering
;
use
anyhow
::
Result
;
use
tokio
::
sync
::
oneshot
;
...
...
@@ -41,6 +42,8 @@ impl PrefillRouter {
model_name
:
String
::
new
(),
// Not used for disabled router
namespace
:
String
::
new
(),
// Not used for disabled router
is_eagle
:
false
,
deactivated
:
std
::
sync
::
atomic
::
AtomicBool
::
new
(
false
),
activated
:
std
::
sync
::
atomic
::
AtomicBool
::
new
(
false
),
})
}
...
...
@@ -71,6 +74,8 @@ impl PrefillRouter {
model_name
,
namespace
,
is_eagle
,
deactivated
:
std
::
sync
::
atomic
::
AtomicBool
::
new
(
false
),
activated
:
std
::
sync
::
atomic
::
AtomicBool
::
new
(
false
),
});
// Spawn background task to wait for activation
...
...
@@ -175,6 +180,7 @@ impl PrefillRouter {
// Set the router (ignore error if already set)
let
_
=
self
.prefill_router
.set
(
inner_router
);
self
.activated
.store
(
true
,
Ordering
::
Release
);
tracing
::
info!
(
router_mode
=
?
self
.router_mode
,
...
...
@@ -191,4 +197,70 @@ impl PrefillRouter {
monitor
.set_prefill_client
(
client
.clone
());
}
}
// -- Prefill death handling --
/// Deactivate the prefill router. Called when all prefill workers are removed.
/// After deactivation, requests fall back to aggregated mode (or fail if enforce_disagg).
/// The inner router is preserved so that when workers rejoin (same endpoint/discovery),
/// the Client's discovery subscription picks them up automatically.
pub
fn
deactivate
(
&
self
)
{
self
.deactivated
.store
(
true
,
Ordering
::
Release
);
tracing
::
info!
(
model_name
=
%
self
.model_name
,
namespace
=
%
self
.namespace
,
enforce_disagg
=
self
.enforce_disagg
,
"Prefill router deactivated (all prefill workers removed)"
);
}
/// Reactivate a deactivated router. Called when prefill workers rejoin.
/// The inner router's Client re-discovers workers via its discovery subscription.
///
/// Note: there is a brief race between flipping `deactivated=false` (making
/// `can_serve_requests()` return true) and the Client actually rediscovering
/// workers. Requests arriving in this window may fail at prefill resolution.
/// This is bounded by discovery propagation time (typically sub-second).
///
/// Also note: reactivation reuses the existing inner router built from the
/// original endpoint. If prefill rejoins under a different endpoint identity
/// (e.g., reconfigured deployment), the stale Client would not discover the
/// new workers. This is acceptable for normal restart scenarios where the
/// endpoint identity is stable.
pub
fn
reactivate
(
&
self
)
{
self
.deactivated
.store
(
false
,
Ordering
::
Release
);
tracing
::
info!
(
model_name
=
%
self
.model_name
,
namespace
=
%
self
.namespace
,
"Prefill router reactivated (prefill workers rejoined)"
);
}
/// Whether this router is currently deactivated (prefill workers died).
pub
fn
is_deactivated
(
&
self
)
->
bool
{
self
.deactivated
.load
(
Ordering
::
Acquire
)
}
/// Whether this router can serve requests in its current state.
/// - !enforce_disagg (aggregated passthrough): always servable unless deactivated
/// - enforce_disagg: only servable when prefill has activated AND is not deactivated,
/// so a cold-started strict-disagg model isn't listed before prefill rendezvoused.
pub
fn
can_serve_requests
(
&
self
)
->
bool
{
if
self
.is_deactivated
()
{
return
!
self
.enforce_disagg
;
}
if
!
self
.enforce_disagg
{
return
true
;
}
self
.activated
.load
(
Ordering
::
Acquire
)
}
/// Mark this router as activated for testing purposes.
/// In production, `activate()` sets this flag when the inner router is populated.
#[cfg(test)]
pub
(
crate
)
fn
mark_activated_for_test
(
&
self
)
{
self
.activated
.store
(
true
,
Ordering
::
Release
);
}
}
lib/llm/src/kv_router/prefill_router/execution.rs
View file @
d2aad651
...
...
@@ -312,9 +312,10 @@ impl PrefillRouter {
}
}
/// Check if disaggregated mode is currently active (prefill router activated)
/// Check if disaggregated mode is currently active (prefill router activated).
/// Uses the same `activated` flag as `can_serve_requests()` for consistency.
pub
fn
is_activated
(
&
self
)
->
bool
{
self
.
prefill_router
.get
()
.is_some
(
)
self
.
activated
.load
(
std
::
sync
::
atomic
::
Ordering
::
Acquire
)
}
/// Whether disaggregated mode is strictly enforced (fail if no prefill workers).
...
...
lib/llm/src/kv_router/prefill_router/mod.rs
View file @
d2aad651
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use
std
::
sync
::
atomic
::{
AtomicBool
,
Ordering
};
use
std
::
sync
::{
Arc
,
OnceLock
};
use
anyhow
::
Result
;
...
...
@@ -53,6 +54,13 @@ pub struct PrefillRouter {
/// Namespace used to look up the correct WorkerSet's worker monitor
namespace
:
String
,
is_eagle
:
bool
,
/// Set to true when all prefill workers die. Checked in generate() to prevent
/// routing to dead workers. Cleared on reactivation when workers rejoin.
deactivated
:
AtomicBool
,
/// Set to true when the prefill router has been activated (inner router populated).
/// Used by `can_serve_requests()` to gate enforce_disagg readiness so a cold-started
/// strict-disagg model isn't listed before the prefill has rendezvoused.
activated
:
AtomicBool
,
}
impl
Drop
for
PrefillRouter
{
...
...
@@ -84,10 +92,10 @@ impl
// Save original max_tokens for decode
let
original_max_tokens
=
req
.stop_conditions.max_tokens
;
// If prefill router is not activated (no prefill workers discovered)
,
// this is aggregated mode
—
route directly
to decode.
// With --enforce-disagg, fail instead of falling back.
if
self
.prefill_router
.get
()
.is_none
()
{
// If prefill router is not activated (no prefill workers discovered)
or has been
//
deactivated (all prefill workers died),
this is aggregated mode
--
route directly
//
to decode.
With --enforce-disagg, fail instead of falling back.
if
self
.prefill_router
.get
()
.is_none
()
||
self
.deactivated
.load
(
Ordering
::
Relaxed
)
{
if
self
.enforce_disagg
{
return
Err
(
anyhow
::
anyhow!
(
PrefillError
::
NotActivated
));
}
...
...
@@ -269,4 +277,102 @@ mod tests {
assert_eq!
(
override_config
.track_prefill_tokens
,
Some
(
false
));
assert_eq!
(
override_config
.router_temperature
,
Some
(
0.7
));
}
// -- Prefill death handling tests --
/// Helper: create a disabled PrefillRouter for testing deactivation behavior.
fn
make_test_router
(
enforce_disagg
:
bool
)
->
Arc
<
PrefillRouter
>
{
PrefillRouter
::
disabled
(
Arc
::
new
(
crate
::
discovery
::
ModelManager
::
new
()),
RouterMode
::
RoundRobin
,
enforce_disagg
,
)
}
#[test]
fn
test_deactivated_flag_blocks_when_enforce_disagg
()
{
let
router
=
make_test_router
(
true
);
// Not activated, so enforce_disagg blocks even before deactivation
assert
!
(
!
router
.can_serve_requests
(),
"enforce_disagg must block before prefill activation"
);
router
.deactivate
();
assert
!
(
router
.is_deactivated
());
assert
!
(
!
router
.can_serve_requests
(),
"deactivated + enforce_disagg must block"
);
}
#[test]
fn
test_deactivated_flag_allows_fallback_no_enforce
()
{
let
router
=
make_test_router
(
false
);
router
.deactivate
();
assert
!
(
router
.is_deactivated
());
assert
!
(
router
.can_serve_requests
(),
"deactivated + !enforce_disagg must allow fallback"
);
}
#[test]
fn
test_reactivate_clears_deactivated_no_enforce
()
{
let
router
=
make_test_router
(
false
);
router
.deactivate
();
// !enforce_disagg allows fallback even while deactivated
assert
!
(
router
.can_serve_requests
());
router
.reactivate
();
assert
!
(
!
router
.is_deactivated
());
assert
!
(
router
.can_serve_requests
(),
"reactivated non-enforce router must serve requests"
);
}
#[test]
fn
test_reactivate_clears_deactivated_enforce_needs_activation
()
{
// disabled() never sets the activated flag, so enforce_disagg stays blocked.
// In a real deployment, activate() sets the flag before the first
// deactivate/reactivate cycle, so this only exercises the flag reset.
let
router
=
make_test_router
(
true
);
router
.deactivate
();
assert
!
(
!
router
.can_serve_requests
());
router
.reactivate
();
assert
!
(
!
router
.is_deactivated
());
assert
!
(
!
router
.can_serve_requests
(),
"enforce_disagg without activation still can't serve"
);
}
#[test]
fn
test_fresh_router_not_deactivated
()
{
let
router
=
make_test_router
(
true
);
assert
!
(
!
router
.is_deactivated
());
// enforce_disagg + no prefill activation => not servable
assert
!
(
!
router
.can_serve_requests
());
}
#[test]
fn
test_fresh_router_no_enforce_disagg_can_serve
()
{
let
router
=
make_test_router
(
false
);
assert
!
(
!
router
.is_deactivated
());
assert
!
(
router
.can_serve_requests
(),
"non-enforce_disagg router must be servable even without prefill activation"
);
}
#[test]
fn
test_deactivate_is_idempotent
()
{
let
router
=
make_test_router
(
true
);
router
.deactivate
();
router
.deactivate
();
assert
!
(
router
.is_deactivated
());
assert
!
(
!
router
.can_serve_requests
());
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment