Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
b579a772
Unverified
Commit
b579a772
authored
Apr 09, 2026
by
Yan Ru Pei
Committed by
GitHub
Apr 09, 2026
Browse files
fix(llm): preserve unresolved dp rank routing (#8000)
Signed-off-by:
PeaBrane
<
yanrpei@gmail.com
>
parent
f3d3a8b3
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
259 additions
and
138 deletions
+259
-138
components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
...src/dynamo/sglang/request_handlers/llm/prefill_handler.py
+3
-2
deploy/inference-gateway/epp/pkg/plugins/disagg/prefill_scorer.go
...nference-gateway/epp/pkg/plugins/disagg/prefill_scorer.go
+5
-2
deploy/inference-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
...erence-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
+4
-0
lib/llm/src/discovery/worker_monitor.rs
lib/llm/src/discovery/worker_monitor.rs
+6
-0
lib/llm/src/http/service/metrics.rs
lib/llm/src/http/service/metrics.rs
+3
-2
lib/llm/src/kv_router.rs
lib/llm/src/kv_router.rs
+9
-0
lib/llm/src/kv_router/prefill_router/execution.rs
lib/llm/src/kv_router/prefill_router/execution.rs
+4
-5
lib/llm/src/kv_router/prefill_router/mod.rs
lib/llm/src/kv_router/prefill_router/mod.rs
+1
-1
lib/llm/src/kv_router/push_router.rs
lib/llm/src/kv_router/push_router.rs
+102
-63
lib/llm/src/protocols/common/timing.rs
lib/llm/src/protocols/common/timing.rs
+120
-63
lib/llm/src/protocols/openai/nvext.rs
lib/llm/src/protocols/openai/nvext.rs
+2
-0
No files found.
components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
View file @
b579a772
...
@@ -13,8 +13,9 @@ from dynamo.sglang.args import Config
...
@@ -13,8 +13,9 @@ from dynamo.sglang.args import Config
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.request_handlers.handler_base
import
BaseWorkerHandler
from
dynamo.sglang.request_handlers.handler_base
import
BaseWorkerHandler
# Sentinel value matching u32::MAX from prefill_router.rs SimpleRouter path,
# Sentinel value matching u32::MAX from the C/Go prefill-routing ABI.
# indicating no specific data-parallel rank was selected.
# This remains as a compatibility fallback for older callers that still encode
# an unresolved data-parallel rank in-band instead of omitting the field.
_DP_RANK_UNSET
=
2
**
32
-
1
_DP_RANK_UNSET
=
2
**
32
-
1
...
...
deploy/inference-gateway/epp/pkg/plugins/disagg/prefill_scorer.go
View file @
b579a772
...
@@ -120,7 +120,6 @@ func (s *DynPrefillScorer) Score(ctx context.Context, cycleState *schedtypes.Cyc
...
@@ -120,7 +120,6 @@ func (s *DynPrefillScorer) Score(ctx context.Context, cycleState *schedtypes.Cyc
}
}
prefillWorkerID
:=
strconv
.
FormatUint
(
result
.
WorkerID
,
10
)
prefillWorkerID
:=
strconv
.
FormatUint
(
result
.
WorkerID
,
10
)
prefillDpRank
:=
strconv
.
FormatUint
(
uint64
(
result
.
DpRank
),
10
)
logger
.
V
(
logutil
.
DEFAULT
)
.
Info
(
"DynPrefillScorer: prefill worker selected"
,
logger
.
V
(
logutil
.
DEFAULT
)
.
Info
(
"DynPrefillScorer: prefill worker selected"
,
"prefillWorkerID"
,
prefillWorkerID
,
"prefillWorkerID"
,
prefillWorkerID
,
"prefillDpRank"
,
result
.
DpRank
,
"prefillDpRank"
,
result
.
DpRank
,
...
@@ -134,7 +133,11 @@ func (s *DynPrefillScorer) Score(ctx context.Context, cycleState *schedtypes.Cyc
...
@@ -134,7 +133,11 @@ func (s *DynPrefillScorer) Score(ctx context.Context, cycleState *schedtypes.Cyc
req
.
Headers
=
map
[
string
]
string
{}
req
.
Headers
=
map
[
string
]
string
{}
}
}
req
.
Headers
[
PrefillWorkerIDHeader
]
=
prefillWorkerID
req
.
Headers
[
PrefillWorkerIDHeader
]
=
prefillWorkerID
req
.
Headers
[
PrefillDpRankHeader
]
=
prefillDpRank
if
result
.
DpRank
!=
dynscorer
.
UnsetDpRank
{
req
.
Headers
[
PrefillDpRankHeader
]
=
strconv
.
FormatUint
(
uint64
(
result
.
DpRank
),
10
)
}
else
{
delete
(
req
.
Headers
,
PrefillDpRankHeader
)
}
// Score: 1.0 for all pods. The label-filter has already restricted to prefill workers,
// Score: 1.0 for all pods. The label-filter has already restricted to prefill workers,
// and the FFI router's internal selection is authoritative.
// and the FFI router's internal selection is authoritative.
...
...
deploy/inference-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
View file @
b579a772
...
@@ -124,6 +124,10 @@ var (
...
@@ -124,6 +124,10 @@ var (
routerHandlesMutex
sync
.
RWMutex
routerHandlesMutex
sync
.
RWMutex
)
)
// UnsetDpRank is the ABI sentinel used by the Rust C bindings when a prefill
// route selected a worker but left the DP rank unresolved.
const
UnsetDpRank
=
^
uint32
(
0
)
func
loadDynamoConfig
()
{
func
loadDynamoConfig
()
{
ffiNamespace
=
getEnvOrDefault
(
"DYN_NAMESPACE_PREFIX"
,
getEnvOrDefault
(
"DYN_NAMESPACE"
,
"vllm-agg"
))
ffiNamespace
=
getEnvOrDefault
(
"DYN_NAMESPACE_PREFIX"
,
getEnvOrDefault
(
"DYN_NAMESPACE"
,
"vllm-agg"
))
ffiComponent
=
"backend"
// This is not the same as DYN_COMPONENT=epp (in this case)
ffiComponent
=
"backend"
// This is not the same as DYN_COMPONENT=epp (in this case)
...
...
lib/llm/src/discovery/worker_monitor.rs
View file @
b579a772
...
@@ -27,6 +27,7 @@ use dynamo_runtime::transports::event_plane::EventSubscriber;
...
@@ -27,6 +27,7 @@ use dynamo_runtime::transports::event_plane::EventSubscriber;
// Re-export worker type constants from timing.rs (single source of truth)
// Re-export worker type constants from timing.rs (single source of truth)
pub
use
crate
::
protocols
::
common
::
timing
::{
WORKER_TYPE_DECODE
,
WORKER_TYPE_PREFILL
};
pub
use
crate
::
protocols
::
common
::
timing
::{
WORKER_TYPE_DECODE
,
WORKER_TYPE_PREFILL
};
const
UNSET_DP_RANK_LABEL
:
&
str
=
"none"
;
/// Clean up all Prometheus metrics for a worker across the specified dp_ranks.
/// Clean up all Prometheus metrics for a worker across the specified dp_ranks.
///
///
...
@@ -44,6 +45,11 @@ fn cleanup_worker_metrics(worker_id: u64, dp_ranks: &[u32], worker_type: &str) {
...
@@ -44,6 +45,11 @@ fn cleanup_worker_metrics(worker_id: u64, dp_ranks: &[u32], worker_type: &str) {
let
_
=
WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE
.remove_label_values
(
labels
);
let
_
=
WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE
.remove_label_values
(
labels
);
let
_
=
WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE
.remove_label_values
(
labels
);
let
_
=
WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE
.remove_label_values
(
labels
);
}
}
let
unset_labels
=
&
[
worker_id_str
.as_str
(),
UNSET_DP_RANK_LABEL
,
worker_type
];
let
_
=
WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE
.remove_label_values
(
unset_labels
);
let
_
=
WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE
.remove_label_values
(
unset_labels
);
let
_
=
WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE
.remove_label_values
(
unset_labels
);
}
}
/// Scale factor for storing f64 thresholds as u32 (10000 = 4 decimal places)
/// Scale factor for storing f64 thresholds as u32 (10000 = 4 decimal places)
...
...
lib/llm/src/http/service/metrics.rs
View file @
b579a772
...
@@ -42,6 +42,7 @@ use super::RouteDoc;
...
@@ -42,6 +42,7 @@ use super::RouteDoc;
/// Worker type label values for Prometheus timing metrics
/// Worker type label values for Prometheus timing metrics
pub
use
crate
::
discovery
::{
WORKER_TYPE_DECODE
,
WORKER_TYPE_PREFILL
};
pub
use
crate
::
discovery
::{
WORKER_TYPE_DECODE
,
WORKER_TYPE_PREFILL
};
const
UNSET_DP_RANK_LABEL
:
&
str
=
"none"
;
/// Global Prometheus gauge for last observed TTFT per worker (in seconds)
/// Global Prometheus gauge for last observed TTFT per worker (in seconds)
/// Labels: worker_id, dp_rank, worker_type
/// Labels: worker_id, dp_rank, worker_type
...
@@ -1342,7 +1343,7 @@ impl ResponseMetricCollector {
...
@@ -1342,7 +1343,7 @@ impl ResponseMetricCollector {
let
worker_id_str
=
worker_id
.to_string
();
let
worker_id_str
=
worker_id
.to_string
();
let
dp_rank_str
=
self
let
dp_rank_str
=
self
.prefill_dp_rank
.prefill_dp_rank
.map_or
(
"0"
.to_string
(),
|
r
|
r
.to_string
());
.map_or
(
UNSET_DP_RANK_LABEL
.to_string
(),
|
r
|
r
.to_string
());
let
worker_type
=
self
let
worker_type
=
self
.prefill_worker_type
.prefill_worker_type
.as_deref
()
.as_deref
()
...
@@ -1385,7 +1386,7 @@ impl ResponseMetricCollector {
...
@@ -1385,7 +1386,7 @@ impl ResponseMetricCollector {
let
worker_id_str
=
worker_id
.to_string
();
let
worker_id_str
=
worker_id
.to_string
();
let
dp_rank_str
=
self
let
dp_rank_str
=
self
.decode_dp_rank
.decode_dp_rank
.map_or
(
"0"
.to_string
(),
|
r
|
r
.to_string
());
.map_or
(
UNSET_DP_RANK_LABEL
.to_string
(),
|
r
|
r
.to_string
());
let
worker_type
=
self
let
worker_type
=
self
.decode_worker_type
.decode_worker_type
.as_deref
()
.as_deref
()
...
...
lib/llm/src/kv_router.rs
View file @
b579a772
...
@@ -131,6 +131,7 @@ where
...
@@ -131,6 +131,7 @@ where
{
{
indexer
:
Indexer
,
indexer
:
Indexer
,
scheduler
:
KvScheduler
<
Sel
>
,
scheduler
:
KvScheduler
<
Sel
>
,
workers_with_configs
:
RuntimeConfigWatch
,
block_size
:
u32
,
block_size
:
u32
,
kv_router_config
:
KvRouterConfig
,
kv_router_config
:
KvRouterConfig
,
prefill_load_estimator
:
Option
<
Arc
<
dyn
PrefillLoadEstimator
>>
,
prefill_load_estimator
:
Option
<
Arc
<
dyn
PrefillLoadEstimator
>>
,
...
@@ -230,6 +231,7 @@ where
...
@@ -230,6 +231,7 @@ where
Ok
(
Self
{
Ok
(
Self
{
indexer
,
indexer
,
scheduler
,
scheduler
,
workers_with_configs
,
block_size
,
block_size
,
kv_router_config
,
kv_router_config
,
prefill_load_estimator
,
prefill_load_estimator
,
...
@@ -473,6 +475,13 @@ where
...
@@ -473,6 +475,13 @@ where
self
.scheduler
.worker_type
()
self
.scheduler
.worker_type
()
}
}
/// Return the worker's unique global DP rank when it owns exactly one rank.
pub
fn
unique_dp_rank_for_worker
(
&
self
,
worker_id
:
WorkerId
)
->
Option
<
u32
>
{
let
configs
=
self
.workers_with_configs
.borrow
();
let
config
=
configs
.get
(
&
worker_id
)
?
;
(
config
.data_parallel_size
==
1
)
.then_some
(
config
.data_parallel_start_rank
)
}
pub
fn
add_output_block
(
pub
fn
add_output_block
(
&
self
,
&
self
,
request_id
:
&
str
,
request_id
:
&
str
,
...
...
lib/llm/src/kv_router/prefill_router/execution.rs
View file @
b579a772
...
@@ -123,7 +123,7 @@ impl PrefillRouter {
...
@@ -123,7 +123,7 @@ impl PrefillRouter {
///
///
/// If `phase_transition_permit` is provided, it is dropped immediately after routing completes,
/// If `phase_transition_permit` is provided, it is dropped immediately after routing completes,
/// allowing subsequent `set_phase` calls to proceed. This preserves the current synchronization:
/// allowing subsequent `set_phase` calls to proceed. This preserves the current synchronization:
/// the prefill route must finish
`record_worker_full`
before the phase can change to Decode.
/// the prefill route must finish
worker recording
before the phase can change to Decode.
///
///
/// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
/// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
pub
(
super
)
async
fn
execute_prefill
(
pub
(
super
)
async
fn
execute_prefill
(
...
@@ -131,7 +131,7 @@ impl PrefillRouter {
...
@@ -131,7 +131,7 @@ impl PrefillRouter {
request
:
SingleIn
<
PreprocessedRequest
>
,
request
:
SingleIn
<
PreprocessedRequest
>
,
target_worker
:
Option
<
u64
>
,
target_worker
:
Option
<
u64
>
,
phase_transition_permit
:
Option
<
OwnedSemaphorePermit
>
,
phase_transition_permit
:
Option
<
OwnedSemaphorePermit
>
,
)
->
Result
<
(
PrefillResult
,
Option
<
(
u64
,
u32
)
>
),
PrefillError
>
{
)
->
Result
<
(
PrefillResult
,
Option
<
(
u64
,
Option
<
u32
>
)
>
),
PrefillError
>
{
let
router
=
router
.ok_or
(
PrefillError
::
NotActivated
)
?
;
let
router
=
router
.ok_or
(
PrefillError
::
NotActivated
)
?
;
let
mut
prefill_response
=
router
let
mut
prefill_response
=
router
.generate_to_worker
(
request
,
target_worker
)
.generate_to_worker
(
request
,
target_worker
)
...
@@ -143,7 +143,7 @@ impl PrefillRouter {
...
@@ -143,7 +143,7 @@ impl PrefillRouter {
)
)
})
?
;
})
?
;
// Release the phase barrier now that routing completed and
record_worker_full
already ran.
// Release the phase barrier now that routing completed and
worker recording
already ran.
// Decode may proceed without waiting for prefill output streaming to finish.
// Decode may proceed without waiting for prefill output streaming to finish.
drop
(
phase_transition_permit
);
drop
(
phase_transition_permit
);
...
@@ -201,8 +201,7 @@ impl PrefillRouter {
...
@@ -201,8 +201,7 @@ impl PrefillRouter {
let
dp_rank
=
worker_id_json
let
dp_rank
=
worker_id_json
.get
(
"prefill_dp_rank"
)
.get
(
"prefill_dp_rank"
)
.and_then
(|
v
|
v
.as_u64
())
.and_then
(|
v
|
v
.as_u64
())
.map
(|
r
|
r
as
u32
)
.map
(|
r
|
r
as
u32
);
.unwrap_or
(
0
);
Some
((
worker_id
,
dp_rank
))
Some
((
worker_id
,
dp_rank
))
});
});
Ok
((
Ok
((
...
...
lib/llm/src/kv_router/prefill_router/mod.rs
View file @
b579a772
...
@@ -148,7 +148,7 @@ impl
...
@@ -148,7 +148,7 @@ impl
link_child_context
(
&
engine_ctx
,
prefill_req
,
request_id
.as_str
());
link_child_context
(
&
engine_ctx
,
prefill_req
,
request_id
.as_str
());
// Pass the phase barrier to the spawned task. It is released after routing
// Pass the phase barrier to the spawned task. It is released after routing
// completes so
`record_worker_full`
finishes before phase changes to Decode.
// completes so
worker recording
finishes before phase changes to Decode.
self
.spawn_prefill_task
(
prefill_context
,
Some
(
worker_id
),
prefill_phase_barrier
);
self
.spawn_prefill_task
(
prefill_context
,
Some
(
worker_id
),
prefill_phase_barrier
);
Ok
(
PrefillOutcome
::
Bootstrap
(
bootstrap_info
))
Ok
(
PrefillOutcome
::
Bootstrap
(
bootstrap_info
))
...
...
lib/llm/src/kv_router/push_router.rs
View file @
b579a772
...
@@ -34,8 +34,9 @@ pub struct KvPushRouter {
...
@@ -34,8 +34,9 @@ pub struct KvPushRouter {
/// Result of worker selection containing instance ID, dp_rank, and overlap amount.
/// Result of worker selection containing instance ID, dp_rank, and overlap amount.
struct
WorkerSelection
{
struct
WorkerSelection
{
instance_id
:
u64
,
instance_id
:
u64
,
dp_rank
:
u32
,
backend_dp_rank
:
Option
<
u32
>
,
overlap_amount
:
u32
,
bookkeeping_dp_rank
:
Option
<
u32
>
,
overlap_amount
:
Option
<
u32
>
,
}
}
/// Drop guard that manages the full lifecycle of a routed request:
/// Drop guard that manages the full lifecycle of a routed request:
...
@@ -46,6 +47,7 @@ struct WorkerSelection {
...
@@ -46,6 +47,7 @@ struct WorkerSelection {
/// `Drop` impl fires and spawns a task to call `free()`.
/// `Drop` impl fires and spawns a task to call `free()`.
struct
RequestGuard
{
struct
RequestGuard
{
chooser
:
Arc
<
KvRouter
>
,
chooser
:
Arc
<
KvRouter
>
,
scheduler_tracked
:
bool
,
context_id
:
String
,
context_id
:
String
,
tracker
:
Option
<
Arc
<
RequestTracker
>>
,
tracker
:
Option
<
Arc
<
RequestTracker
>>
,
request_metrics
:
Arc
<
RouterRequestMetrics
>
,
request_metrics
:
Arc
<
RouterRequestMetrics
>
,
...
@@ -70,7 +72,9 @@ impl RequestGuard {
...
@@ -70,7 +72,9 @@ impl RequestGuard {
.map
(|
d
|
!
d
.token_ids
.is_empty
())
.map
(|
d
|
!
d
.token_ids
.is_empty
())
.unwrap_or
(
false
);
.unwrap_or
(
false
);
if
has_tokens
{
if
has_tokens
{
if
let
Err
(
e
)
=
self
.chooser
.mark_prefill_completed
(
&
self
.context_id
)
.await
{
if
self
.scheduler_tracked
&&
let
Err
(
e
)
=
self
.chooser
.mark_prefill_completed
(
&
self
.context_id
)
.await
{
tracing
::
warn!
(
tracing
::
warn!
(
"Failed to mark prefill completed for request {}: {e}"
,
"Failed to mark prefill completed for request {}: {e}"
,
self
.context_id
self
.context_id
...
@@ -130,7 +134,9 @@ impl RequestGuard {
...
@@ -130,7 +134,9 @@ impl RequestGuard {
async
fn
finish
(
&
mut
self
)
{
async
fn
finish
(
&
mut
self
)
{
self
.record_metrics
();
self
.record_metrics
();
if
let
Err
(
e
)
=
self
.chooser
.free
(
&
self
.context_id
)
.await
{
if
self
.scheduler_tracked
&&
let
Err
(
e
)
=
self
.chooser
.free
(
&
self
.context_id
)
.await
{
tracing
::
warn!
(
"Failed to free request {}: {e}"
,
self
.context_id
);
tracing
::
warn!
(
"Failed to free request {}: {e}"
,
self
.context_id
);
}
}
self
.freed
=
true
;
self
.freed
=
true
;
...
@@ -155,7 +161,7 @@ impl RequestGuard {
...
@@ -155,7 +161,7 @@ impl RequestGuard {
impl
Drop
for
RequestGuard
{
impl
Drop
for
RequestGuard
{
fn
drop
(
&
mut
self
)
{
fn
drop
(
&
mut
self
)
{
self
.record_metrics
();
self
.record_metrics
();
if
!
self
.freed
{
if
!
self
.freed
&&
self
.scheduler_tracked
{
let
chooser
=
self
.chooser
.clone
();
let
chooser
=
self
.chooser
.clone
();
let
context_id
=
self
.context_id
.clone
();
let
context_id
=
self
.context_id
.clone
();
let
Ok
(
handle
)
=
tokio
::
runtime
::
Handle
::
try_current
()
else
{
let
Ok
(
handle
)
=
tokio
::
runtime
::
Handle
::
try_current
()
else
{
...
@@ -198,7 +204,6 @@ impl KvPushRouter {
...
@@ -198,7 +204,6 @@ impl KvPushRouter {
let
routing
=
request
.routing
.as_ref
();
let
routing
=
request
.routing
.as_ref
();
let
lora_name
=
routing
.and_then
(|
r
|
r
.lora_name
.clone
());
let
lora_name
=
routing
.and_then
(|
r
|
r
.lora_name
.clone
());
let
priority_jump
=
routing
.and_then
(|
r
|
r
.priority_jump
)
.unwrap_or
(
0.0
);
let
priority_jump
=
routing
.and_then
(|
r
|
r
.priority_jump
)
.unwrap_or
(
0.0
);
let
dp_rank
=
routing
.and_then
(|
r
|
r
.dp_rank
)
.unwrap_or
(
0
);
let
expected_output_tokens
=
routing
.and_then
(|
r
|
r
.expected_output_tokens
);
let
expected_output_tokens
=
routing
.and_then
(|
r
|
r
.expected_output_tokens
);
let
allowed_worker_ids
=
routing
.and_then
(|
r
|
r
.allowed_worker_ids
.clone
());
let
allowed_worker_ids
=
routing
.and_then
(|
r
|
r
.allowed_worker_ids
.clone
());
let
(
routing_token_ids
,
block_mm_infos
)
=
request
.block_mm_routing_info
();
let
(
routing_token_ids
,
block_mm_infos
)
=
request
.block_mm_routing_info
();
...
@@ -213,6 +218,10 @@ impl KvPushRouter {
...
@@ -213,6 +218,10 @@ impl KvPushRouter {
}
}
RequestPhase
::
Aggregated
=>
routing
.and_then
(|
r
|
r
.backend_instance_id
),
RequestPhase
::
Aggregated
=>
routing
.and_then
(|
r
|
r
.backend_instance_id
),
};
};
let
requested_dp_rank
=
match
phase
{
RequestPhase
::
Prefill
=>
routing
.and_then
(|
r
|
r
.prefill_dp_rank
.or
(
r
.dp_rank
)),
RequestPhase
::
Decode
|
RequestPhase
::
Aggregated
=>
routing
.and_then
(|
r
|
r
.dp_rank
),
};
let
Some
(
id
)
=
preselected_id
else
{
let
Some
(
id
)
=
preselected_id
else
{
let
_
nvtx_kv
=
dynamo_nvtx_range!
(
"route.kv_match"
);
let
_
nvtx_kv
=
dynamo_nvtx_range!
(
"route.kv_match"
);
...
@@ -254,55 +263,72 @@ impl KvPushRouter {
...
@@ -254,55 +263,72 @@ impl KvPushRouter {
return
Ok
(
WorkerSelection
{
return
Ok
(
WorkerSelection
{
instance_id
:
best_worker
.worker_id
,
instance_id
:
best_worker
.worker_id
,
dp_rank
:
best_worker
.dp_rank
,
backend_dp_rank
:
Some
(
best_worker
.dp_rank
),
overlap_amount
,
bookkeeping_dp_rank
:
Some
(
best_worker
.dp_rank
),
overlap_amount
:
Some
(
overlap_amount
),
});
});
};
};
let
backend_dp_rank
=
requested_dp_rank
.or_else
(||
self
.chooser
.unique_dp_rank_for_worker
(
id
));
tracing
::
debug!
(
tracing
::
debug!
(
worker_id
=
id
,
worker_id
=
id
,
dp_rank
=
dp_rank
,
dp_rank
=
?
backend_
dp_rank
,
?
phase
,
?
phase
,
"Routing to specified worker"
"Routing to specified worker"
);
);
let
worker
=
WorkerWithDpRank
::
new
(
id
,
dp_rank
);
let
(
bookkeeping_dp_rank
,
overlap_amount
)
=
if
let
Some
(
dp_rank
)
=
backend_dp_rank
{
let
overlap_blocks
=
self
let
worker
=
WorkerWithDpRank
::
new
(
id
,
dp_rank
);
.chooser
let
overlap_blocks
=
self
.get_overlap_blocks
(
.chooser
routing_token_ids
,
.get_overlap_blocks
(
block_mm_infos
,
worker
,
lora_name
.as_deref
(),
)
.await
?
;
if
!
is_query_only
{
self
.chooser
.add_request
(
context_id
.to_string
(),
routing_token_ids
,
routing_token_ids
,
block_mm_infos
,
block_mm_infos
,
overlap_blocks
,
expected_output_tokens
,
worker
,
worker
,
lora_name
,
lora_name
.as_deref
(),
request
.router_config_override
.as_ref
(),
)
)
.await
;
.await
?
;
if
!
is_query_only
{
self
.chooser
.add_request
(
context_id
.to_string
(),
routing_token_ids
,
block_mm_infos
,
overlap_blocks
,
expected_output_tokens
,
worker
,
lora_name
,
request
.router_config_override
.as_ref
(),
)
.await
;
}
else
{
tracing
::
debug!
(
request_id
=
%
context_id
,
worker_id
=
id
,
dp_rank
=
dp_rank
,
"Skipping add_request - query-only request"
);
}
(
Some
(
dp_rank
),
Some
(
overlap_blocks
))
}
else
{
}
else
{
tracing
::
debug!
(
tracing
::
debug!
(
request_id
=
%
context_id
,
request_id
=
%
context_id
,
worker_id
=
id
,
worker_id
=
id
,
dp_rank
=
dp_rank
,
?
phase
,
"
Skipping add_request - query or handled externally
"
"
Routing to specified worker without resolved dp_rank; skipping scheduler bookkeeping
"
);
);
}
(
None
,
None
)
};
Ok
(
WorkerSelection
{
Ok
(
WorkerSelection
{
instance_id
:
id
,
instance_id
:
id
,
dp_rank
,
backend_dp_rank
,
overlap_amount
:
overlap_blocks
,
bookkeeping_dp_rank
,
overlap_amount
,
})
})
}
}
}
}
...
@@ -354,37 +380,47 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
...
@@ -354,37 +380,47 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
.await
?
;
.await
?
;
let
WorkerSelection
{
let
WorkerSelection
{
instance_id
,
instance_id
,
dp_rank
,
backend_dp_rank
,
bookkeeping_dp_rank
,
overlap_amount
,
overlap_amount
,
}
=
selection
;
}
=
selection
;
let
scheduler_tracked
=
!
is_query_only
&&
bookkeeping_dp_rank
.is_some
();
// In approximate mode (use_kv_events=false), record the routing decision
// In approximate mode (use_kv_events=false), record the routing decision
// so the indexer can track cache state based on routing decisions.
// so the indexer can track cache state based on routing decisions.
// This covers both pre-selected workers and find_best_match selections.
// This covers both pre-selected workers and find_best_match selections.
if
!
is_query_only
&&
!
self
.chooser
.kv_router_config
()
.use_kv_events
{
if
!
is_query_only
&&
!
self
.chooser
.kv_router_config
()
.use_kv_events
{
let
lora_name
=
request
.routing
.as_ref
()
.and_then
(|
r
|
r
.lora_name
.clone
());
if
let
Some
(
dp_rank
)
=
bookkeeping_dp_rank
{
let
(
routing_token_ids
,
block_mm_infos
)
=
request
.block_mm_routing_info
();
let
lora_name
=
request
.routing
.as_ref
()
.and_then
(|
r
|
r
.lora_name
.clone
());
let
worker
=
WorkerWithDpRank
::
new
(
instance_id
,
dp_rank
);
let
(
routing_token_ids
,
block_mm_infos
)
=
request
.block_mm_routing_info
();
let
mut
tokens_with_hashes
=
let
worker
=
WorkerWithDpRank
::
new
(
instance_id
,
dp_rank
);
TokensWithHashes
::
new
(
routing_token_ids
.to_vec
(),
self
.chooser
.block_size
())
let
mut
tokens_with_hashes
=
.with_is_eagle
(
self
.chooser
.is_eagle
());
TokensWithHashes
::
new
(
routing_token_ids
.to_vec
(),
self
.chooser
.block_size
())
if
let
Some
(
infos
)
=
block_mm_infos
{
.with_is_eagle
(
self
.chooser
.is_eagle
());
tokens_with_hashes
=
tokens_with_hashes
.with_mm_infos
(
infos
.to_vec
());
if
let
Some
(
infos
)
=
block_mm_infos
{
}
tokens_with_hashes
=
tokens_with_hashes
.with_mm_infos
(
infos
.to_vec
());
if
let
Some
(
lora_name
)
=
lora_name
{
}
tokens_with_hashes
=
tokens_with_hashes
.with_lora_name
(
lora_name
);
if
let
Some
(
lora_name
)
=
lora_name
{
}
tokens_with_hashes
=
tokens_with_hashes
.with_lora_name
(
lora_name
);
if
let
Err
(
e
)
=
self
}
.chooser
if
let
Err
(
e
)
=
self
.record_routing_decision
(
tokens_with_hashes
,
worker
)
.chooser
.await
.record_routing_decision
(
tokens_with_hashes
,
worker
)
{
.await
tracing
::
warn!
(
{
tracing
::
warn!
(
request_id
=
%
context_id
,
worker_id
=
instance_id
,
dp_rank
=
dp_rank
,
error
=
%
e
,
"Failed to record routing decision in approximate mode"
);
}
}
else
{
tracing
::
debug!
(
request_id
=
%
context_id
,
request_id
=
%
context_id
,
worker_id
=
instance_id
,
worker_id
=
instance_id
,
dp_rank
=
dp_rank
,
"Skipping approximate-mode routing decision for unresolved dp_rank"
error
=
%
e
,
"Failed to record routing decision in approximate mode"
);
);
}
}
}
}
...
@@ -395,12 +431,14 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
...
@@ -395,12 +431,14 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
if
let
Some
(
ref
tracker
)
=
request
.tracker
{
if
let
Some
(
ref
tracker
)
=
request
.tracker
{
let
(
routing_token_ids
,
_
)
=
request
.block_mm_routing_info
();
let
(
routing_token_ids
,
_
)
=
request
.block_mm_routing_info
();
let
isl_blocks
=
routing_token_ids
.len
()
.div_ceil
(
block_size
);
let
isl_blocks
=
routing_token_ids
.len
()
.div_ceil
(
block_size
);
tracker
.record_kv_hit
(
overlap_amount
,
isl_blocks
);
if
let
Some
(
overlap_amount
)
=
overlap_amount
{
tracker
.record_kv_hit
(
overlap_amount
,
isl_blocks
);
}
tracker
.record_isl
(
tracker
.record_isl
(
routing_token_ids
.len
(),
routing_token_ids
.len
(),
overlap_amount
as
usize
*
block_size
,
overlap_amount
.map
(|
overlap
|
overlap
as
usize
*
block_size
)
,
);
);
tracker
.record_worker
_full
(
instance_id
,
dp_rank
,
self
.chooser
.worker_type
());
tracker
.record_worker
(
instance_id
,
backend_
dp_rank
,
self
.chooser
.worker_type
());
tracker
.record_router_queue_depth
(
self
.chooser
.pending_count
());
tracker
.record_router_queue_depth
(
self
.chooser
.pending_count
());
if
let
Some
(
hit_rate
)
=
tracker
.kv_hit_rate
()
{
if
let
Some
(
hit_rate
)
=
tracker
.kv_hit_rate
()
{
request_metrics
.kv_hit_rate
.observe
(
hit_rate
);
request_metrics
.kv_hit_rate
.observe
(
hit_rate
);
...
@@ -444,7 +482,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
...
@@ -444,7 +482,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
let
tracker
=
request
.tracker
.clone
();
let
tracker
=
request
.tracker
.clone
();
let
(
mut
backend_input
,
context
)
=
request
.into_parts
();
let
(
mut
backend_input
,
context
)
=
request
.into_parts
();
backend_input
.routing_mut
()
.dp_rank
=
Some
(
dp_rank
)
;
backend_input
.routing_mut
()
.dp_rank
=
backend_
dp_rank
;
let
updated_request
=
context
.map
(|
_
|
backend_input
);
let
updated_request
=
context
.map
(|
_
|
backend_input
);
// Record prefill start right before pushing to backend (OnceLock: first call wins).
// Record prefill start right before pushing to backend (OnceLock: first call wins).
...
@@ -460,8 +498,8 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
...
@@ -460,8 +498,8 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
"kv_router.route_request"
,
"kv_router.route_request"
,
request_id
=
%
context_id
,
request_id
=
%
context_id
,
worker_id
=
instance_id
,
worker_id
=
instance_id
,
dp_rank
=
dp_rank
,
dp_rank
=
?
backend_
dp_rank
,
overlap_blocks
=
overlap_amount
,
overlap_blocks
=
?
overlap_amount
,
phase
=
?
phase
,
phase
=
?
phase
,
))
))
.await
?
;
.await
?
;
...
@@ -471,6 +509,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
...
@@ -471,6 +509,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
let
wrapped_stream
=
Box
::
pin
(
async_stream
::
stream!
{
let
wrapped_stream
=
Box
::
pin
(
async_stream
::
stream!
{
let
mut
guard
=
RequestGuard
{
let
mut
guard
=
RequestGuard
{
chooser
:
chooser
.clone
(),
chooser
:
chooser
.clone
(),
scheduler_tracked
,
context_id
:
context_id
.clone
(),
context_id
:
context_id
.clone
(),
tracker
:
tracker
.clone
(),
tracker
:
tracker
.clone
(),
request_metrics
:
request_metrics
.clone
(),
request_metrics
:
request_metrics
.clone
(),
...
@@ -479,7 +518,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
...
@@ -479,7 +518,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
freed
:
false
,
freed
:
false
,
prefill_marked
:
false
,
prefill_marked
:
false
,
first_token_recorded
:
false
,
first_token_recorded
:
false
,
track_output_blocks
,
track_output_blocks
:
scheduler_tracked
&&
track_output_blocks
,
current_total_blocks
:
isl_tokens
.div_ceil
(
block_size
),
current_total_blocks
:
isl_tokens
.div_ceil
(
block_size
),
isl_tokens
,
isl_tokens
,
block_size
,
block_size
,
...
...
lib/llm/src/protocols/common/timing.rs
View file @
b579a772
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
use
std
::
sync
::
Arc
;
use
std
::
sync
::
Arc
;
use
std
::
sync
::
OnceLock
;
use
std
::
sync
::
OnceLock
;
use
std
::
sync
::
atomic
::{
AtomicU32
,
AtomicU64
,
Ordering
};
use
std
::
sync
::
atomic
::{
AtomicU64
,
Ordering
};
use
std
::
time
::{
Duration
,
Instant
,
SystemTime
,
UNIX_EPOCH
};
use
std
::
time
::{
Duration
,
Instant
,
SystemTime
,
UNIX_EPOCH
};
use
parking_lot
::
Mutex
;
use
parking_lot
::
Mutex
;
...
@@ -22,16 +22,12 @@ use crate::http::service::metrics::{
...
@@ -22,16 +22,12 @@ use crate::http::service::metrics::{
};
};
use
crate
::
protocols
::
openai
::
nvext
::
WorkerIdInfo
;
use
crate
::
protocols
::
openai
::
nvext
::
WorkerIdInfo
;
/// Sentinel value indicating no worker ID has been set.
/// We use 0 as the sentinel since valid worker IDs are non-zero lease IDs from etcd.
const
NO_WORKER_ID
:
u64
=
0
;
const
NO_DP_RANK
:
u32
=
u32
::
MAX
;
/// Worker type constants for Prometheus metric labels.
/// Worker type constants for Prometheus metric labels.
/// These are stored in RequestTracker at routing time to avoid costly MDC lookups
/// These are stored in RequestTracker at routing time to avoid costly MDC lookups
/// when updating per-worker metrics (TTFT, ITL).
/// when updating per-worker metrics (TTFT, ITL).
pub
const
WORKER_TYPE_PREFILL
:
&
str
=
"prefill"
;
pub
const
WORKER_TYPE_PREFILL
:
&
str
=
"prefill"
;
pub
const
WORKER_TYPE_DECODE
:
&
str
=
"decode"
;
pub
const
WORKER_TYPE_DECODE
:
&
str
=
"decode"
;
const
UNSET_DP_RANK_LABEL
:
&
str
=
"none"
;
/// Phase of the request in disaggregated serving.
/// Phase of the request in disaggregated serving.
///
///
...
@@ -81,8 +77,8 @@ impl std::fmt::Display for RequestPhase {
...
@@ -81,8 +77,8 @@ impl std::fmt::Display for RequestPhase {
/// phase's final finish naturally overwrites the prefill phase's earlier finish.
/// phase's final finish naturally overwrites the prefill phase's earlier finish.
/// `phase` also uses a Mutex since it transitions across phases.
/// `phase` also uses a Mutex since it transitions across phases.
///
///
/// **`AtomicU64`
/`AtomicU32`
:** Used for frequently updated counters (`osl_tokens`)
/// **`AtomicU64`:** Used for frequently updated counters (`osl_tokens`)
and
/// a
nd worker IDs/ranks where `OnceLock`'s heap overhead is unnecessary
.
/// a
ccumulated detokenize timing, where lock-free updates are beneficial
.
#[derive(Debug)]
#[derive(Debug)]
pub
struct
RequestTracker
{
pub
struct
RequestTracker
{
/// When the request was received (monotonic clock for duration calculations)
/// When the request was received (monotonic clock for duration calculations)
...
@@ -118,19 +114,17 @@ pub struct RequestTracker {
...
@@ -118,19 +114,17 @@ pub struct RequestTracker {
/// Output sequence length in tokens - updated atomically as tokens stream back
/// Output sequence length in tokens - updated atomically as tokens stream back
osl_tokens
:
AtomicU64
,
osl_tokens
:
AtomicU64
,
/// Prefill worker ID (for disaggregated serving).
/// Prefill worker ID (for disaggregated serving) - set once when known.
/// Uses atomic with compare-exchange for set-once semantics.
prefill_worker_id
:
OnceLock
<
u64
>
,
/// Value of 0 (NO_WORKER_ID) means not yet set.
prefill_worker_id
:
AtomicU64
,
/// Prefill DP rank
. Value of u32::MAX (NO_DP_RANK) means not yet set
.
/// Prefill DP rank
- set once when known
.
prefill_dp_rank
:
AtomicU
32
,
prefill_dp_rank
:
OnceLock
<
u
32
>
,
/// Decode worker ID
. Value of 0 (NO_WORKER_ID) means not yet set
.
/// Decode worker ID
- set once when known
.
decode_worker_id
:
AtomicU
64
,
decode_worker_id
:
OnceLock
<
u
64
>
,
/// Decode DP rank
. Value of u32::MAX (NO_DP_RANK) means not yet set
.
/// Decode DP rank
- set once when known
.
decode_dp_rank
:
AtomicU
32
,
decode_dp_rank
:
OnceLock
<
u
32
>
,
/// Worker type for the prefill worker ("prefill" or "decode").
/// Worker type for the prefill worker ("prefill" or "decode").
/// Stored at routing time to avoid MDC lookup when updating Prometheus metrics.
/// Stored at routing time to avoid MDC lookup when updating Prometheus metrics.
...
@@ -149,7 +143,7 @@ pub struct RequestTracker {
...
@@ -149,7 +143,7 @@ pub struct RequestTracker {
/// Semaphore for coordinating phase transitions.
/// Semaphore for coordinating phase transitions.
/// Acquiring a permit blocks subsequent set_phase calls until the permit is dropped.
/// Acquiring a permit blocks subsequent set_phase calls until the permit is dropped.
/// This prevents race conditions in the bootstrap optimization path where prefill
/// This prevents race conditions in the bootstrap optimization path where prefill
/// runs in background and needs to complete
record_worker_full
before phase changes.
/// runs in background and needs to complete
worker recording
before phase changes.
phase_semaphore
:
Arc
<
Semaphore
>
,
phase_semaphore
:
Arc
<
Semaphore
>
,
/// How long it took to tokenize the input
/// How long it took to tokenize the input
...
@@ -185,10 +179,10 @@ impl RequestTracker {
...
@@ -185,10 +179,10 @@ impl RequestTracker {
isl_tokens
:
OnceLock
::
new
(),
isl_tokens
:
OnceLock
::
new
(),
cached_tokens
:
OnceLock
::
new
(),
cached_tokens
:
OnceLock
::
new
(),
osl_tokens
:
AtomicU64
::
new
(
0
),
osl_tokens
:
AtomicU64
::
new
(
0
),
prefill_worker_id
:
AtomicU64
::
new
(
NO_WORKER_ID
),
prefill_worker_id
:
OnceLock
::
new
(
),
prefill_dp_rank
:
AtomicU32
::
new
(
NO_DP_RANK
),
prefill_dp_rank
:
OnceLock
::
new
(
),
decode_worker_id
:
AtomicU64
::
new
(
NO_WORKER_ID
),
decode_worker_id
:
OnceLock
::
new
(
),
decode_dp_rank
:
AtomicU32
::
new
(
NO_DP_RANK
),
decode_dp_rank
:
OnceLock
::
new
(
),
prefill_worker_type
:
OnceLock
::
new
(),
prefill_worker_type
:
OnceLock
::
new
(),
decode_worker_type
:
OnceLock
::
new
(),
decode_worker_type
:
OnceLock
::
new
(),
phase
:
Mutex
::
new
(
RequestPhase
::
Aggregated
),
phase
:
Mutex
::
new
(
RequestPhase
::
Aggregated
),
...
@@ -220,10 +214,12 @@ impl RequestTracker {
...
@@ -220,10 +214,12 @@ impl RequestTracker {
overlap_set
&&
isl_set
overlap_set
&&
isl_set
}
}
/// Record input sequence length in tokens and cached token count.
/// Record input sequence length in tokens and cached token count
when known
.
pub
fn
record_isl
(
&
self
,
isl_tokens
:
usize
,
cached_tokens
:
usize
)
{
pub
fn
record_isl
(
&
self
,
isl_tokens
:
usize
,
cached_tokens
:
Option
<
usize
>
)
{
let
_
=
self
.isl_tokens
.set
(
isl_tokens
);
let
_
=
self
.isl_tokens
.set
(
isl_tokens
);
let
_
=
self
.cached_tokens
.set
(
cached_tokens
);
if
let
Some
(
cached_tokens
)
=
cached_tokens
{
let
_
=
self
.cached_tokens
.set
(
cached_tokens
);
}
}
}
pub
fn
isl_tokens
(
&
self
)
->
Option
<
usize
>
{
pub
fn
isl_tokens
(
&
self
)
->
Option
<
usize
>
{
...
@@ -321,31 +317,96 @@ impl RequestTracker {
...
@@ -321,31 +317,96 @@ impl RequestTracker {
*
self
.phase
.lock
()
*
self
.phase
.lock
()
}
}
/// Record worker ID, DP rank, and worker type based on the current phase.
fn
record_once_u64
(
slot
:
&
OnceLock
<
u64
>
,
value
:
u64
,
field_name
:
&
'static
str
)
{
///
if
let
Some
(
existing
)
=
slot
.get
()
{
/// Each slot is written exactly once by `KvPushRouter::generate()`:
if
*
existing
!=
value
{
/// - Prefill phase: stores as prefill worker
tracing
::
error!
(
/// - Decode phase: stores as decode worker
field
=
field_name
,
/// - Aggregated phase: stores as both prefill and decode worker
existing
=
*
existing
,
pub
fn
record_worker_full
(
&
self
,
instance_id
:
u64
,
dp_rank
:
u32
,
worker_type
:
&
'static
str
)
{
new
=
value
,
match
self
.phase
()
{
"Conflicting request tracker write"
RequestPhase
::
Prefill
=>
{
);
self
.prefill_worker_id
.store
(
instance_id
,
Ordering
::
Relaxed
);
}
self
.prefill_dp_rank
.store
(
dp_rank
,
Ordering
::
Relaxed
);
return
;
let
_
=
self
.prefill_worker_type
.set
(
worker_type
);
}
let
_
=
slot
.set
(
value
);
}
fn
record_once_u32
(
slot
:
&
OnceLock
<
u32
>
,
value
:
u32
,
field_name
:
&
'static
str
)
{
if
let
Some
(
existing
)
=
slot
.get
()
{
if
*
existing
!=
value
{
tracing
::
error!
(
field
=
field_name
,
existing
=
*
existing
,
new
=
value
,
"Conflicting request tracker write"
);
}
}
RequestPhase
::
Decode
=>
{
return
;
self
.decode_worker_id
.store
(
instance_id
,
Ordering
::
Relaxed
);
}
self
.decode_dp_rank
.store
(
dp_rank
,
Ordering
::
Relaxed
);
let
_
=
slot
.set
(
value
);
let
_
=
self
.decode_worker_type
.set
(
worker_type
);
}
fn
record_once_worker_type
(
slot
:
&
OnceLock
<&
'static
str
>
,
value
:
&
'static
str
,
field_name
:
&
'static
str
,
)
{
if
let
Some
(
existing
)
=
slot
.get
()
{
if
*
existing
!=
value
{
tracing
::
error!
(
field
=
field_name
,
existing
=
*
existing
,
new
=
value
,
"Conflicting request tracker write"
);
}
}
return
;
}
let
_
=
slot
.set
(
value
);
}
fn
record_prefill_worker
(
&
self
,
instance_id
:
u64
,
dp_rank
:
Option
<
u32
>
,
worker_type
:
&
'static
str
,
)
{
Self
::
record_once_u64
(
&
self
.prefill_worker_id
,
instance_id
,
"prefill_worker_id"
);
if
let
Some
(
rank
)
=
dp_rank
{
Self
::
record_once_u32
(
&
self
.prefill_dp_rank
,
rank
,
"prefill_dp_rank"
);
}
Self
::
record_once_worker_type
(
&
self
.prefill_worker_type
,
worker_type
,
"prefill_worker_type"
,
);
}
fn
record_decode_worker
(
&
self
,
instance_id
:
u64
,
dp_rank
:
Option
<
u32
>
,
worker_type
:
&
'static
str
,
)
{
Self
::
record_once_u64
(
&
self
.decode_worker_id
,
instance_id
,
"decode_worker_id"
);
if
let
Some
(
rank
)
=
dp_rank
{
Self
::
record_once_u32
(
&
self
.decode_dp_rank
,
rank
,
"decode_dp_rank"
);
}
Self
::
record_once_worker_type
(
&
self
.decode_worker_type
,
worker_type
,
"decode_worker_type"
);
}
/// Record worker ID, optional DP rank, and worker type based on the current phase.
///
/// Worker ID and type are recorded as soon as they are known. DP rank is recorded only
/// when it is concrete, allowing the unresolved rank to remain unset until later.
pub
fn
record_worker
(
&
self
,
instance_id
:
u64
,
dp_rank
:
Option
<
u32
>
,
worker_type
:
&
'static
str
)
{
match
self
.phase
()
{
RequestPhase
::
Prefill
=>
self
.record_prefill_worker
(
instance_id
,
dp_rank
,
worker_type
),
RequestPhase
::
Decode
=>
self
.record_decode_worker
(
instance_id
,
dp_rank
,
worker_type
),
RequestPhase
::
Aggregated
=>
{
RequestPhase
::
Aggregated
=>
{
self
.prefill_worker_id
.store
(
instance_id
,
Ordering
::
Relaxed
);
self
.record_prefill_worker
(
instance_id
,
dp_rank
,
worker_type
);
self
.prefill_dp_rank
.store
(
dp_rank
,
Ordering
::
Relaxed
);
self
.record_decode_worker
(
instance_id
,
dp_rank
,
worker_type
);
let
_
=
self
.prefill_worker_type
.set
(
worker_type
);
self
.decode_worker_id
.store
(
instance_id
,
Ordering
::
Relaxed
);
self
.decode_dp_rank
.store
(
dp_rank
,
Ordering
::
Relaxed
);
let
_
=
self
.decode_worker_type
.set
(
worker_type
);
}
}
}
}
}
}
...
@@ -415,26 +476,22 @@ impl RequestTracker {
...
@@ -415,26 +476,22 @@ impl RequestTracker {
/// Get the decode worker ID if recorded.
/// Get the decode worker ID if recorded.
pub
fn
decode_worker_id
(
&
self
)
->
Option
<
u64
>
{
pub
fn
decode_worker_id
(
&
self
)
->
Option
<
u64
>
{
let
id
=
self
.decode_worker_id
.load
(
Ordering
::
SeqCst
);
self
.decode_worker_id
.get
()
.copied
()
if
id
==
NO_WORKER_ID
{
None
}
else
{
Some
(
id
)
}
}
}
/// Get the decode DP rank if recorded.
/// Get the decode DP rank if recorded.
pub
fn
decode_dp_rank
(
&
self
)
->
Option
<
u32
>
{
pub
fn
decode_dp_rank
(
&
self
)
->
Option
<
u32
>
{
let
rank
=
self
.decode_dp_rank
.load
(
Ordering
::
SeqCst
);
self
.decode_dp_rank
.get
()
.copied
()
if
rank
==
NO_DP_RANK
{
None
}
else
{
Some
(
rank
)
}
}
}
/// Get the prefill worker ID if recorded.
/// Get the prefill worker ID if recorded.
pub
fn
prefill_worker_id
(
&
self
)
->
Option
<
u64
>
{
pub
fn
prefill_worker_id
(
&
self
)
->
Option
<
u64
>
{
let
id
=
self
.prefill_worker_id
.load
(
Ordering
::
SeqCst
);
self
.prefill_worker_id
.get
()
.copied
()
if
id
==
NO_WORKER_ID
{
None
}
else
{
Some
(
id
)
}
}
}
/// Get the prefill DP rank if recorded.
/// Get the prefill DP rank if recorded.
pub
fn
prefill_dp_rank
(
&
self
)
->
Option
<
u32
>
{
pub
fn
prefill_dp_rank
(
&
self
)
->
Option
<
u32
>
{
let
rank
=
self
.prefill_dp_rank
.load
(
Ordering
::
SeqCst
);
self
.prefill_dp_rank
.get
()
.copied
()
if
rank
==
NO_DP_RANK
{
None
}
else
{
Some
(
rank
)
}
}
}
/// Get the prefill worker type if recorded.
/// Get the prefill worker type if recorded.
...
@@ -456,7 +513,7 @@ impl RequestTracker {
...
@@ -456,7 +513,7 @@ impl RequestTracker {
let
worker_id_str
=
worker_id
.to_string
();
let
worker_id_str
=
worker_id
.to_string
();
let
dp_rank_str
=
self
let
dp_rank_str
=
self
.prefill_dp_rank
()
.prefill_dp_rank
()
.map_or
(
"0"
.to_string
(),
|
r
|
r
.to_string
());
.map_or
(
UNSET_DP_RANK_LABEL
.to_string
(),
|
r
|
r
.to_string
());
let
worker_type
=
self
.prefill_worker_type
()
.unwrap_or
(
WORKER_TYPE_PREFILL
);
let
worker_type
=
self
.prefill_worker_type
()
.unwrap_or
(
WORKER_TYPE_PREFILL
);
let
labels
=
&
[
worker_id_str
.as_str
(),
dp_rank_str
.as_str
(),
worker_type
];
let
labels
=
&
[
worker_id_str
.as_str
(),
dp_rank_str
.as_str
(),
worker_type
];
...
@@ -481,7 +538,7 @@ impl RequestTracker {
...
@@ -481,7 +538,7 @@ impl RequestTracker {
let
worker_id_str
=
worker_id
.to_string
();
let
worker_id_str
=
worker_id
.to_string
();
let
dp_rank_str
=
self
let
dp_rank_str
=
self
.decode_dp_rank
()
.decode_dp_rank
()
.map_or
(
"0"
.to_string
(),
|
r
|
r
.to_string
());
.map_or
(
UNSET_DP_RANK_LABEL
.to_string
(),
|
r
|
r
.to_string
());
let
worker_type
=
self
.decode_worker_type
()
.unwrap_or
(
WORKER_TYPE_DECODE
);
let
worker_type
=
self
.decode_worker_type
()
.unwrap_or
(
WORKER_TYPE_DECODE
);
let
labels
=
&
[
worker_id_str
.as_str
(),
dp_rank_str
.as_str
(),
worker_type
];
let
labels
=
&
[
worker_id_str
.as_str
(),
dp_rank_str
.as_str
(),
worker_type
];
...
@@ -555,7 +612,7 @@ mod tests {
...
@@ -555,7 +612,7 @@ mod tests {
fn
test_record_isl_osl
()
{
fn
test_record_isl_osl
()
{
let
tracker
=
RequestTracker
::
new
();
let
tracker
=
RequestTracker
::
new
();
tracker
.record_isl
(
512
,
256
);
tracker
.record_isl
(
512
,
Some
(
256
)
)
;
assert_eq!
(
tracker
.isl_tokens
(),
Some
(
512
));
assert_eq!
(
tracker
.isl_tokens
(),
Some
(
512
));
assert_eq!
(
tracker
.cached_tokens
(),
Some
(
256
));
assert_eq!
(
tracker
.cached_tokens
(),
Some
(
256
));
...
@@ -659,7 +716,7 @@ mod tests {
...
@@ -659,7 +716,7 @@ mod tests {
fn
test_observe_first_token_gauges_no_panic_without_worker
()
{
fn
test_observe_first_token_gauges_no_panic_without_worker
()
{
let
tracker
=
RequestTracker
::
new
();
let
tracker
=
RequestTracker
::
new
();
tracker
.record_first_token
();
tracker
.record_first_token
();
tracker
.record_isl
(
100
,
50
);
tracker
.record_isl
(
100
,
Some
(
50
)
)
;
// No worker recorded — should return early without panic
// No worker recorded — should return early without panic
tracker
.observe_first_token_gauges
();
tracker
.observe_first_token_gauges
();
}
}
...
@@ -677,10 +734,10 @@ mod tests {
...
@@ -677,10 +734,10 @@ mod tests {
#[test]
#[test]
fn
test_observe_first_token_gauges_with_worker
()
{
fn
test_observe_first_token_gauges_with_worker
()
{
let
tracker
=
RequestTracker
::
new
();
let
tracker
=
RequestTracker
::
new
();
tracker
.record_worker
_full
(
42
,
0
,
WORKER_TYPE_PREFILL
);
tracker
.record_worker
(
42
,
Some
(
0
)
,
WORKER_TYPE_PREFILL
);
thread
::
sleep
(
Duration
::
from_millis
(
5
));
thread
::
sleep
(
Duration
::
from_millis
(
5
));
tracker
.record_first_token
();
tracker
.record_first_token
();
tracker
.record_isl
(
256
,
128
);
tracker
.record_isl
(
256
,
Some
(
128
)
)
;
tracker
.observe_first_token_gauges
();
tracker
.observe_first_token_gauges
();
...
@@ -702,7 +759,7 @@ mod tests {
...
@@ -702,7 +759,7 @@ mod tests {
#[test]
#[test]
fn
test_observe_finish_gauges_with_worker
()
{
fn
test_observe_finish_gauges_with_worker
()
{
let
tracker
=
RequestTracker
::
new
();
let
tracker
=
RequestTracker
::
new
();
tracker
.record_worker
_full
(
99
,
1
,
WORKER_TYPE_DECODE
);
tracker
.record_worker
(
99
,
Some
(
1
)
,
WORKER_TYPE_DECODE
);
tracker
.record_first_token
();
tracker
.record_first_token
();
thread
::
sleep
(
Duration
::
from_millis
(
10
));
thread
::
sleep
(
Duration
::
from_millis
(
10
));
tracker
.record_osl
(
5
);
tracker
.record_osl
(
5
);
...
...
lib/llm/src/protocols/openai/nvext.rs
View file @
b579a772
...
@@ -13,6 +13,7 @@ pub const HEADER_WORKER_INSTANCE_ID: &str = "x-worker-instance-id";
...
@@ -13,6 +13,7 @@ pub const HEADER_WORKER_INSTANCE_ID: &str = "x-worker-instance-id";
pub
const
HEADER_PREFILL_INSTANCE_ID
:
&
str
=
"x-prefill-instance-id"
;
pub
const
HEADER_PREFILL_INSTANCE_ID
:
&
str
=
"x-prefill-instance-id"
;
pub
const
HEADER_DP_RANK
:
&
str
=
"x-dp-rank"
;
pub
const
HEADER_DP_RANK
:
&
str
=
"x-dp-rank"
;
pub
const
HEADER_PREFILL_DP_RANK
:
&
str
=
"x-prefill-dp-rank"
;
pub
const
HEADER_PREFILL_DP_RANK
:
&
str
=
"x-prefill-dp-rank"
;
const
UNSET_DP_RANK_SENTINEL
:
u32
=
u32
::
MAX
;
/// Apply routing overrides from HTTP headers to nvext.
/// Apply routing overrides from HTTP headers to nvext.
///
///
...
@@ -44,6 +45,7 @@ pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap)
...
@@ -44,6 +45,7 @@ pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap)
.get
(
HEADER_PREFILL_DP_RANK
)
.get
(
HEADER_PREFILL_DP_RANK
)
.and_then
(|
v
|
v
.to_str
()
.ok
())
.and_then
(|
v
|
v
.to_str
()
.ok
())
.and_then
(|
s
|
s
.parse
::
<
u32
>
()
.ok
());
.and_then
(|
s
|
s
.parse
::
<
u32
>
()
.ok
());
let
prefill_dp_rank
=
prefill_dp_rank
.filter
(|
rank
|
*
rank
!=
UNSET_DP_RANK_SENTINEL
);
if
worker_id
.is_none
()
&&
prefill_id
.is_none
()
&&
dp_rank
.is_none
()
&&
prefill_dp_rank
.is_none
()
if
worker_id
.is_none
()
&&
prefill_id
.is_none
()
&&
dp_rank
.is_none
()
&&
prefill_dp_rank
.is_none
()
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment