Unverified Commit d94b350d authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore(kv-router): make sequences stop doing token math (#8260)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent c2ec3359
......@@ -7,7 +7,7 @@ use common::*;
use clap::Parser;
use common::NoopSequencePublisher;
use dynamo_kv_router::protocols::WorkerWithDpRank;
use dynamo_kv_router::protocols::{PrefillLoadHint, WorkerWithDpRank};
use dynamo_kv_router::{ActiveSequencesMultiWorker, OverlapScores, SequenceRequest};
use dynamo_mocker::loadgen::Trace;
use dynamo_tokens::SequenceHash;
......@@ -389,11 +389,12 @@ async fn apply_entry(
SequenceRequest {
request_id,
token_sequence: Some(block_hashes),
isl,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: Some(output_length as u32),
prefill_load_hint: None,
prefill_load_hint: Some(PrefillLoadHint {
initial_effective_prefill_tokens: isl,
expected_prefill_duration: None,
}),
worker,
lora_name: None,
},
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Shared stale-child cleanup machinery for rooted tree structures.
//!
//! Provides a throttled, one-in-flight sweep that unlinks empty child nodes
//! from their parent. It is used by [`ConcurrentRadixTree`](super::concurrent_radix_tree),
//! [`ConcurrentRadixTreeCompressed`](super::concurrent_radix_tree_compressed)
//! and the sequence-side
//! [`PromptMembershipTrie`](super::sequences::prompt_membership_trie::PromptMembershipTrie),
//! each of which embeds a [`CleanupState`] and implements [`CleanableNode`]
//! for its node type.
//!
//! # Sweep semantics
//!
//! [`sweep_stale_children`] is a reverse-BFS prune:
//! - BFS from the root under read locks, collecting `(parent_weak, key, child_weak)` edges.
//! - Iterate edges deepest-first so children are swept before parents.
//! - For each edge: upgrade weaks, take the parent write lock, verify the
//! child pointer still matches, `try_write` the child, and unlink only when
//! the child has no workers, no children, and `Arc::strong_count == 2`
//! (parent map ref + our local upgrade). The strong-count gate is what
//! prevents reclaiming a node that a concurrent `find_matches` is currently
//! traversing — such edges are skipped and retried on the next sweep.
use std::collections::VecDeque;
use std::hash::Hash;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::{Arc, Weak};
use std::time::Instant;
use parking_lot::RwLock;
use rustc_hash::FxHashMap;
pub const CLEANUP_INTERVAL_MS: u64 = 5 * 60 * 1000;
/// Node type that participates in the reverse-BFS cleanup sweep.
pub trait CleanableNode: Sized + Send + Sync + 'static {
/// Key type used in this node's children map (e.g. `LocalBlockHash`,
/// `SequenceHash`).
type ChildKey: Copy + Eq + Hash + Send + Sync + 'static;
/// True if this node still carries worker state that pins it in the tree.
fn has_any_workers(&self) -> bool;
/// Read-only view of this node's children keyed by the first edge element.
fn children(&self) -> &FxHashMap<Self::ChildKey, Arc<RwLock<Self>>>;
/// Unlink a child edge.
fn remove_child(&mut self, key: &Self::ChildKey);
}
pub struct CleanupState {
clock_origin: Instant,
last_cleanup_elapsed_ms: AtomicU64,
scheduled: AtomicBool,
}
impl CleanupState {
pub fn new() -> Self {
Self {
clock_origin: Instant::now(),
last_cleanup_elapsed_ms: AtomicU64::new(0),
scheduled: AtomicBool::new(false),
}
}
pub fn elapsed_ms(&self) -> u64 {
self.clock_origin.elapsed().as_millis() as u64
}
pub fn try_schedule(&self) -> bool {
let now_ms = self.elapsed_ms();
let last_ms = self.last_cleanup_elapsed_ms.load(Ordering::Relaxed);
if now_ms.saturating_sub(last_ms) < CLEANUP_INTERVAL_MS {
return false;
}
self.scheduled
.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
.is_ok()
}
pub fn cancel(&self) {
self.scheduled.store(false, Ordering::Release);
}
}
impl Default for CleanupState {
fn default() -> Self {
Self::new()
}
}
pub struct CleanupGuard<'a> {
state: &'a CleanupState,
completed_elapsed_ms: Option<u64>,
}
impl<'a> CleanupGuard<'a> {
pub fn new(state: &'a CleanupState) -> Self {
Self {
state,
completed_elapsed_ms: None,
}
}
pub fn mark_completed(&mut self) {
self.completed_elapsed_ms = Some(self.state.elapsed_ms());
}
}
impl Drop for CleanupGuard<'_> {
fn drop(&mut self) {
if let Some(elapsed_ms) = self.completed_elapsed_ms {
self.state
.last_cleanup_elapsed_ms
.store(elapsed_ms, Ordering::Relaxed);
}
self.state.scheduled.store(false, Ordering::Release);
}
}
struct CleanupEdge<N: CleanableNode> {
parent: Weak<RwLock<N>>,
key: N::ChildKey,
child: Weak<RwLock<N>>,
}
/// Reverse-BFS sweep that unlinks empty, unreferenced leaf nodes from the tree.
pub fn sweep_stale_children<N: CleanableNode>(root: &Arc<RwLock<N>>) {
let mut queue: VecDeque<Arc<RwLock<N>>> = VecDeque::from([root.clone()]);
let mut edges: Vec<CleanupEdge<N>> = Vec::new();
while let Some(parent) = queue.pop_front() {
let guard = parent.read();
for (&key, child) in guard.children() {
queue.push_back(child.clone());
edges.push(CleanupEdge {
parent: Arc::downgrade(&parent),
key,
child: Arc::downgrade(child),
});
}
}
for edge in edges.into_iter().rev() {
let (Some(parent), Some(child)) = (edge.parent.upgrade(), edge.child.upgrade()) else {
continue;
};
let mut parent_guard = parent.write();
let still_attached = parent_guard
.children()
.get(&edge.key)
.is_some_and(|current| Arc::ptr_eq(current, &child));
if !still_attached {
continue;
}
let Some(child_guard) = child.try_write() else {
continue;
};
if child_guard.has_any_workers() || !child_guard.children().is_empty() {
continue;
}
if Arc::strong_count(&child) != 2 {
continue;
}
parent_guard.remove_child(&edge.key);
drop(child_guard);
}
}
......@@ -34,6 +34,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
use super::{EventKind, KvIndexerMetrics, SyncIndexer, WorkerTask};
use crate::active_set::reconcile_active_workers;
use crate::cleanup::{self, CleanableNode, CleanupGuard, CleanupState};
use crate::protocols::*;
/// Thread-safe shared reference to a Block.
......@@ -83,6 +84,22 @@ impl Block {
}
}
impl CleanableNode for Block {
type ChildKey = LocalBlockHash;
fn has_any_workers(&self) -> bool {
!self.workers.is_empty()
}
fn children(&self) -> &FxHashMap<LocalBlockHash, SharedBlock> {
&self.children
}
fn remove_child(&mut self, key: &LocalBlockHash) {
self.children.remove(key);
}
}
/// Thread-safe radix tree for concurrent KV cache lookups.
///
/// Unlike `RadixTree` which uses `Rc<RefCell<>>` and requires single-threaded access,
......@@ -109,6 +126,7 @@ pub struct ConcurrentRadixTree {
root: SharedBlock,
tree_sizes: DashMap<WorkerWithDpRank, AtomicUsize, FxBuildHasher>,
cleanup: CleanupState,
}
impl Default for ConcurrentRadixTree {
......@@ -147,6 +165,7 @@ impl ConcurrentRadixTree {
Self {
root: Arc::new(RwLock::new(Block::new())),
tree_sizes: DashMap::with_hasher(FxBuildHasher),
cleanup: CleanupState::new(),
}
}
......@@ -665,6 +684,20 @@ impl SyncIndexer for ConcurrentRadixTree {
self.find_matches_impl(sequence, early_exit)
}
fn try_schedule_cleanup(&self) -> bool {
self.cleanup.try_schedule()
}
fn cancel_scheduled_cleanup(&self) {
self.cleanup.cancel();
}
fn run_cleanup_task(&self) {
let mut cleanup_guard = CleanupGuard::new(&self.cleanup);
cleanup::sweep_stale_children(&self.root);
cleanup_guard.mark_completed();
}
fn dump_events(&self) -> Option<Vec<RouterEvent>> {
Some(self.dump_tree_as_events())
}
......
......@@ -59,16 +59,16 @@
//! - `new_with_frequency()` is not provided
//! - `find_matches` does not populate `OverlapScores.frequencies`
use std::sync::{Arc, Weak};
use std::time::Instant;
use std::sync::Arc;
use dashmap::DashMap;
use parking_lot::RwLock;
use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet};
use std::collections::VecDeque;
use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};
use std::sync::atomic::{AtomicUsize, Ordering};
use super::{EventKind, KvIndexerMetrics, SyncIndexer, WorkerTask};
use crate::cleanup::{self, CleanableNode, CleanupGuard, CleanupState};
use crate::protocols::*;
macro_rules! read_lock {
......@@ -87,8 +87,6 @@ type SharedNode = Arc<RwLock<Node>>;
/// stored here, keeping the map compact and correct across concurrent splits.
type WorkerLookup = FxHashMap<ExternalSequenceBlockHash, SharedNode>;
const CLEANUP_INTERVAL_MS: u64 = 5 * 60 * 1000;
/// A node in the concurrent radix tree.
///
/// Stores a compressed edge with per-worker match indices. Workers with full coverage
......@@ -122,10 +120,6 @@ impl Node {
}
}
fn has_any_workers(&self) -> bool {
!self.full_edge_workers.is_empty() || !self.worker_cutoffs.is_empty()
}
#[inline]
fn current_cutoff(&self, worker: WorkerWithDpRank) -> usize {
if self.full_edge_workers.contains(&worker) {
......@@ -226,6 +220,22 @@ impl Node {
}
}
impl CleanableNode for Node {
type ChildKey = LocalBlockHash;
fn has_any_workers(&self) -> bool {
!self.full_edge_workers.is_empty() || !self.worker_cutoffs.is_empty()
}
fn children(&self) -> &FxHashMap<LocalBlockHash, SharedNode> {
&self.children
}
fn remove_child(&mut self, key: &LocalBlockHash) {
self.children.remove(key);
}
}
/// Data returned by [`ConcurrentRadixTreeCompressed::split_node`] for deferred lookup updates.
///
/// Callers must call [`ConcurrentRadixTreeCompressed::apply_split_lookup`] **after**
......@@ -240,64 +250,6 @@ struct RemoveOutcome {
stale_hashes: Vec<ExternalSequenceBlockHash>,
}
struct CleanupEdge {
parent: Weak<RwLock<Node>>,
key: LocalBlockHash,
child: Weak<RwLock<Node>>,
}
struct CleanupState {
clock_origin: Instant,
last_cleanup_elapsed_ms: AtomicU64,
scheduled: AtomicBool,
}
impl CleanupState {
fn new() -> Self {
Self {
clock_origin: Instant::now(),
last_cleanup_elapsed_ms: AtomicU64::new(0),
scheduled: AtomicBool::new(false),
}
}
fn elapsed_ms(&self) -> u64 {
self.clock_origin.elapsed().as_millis() as u64
}
fn try_schedule(&self) -> bool {
let now_ms = self.elapsed_ms();
let last_ms = self.last_cleanup_elapsed_ms.load(Ordering::Relaxed);
if now_ms.saturating_sub(last_ms) < CLEANUP_INTERVAL_MS {
return false;
}
self.scheduled
.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
.is_ok()
}
fn cancel(&self) {
self.scheduled.store(false, Ordering::Release);
}
}
struct CleanupGuard<'a> {
state: &'a CleanupState,
completed_elapsed_ms: Option<u64>,
}
impl Drop for CleanupGuard<'_> {
fn drop(&mut self) {
if let Some(elapsed_ms) = self.completed_elapsed_ms {
self.state
.last_cleanup_elapsed_ms
.store(elapsed_ms, Ordering::Relaxed);
}
self.state.scheduled.store(false, Ordering::Release);
}
}
/// Thread-safe radix tree (compressed trie) for concurrent KV cache lookups.
pub struct ConcurrentRadixTreeCompressed {
/// The root of the radix tree. Has an empty edge and only contains children.
......@@ -340,50 +292,6 @@ impl ConcurrentRadixTreeCompressed {
}
}
fn cleanup_stale_children(&self) {
let mut queue = VecDeque::from([self.root.clone()]);
let mut edges = Vec::new();
while let Some(parent) = queue.pop_front() {
let guard = parent.read();
for (&key, child) in &guard.children {
queue.push_back(child.clone());
edges.push(CleanupEdge {
parent: Arc::downgrade(&parent),
key,
child: Arc::downgrade(child),
});
}
}
for edge in edges.into_iter().rev() {
let (Some(parent), Some(child)) = (edge.parent.upgrade(), edge.child.upgrade()) else {
continue;
};
let mut parent_guard = parent.write();
let Some(current) = parent_guard.children.get(&edge.key) else {
continue;
};
if !Arc::ptr_eq(current, &child) {
continue;
}
let Some(child_guard) = child.try_write() else {
continue;
};
if child_guard.has_any_workers() || !child_guard.children.is_empty() {
continue;
}
if Arc::strong_count(&child) != 2 {
continue;
}
parent_guard.children.remove(&edge.key);
drop(child_guard);
}
}
#[cfg(test)]
pub(crate) fn raw_child_edge_count(&self) -> usize {
let mut queue = VecDeque::from([self.root.clone()]);
......@@ -400,7 +308,7 @@ impl ConcurrentRadixTreeCompressed {
#[cfg(test)]
pub(crate) fn run_cleanup_for_test(&self) {
self.cleanup_stale_children();
cleanup::sweep_stale_children(&self.root);
}
// ------------------------------------------------------------------
......@@ -1378,13 +1286,9 @@ impl SyncIndexer for ConcurrentRadixTreeCompressed {
}
fn run_cleanup_task(&self) {
let mut cleanup_guard = CleanupGuard {
state: &self.cleanup,
completed_elapsed_ms: None,
};
self.cleanup_stale_children();
cleanup_guard.completed_elapsed_ms = Some(self.cleanup.elapsed_ms());
let mut cleanup_guard = CleanupGuard::new(&self.cleanup);
cleanup::sweep_stale_children(&self.root);
cleanup_guard.mark_completed();
}
fn dump_events(&self) -> Option<Vec<RouterEvent>> {
......
......@@ -7,6 +7,7 @@
//! efficient KV cache lookup and routing in distributed LLM inference systems.
mod active_set;
pub(crate) mod cleanup;
pub mod indexer;
pub mod protocols;
......
......@@ -446,8 +446,6 @@ pub struct PrefillLoadHint {
pub enum ActiveSequenceEventData {
AddRequest {
token_sequence: Option<Vec<SequenceHash>>,
isl: usize,
overlap: u32,
#[serde(default = "default_track_prefill_tokens")]
track_prefill_tokens: bool,
expected_output_tokens: Option<u32>,
......
......@@ -285,8 +285,6 @@ impl<
SequenceRequest {
request_id: request_id.clone(),
token_sequence: request.token_seq,
isl: request.isl_tokens,
overlap: selection.overlap_blocks,
track_prefill_tokens: request.track_prefill_tokens,
expected_output_tokens: request.expected_output_tokens,
prefill_load_hint,
......@@ -315,24 +313,25 @@ impl<
return None;
}
let Some(estimator) = &self.prefill_load_estimator else {
return None;
let expected_prefill_duration = match &self.prefill_load_estimator {
Some(estimator) => match estimator.predict_prefill_duration(1, effective_isl, prefix) {
Ok(expected_prefill_duration) => Some(expected_prefill_duration),
Err(error) => {
tracing::warn!(
effective_isl,
prefix,
"failed to predict prefill duration for active load tracking: {error}"
);
None
}
},
None => None,
};
match estimator.predict_prefill_duration(1, effective_isl, prefix) {
Ok(expected_prefill_duration) => Some(PrefillLoadHint {
initial_effective_prefill_tokens: effective_isl,
expected_prefill_duration: Some(expected_prefill_duration),
}),
Err(error) => {
tracing::warn!(
effective_isl,
prefix,
"failed to predict prefill duration for active load tracking: {error}"
);
None
}
}
Some(PrefillLoadHint {
initial_effective_prefill_tokens: effective_isl,
expected_prefill_duration,
})
}
/// Number of requests currently parked in the pending queue (lock-free).
......
# Sequence State Model
This directory implements the router's active-sequence state for local request routing and replica sync.
For the local, non-remote path, the model is intentionally organized as a one-way write pipeline:
```mermaid
flowchart TD
A["Routing event<br/>AddRequest / MarkPrefillCompleted / Free"]
B["WorkerTable + RequestIndex<br/>lookup authoritative worker-local state"]
C["ActiveSequences<br/>authoritative per-worker write model"]
D["PromptRegistry<br/>derived read model"]
E["Scheduler reads projected load"]
A --> B
B --> C
C --> D
D -. read .-> E
```
## Source of truth
- `topology.rs` owns `WorkerTable`, which maps a worker identity to its slot.
- `request_maps.rs` owns `RequestIndex`, which maps `request_id -> worker`.
- `single.rs` owns `ActiveSequences`, the authoritative per-worker request, prefill, and block state.
- `prompt_registry.rs` owns `PromptRegistry`, which is not a source of truth. It is a derived routing view.
The local orchestrator in `multi_worker.rs` reads `WorkerTable` and `RequestIndex`, mutates the chosen worker's `ActiveSequences`, then projects the resulting membership/load delta into `PromptRegistry`.
## Why this is a DAG
Within a single local mutation, data moves in one direction:
`event -> authoritative state -> derived read model -> scheduler`
`PromptRegistry` does not write back into `ActiveSequences`, so there is no write-back loop inside the local mutation path.
At runtime there is still a control loop over time, because the scheduler reads the derived view and later emits the next `AddRequest`. That is a system feedback loop, not cyclic state ownership.
## Torn reads are intentional
`PromptRegistry` is allowed to be only eventually consistent with `ActiveSequences`.
That means a reader may temporarily observe:
- a worker-load snapshot from one moment
- prompt membership from another moment
- a combined view that never existed atomically
This is an intentional tradeoff. The derived read model is optimized for lower contention and higher concurrency, not perfect snapshot consistency.
The important safety boundary is:
- lifecycle and ownership invariants live in the write model (`WorkerTable`, `RequestIndex`, `ActiveSequences`)
- scheduling quality lives in the read model (`PromptRegistry`)
So a stale or torn read can lead to a suboptimal routing choice, but it should not cause catastrophic invariant breakage such as losing request ownership or corrupting block membership.
## Eventual consistency contract
- Local writes update `ActiveSequences` first.
- `PromptRegistry` is projected from that authoritative state afterward.
- Replica sync and scheduler decisions may lag behind temporarily.
- The system accepts this lag because the read side is advisory.
This is the core design: a strict local write DAG with an eventually consistent read projection.
......@@ -19,6 +19,8 @@ use tokio::sync::watch;
use tokio::time::{Duration, Instant};
use tokio_util::sync::CancellationToken;
#[cfg(any(test, feature = "bench"))]
use super::prompt_membership_trie::lookup_live_hashes;
use super::prompt_registry::{PromptRegistry, WorkerLoadSnapshot};
use super::request_maps::RequestIndex;
use super::single::{ActiveSequences, PromptMembershipDelta, RequestId};
......@@ -94,8 +96,6 @@ pub enum SequenceError {
pub struct SequenceRequest {
pub request_id: RequestId,
pub token_sequence: Option<Vec<SequenceHash>>,
pub isl: usize,
pub overlap: u32,
pub track_prefill_tokens: bool,
pub expected_output_tokens: Option<u32>,
pub prefill_load_hint: Option<PrefillLoadHint>,
......@@ -182,6 +182,23 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
self.prompt_registry.is_block_index_empty(),
"expected reverse block index to be empty after drain",
);
let trie_lookup_live_hashes: Vec<_> = {
let table = self.workers.read();
table
.slots
.iter()
.filter_map(|slot| {
let live_hashes = lookup_live_hashes(&slot.trie_lookup);
(!live_hashes.is_empty()).then_some((slot.worker, live_hashes))
})
.collect()
};
assert!(
trie_lookup_live_hashes.is_empty(),
"expected all worker trie lookups to reference only dead nodes after drain, found {:?}",
trie_lookup_live_hashes,
);
}
fn publish_worker_load_snapshot(
......@@ -280,8 +297,6 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
match &event.data {
ActiveSequenceEventData::AddRequest {
token_sequence,
isl,
overlap,
track_prefill_tokens,
expected_output_tokens,
prefill_load_hint,
......@@ -300,8 +315,6 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
let outcome = seq.add_request_with_prefill_tracking(
event.request_id.clone(),
token_sequence.clone(),
*isl,
*overlap,
*expected_output_tokens,
*track_prefill_tokens,
*prefill_load_hint,
......@@ -434,8 +447,6 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
worker: req.worker,
data: ActiveSequenceEventData::AddRequest {
token_sequence: req.token_sequence.clone(),
isl: req.isl,
overlap: req.overlap,
track_prefill_tokens: req.track_prefill_tokens,
expected_output_tokens: req.expected_output_tokens,
prefill_load_hint: req.prefill_load_hint,
......@@ -727,8 +738,6 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
let SequenceRequest {
request_id,
token_sequence,
isl,
overlap,
track_prefill_tokens,
expected_output_tokens,
prefill_load_hint,
......@@ -758,8 +767,6 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
let outcome = seq.add_request_with_prefill_tracking(
request_id,
token_sequence,
isl,
overlap,
expected_output_tokens,
track_prefill_tokens,
prefill_load_hint,
......@@ -938,6 +945,7 @@ mod tests {
use rustc_hash::FxHashMap;
use super::super::prefill_tracker::added_prefill_tokens;
use super::*;
use crate::protocols::{
ActiveSequenceEvent, ActiveSequenceEventData, BlockHashOptions, OverlapScores,
......@@ -994,7 +1002,7 @@ mod tests {
token_sequence.map_or(0, |query| query.len().saturating_sub(overlap_depth));
let overlap = *overlaps.scores.get(&slot.worker).unwrap_or(&0);
let added_tokens = if track_prefill_tokens {
seq.new_tokens(isl, overlap)
added_prefill_tokens(sequences.block_size, isl, overlap)
} else {
0
};
......@@ -1016,6 +1024,13 @@ mod tests {
compute_seq_hash_for_block(&block_hashes)
}
fn tracking_hint(tokens: usize) -> Option<PrefillLoadHint> {
Some(PrefillLoadHint {
initial_effective_prefill_tokens: tokens,
expected_prefill_duration: None,
})
}
struct VecSubscriber {
events: VecDeque<anyhow::Result<ActiveSequenceEvent>>,
}
......@@ -1039,8 +1054,6 @@ mod tests {
SequenceRequest {
request_id: "req-1".to_string(),
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: false,
expected_output_tokens: None,
prefill_load_hint: None,
......@@ -1069,11 +1082,9 @@ mod tests {
SequenceRequest {
request_id: "req-a".to_string(),
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
worker: worker_a,
lora_name: None,
},
......@@ -1092,11 +1103,9 @@ mod tests {
SequenceRequest {
request_id: "req-b".to_string(),
token_sequence: Some(vec![1, 2, 4]),
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
worker: worker_b,
lora_name: None,
},
......@@ -1149,8 +1158,6 @@ mod tests {
SequenceRequest {
request_id: "base".to_string(),
token_sequence: Some(base_prompt.clone()),
isl: 8,
overlap: 0,
track_prefill_tokens: false,
expected_output_tokens: None,
prefill_load_hint: None,
......@@ -1165,8 +1172,6 @@ mod tests {
SequenceRequest {
request_id: "lora".to_string(),
token_sequence: Some(lora_prompt),
isl: 8,
overlap: 0,
track_prefill_tokens: false,
expected_output_tokens: None,
prefill_load_hint: None,
......@@ -1213,11 +1218,9 @@ mod tests {
SequenceRequest {
request_id: "req-1".to_string(),
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
worker,
lora_name: None,
},
......@@ -1243,11 +1246,9 @@ mod tests {
SequenceRequest {
request_id: "req-1".to_string(),
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
worker,
lora_name: None,
},
......@@ -1262,11 +1263,9 @@ mod tests {
SequenceRequest {
request_id: "req-2".to_string(),
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
worker,
lora_name: None,
},
......@@ -1313,11 +1312,9 @@ mod tests {
worker,
data: ActiveSequenceEventData::AddRequest {
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
},
router_id: 99,
lora_name: None,
......@@ -1359,11 +1356,9 @@ mod tests {
worker,
data: ActiveSequenceEventData::AddRequest {
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
},
router_id: 99,
lora_name: None,
......@@ -1395,8 +1390,6 @@ mod tests {
SequenceRequest {
request_id: "req-1".to_string(),
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: false,
expected_output_tokens: None,
prefill_load_hint: None,
......@@ -1429,8 +1422,6 @@ mod tests {
SequenceRequest {
request_id: request_id.clone(),
token_sequence: Some(vec![1, 2, 3]),
isl: 12,
overlap: 0,
track_prefill_tokens: false,
expected_output_tokens: None,
prefill_load_hint: None,
......@@ -1481,8 +1472,6 @@ mod tests {
SequenceRequest {
request_id: "req-1".to_string(),
token_sequence: Some(vec![1, 2, 3]),
isl: 100,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: Some(PrefillLoadHint {
......
......@@ -7,6 +7,7 @@ use dynamo_tokens::SequenceHash;
use parking_lot::RwLock;
use rustc_hash::{FxHashMap, FxHashSet};
use crate::cleanup::{self, CleanableNode, CleanupGuard, CleanupState};
use crate::protocols::WorkerWithDpRank;
type SharedNode = Arc<RwLock<PromptTrieNode>>;
......@@ -32,11 +33,6 @@ impl PromptTrieNode {
}
}
#[cfg(any(test, feature = "bench"))]
fn has_any_workers(&self) -> bool {
!self.full_edge_workers.is_empty() || !self.worker_cutoffs.is_empty()
}
fn current_cutoff(&self, worker: WorkerWithDpRank) -> usize {
if self.full_edge_workers.contains(&worker) {
self.edge.len()
......@@ -114,12 +110,29 @@ impl PromptTrieNode {
}
}
impl CleanableNode for PromptTrieNode {
type ChildKey = SequenceHash;
fn has_any_workers(&self) -> bool {
!self.full_edge_workers.is_empty() || !self.worker_cutoffs.is_empty()
}
fn children(&self) -> &FxHashMap<SequenceHash, SharedNode> {
&self.children
}
fn remove_child(&mut self, key: &SequenceHash) {
self.children.remove(key);
}
}
struct RemoveOutcome {
stale_hashes: Vec<SequenceHash>,
}
pub(super) struct PromptMembershipTrie {
root: SharedNode,
cleanup: CleanupState,
}
impl Default for PromptMembershipTrie {
......@@ -149,7 +162,22 @@ impl PromptMembershipTrie {
pub(super) fn new() -> Self {
Self {
root: Arc::new(RwLock::new(PromptTrieNode::new())),
cleanup: CleanupState::new(),
}
}
/// Run the stale-child sweep if the throttle interval has elapsed.
///
/// Safe to call from any write path; the sweep is a no-op until
/// [`CLEANUP_INTERVAL_MS`](crate::cleanup::CLEANUP_INTERVAL_MS) has passed
/// since the last completion, and only one sweep runs at a time.
pub(super) fn maybe_cleanup(&self) {
if !self.cleanup.try_schedule() {
return;
}
let mut guard = CleanupGuard::new(&self.cleanup);
cleanup::sweep_stale_children(&self.root);
guard.mark_completed();
}
fn find_in_subtree(start: &SharedNode, hash: SequenceHash) -> Option<SharedNode> {
......@@ -656,6 +684,15 @@ impl PromptMembershipTrie {
}
}
#[cfg(any(test, feature = "bench"))]
pub(super) fn lookup_live_hashes(lookup: &Arc<RwLock<WorkerLookup>>) -> Vec<SequenceHash> {
let worker_lookup = lookup.read();
worker_lookup
.iter()
.filter_map(|(&hash, node)| node.read().has_any_workers().then_some(hash))
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
......
......@@ -80,6 +80,7 @@ impl PromptRegistry {
.store_chain(worker, lookup, store.parent, &store.hashes);
}
self.loads.insert(worker, load);
self.membership.maybe_cleanup();
}
pub(super) fn apply_topology_change(&self, change: WorkerTopologyChange) {
......@@ -92,6 +93,7 @@ impl PromptRegistry {
for worker in change.added {
self.loads.entry(worker).or_default();
}
self.membership.maybe_cleanup();
}
#[expect(clippy::too_many_arguments)]
......
......@@ -29,7 +29,9 @@ use uuid::Uuid;
use rustc_hash::FxHashSet;
use super::block_tracker::BlockTracker;
use super::prefill_tracker::{PrefillLoadState, PrefillLoadTracker, added_prefill_tokens};
#[cfg(test)]
use super::prefill_tracker::added_prefill_tokens;
use super::prefill_tracker::{PrefillLoadState, PrefillLoadTracker};
use super::prompt_registry::WorkerLoadSnapshot;
use crate::protocols::PrefillLoadHint;
......@@ -107,6 +109,7 @@ pub struct ActiveSequences {
requests: HashMap<RequestId, RequestState>,
prefill: PrefillLoadTracker,
blocks: BlockTracker,
#[cfg(test)]
block_size: usize,
last_expiry_check_time: Instant,
}
......@@ -120,6 +123,7 @@ impl ActiveSequences {
requests: HashMap::new(),
prefill: PrefillLoadTracker::default(),
blocks: BlockTracker::default(),
#[cfg(test)]
block_size,
last_expiry_check_time: Instant::now(),
}
......@@ -158,30 +162,6 @@ impl ActiveSequences {
self.prefill.snapshot().active_tokens_at(decay_now)
}
/// Add a new request with its initial tokens.
/// Returns block membership transitions plus any expired request IDs removed during cleanup.
#[cfg(test)]
pub(super) fn add_request(
&mut self,
request_id: RequestId,
token_sequence: Option<Vec<SequenceHash>>,
isl: usize,
overlap: u32,
expected_output_tokens: Option<u32>,
decay_now: Instant,
) -> SequenceMutationOutcome {
self.add_request_with_prefill_tracking(
request_id,
token_sequence,
isl,
overlap,
expected_output_tokens,
true,
None,
decay_now,
)
}
/// Add a new request with optional prompt-token load accounting.
/// Returns block membership transitions plus any expired request IDs removed during cleanup.
#[allow(clippy::too_many_arguments)]
......@@ -189,8 +169,6 @@ impl ActiveSequences {
&mut self,
request_id: RequestId,
token_sequence: Option<Vec<SequenceHash>>,
isl: usize,
overlap: u32,
expected_output_tokens: Option<u32>,
track_prefill_tokens: bool,
prefill_load_hint: Option<PrefillLoadHint>,
......@@ -241,15 +219,11 @@ impl ActiveSequences {
};
let prefill = if track_prefill_tokens {
let default_tokens = self.new_tokens(isl, overlap);
let hint = prefill_load_hint.unwrap_or(PrefillLoadHint {
initial_effective_prefill_tokens: default_tokens,
expected_prefill_duration: None,
});
(hint.initial_effective_prefill_tokens > 0).then_some(PrefillLoadState {
initial_effective_prefill_tokens: hint.initial_effective_prefill_tokens,
expected_prefill_duration: hint.expected_prefill_duration,
prefill_load_hint.and_then(|hint| {
(hint.initial_effective_prefill_tokens > 0).then_some(PrefillLoadState {
initial_effective_prefill_tokens: hint.initial_effective_prefill_tokens,
expected_prefill_duration: hint.expected_prefill_duration,
})
})
} else {
None
......@@ -356,10 +330,6 @@ impl ActiveSequences {
acquire.became_present_on_worker.then_some(random_hash)
}
pub(super) fn new_tokens(&self, isl: usize, overlap: u32) -> usize {
added_prefill_tokens(self.block_size, isl, overlap)
}
#[cfg(test)]
fn potential_blocks_and_tokens_with_prefill_tracking(
&self,
......@@ -376,7 +346,7 @@ impl ActiveSequences {
};
let active_tokens = self.active_tokens(decay_now);
let potential_tokens = if track_prefill_tokens {
self.new_tokens(isl, overlap) + active_tokens
added_prefill_tokens(self.block_size, isl, overlap) + active_tokens
} else {
active_tokens
};
......@@ -479,6 +449,14 @@ mod tests {
}
}
fn tracking_hint(block_size: usize, isl: usize, overlap: u32) -> Option<PrefillLoadHint> {
let tokens = added_prefill_tokens(block_size, isl, overlap);
(tokens > 0).then_some(PrefillLoadHint {
initial_effective_prefill_tokens: tokens,
expected_prefill_duration: None,
})
}
#[test]
fn test_prompt_membership_delta_only_reports_first_add_and_last_remove() {
let mut seq_manager = ActiveSequences::new(4);
......@@ -487,11 +465,9 @@ mod tests {
let first = seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1, 2]),
8,
0,
None,
true,
None,
tracking_hint(4, 8, 0),
decay_now,
);
assert_eq!(
......@@ -509,11 +485,9 @@ mod tests {
let second = seq_manager.add_request_with_prefill_tracking(
"r2".to_string(),
Some(vec![1, 2, 3]),
12,
0,
None,
true,
None,
tracking_hint(4, 12, 0),
decay_now,
);
assert_eq!(
......@@ -549,11 +523,9 @@ mod tests {
let outcome = seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1, 2, 3]),
12,
0,
None,
true,
None,
tracking_hint(4, 12, 0),
decay_now,
);
assert_eq!(
......@@ -598,34 +570,34 @@ mod tests {
let mut seq_manager = ActiveSequences::new(block_size);
let decay_now = Instant::now();
seq_manager.add_request(
seq_manager.add_request_with_prefill_tracking(
"request_1".to_string(),
Some(vec![1, 2, 3]),
12,
0,
None,
true,
tracking_hint(block_size, 12, 0),
decay_now,
);
assert_eq!(seq_manager.active_blocks(), 3);
assert_eq!(seq_manager.active_tokens(decay_now), 12);
seq_manager.add_request(
seq_manager.add_request_with_prefill_tracking(
"request_2".to_string(),
Some(vec![4]),
4,
0,
None,
true,
tracking_hint(block_size, 4, 0),
decay_now,
);
assert_eq!(seq_manager.active_blocks(), 4);
assert_eq!(seq_manager.active_tokens(decay_now), 16);
seq_manager.add_request(
seq_manager.add_request_with_prefill_tracking(
"request_3".to_string(),
Some(vec![1, 2, 3, 4]),
16,
4,
None,
true,
tracking_hint(block_size, 16, 4),
decay_now,
);
assert_eq!(seq_manager.active_blocks(), 4);
......@@ -650,12 +622,12 @@ mod tests {
let mut seq_manager = ActiveSequences::new(block_size);
let decay_now = Instant::now();
seq_manager.add_request(
seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1, 2, 3]),
12,
0,
None,
true,
tracking_hint(block_size, 12, 0),
decay_now,
);
assert_eq!(seq_manager.active_blocks(), 3);
......@@ -667,7 +639,14 @@ mod tests {
);
assert_eq!(seq_manager.active_blocks(), 2);
seq_manager.add_request("r2".to_string(), Some(vec![1, 2]), 8, 0, None, decay_now);
seq_manager.add_request_with_prefill_tracking(
"r2".to_string(),
Some(vec![1, 2]),
None,
true,
tracking_hint(block_size, 8, 0),
decay_now,
);
assert_eq!(seq_manager.active_blocks(), 2);
assert!(
......@@ -689,12 +668,12 @@ mod tests {
let mut seq_manager = ActiveSequences::new(block_size);
let decay_now = Instant::now();
seq_manager.add_request(
seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1, 2, 3]),
12,
0,
None,
true,
tracking_hint(block_size, 12, 0),
decay_now,
);
assert_eq!(seq_manager.active_tokens(decay_now), 12);
......@@ -705,7 +684,14 @@ mod tests {
seq_manager.mark_prefill_completed(&"r1".to_string(), decay_now);
assert_eq!(seq_manager.active_tokens(decay_now), 0);
seq_manager.add_request("r2".to_string(), Some(vec![4, 5]), 8, 0, None, decay_now);
seq_manager.add_request_with_prefill_tracking(
"r2".to_string(),
Some(vec![4, 5]),
None,
true,
tracking_hint(block_size, 8, 0),
decay_now,
);
assert_eq!(seq_manager.active_tokens(decay_now), 8);
seq_manager.free(&"r2".to_string(), decay_now);
......@@ -720,8 +706,6 @@ mod tests {
seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1, 2, 3]),
12,
0,
None,
false,
None,
......@@ -745,8 +729,6 @@ mod tests {
seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1, 2, 3]),
12,
0,
None,
false,
None,
......@@ -772,8 +754,6 @@ mod tests {
seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1]),
50,
0,
None,
true,
Some(prefill_hint(50, 10)),
......@@ -782,8 +762,6 @@ mod tests {
seq_manager.add_request_with_prefill_tracking(
"r2".to_string(),
Some(vec![2]),
30,
0,
None,
true,
Some(prefill_hint(30, 10)),
......@@ -823,20 +801,20 @@ mod tests {
let block_size = 4;
let mut seq_manager = ActiveSequences::new(block_size);
seq_manager.add_request(
seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1, 2]),
8,
0,
None,
true,
tracking_hint(block_size, 8, 0),
Instant::now(),
);
seq_manager.add_request(
seq_manager.add_request_with_prefill_tracking(
"r2".to_string(),
Some(vec![3, 4]),
8,
0,
None,
true,
tracking_hint(block_size, 8, 0),
Instant::now(),
);
assert_eq!(seq_manager.active_blocks(), 4);
......@@ -869,8 +847,14 @@ mod tests {
seq_manager.assert_consistent();
tokio::time::advance(Duration::from_secs(31)).await;
let expired =
seq_manager.add_request("r3".to_string(), Some(vec![5]), 4, 0, None, Instant::now());
let expired = seq_manager.add_request_with_prefill_tracking(
"r3".to_string(),
Some(vec![5]),
None,
true,
tracking_hint(block_size, 4, 0),
Instant::now(),
);
assert!(expired.expired_request_ids.is_empty());
assert_eq!(seq_manager.active_blocks(), 1);
assert_eq!(seq_manager.active_tokens(Instant::now()), 4);
......@@ -885,8 +869,6 @@ mod tests {
seq_manager.add_request_with_prefill_tracking(
"r1".to_string(),
Some(vec![1]),
40,
0,
None,
true,
Some(prefill_hint(40, 100)),
......@@ -896,8 +878,6 @@ mod tests {
seq_manager.add_request_with_prefill_tracking(
"r2".to_string(),
Some(vec![2]),
30,
0,
None,
true,
Some(prefill_hint(30, 100)),
......
......@@ -219,12 +219,15 @@ mod tests {
{
let idx = table.index[&existing];
let mut seq = table.slots[idx].sequences.write();
let outcome = seq.add_request(
let outcome = seq.add_request_with_prefill_tracking(
"req-1".to_string(),
Some(vec![1, 2, 3]),
12,
0,
None,
true,
Some(crate::protocols::PrefillLoadHint {
initial_effective_prefill_tokens: 12,
expected_prefill_duration: None,
}),
Instant::now(),
);
assert_eq!(outcome.membership_delta.stores[0].hashes, vec![1, 2, 3],);
......
......@@ -421,8 +421,6 @@ where
.add_request(SequenceRequest {
request_id: request_id.clone(),
token_sequence: maybe_seq_hashes,
isl: isl_tokens,
overlap: overlap_blocks,
track_prefill_tokens,
expected_output_tokens,
prefill_load_hint,
......@@ -464,24 +462,25 @@ where
return None;
}
let Some(estimator) = &self.prefill_load_estimator else {
return None;
let expected_prefill_duration = match &self.prefill_load_estimator {
Some(estimator) => match estimator.predict_prefill_duration(1, effective_isl, prefix) {
Ok(expected_prefill_duration) => Some(expected_prefill_duration),
Err(error) => {
tracing::warn!(
effective_isl,
prefix,
"failed to predict prefill duration for direct add_request path: {error}"
);
None
}
},
None => None,
};
match estimator.predict_prefill_duration(1, effective_isl, prefix) {
Ok(expected_prefill_duration) => Some(PrefillLoadHint {
initial_effective_prefill_tokens: effective_isl,
expected_prefill_duration: Some(expected_prefill_duration),
}),
Err(error) => {
tracing::warn!(
effective_isl,
prefix,
"failed to predict prefill duration for direct add_request path: {error}"
);
None
}
}
Some(PrefillLoadHint {
initial_effective_prefill_tokens: effective_isl,
expected_prefill_duration,
})
}
/// Get the worker type for this router ("prefill" or "decode").
......
......@@ -24,6 +24,8 @@ use std::sync::Arc;
use super::metrics::WORKER_LOAD_METRICS;
use crate::kv_router::{ACTIVE_SEQUENCES_SUBJECT, KV_METRICS_SUBJECT};
use crate::local_model::runtime_config::ModelRuntimeConfig;
#[cfg(test)]
use dynamo_kv_router::protocols::PrefillLoadHint;
/// Concrete [`SequencePublisher`] backed by NATS [`EventPublisher`] and Prometheus gauges.
pub struct RuntimeSequencePublisher {
......@@ -145,6 +147,13 @@ mod tests {
use dynamo_runtime::{DistributedRuntime, Runtime};
use tokio::time::Instant;
fn tracking_hint(tokens: usize) -> Option<PrefillLoadHint> {
Some(PrefillLoadHint {
initial_effective_prefill_tokens: tokens,
expected_prefill_duration: None,
})
}
#[tokio::test]
#[ignore]
async fn test_multi_worker_cross_instance_sync() -> Result<()> {
......@@ -192,11 +201,9 @@ mod tests {
SequenceRequest {
request_id: "request_0".to_string(),
token_sequence: Some(vec![0, 1, 2]),
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
worker: WorkerWithDpRank::new(0, 0),
lora_name: None,
},
......@@ -207,11 +214,9 @@ mod tests {
SequenceRequest {
request_id: "request_1".to_string(),
token_sequence: Some(vec![3, 4]),
isl: 8,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(8),
worker: WorkerWithDpRank::new(0, 1),
lora_name: None,
},
......@@ -222,11 +227,9 @@ mod tests {
SequenceRequest {
request_id: "request_2".to_string(),
token_sequence: Some(vec![0, 1, 2, 3]),
isl: 16,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(16),
worker: WorkerWithDpRank::new(1, 0),
lora_name: None,
},
......@@ -351,11 +354,9 @@ mod tests {
SequenceRequest {
request_id: "request_0".to_string(),
token_sequence: None,
isl: 12,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(12),
worker: WorkerWithDpRank::from_worker_id(0),
lora_name: None,
},
......@@ -366,11 +367,9 @@ mod tests {
SequenceRequest {
request_id: "request_1".to_string(),
token_sequence: None,
isl: 8,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(8),
worker: WorkerWithDpRank::from_worker_id(1),
lora_name: None,
},
......@@ -381,11 +380,9 @@ mod tests {
SequenceRequest {
request_id: "request_2".to_string(),
token_sequence: None,
isl: 16,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: None,
prefill_load_hint: None,
prefill_load_hint: tracking_hint(16),
worker: WorkerWithDpRank::from_worker_id(2),
lora_name: None,
},
......
......@@ -516,8 +516,6 @@ impl OfflineReplayRouter {
SequenceRequest {
request_id,
token_sequence: request.token_seq,
isl: request.isl_tokens,
overlap: selection.overlap_blocks,
track_prefill_tokens: request.track_prefill_tokens,
expected_output_tokens: request.expected_output_tokens,
prefill_load_hint,
......@@ -583,24 +581,25 @@ impl OfflineReplayRouter {
return None;
}
let Some(estimator) = &self.prefill_load_estimator else {
return None;
let expected_prefill_duration = match &self.prefill_load_estimator {
Some(estimator) => match estimator.predict_prefill_duration(1, effective_isl, prefix) {
Ok(expected_prefill_duration) => Some(expected_prefill_duration),
Err(error) => {
tracing::warn!(
effective_isl,
prefix,
"failed to predict replay prefill duration for active load tracking: {error}"
);
None
}
},
None => None,
};
match estimator.predict_prefill_duration(1, effective_isl, prefix) {
Ok(expected_prefill_duration) => Some(PrefillLoadHint {
initial_effective_prefill_tokens: effective_isl,
expected_prefill_duration: Some(expected_prefill_duration),
}),
Err(error) => {
tracing::warn!(
effective_isl,
prefix,
"failed to predict replay prefill duration for active load tracking: {error}"
);
None
}
}
Some(PrefillLoadHint {
initial_effective_prefill_tokens: effective_isl,
expected_prefill_duration,
})
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment