Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
803bfa81
"components/vscode:/vscode.git/clone" did not exist on "4a73b3c661877fc19c8d253c97178f22b43142c8"
Unverified
Commit
803bfa81
authored
Jul 28, 2025
by
Yan Ru Pei
Committed by
GitHub
Jul 28, 2025
Browse files
feat: proper local hashes for mockers + router watches endpoints (#2132)
parent
e82bc4ec
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
147 additions
and
95 deletions
+147
-95
components/router/src/main.rs
components/router/src/main.rs
+2
-2
lib/llm/src/kv_router.rs
lib/llm/src/kv_router.rs
+64
-56
lib/llm/src/kv_router/scheduler.rs
lib/llm/src/kv_router/scheduler.rs
+26
-19
lib/llm/src/mocker/protocols.rs
lib/llm/src/mocker/protocols.rs
+28
-11
lib/llm/src/mocker/scheduler.rs
lib/llm/src/mocker/scheduler.rs
+14
-5
lib/llm/src/mocker/sequence.rs
lib/llm/src/mocker/sequence.rs
+9
-1
tests/router/test_router_e2e_with_mockers.py
tests/router/test_router_e2e_with_mockers.py
+4
-1
No files found.
components/router/src/main.rs
View file @
803bfa81
...
...
@@ -27,9 +27,9 @@ use clap::Parser;
use
dynamo_llm
::
kv_router
::{
protocols
::
WorkerSelectionResult
,
scheduler
::{
DefaultWorkerSelector
,
KvSchedulerError
,
SchedulingRequest
},
scoring
::
ProcessedEndpoints
,
KvRouter
,
WorkerSelector
,
};
use
dynamo_runtime
::
component
::
Instance
;
use
dynamo_runtime
::{
logging
,
pipeline
::
network
::
Ingress
,
DistributedRuntime
,
Result
,
Runtime
,
Worker
,
};
...
...
@@ -86,7 +86,7 @@ pub struct CustomWorkerSelector(DefaultWorkerSelector);
impl
WorkerSelector
for
CustomWorkerSelector
{
fn
select_worker
(
&
self
,
workers
:
&
ProcessedEndpoints
,
workers
:
&
[
Instance
]
,
request
:
&
SchedulingRequest
,
block_size
:
u32
,
)
->
Result
<
WorkerSelectionResult
,
KvSchedulerError
>
{
...
...
lib/llm/src/kv_router.rs
View file @
803bfa81
...
...
@@ -34,7 +34,7 @@ use crate::{
compute_block_hash_for_seq
,
KvIndexer
,
KvIndexerInterface
,
KvRouterError
,
OverlapScores
,
RouterEvent
,
},
metrics_aggregator
::
EndpointCollector
,
//
metrics_aggregator::EndpointCollector,
protocols
::{
LocalBlockHash
,
RouterRequest
,
RouterResponse
,
WorkerSelectionResult
},
scheduler
::{
KvScheduler
,
KvSchedulerError
,
SchedulingRequest
},
scoring
::
ProcessedEndpoints
,
...
...
@@ -43,6 +43,7 @@ use crate::{
protocols
::
common
::
llm_backend
::
LLMEngineOutput
,
};
use
dynamo_runtime
::
component
::
Instance
;
use
dynamo_runtime
::
traits
::
events
::
EventSubscriber
;
// [gluo TODO] shouldn't need to be public
...
...
@@ -55,7 +56,7 @@ pub const KV_METRICS_ENDPOINT: &str = "load_metrics";
pub
trait
WorkerSelector
{
fn
select_worker
(
&
self
,
workers
:
&
ProcessedEndpoints
,
workers
:
&
[
Instance
]
,
request
:
&
SchedulingRequest
,
block_size
:
u32
,
)
->
Result
<
WorkerSelectionResult
,
KvSchedulerError
>
;
...
...
@@ -151,8 +152,16 @@ impl KvRouter {
.primary_lease
()
.expect
(
"Cannot KV route static workers"
)
.primary_token
();
let
metrics_aggregator
=
EndpointCollector
::
new
(
component
.clone
(),
cancellation_token
.clone
())
.await
;
let
generate_endpoint
=
component
.endpoint
(
"generate"
);
let
client
=
generate_endpoint
.client
()
.await
?
;
let
instances_rx
=
match
client
.instance_source
.as_ref
()
{
InstanceSource
::
Dynamic
(
rx
)
=>
rx
.clone
(),
InstanceSource
::
Static
=>
{
panic!
(
"Expected dynamic instance source for KV routing"
);
}
};
let
indexer
=
if
use_kv_events
{
Indexer
::
KvIndexer
(
KvIndexer
::
new
(
cancellation_token
.clone
(),
block_size
))
...
...
@@ -168,7 +177,7 @@ impl KvRouter {
let
scheduler
=
KvScheduler
::
start
(
component
.namespace
()
.clone
(),
block_size
,
metrics_aggregator
.endpoints_watcher
()
,
instances_rx
,
selector
,
)
.await
?
;
...
...
@@ -325,6 +334,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
let
isl
=
backend_input
.token_ids
.len
();
backend_input
.estimated_prefix_hit_num_blocks
=
Some
(
overlap_amount
);
let
updated_request
=
context
.map
(|
_
|
backend_input
);
// if request has the annotation "query_instance_id", for example
// curl -d '{... ,"nvext": { "annotations": ["query_instance_id"]}}'
// request will not be routed to worker immediately
...
...
@@ -333,61 +343,59 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
let
response
=
Annotated
::
from_annotation
(
"worker_instance_id"
,
&
instance_id_str
)
?
;
let
stream
=
stream
::
iter
(
vec!
[
response
]);
Ok
(
ResponseStream
::
new
(
Box
::
pin
(
stream
),
stream_context
))
}
else
{
// Get the response stream from the worker
let
mut
response_stream
=
self
.inner
.direct
(
updated_request
,
instance_id
)
.await
?
;
// Wrap the stream to track tokens
let
stream_context
=
response_stream
.context
();
let
chooser
=
self
.chooser
.clone
();
let
request_id
=
context_id
.clone
();
let
block_size
=
chooser
.block_size
()
as
usize
;
let
wrapped_stream
=
Box
::
pin
(
async_stream
::
stream!
{
let
mut
accumulated_tokens
=
Vec
::
new
();
let
mut
total_output_length
=
0u
size
;
let
mut
last_block_index
=
(
isl
.saturating_sub
(
1
))
/
block_size
;
let
mut
first_push_done
=
false
;
while
let
Some
(
item
)
=
response_stream
.next
()
.await
{
// Track tokens if they exist in the response
let
Some
(
ref
output
)
=
item
.data
else
{
yield
item
;
continue
;
};
if
output
.token_ids
.is_empty
()
{
yield
item
;
continue
;
}
return
Ok
(
ResponseStream
::
new
(
Box
::
pin
(
stream
),
stream_context
));
}
// Get the response stream from the worker
let
mut
response_stream
=
self
.inner
.direct
(
updated_request
,
instance_id
)
.await
?
;
// Wrap the stream to track tokens
let
stream_context
=
response_stream
.context
();
let
chooser
=
self
.chooser
.clone
();
let
request_id
=
context_id
.clone
();
let
block_size
=
chooser
.block_size
()
as
usize
;
let
wrapped_stream
=
Box
::
pin
(
async_stream
::
stream!
{
let
mut
accumulated_tokens
=
Vec
::
new
();
let
mut
total_output_length
=
0u
size
;
let
mut
last_block_index
=
(
isl
.saturating_sub
(
1
))
/
block_size
;
let
mut
first_push_done
=
false
;
while
let
Some
(
item
)
=
response_stream
.next
()
.await
{
// Track tokens if they exist in the response
let
Some
(
ref
output
)
=
item
.data
else
{
yield
item
;
continue
;
};
if
output
.token_ids
.is_empty
()
{
yield
item
;
continue
;
}
// Add tokens to accumulator
accumulated_tokens
.extend_from_slice
(
&
output
.token_ids
);
total_output_length
+=
output
.token_ids
.len
();
// Always push for the first generated token (to mark prefill done)
// or when we've moved to a new block
let
current_block_index
=
(
isl
+
total_output_length
)
.saturating_sub
(
1
)
/
block_size
;
let
should_push
=
(
!
first_push_done
&&
total_output_length
>=
1
)
||
(
first_push_done
&&
current_block_index
>
last_block_index
);
if
should_push
{
chooser
.push
(
&
request_id
,
&
accumulated_tokens
)
.await
;
accumulated_tokens
.clear
();
last_block_index
=
current_block_index
;
if
!
first_push_done
{
first_push_done
=
true
;
}
// Add tokens to accumulator
accumulated_tokens
.extend_from_slice
(
&
output
.token_ids
);
total_output_length
+=
output
.token_ids
.len
();
// Always push for the first generated token (to mark prefill done)
// or when we've moved to a new block
let
current_block_index
=
(
isl
+
total_output_length
)
.saturating_sub
(
1
)
/
block_size
;
let
should_push
=
(
!
first_push_done
&&
total_output_length
>=
1
)
||
(
first_push_done
&&
current_block_index
>
last_block_index
);
if
should_push
{
chooser
.push
(
&
request_id
,
&
accumulated_tokens
)
.await
;
accumulated_tokens
.clear
();
last_block_index
=
current_block_index
;
if
!
first_push_done
{
first_push_done
=
true
;
}
yield
item
;
}
chooser
.free
(
&
request_id
)
.await
;
});
Ok
(
ResponseStream
::
new
(
wrapped_stream
,
stream_context
))
}
yield
item
;
}
chooser
.free
(
&
request_id
)
.await
;
});
Ok
(
ResponseStream
::
new
(
wrapped_stream
,
stream_context
))
}
}
}
...
...
lib/llm/src/kv_router/scheduler.rs
View file @
803bfa81
...
...
@@ -26,11 +26,11 @@ use super::protocols::WorkerSelectionResult;
use
super
::
WorkerSelector
;
use
crate
::
kv_router
::
indexer
::
OverlapScores
;
use
crate
::
kv_router
::
protocols
::
LoadMetrics
;
use
crate
::
kv_router
::
scoring
::
ProcessedEndpoints
;
use
crate
::
kv_router
::
sequence
::
ActiveSequencesMultiWorker
;
use
crate
::
kv_router
::
KvRouterConfig
;
use
crate
::
kv_router
::
KV_HIT_RATE_SUBJECT
;
use
crate
::
tokens
::
TokenBlockSequence
;
use
dynamo_runtime
::
component
::
Instance
;
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
KVHitRateEvent
{
...
...
@@ -107,12 +107,14 @@ impl KvScheduler {
pub
async
fn
start
(
ns
:
Namespace
,
block_size
:
u32
,
endpoint
s_rx
:
tokio
::
sync
::
watch
::
Receiver
<
ProcessedEndpoints
>
,
mut
instance
s_rx
:
tokio
::
sync
::
watch
::
Receiver
<
Vec
<
Instance
>>
,
// Changed from
ProcessedEndpoints
selector
:
Option
<
Box
<
dyn
WorkerSelector
+
Send
+
Sync
>>
,
)
->
Result
<
Self
,
KvSchedulerError
>
{
let
selector
=
selector
.unwrap_or
(
Box
::
new
(
DefaultWorkerSelector
::
default
()));
let
mut
endpoints_rx
=
endpoints_rx
;
let
mut
endpoints
:
ProcessedEndpoints
=
endpoints_rx
.borrow_and_update
()
.clone
();
let
mut
instances
:
Vec
<
Instance
>
=
instances_rx
.borrow_and_update
()
.clone
();
// Get worker IDs from instances
let
worker_ids
:
Vec
<
i64
>
=
instances
.iter
()
.map
(|
i
|
i
.instance_id
)
.collect
();
let
(
event_tx
,
event_rx
)
=
tokio
::
sync
::
mpsc
::
unbounded_channel
::
<
KVHitRateEvent
>
();
tokio
::
spawn
(
async
move
{
...
...
@@ -126,7 +128,7 @@ impl KvScheduler {
let
sequences
=
Arc
::
new
(
Mutex
::
new
(
ActiveSequencesMultiWorker
::
new
(
block_size
as
usize
,
endpoints
.
worker_ids
()
,
worker_ids
,
)));
// Channel to accept new scheduling requests
...
...
@@ -142,9 +144,10 @@ impl KvScheduler {
request
=
tokio
::
select!
{
biased
;
_
=
endpoints_rx
.changed
()
=>
{
endpoints
=
endpoints_rx
.borrow_and_update
()
.clone
();
pending_endpoint_update
=
Some
(
endpoints
.worker_ids
());
_
=
instances_rx
.changed
()
=>
{
instances
=
instances_rx
.borrow_and_update
()
.clone
();
let
worker_ids
:
Vec
<
i64
>
=
instances
.iter
()
.map
(|
i
|
i
.instance_id
)
.collect
();
pending_endpoint_update
=
Some
(
worker_ids
);
continue
'outer
;
}
...
...
@@ -159,7 +162,8 @@ impl KvScheduler {
};
loop
{
match
selector
.select_worker
(
&
endpoints
,
&
request
,
block_size
)
{
// When calling selector.select_worker, we need to adapt
match
selector
.select_worker
(
&
instances
,
&
request
,
block_size
)
{
Ok
(
selection
)
=>
{
if
let
Err
(
e
)
=
event_tx
.send
(
KVHitRateEvent
{
worker_id
:
selection
.worker_id
,
...
...
@@ -179,9 +183,11 @@ impl KvScheduler {
}
Err
(
KvSchedulerError
::
NoEndpoints
)
=>
{
tracing
::
trace!
(
"no endpoints available; waiting for endpoints update"
);
endpoints_rx
.changed
()
.await
.ok
();
endpoints
=
endpoints_rx
.borrow_and_update
()
.clone
();
pending_endpoint_update
=
Some
(
endpoints
.worker_ids
());
instances_rx
.changed
()
.await
.ok
();
instances
=
instances_rx
.borrow_and_update
()
.clone
();
let
worker_ids
:
Vec
<
i64
>
=
instances
.iter
()
.map
(|
i
|
i
.instance_id
)
.collect
();
pending_endpoint_update
=
Some
(
worker_ids
);
continue
;
}
// TODO: this is not actually hooked up
...
...
@@ -353,13 +359,13 @@ impl DefaultWorkerSelector {
impl
WorkerSelector
for
DefaultWorkerSelector
{
fn
select_worker
(
&
self
,
workers
:
&
ProcessedEndpoints
,
workers
:
&
[
Instance
]
,
request
:
&
SchedulingRequest
,
block_size
:
u32
,
)
->
Result
<
WorkerSelectionResult
,
KvSchedulerError
>
{
assert
!
(
request
.isl_tokens
>
0
);
if
workers
.
endpoints
.
is_empty
()
{
if
workers
.is_empty
()
{
return
Err
(
KvSchedulerError
::
NoEndpoints
);
}
...
...
@@ -376,9 +382,10 @@ impl WorkerSelector for DefaultWorkerSelector {
let
mut
max_logit
=
f64
::
NEG_INFINITY
;
// Calculate logits for each worker
for
(
worker_id
,
_
)
in
workers
.endpoints
.iter
()
{
for
instance
in
workers
.iter
()
{
let
worker_id
=
instance
.instance_id
;
// this is the number of tokens each worker would have if the request were scheduled there
let
potential_tokens
=
*
potential_active_tokens
.get
(
worker_id
)
.unwrap_or_else
(||
{
let
potential_tokens
=
*
potential_active_tokens
.get
(
&
worker_id
)
.unwrap_or_else
(||
{
tracing
::
warn!
(
"assuming {isl} tokens for {worker_id}, as the endpoint does not exist yet"
);
...
...
@@ -386,7 +393,7 @@ impl WorkerSelector for DefaultWorkerSelector {
})
as
f64
;
// this is the number of blocks each worker would have if the request were scheduled there
let
potential_blocks
=
*
potential_active_blocks
.get
(
worker_id
)
.unwrap_or_else
(||
let
potential_blocks
=
*
potential_active_blocks
.get
(
&
worker_id
)
.unwrap_or_else
(||
{
tracing
::
warn!
(
"assuming {request_blocks} decoding blocks for {worker_id}, as the endpoint does not exist yet"
);
&
request_blocks
})
as
f64
;
...
...
@@ -398,12 +405,12 @@ impl WorkerSelector for DefaultWorkerSelector {
+
potential_blocks
;
max_logit
=
max_logit
.max
(
logit
);
worker_logits
.insert
(
*
worker_id
,
logit
);
worker_logits
.insert
(
worker_id
,
logit
);
tracing
::
info!
(
"Formula for {worker_id}: {logit:.3} = {:.1} * {potential_prefill_blocks:.3} + {potential_blocks:.3} (cached_blocks: {})"
,
self
.kv_router_config.overlap_score_weight
,
overlaps
.get
(
worker_id
)
.unwrap_or
(
&
0
),
overlaps
.get
(
&
worker_id
)
.unwrap_or
(
&
0
),
);
}
...
...
lib/llm/src/mocker/protocols.rs
View file @
803bfa81
...
...
@@ -24,9 +24,8 @@ use crate::kv_router::protocols::{
KvCacheStoredBlockData
,
LocalBlockHash
,
};
use
crate
::
tokens
::
blocks
::
UniqueBlock
;
use
crate
::
tokens
::{
BlockHash
,
SequenceHash
,
Token
};
pub
type
Token
=
u32
;
pub
type
GlobalHash
=
u64
;
pub
type
NumBlocks
=
usize
;
/// Represents different block movement operations in the cache
...
...
@@ -36,13 +35,13 @@ pub enum MoveBlock {
Use
(
Vec
<
UniqueBlock
>
),
Destroy
(
Vec
<
UniqueBlock
>
),
Deref
(
Vec
<
UniqueBlock
>
),
Promote
(
Uuid
,
Global
Hash
,
Option
<
u64
>
),
Promote
(
Uuid
,
Sequence
Hash
,
Option
<
u64
>
),
}
#[derive(Debug,
Clone,
PartialEq,
Serialize,
Deserialize)]
pub
enum
MoveBlockResponse
{
Store
(
Vec
<
Global
Hash
>
,
Option
<
u64
>
),
Remove
(
Vec
<
Global
Hash
>
),
Store
(
Vec
<
Sequence
Hash
>
,
Option
<
u64
>
),
Remove
(
Vec
<
Sequence
Hash
>
),
}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
...
...
@@ -222,18 +221,36 @@ impl MockEngineArgs {
}
}
/// Note: This assumes block_hash and tokens_hash are the same, which is not correct in rare cases
/// where the sequence-aware hash differs from the token content hash.
pub
fn
block_response_to_kv_event
(
response
:
MoveBlockResponse
)
->
KvCacheEventData
{
/// Converts a MoveBlockResponse from the mocker backend into a KvCacheEventData.
///
/// This function assumes that the stored sequence hashes in the response always
/// correspond to the tail part of the local hashes array. This is the expected
/// behavior of KV block storage, where blocks are stored sequentially and the
/// response contains the most recent blocks that were stored.
///
/// # Panics
/// Panics if the number of blocks in the Store response exceeds the length
/// of local_hashes.
pub
fn
block_response_to_kv_event
(
response
:
MoveBlockResponse
,
local_hashes
:
&
[
BlockHash
],
)
->
KvCacheEventData
{
match
response
{
MoveBlockResponse
::
Store
(
full_blocks
,
parent_hash
)
=>
{
let
num_blocks
=
full_blocks
.len
();
let
local_hashes_slice
=
&
local_hashes
[
local_hashes
.len
()
.checked_sub
(
num_blocks
)
.expect
(
"local hashes fewer than block response signal"
)
..
];
KvCacheEventData
::
Stored
(
KvCacheStoreData
{
parent_hash
:
parent_hash
.map
(
ExternalSequenceBlockHash
),
blocks
:
full_blocks
.into_iter
()
.map
(|
block
|
KvCacheStoredBlockData
{
block_hash
:
ExternalSequenceBlockHash
(
block
),
tokens_hash
:
LocalBlockHash
(
block
),
.zip
(
local_hashes_slice
.iter
())
.map
(|(
global_hash
,
local_hash
)|
KvCacheStoredBlockData
{
block_hash
:
ExternalSequenceBlockHash
(
global_hash
),
tokens_hash
:
LocalBlockHash
(
*
local_hash
),
})
.collect
(),
})
...
...
lib/llm/src/mocker/scheduler.rs
View file @
803bfa81
...
...
@@ -47,6 +47,7 @@ use crate::mocker::protocols::{block_response_to_kv_event, MoveBlock, OutputSign
use
crate
::
mocker
::
protocols
::{
DirectRequest
,
MockEngineArgs
,
MoveBlockResponse
};
use
crate
::
mocker
::
sequence
::
ActiveSequence
;
use
crate
::
tokens
::
blocks
::
UniqueBlock
;
use
crate
::
tokens
::
BlockHash
;
use
std
::
collections
::
HashMap
;
use
std
::
collections
::
VecDeque
;
use
std
::
sync
::
Arc
;
...
...
@@ -123,8 +124,9 @@ impl SchedulerState {
/// Returns `Some((prefill_compute, creation_signal, is_full_prefill))` where:
/// - `prefill_compute`: The compute time in milliseconds for this prefill operation
/// - `creation_signal`: Optional MoveBlock signal for KV cache block creation
/// - `block_hashes`: Block hashes of the sequence beign prefilled
/// - `is_full_prefill`: true if the entire sequence was prefilled, false if chunked
fn
try_prefill
(
&
mut
self
)
->
Option
<
(
f64
,
Option
<
MoveBlock
>
,
bool
)
>
{
fn
try_prefill
(
&
mut
self
)
->
Option
<
(
f64
,
Option
<
MoveBlock
>
,
Vec
<
BlockHash
>
,
bool
)
>
{
let
uuid
=
self
.prefill
.pop_front
()
?
;
// Remove and extract prefill_compute from prefill_costs
...
...
@@ -179,6 +181,7 @@ impl SchedulerState {
Some
((
prefill_compute
,
sequence
.take_creation_signal
(),
sequence
.block_hashes
(),
is_full_prefill
,
))
}
...
...
@@ -401,8 +404,12 @@ impl Scheduler {
let
mut
total_time
=
Duration
::
from_secs_f64
(
decoding_time
/
1000.0
);
// Process prefilling
while
let
Some
((
prefill_compute
,
maybe_creation_signal
,
is_full_prefill
))
=
state_guard
.try_prefill
()
while
let
Some
((
prefill_compute
,
maybe_creation_signal
,
block_hashes
,
is_full_prefill
,
))
=
state_guard
.try_prefill
()
{
// NOTE: Prefill cost/time is always incremented for new blocks, even if they
// could be cached by other requests in the same batch. This matches vLLM behavior.
...
...
@@ -421,7 +428,8 @@ impl Scheduler {
(
&
kv_events_tx
,
&
mut
block_resp_rx
)
{
while
let
Ok
(
event
)
=
rx
.try_recv
()
{
let
_
=
relay_tx
.send
(
block_response_to_kv_event
(
event
));
let
_
=
relay_tx
.send
(
block_response_to_kv_event
(
event
,
&
block_hashes
));
}
}
};
...
...
@@ -460,7 +468,8 @@ impl Scheduler {
(
&
kv_events_tx
,
&
mut
block_resp_rx
)
{
while
let
Ok
(
event
)
=
rx
.try_recv
()
{
let
_
=
relay_tx
.send
(
block_response_to_kv_event
(
event
));
let
_
=
relay_tx
.send
(
block_response_to_kv_event
(
event
,
&
sequence
.block_hashes
()));
}
}
...
...
lib/llm/src/mocker/sequence.rs
View file @
803bfa81
...
...
@@ -90,7 +90,7 @@ impl ActiveSequence {
assert
!
(
block_size
>
1
,
"block_size must be greater than 1"
);
let
num_input_tokens
=
tokens
.len
();
let
tokens
=
Tokens
::
from
(
tokens
)
.into_sequence
(
block_size
as
u32
,
None
);
let
tokens
=
Tokens
::
from
(
tokens
)
.into_sequence
(
block_size
as
u32
,
Some
(
1337
)
);
let
unique_blocks
=
create_unique_blocks_from_sequence
(
&
tokens
,
None
,
block_size
,
enable_prefix_caching
);
let
creation_signal
=
Some
(
MoveBlock
::
Use
(
unique_blocks
.clone
()));
...
...
@@ -124,6 +124,14 @@ impl ActiveSequence {
self
.creation_signal
.take
()
}
pub
fn
block_hashes
(
&
self
)
->
Vec
<
u64
>
{
self
.tokens
.blocks
()
.iter
()
.map
(|
block
|
block
.block_hash
())
.collect
()
}
/// Create a new ActiveSequence instance and return the creation signal
pub
fn
new_with_signal
(
tokens
:
Vec
<
u32
>
,
...
...
tests/router/test_router_e2e_with_mockers.py
View file @
803bfa81
...
...
@@ -18,6 +18,7 @@ logger = logging.getLogger(__name__)
MODEL_NAME
=
"Qwen/Qwen3-0.6B"
NUM_MOCKERS
=
2
BLOCK_SIZE
=
16
SPEEDUP_RATIO
=
10.0
NUM_REQUESTS
=
100
PORT
=
8090
# Starting port for mocker instances
...
...
@@ -59,6 +60,8 @@ class KVRouterProcess(ManagedProcess):
"python"
,
"-m"
,
"dynamo.frontend"
,
"--kv-cache-block-size"
,
str
(
BLOCK_SIZE
),
"--router-mode"
,
"kv"
,
"--http-port"
,
...
...
@@ -100,7 +103,7 @@ def test_mocker_kv_router(request, runtime_services):
logger
.
info
(
"Starting mocker KV router test"
)
# Create mocker args file
mocker_args
=
{
"speedup_ratio"
:
SPEEDUP_RATIO
}
mocker_args
=
{
"speedup_ratio"
:
SPEEDUP_RATIO
,
"block_size"
:
BLOCK_SIZE
}
mocker_args_file
=
os
.
path
.
join
(
request
.
node
.
name
,
"mocker_args.json"
)
with
open
(
mocker_args_file
,
"w"
)
as
f
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment