Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
5e511e92
Unverified
Commit
5e511e92
authored
Jul 09, 2025
by
Yan Ru Pei
Committed by
GitHub
Jul 09, 2025
Browse files
feat: update active blocks in chunks only when necessary (#1848)
parent
7bc70bc9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
85 additions
and
44 deletions
+85
-44
lib/llm/src/kv_router.rs
lib/llm/src/kv_router.rs
+30
-7
lib/llm/src/kv_router/scheduler.rs
lib/llm/src/kv_router/scheduler.rs
+7
-5
lib/llm/src/kv_router/sequence.rs
lib/llm/src/kv_router/sequence.rs
+48
-32
No files found.
lib/llm/src/kv_router.rs
View file @
5e511e92
...
@@ -206,9 +206,9 @@ impl KvRouter {
...
@@ -206,9 +206,9 @@ impl KvRouter {
Ok
((
best_worker_id
,
overlap_amount
))
Ok
((
best_worker_id
,
overlap_amount
))
}
}
/// Push
a
token to a specific request's sequence
/// Push token
s
to a specific request's sequence
pub
async
fn
push
(
&
self
,
request_id
:
&
String
,
token
:
u32
)
{
pub
async
fn
push
(
&
self
,
request_id
:
&
String
,
token
s
:
&
[
u32
]
)
{
self
.scheduler
.push
(
request_id
,
token
)
.await
self
.scheduler
.push
(
request_id
,
token
s
)
.await
}
}
/// Free all blocks associated with a request
/// Free all blocks associated with a request
...
@@ -273,6 +273,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
...
@@ -273,6 +273,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
.await
?
;
.await
?
;
// Update the request with the estimated prefix hit blocks
// Update the request with the estimated prefix hit blocks
let
(
mut
backend_input
,
context
)
=
request
.into_parts
();
let
(
mut
backend_input
,
context
)
=
request
.into_parts
();
let
isl
=
backend_input
.token_ids
.len
();
backend_input
.estimated_prefix_hit_num_blocks
=
Some
(
overlap_amount
);
backend_input
.estimated_prefix_hit_num_blocks
=
Some
(
overlap_amount
);
let
updated_request
=
context
.map
(|
_
|
backend_input
);
let
updated_request
=
context
.map
(|
_
|
backend_input
);
...
@@ -283,17 +284,39 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
...
@@ -283,17 +284,39 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
let
stream_context
=
response_stream
.context
();
let
stream_context
=
response_stream
.context
();
let
chooser
=
self
.chooser
.clone
();
let
chooser
=
self
.chooser
.clone
();
let
request_id
=
context_id
.clone
();
let
request_id
=
context_id
.clone
();
let
block_size
=
chooser
.block_size
()
as
usize
;
let
wrapped_stream
=
Box
::
pin
(
async_stream
::
stream!
{
let
wrapped_stream
=
Box
::
pin
(
async_stream
::
stream!
{
let
mut
accumulated_tokens
=
Vec
::
new
();
let
mut
total_output_length
=
0u
size
;
let
mut
last_block_index
=
(
isl
.saturating_sub
(
1
))
/
block_size
;
while
let
Some
(
item
)
=
response_stream
.next
()
.await
{
while
let
Some
(
item
)
=
response_stream
.next
()
.await
{
// Track tokens if they exist in the response
// Track tokens if they exist in the response
if
let
Some
(
ref
output
)
=
item
.data
{
let
Some
(
ref
output
)
=
item
.data
else
{
for
token_id
in
&
output
.token_ids
{
yield
item
;
chooser
.push
(
&
request_id
,
*
token_id
)
.await
;
continue
;
}
};
if
output
.token_ids
.is_empty
()
{
yield
item
;
continue
;
}
// Add tokens to accumulator
accumulated_tokens
.extend_from_slice
(
&
output
.token_ids
);
total_output_length
+=
output
.token_ids
.len
();
// Check if we've moved to a new block
let
current_block_index
=
(
isl
+
total_output_length
)
.saturating_sub
(
1
)
/
block_size
;
if
current_block_index
>
last_block_index
{
chooser
.push
(
&
request_id
,
&
accumulated_tokens
)
.await
;
accumulated_tokens
.clear
();
last_block_index
=
current_block_index
;
}
}
yield
item
;
yield
item
;
}
}
chooser
.free
(
&
request_id
)
.await
;
chooser
.free
(
&
request_id
)
.await
;
});
});
...
...
lib/llm/src/kv_router/scheduler.rs
View file @
5e511e92
...
@@ -259,10 +259,10 @@ impl KvScheduler {
...
@@ -259,10 +259,10 @@ impl KvScheduler {
sequences
.add_request
(
request_id
,
token_sequence
,
worker_id
)
sequences
.add_request
(
request_id
,
token_sequence
,
worker_id
)
}
}
/// Push
a
token to a specific request's sequence
/// Push token
s
to a specific request's sequence
pub
async
fn
push
(
&
self
,
request_id
:
&
String
,
token
:
u32
)
{
pub
async
fn
push
(
&
self
,
request_id
:
&
String
,
token
s
:
&
[
u32
]
)
{
let
mut
sequences
=
self
.sequences
.lock
()
.await
;
let
mut
sequences
=
self
.sequences
.lock
()
.await
;
sequences
.push
(
request_id
,
token
)
sequences
.push
(
request_id
,
token
s
)
}
}
/// Free all blocks associated with a request
/// Free all blocks associated with a request
...
@@ -401,8 +401,10 @@ impl WorkerSelector for DefaultWorkerSelector {
...
@@ -401,8 +401,10 @@ impl WorkerSelector for DefaultWorkerSelector {
}
}
// Normalize by dividing by max value
// Normalize by dividing by max value
for
logit
in
worker_logits
.values_mut
()
{
if
max_logit
>
0.0
{
*
logit
/=
max_logit
;
for
logit
in
worker_logits
.values_mut
()
{
*
logit
/=
max_logit
;
}
}
}
// Use softmax sampling to select worker
// Use softmax sampling to select worker
...
...
lib/llm/src/kv_router/sequence.rs
View file @
5e511e92
...
@@ -185,32 +185,50 @@ impl ActiveSequences {
...
@@ -185,32 +185,50 @@ impl ActiveSequences {
self
.active_blocks
self
.active_blocks
}
}
/// Push a token to a specific request's sequence
/// Push tokens to a specific request's sequence
pub
fn
push
(
&
mut
self
,
request_id
:
&
RequestId
,
token
:
u32
)
->
usize
{
pub
fn
push
(
&
mut
self
,
request_id
:
&
RequestId
,
tokens
:
&
[
u32
])
->
usize
{
let
token_seq
=
self
// Collect operations to perform after releasing the borrow
.active_seqs
let
mut
blocks_to_remove
=
Vec
::
new
();
.get_mut
(
request_id
)
let
mut
blocks_to_add
=
Vec
::
new
();
.expect
(
"Request ID not found for token push"
);
token_seq
.append
(
token
)
.expect
(
"Token push failed."
);
{
let
token_seq
=
self
// No need to update anything
.active_seqs
if
token_seq
.total_tokens
()
%
self
.block_size
!=
1
{
.get_mut
(
request_id
)
return
self
.active_blocks
;
.expect
(
"Request ID not found for token push"
);
}
for
&
token
in
tokens
{
token_seq
.append
(
token
)
.expect
(
"Token push failed."
);
// Guard: skip if we didn't cross a block boundary
if
token_seq
.total_tokens
()
%
self
.block_size
!=
1
{
continue
;
}
let
last_seq_hash
=
token_seq
let
last_seq_hash
=
token_seq
.last_complete_block
()
.last_complete_block
()
.map
(|
block
|
block
.sequence_hash
());
.map
(|
block
|
block
.sequence_hash
());
// Promote a partial block into a full block if not already
// Queue operations for later
if
let
Some
(
partial_block
)
=
self
.partial_blocks
.get
(
request_id
)
.cloned
()
{
if
let
Some
(
partial_block
)
=
self
.partial_blocks
.get
(
request_id
)
.cloned
()
{
self
.remove_block
(
request_id
,
&
partial_block
);
blocks_to_remove
.push
(
partial_block
);
}
}
if
let
Some
(
full_block
)
=
last_seq_hash
{
if
let
Some
(
full_block
)
=
last_seq_hash
{
self
.add_block
(
request_id
.clone
(),
&
UniqueBlock
::
FullBlock
(
full_block
));
blocks_to_add
.push
(
UniqueBlock
::
FullBlock
(
full_block
));
}
blocks_to_add
.push
(
UniqueBlock
::
default
());
}
}
// token_seq borrow is dropped here
// Now perform all the queued operations
for
block
in
blocks_to_remove
{
self
.remove_block
(
request_id
,
&
block
);
}
}
self
.add_block
(
request_id
.clone
(),
&
UniqueBlock
::
default
());
for
block
in
blocks_to_add
{
self
.add_block
(
request_id
.clone
(),
&
block
);
}
self
.active_blocks
self
.active_blocks
}
}
...
@@ -227,7 +245,7 @@ enum UpdateSequences {
...
@@ -227,7 +245,7 @@ enum UpdateSequences {
},
},
Push
{
Push
{
request_id
:
RequestId
,
request_id
:
RequestId
,
token
:
u32
,
token
s
:
Vec
<
u32
>
,
// Changed from token:
u32
},
},
NewBlocks
{
NewBlocks
{
token_sequence
:
Arc
<
TokenBlockSequence
>
,
token_sequence
:
Arc
<
TokenBlockSequence
>
,
...
@@ -290,8 +308,8 @@ impl ActiveSequencesMultiWorker {
...
@@ -290,8 +308,8 @@ impl ActiveSequencesMultiWorker {
UpdateSequences
::
Free
{
request_id
}
=>
{
UpdateSequences
::
Free
{
request_id
}
=>
{
active_sequences
.free
(
&
request_id
);
active_sequences
.free
(
&
request_id
);
}
}
UpdateSequences
::
Push
{
request_id
,
token
}
=>
{
UpdateSequences
::
Push
{
request_id
,
token
s
}
=>
{
active_sequences
.push
(
&
request_id
,
token
);
active_sequences
.push
(
&
request_id
,
&
token
s
);
// Changed to pass tokens slice
}
}
UpdateSequences
::
NewBlocks
{
UpdateSequences
::
NewBlocks
{
token_sequence
,
token_sequence
,
...
@@ -393,7 +411,7 @@ impl ActiveSequencesMultiWorker {
...
@@ -393,7 +411,7 @@ impl ActiveSequencesMultiWorker {
self
.request_to_worker
.remove
(
request_id
);
self
.request_to_worker
.remove
(
request_id
);
}
}
pub
fn
push
(
&
mut
self
,
request_id
:
&
RequestId
,
token
:
u32
)
{
pub
fn
push
(
&
mut
self
,
request_id
:
&
RequestId
,
token
s
:
&
[
u32
]
)
{
let
worker_id
=
self
let
worker_id
=
self
.request_to_worker
.request_to_worker
.get
(
request_id
)
.get
(
request_id
)
...
@@ -402,7 +420,7 @@ impl ActiveSequencesMultiWorker {
...
@@ -402,7 +420,7 @@ impl ActiveSequencesMultiWorker {
self
.senders
[
&
worker_id
]
self
.senders
[
&
worker_id
]
.send
(
UpdateSequences
::
Push
{
.send
(
UpdateSequences
::
Push
{
request_id
:
request_id
.clone
(),
request_id
:
request_id
.clone
(),
token
,
token
s
:
tokens
.to_vec
(),
// Convert to Vec
})
})
.expect
(
"Failed to send push command to worker"
);
.expect
(
"Failed to send push command to worker"
);
}
}
...
@@ -498,8 +516,7 @@ mod tests {
...
@@ -498,8 +516,7 @@ mod tests {
// Step 1: Add request 0 with tokens [0, 1, 2], then push 3 and 4
// Step 1: Add request 0 with tokens [0, 1, 2], then push 3 and 4
manager
.add_request
(
"0"
.to_string
(),
to_sequence
(
vec!
[
0
,
1
,
2
]));
manager
.add_request
(
"0"
.to_string
(),
to_sequence
(
vec!
[
0
,
1
,
2
]));
manager
.push
(
&
"0"
.to_string
(),
3
);
manager
.push
(
&
"0"
.to_string
(),
&
[
3
,
4
]);
// Push both tokens at once
manager
.push
(
&
"0"
.to_string
(),
4
);
assert_eq!
(
manager
.active_blocks
(),
2
);
assert_eq!
(
manager
.active_blocks
(),
2
);
assert_eq!
(
manager
.partial_blocks
.len
(),
1
);
assert_eq!
(
manager
.partial_blocks
.len
(),
1
);
...
@@ -551,10 +568,9 @@ mod tests {
...
@@ -551,10 +568,9 @@ mod tests {
// Send request [0, 1, 2, 3] to worker 0
// Send request [0, 1, 2, 3] to worker 0
manager
.add_request
(
"req0"
.to_string
(),
to_sequence
(
vec!
[
0
,
1
,
2
,
3
]),
0
);
manager
.add_request
(
"req0"
.to_string
(),
to_sequence
(
vec!
[
0
,
1
,
2
,
3
]),
0
);
// Send request [0, 1, 2] to worker 1, then push 3 and
push
4
// Send request [0, 1, 2] to worker 1, then push 3 and 4
manager
.add_request
(
"req1"
.to_string
(),
to_sequence
(
vec!
[
0
,
1
,
2
]),
1
);
manager
.add_request
(
"req1"
.to_string
(),
to_sequence
(
vec!
[
0
,
1
,
2
]),
1
);
manager
.push
(
&
"req1"
.to_string
(),
3
);
manager
.push
(
&
"req1"
.to_string
(),
&
[
3
,
4
]);
// Push both tokens at once
manager
.push
(
&
"req1"
.to_string
(),
4
);
// Send request [0, 1, 2] to worker 2
// Send request [0, 1, 2] to worker 2
manager
.add_request
(
"req2"
.to_string
(),
to_sequence
(
vec!
[
0
,
1
,
2
]),
2
);
manager
.add_request
(
"req2"
.to_string
(),
to_sequence
(
vec!
[
0
,
1
,
2
]),
2
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment