Unverified Commit deec30f8 authored by Daniël de Kok's avatar Daniël de Kok Committed by GitHub
Browse files

hotfix: avoid non-prefilled block use when using prefix caching (#2489)

The minimum batch size logic could cause prefix blocks to be
deallocated without prefill. The next allocation of the same
prefix would then use garbage blocks.
parent 6cb42f49
...@@ -122,7 +122,7 @@ impl Backend for BackendV3 { ...@@ -122,7 +122,7 @@ impl Backend for BackendV3 {
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub(crate) async fn batching_task( pub(crate) async fn batching_task(
mut client: ShardedClient, mut client: ShardedClient,
waiting_served_ratio: f32, _waiting_served_ratio: f32,
max_batch_prefill_tokens: u32, max_batch_prefill_tokens: u32,
max_batch_total_tokens: u32, max_batch_total_tokens: u32,
max_waiting_tokens: usize, max_waiting_tokens: usize,
...@@ -168,7 +168,10 @@ pub(crate) async fn batching_task( ...@@ -168,7 +168,10 @@ pub(crate) async fn batching_task(
None None
} else { } else {
// Minimum batch size // Minimum batch size
Some((batch_size as f32 * waiting_served_ratio).floor() as usize) // TODO: temporarily disable to avoid incorrect deallocation +
// reallocation when using prefix caching.
// Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
None
}; };
let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens); let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment