Unverified Commit deec30f8 authored by Daniël de Kok's avatar Daniël de Kok Committed by GitHub
Browse files

hotfix: avoid non-prefilled block use when using prefix caching (#2489)

The minimum batch size logic could cause prefix blocks to be
deallocated without prefill. The next allocation of the same
prefix would then use garbage blocks.
parent 6cb42f49
......@@ -122,7 +122,7 @@ impl Backend for BackendV3 {
#[allow(clippy::too_many_arguments)]
pub(crate) async fn batching_task(
mut client: ShardedClient,
waiting_served_ratio: f32,
_waiting_served_ratio: f32,
max_batch_prefill_tokens: u32,
max_batch_total_tokens: u32,
max_waiting_tokens: usize,
......@@ -168,7 +168,10 @@ pub(crate) async fn batching_task(
None
} else {
// Minimum batch size
Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
// TODO: temporarily disable to avoid incorrect deallocation +
// reallocation when using prefix caching.
// Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
None
};
let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment