hotfix: avoid non-prefilled block use when using prefix caching (#2489)

The minimum batch size logic could cause prefix blocks to be deallocated without prefill. The next allocation of the same prefix would then use garbage blocks.

hotfix: avoid non-prefilled block use when using prefix caching (#2489)
The minimum batch size logic could cause prefix blocks to be deallocated without prefill. The next allocation of the same prefix would then use garbage blocks.
deec30f8 · Daniël de Kok · GitHub · 6cb42f49 · deec30f8
Unverified Commit deec30f8 authored Sep 05, 2024 by Daniël de Kok Committed by GitHub Sep 05, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

backends/v3/src/backend.rs backends/v3/src/backend.rs +5 -2

No files found.
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@@ -122,7 +122,7 @@ impl Backend for BackendV3 {
 #[allow(clippy::too_many_arguments)]
 pub(crate) async fn batching_task(
    mut client: ShardedClient,
-    waiting_served_ratio: f32,
+    _waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
    max_batch_total_tokens: u32,
    max_waiting_tokens: usize,
@@ -168,7 +168,10 @@ pub(crate) async fn batching_task(
                    None
                } else {
                    // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    // TODO: temporarily disable to avoid incorrect deallocation +
+                    //       reallocation when using prefix caching.
+                    // Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    None
                };
                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);