Fix a regression introduced by overlapping KV cache writing (#4375)

4fea040c · Lianmin Zheng · GitHub · 6aaeb848 · 4fea040c
Unverified Commit 4fea040c authored Mar 13, 2025 by Lianmin Zheng Committed by GitHub Mar 13, 2025
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

python/sglang/srt/mem_cache/memory_pool.py python/sglang/srt/mem_cache/memory_pool.py +1 -1

No files found.
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -326,7 +326,7 @@ class MHATokenToKVPool(KVCache):
            cache_k = cache_k.view(self.store_dtype)
            cache_v = cache_v.view(self.store_dtype)

-        if self.capture_mode:
+        if self.capture_mode and cache_k.shape[0] < 4:
            self.alt_stream.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(self.alt_stream):
                self.k_buffer[layer_id][loc] = cache_k