Use the optimized block sizes after tuning the kernel. (#14329)

1e3598ed · iefgnoix · GitHub · f7a6bd0f · 1e3598ed
Unverified Commit 1e3598ed authored Mar 07, 2025 by iefgnoix Committed by GitHub Mar 07, 2025
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

vllm/v1/attention/backends/pallas.py vllm/v1/attention/backends/pallas.py +2 -2

No files found.
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -12,8 +12,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.backends.utils import CommonAttentionState

 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 32
-NUM_KV_PAGES_PER_BLOCK = 128
+NUM_QUERIES_PER_BLOCK = 16
+NUM_KV_PAGES_PER_BLOCK = 256


 class PallasAttentionBackend(AttentionBackend):