Unverified Commit 1e3598ed authored by iefgnoix's avatar iefgnoix Committed by GitHub
Browse files

Use the optimized block sizes after tuning the kernel. (#14329)

parent f7a6bd0f
...@@ -12,8 +12,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, ...@@ -12,8 +12,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.backends.utils import CommonAttentionState
# These are the 2 tunable parameters of the paged attention Pallas kernel. # These are the 2 tunable parameters of the paged attention Pallas kernel.
NUM_QUERIES_PER_BLOCK = 32 NUM_QUERIES_PER_BLOCK = 16
NUM_KV_PAGES_PER_BLOCK = 128 NUM_KV_PAGES_PER_BLOCK = 256
class PallasAttentionBackend(AttentionBackend): class PallasAttentionBackend(AttentionBackend):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment