Prefix Caching- fix t4 triton error (#2517)

64da65b3 · shiyi.c_98 · GitHub · 5255d99d · 64da65b3
Unverified Commit 64da65b3 authored Feb 16, 2024 by shiyi.c_98 Committed by GitHub Feb 16, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

vllm/model_executor/layers/triton_kernel/prefix_prefill.py vllm/model_executor/layers/triton_kernel/prefix_prefill.py +3 -1

No files found.
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -618,7 +618,9 @@ if triton.__version__ >= "2.1.0":
                              b_ctx_len,
                              max_input_len,
                              alibi_slopes=None):
-        BLOCK = 128
+        cap = torch.cuda.get_device_capability()
+        BLOCK = 128 if cap[0] >= 8 else 64
        # shape constraints
        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
        assert Lq == Lk and Lk == Lv