[Model][AMD] ROCm support for 256 head dims for Gemma (#3972)

8b317c6d · James Whedbee · GitHub · bd3c144e · 8b317c6d
Unverified Commit 8b317c6d authored Apr 10, 2024 by James Whedbee Committed by GitHub Apr 10, 2024
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 3 deletions

vllm/attention/ops/triton_flash_attention.py vllm/attention/ops/triton_flash_attention.py +2 -3

No files found.
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -677,8 +677,7 @@ def check_args(
    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
    # TODO: Change assert if we support qkl f8 and v f16
    assert q.dtype == k.dtype and q.dtype == v.dtype
-    # TODO: Fix assert to check head size <=256 once supported
-    assert head_size <= 128
+    assert head_size <= 256
    assert o.shape == q.shape
    assert (nheads_q % nheads_k) == 0

@@ -729,7 +728,7 @@ class _attention(torch.autograd.Function):
            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))

        # Get closest power of 2 over or equal to 32.
-        unpadded_head_dims = {32, 64, 128}
+        unpadded_head_dims = {32, 64, 128, 256}
        if head_size not in unpadded_head_dims:
            padded_d_model = None
            for i in unpadded_head_dims: