[Bugfix][Hardware][Gaudi] Bump vllm_hpu_extension version (#11028)

Signed-off-by: Konrad Zawora <kzawora@habana.ai>

[Bugfix][Hardware][Gaudi] Bump vllm_hpu_extension version (#11028)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
cbcbdb1c · Konrad Zawora · GitHub · a811dd66 · cbcbdb1c · cbcbdb1c
Unverified Commit cbcbdb1c authored Dec 09, 2024 by Konrad Zawora Committed by GitHub Dec 09, 2024
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 1 deletion

requirements-hpu.txt requirements-hpu.txt +1 -1

vllm/attention/backends/hpu_attn.py vllm/attention/backends/hpu_attn.py +11 -0

No files found.
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -111,8 +111,16 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
        self.matmul_qk = Matmul()
        self.softmax = Softmax()
        self.matmul_av = Matmul()
+        self.batch2block_matmul = Matmul()
+        self.block2batch_matmul = Matmul()
+        # NOTE(kzawora): Contiguous PA is off until model runner supports it
        self.k_cache = VLLMKVCache()
+        self.k_cache.use_contiguous_pa = False
        self.v_cache = VLLMKVCache()
+        self.v_cache.use_contiguous_pa = False
+        # NOTE(kzawora): Pipelined PA is off until model runner supports it
+        ops.pa_impl = ops.pa
        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
        self.sliding_window = sliding_window
        self.alibi_slopes = alibi_slopes
@@ -228,9 +236,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
                block_mapping=attn_metadata.block_mapping,
                block_bias=attn_metadata.attn_bias,
                block_scales=attn_metadata.block_scales,
+                block_groups=None,
                scale=self.scale,
                matmul_qk_op=self.matmul_qk,
                matmul_av_op=self.matmul_av,
+                batch2block_matmul_op=self.batch2block_matmul,
+                block2batch_matmul_op=self.block2batch_matmul,
                keys_fetch_func=self.k_cache.fetch_from_cache,
                values_fetch_func=self.v_cache.fetch_from_cache)
        # Reshape the output tensor.