Unverified Commit cbcbdb1c authored by Konrad Zawora's avatar Konrad Zawora Committed by GitHub
Browse files

[Bugfix][Hardware][Gaudi] Bump vllm_hpu_extension version (#11028)


Signed-off-by: default avatarKonrad Zawora <kzawora@habana.ai>
parent a811dd66
...@@ -8,4 +8,4 @@ pandas ...@@ -8,4 +8,4 @@ pandas
tabulate tabulate
setuptools>=61 setuptools>=61
setuptools-scm>=8 setuptools-scm>=8
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6 vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f
...@@ -111,8 +111,16 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): ...@@ -111,8 +111,16 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
self.matmul_qk = Matmul() self.matmul_qk = Matmul()
self.softmax = Softmax() self.softmax = Softmax()
self.matmul_av = Matmul() self.matmul_av = Matmul()
self.batch2block_matmul = Matmul()
self.block2batch_matmul = Matmul()
# NOTE(kzawora): Contiguous PA is off until model runner supports it
self.k_cache = VLLMKVCache() self.k_cache = VLLMKVCache()
self.k_cache.use_contiguous_pa = False
self.v_cache = VLLMKVCache() self.v_cache = VLLMKVCache()
self.v_cache.use_contiguous_pa = False
# NOTE(kzawora): Pipelined PA is off until model runner supports it
ops.pa_impl = ops.pa
self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
self.sliding_window = sliding_window self.sliding_window = sliding_window
self.alibi_slopes = alibi_slopes self.alibi_slopes = alibi_slopes
...@@ -228,9 +236,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): ...@@ -228,9 +236,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
block_mapping=attn_metadata.block_mapping, block_mapping=attn_metadata.block_mapping,
block_bias=attn_metadata.attn_bias, block_bias=attn_metadata.attn_bias,
block_scales=attn_metadata.block_scales, block_scales=attn_metadata.block_scales,
block_groups=None,
scale=self.scale, scale=self.scale,
matmul_qk_op=self.matmul_qk, matmul_qk_op=self.matmul_qk,
matmul_av_op=self.matmul_av, matmul_av_op=self.matmul_av,
batch2block_matmul_op=self.batch2block_matmul,
block2batch_matmul_op=self.block2batch_matmul,
keys_fetch_func=self.k_cache.fetch_from_cache, keys_fetch_func=self.k_cache.fetch_from_cache,
values_fetch_func=self.v_cache.fetch_from_cache) values_fetch_func=self.v_cache.fetch_from_cache)
# Reshape the output tensor. # Reshape the output tensor.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment