Unverified Commit f1531d9f authored by Stan Wozniak's avatar Stan Wozniak Committed by GitHub
Browse files

[Hybrid] Mamba2 prefix cache blocks freeing for running requests (#28047)


Signed-off-by: default avatarStanislaw Wozniak <stw@zurich.ibm.com>
Signed-off-by: default avatarChen Zhang <zhangch99@outlook.com>
Co-authored-by: default avatarChen Zhang <zhangch99@outlook.com>
parent 2d6001f4
...@@ -737,6 +737,14 @@ class MambaManager(SingleTypeKVCacheManager): ...@@ -737,6 +737,14 @@ class MambaManager(SingleTypeKVCacheManager):
) )
return super().allocate_new_blocks(request_id, num_tokens) return super().allocate_new_blocks(request_id, num_tokens)
def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
"""
Get the number of tokens whose mamba state are not needed anymore. Mamba only
need to keep the state of the last computed token, so we return
num_computed_tokens - 1.
"""
return num_computed_tokens - 1
class CrossAttentionManager(SingleTypeKVCacheManager): class CrossAttentionManager(SingleTypeKVCacheManager):
"""Manager for cross-attention KV cache in encoder-decoder models.""" """Manager for cross-attention KV cache in encoder-decoder models."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment