Unverified Commit 0ecdd980 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Add comments on accessing `kv_cache` and `attn_metadata` (#13887)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 7b700ec8
...@@ -47,6 +47,10 @@ class Attention(nn.Module): ...@@ -47,6 +47,10 @@ class Attention(nn.Module):
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
**extra_impl_args, **extra_impl_args,
) -> None: ) -> None:
"""
The KV cache is stored inside this class and is accessed via
`self.kv_cache`.
"""
super().__init__() super().__init__()
if per_layer_sliding_window is not None: if per_layer_sliding_window is not None:
# per-layer sliding window # per-layer sliding window
...@@ -155,6 +159,15 @@ class Attention(nn.Module): ...@@ -155,6 +159,15 @@ class Attention(nn.Module):
key: torch.Tensor, key: torch.Tensor,
value: torch.Tensor, value: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
"""
The KV cache is stored inside this class and is accessed via
`self.kv_cache`.
Attention metadata (`attn_metadata`) is set using a context manager in
the model runner's `execute_model` method. It is accessed via forward
context using
`vllm.forward_context.get_forward_context().attn_metadata`.
"""
if self.calculate_kv_scales: if self.calculate_kv_scales:
attn_metadata = get_forward_context().attn_metadata attn_metadata = get_forward_context().attn_metadata
if attn_metadata.enable_kv_scales_calculation: if attn_metadata.enable_kv_scales_calculation:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment