Add comments on accessing `kv_cache` and `attn_metadata` (#13887)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Add comments on accessing `kv_cache` and `attn_metadata` (#13887)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
0ecdd980 · Harry Mellor · GitHub · 7b700ec8 · 0ecdd980
Unverified Commit 0ecdd980 authored Feb 26, 2025 by Harry Mellor Committed by GitHub Feb 26, 2025
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 0 deletions

vllm/attention/layer.py vllm/attention/layer.py +13 -0

No files found.
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -47,6 +47,10 @@ class Attention(nn.Module):
        attn_type: str = AttentionType.DECODER,
        **extra_impl_args,
    ) -> None:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+        """
        super().__init__()
        if per_layer_sliding_window is not None:
            # per-layer sliding window
@@ -155,6 +159,15 @@ class Attention(nn.Module):
        key: torch.Tensor,
        value: torch.Tensor,
    ) -> torch.Tensor:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+        Attention metadata (`attn_metadata`) is set using a context manager in
+        the model runner's `execute_model` method. It is accessed via forward
+        context using
+        `vllm.forward_context.get_forward_context().attn_metadata`.
+        """
        if self.calculate_kv_scales:
            attn_metadata = get_forward_context().attn_metadata
            if attn_metadata.enable_kv_scales_calculation: