[bugfix]: use correct cache location for cross attention in torch native backend (#8622)

e678cc71 · Mahmoud Ashraf · GitHub · 4efe844a · e678cc71
Unverified Commit e678cc71 authored Sep 05, 2025 by Mahmoud Ashraf Committed by GitHub Sep 05, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 6 deletions

python/sglang/srt/layers/attention/torch_native_backend.py python/sglang/srt/layers/attention/torch_native_backend.py +12 -6

No files found.
--- a/python/sglang/srt/layers/attention/torch_native_backend.py
+++ b/python/sglang/srt/layers/attention/torch_native_backend.py
@@ -193,10 +193,13 @@ class TorchNativeAttnBackend(AttentionBackend):
        else:
            o = torch.empty_like(q)
+        if layer.is_cross_attention:
+            cache_loc = forward_batch.encoder_out_cache_loc
+        else:
+            cache_loc = forward_batch.out_cache_loc
        if save_kv_cache:
-            forward_batch.token_to_kv_pool.set_kv_buffer(
+            forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
-                layer, forward_batch.out_cache_loc, k, v
-            )
        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
@@ -241,10 +244,13 @@ class TorchNativeAttnBackend(AttentionBackend):
        else:
            o = torch.empty_like(q)
+        if layer.is_cross_attention:
+            cache_loc = forward_batch.encoder_out_cache_loc
+        else:
+            cache_loc = forward_batch.out_cache_loc
        if save_kv_cache:
-            forward_batch.token_to_kv_pool.set_kv_buffer(
+            forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
-                layer, forward_batch.out_cache_loc, k, v
-            )
        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num