Fix cuda graph mode in flashinfer attn backend (#10056)

a12061df · Ben Barsdell · GitHub · 85ed8e0a · a12061df
Unverified Commit a12061df authored Sep 07, 2025 by Ben Barsdell Committed by GitHub Sep 06, 2025
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

python/sglang/srt/layers/attention/flashinfer_backend.py python/sglang/srt/layers/attention/flashinfer_backend.py +6 -4

No files found.
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -501,8 +501,9 @@ class FlashInferAttnBackend(AttentionBackend):
                sm_scale=layer.scaling,
                window_left=layer.sliding_window_size,
                logits_soft_cap=logits_soft_cap,
-                k_scale=layer.k_scale,
+                # Must use _float to avoid device-to-host copy that breaks cuda graph capture.
-                v_scale=layer.v_scale,
+                k_scale=layer.k_scale_float,
+                v_scale=layer.v_scale_float,
            )
        else:
            causal = True
@@ -580,8 +581,9 @@ class FlashInferAttnBackend(AttentionBackend):
            forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
            sm_scale=layer.scaling,
            logits_soft_cap=layer.logit_cap,
-            k_scale=layer.k_scale,
+            # Must use _float to avoid device-to-host copy that breaks cuda graph capture.
-            v_scale=layer.v_scale,
+            k_scale=layer.k_scale_float,
+            v_scale=layer.v_scale_float,
        )
        return o.view(-1, layer.tp_q_head_num * layer.head_dim)