[Bug] Fix NSA Backend KV-Buffer Shape Mismatch in DeepSeek-V3.2 (#12645)

1cfbbc42 · Johnsonms · GitHub · 55dfb539 · 1cfbbc42
Unverified Commit 1cfbbc42 authored Nov 04, 2025 by Johnsonms Committed by GitHub Nov 04, 2025
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

python/sglang/srt/mem_cache/memory_pool.py python/sglang/srt/mem_cache/memory_pool.py +3 -1

No files found.
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -1568,6 +1568,7 @@ class MLATokenToKVPool(KVCache):
 class NSATokenToKVPool(MLATokenToKVPool):
    quant_block_size = 128
    index_k_with_scale_buffer_dtype = torch.uint8
+    rope_storage_dtype = torch.bfloat16  # rope is always stored in bf16

    def __init__(
        self,
@@ -1589,10 +1590,11 @@ class NSATokenToKVPool(MLATokenToKVPool):

        # Calculate override_kv_cache_dim for FP8 storage:
        # kv_lora_rank + scale storage (kv_lora_rank // quant_block_size * 4 bytes) + rope dimension storage
+        # Note: rope dimension is stored in original dtype (bf16), not quantized to fp8
        override_dim = (
            kv_lora_rank
            + kv_lora_rank // self.quant_block_size * 4
-            + qk_rope_head_dim * dtype.itemsize
+            + qk_rope_head_dim * self.rope_storage_dtype.itemsize
        )

        super().__init__(