[Fix] Fix accuracy bug in CSGMV kernel caching key. (#11579)

780fbf2f · Lifu Huang · GitHub · 825432fc · 780fbf2f · 780fbf2f
Unverified Commit 780fbf2f authored Oct 14, 2025 by Lifu Huang Committed by GitHub Oct 14, 2025
2 changed files
--- a/python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py
+++ b/python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py
@@ -9,7 +9,7 @@ from sglang.srt.utils import cached_triton_kernel
 @cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
-@triton.jit
+@triton.jit(do_not_specialize=["num_segs"])
 def _chunked_lora_expand_kernel(
    # Pointers to matrices
    x,

--- a/python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py
+++ b/python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py
@@ -6,8 +6,10 @@ from sglang.srt.lora.utils import LoRABatchInfo
 from sglang.srt.utils import cached_triton_kernel
-@cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
+@cached_triton_kernel(
-@triton.jit
+    lambda _, kwargs: (kwargs["K"], kwargs["NUM_SLICES"], kwargs["BLOCK_M"])
+)
+@triton.jit(do_not_specialize=["num_segs"])
 def _chunked_lora_shrink_kernel(
    # Pointers to matrices
    x,