Use deepgemm instead of triton for fused_qkv_a_proj_with_mqa (#6890)

35b65cf0 · fzyzcjy · GitHub · dd1012fc · 35b65cf0
Unverified Commit 35b65cf0 authored Jun 06, 2025 by fzyzcjy Committed by GitHub Jun 05, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

python/sglang/srt/layers/quantization/fp8_utils.py python/sglang/srt/layers/quantization/fp8_utils.py +2 -2

No files found.
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -227,8 +227,8 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback(
    output_dtype = input.dtype
    dtype_supported = output_dtype == torch.bfloat16

-    # TODO: add more robust shape check here
-    shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
+    # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737
+    shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0

    if not (shape_supported and dtype_supported):
        # fall back to triton