remove _grouped_size_compiled_for_decode_kernels (#3453)

27c4c9cf · Yineng Zhang · GitHub · 52a492a1 · 27c4c9cf · 27c4c9cf
Unverified Commit 27c4c9cf authored Feb 10, 2025 by Yineng Zhang Committed by GitHub Feb 10, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 16 deletions

docs/start/install.md docs/start/install.md +1 -1

python/sglang/srt/layers/attention/flashinfer_backend.py python/sglang/srt/layers/attention/flashinfer_backend.py +0 -15

No files found.
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -22,7 +22,7 @@ pip install sgl-kernel --force-reinstall --no-deps
 pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
 ```

-Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions. If you meet with issue like **ImportError: cannot import name `_grouped_size_compiled_for_decode_kernels`**, installing FlashInfer with some older version like 0.1.6 instead of the latest version could solve it.
+Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.

 Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:


--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -1077,21 +1077,6 @@ def should_use_tensor_core(
    if env_override is not None:
        return env_override.lower() == "true"

-    # Try to use _grouped_size_compiled_for_decode_kernels if available
-    # This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
-    try:
-        from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
-
-        if not _grouped_size_compiled_for_decode_kernels(
-            num_attention_heads,
-            num_kv_heads,
-        ):
-            return True
-        else:
-            return False
-    except (ImportError, AttributeError):
-        pass
-
    # Calculate GQA group size
    gqa_group_size = num_attention_heads // num_kv_heads