Revert "kernel: use tensor cores for flashinfer gqa kernels" (#1511)

f39a0197 · Ying Sheng · GitHub · 3c93187c · f39a0197
Unverified Commit f39a0197 authored Sep 24, 2024 by Ying Sheng Committed by GitHub Sep 24, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 11 deletions

python/sglang/srt/layers/attention_backend.py python/sglang/srt/layers/attention_backend.py +3 -11

No files found.
--- a/python/sglang/srt/layers/attention_backend.py
+++ b/python/sglang/srt/layers/attention_backend.py
@@ -86,17 +86,9 @@ class FlashInferAttnBackend(AttentionBackend):
        super().__init__()
        self.model_runner = model_runner
-        local_num_qo_heads = (
+        if not _grouped_size_compiled_for_decode_kernels(
-            model_runner.model_config.num_attention_heads // model_runner.tp_size
+            model_runner.model_config.num_attention_heads // model_runner.tp_size,
-        )
+            model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
-        local_num_kv_heads = model_runner.model_config.get_num_kv_heads(
-            model_runner.tp_size
-        )
-        if (
-            not _grouped_size_compiled_for_decode_kernels(
-                local_num_qo_heads, local_num_kv_heads
-            )
-            or local_num_qo_heads // local_num_kv_heads > 4
        ):
            self.decode_use_tensor_cores = True
        else: