Unverified Commit f39a0197 authored by Ying Sheng's avatar Ying Sheng Committed by GitHub
Browse files

Revert "kernel: use tensor cores for flashinfer gqa kernels" (#1511)

parent 3c93187c
...@@ -86,17 +86,9 @@ class FlashInferAttnBackend(AttentionBackend): ...@@ -86,17 +86,9 @@ class FlashInferAttnBackend(AttentionBackend):
super().__init__() super().__init__()
self.model_runner = model_runner self.model_runner = model_runner
local_num_qo_heads = ( if not _grouped_size_compiled_for_decode_kernels(
model_runner.model_config.num_attention_heads // model_runner.tp_size model_runner.model_config.num_attention_heads // model_runner.tp_size,
) model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
local_num_kv_heads = model_runner.model_config.get_num_kv_heads(
model_runner.tp_size
)
if (
not _grouped_size_compiled_for_decode_kernels(
local_num_qo_heads, local_num_kv_heads
)
or local_num_qo_heads // local_num_kv_heads > 4
): ):
self.decode_use_tensor_cores = True self.decode_use_tensor_cores = True
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment