Turn off autotune for scaled mm for fp8 dynamic quant in torchao (#2116)

7f8fcd39 · Jerry Zhang · GitHub · 5c6a41fa · 7f8fcd39
Unverified Commit 7f8fcd39 authored Nov 21, 2024 by Jerry Zhang Committed by GitHub Nov 21, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 0 deletions

python/sglang/srt/models/torch_native_llama.py python/sglang/srt/models/torch_native_llama.py +4 -0

No files found.
--- a/python/sglang/srt/models/torch_native_llama.py
+++ b/python/sglang/srt/models/torch_native_llama.py
@@ -401,6 +401,10 @@ class TorchNativeLlamaForCausalLM(nn.Module):
        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
        self.logits_processor = LogitsProcessor(config)
+        # turning off autotune for fp8dq since it doesn't give speedup and
+        # increases compile time significantly
+        torch._inductor.config.max_autotune_gemm_backends = "ATEN"
    @torch.no_grad()
    def forward(
        self,