Adjust flashinfer workspace size for Qwen2 models (#2879)

c19d8482 · Ke Bao · GitHub · 80002562 · c19d8482
Unverified Commit c19d8482 authored Jan 14, 2025 by Ke Bao Committed by GitHub Jan 14, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 0 deletions

python/sglang/srt/layers/attention/flashinfer_backend.py python/sglang/srt/layers/attention/flashinfer_backend.py +4 -0

No files found.
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -84,6 +84,10 @@ class FlashInferAttnBackend(AttentionBackend):
            self.num_wrappers = 1
            self.dispatch_reason = None
+        # Qwen2 models require higher flashinfer workspace size
+        if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures:
+            global_config.flashinfer_workspace_size = 512 * 1024 * 1024
        # Allocate buffers
        self.workspace_buffer = torch.empty(
            global_config.flashinfer_workspace_size,