[GDN] add a config for gdn kernel selection (#36647)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> Co-authored-by: Roger Wang <hey@rogerw.io>

[GDN] add a config for gdn kernel selection (#36647)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> Co-authored-by: Roger Wang <hey@rogerw.io>
697e4ff3 · Jiangyun Zhu · GitHub · a3e2e250 · 697e4ff3 · 697e4ff3
Unverified Commit 697e4ff3 authored Mar 16, 2026 by Jiangyun Zhu Committed by GitHub Mar 16, 2026
Show whitespace changes
Inline Side-by-side

Showing with 47 additions and 4 deletions

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +11 -0

vllm/model_executor/models/qwen3_next.py vllm/model_executor/models/qwen3_next.py +36 -4

No files found.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -614,6 +614,7 @@ class EngineArgs:
    )
    fail_on_environ_validation: bool = False
+    gdn_prefill_backend: Literal["flashinfer", "triton"] | None = None
    def __post_init__(self):
        # support `EngineArgs(compilation_config={...})`
@@ -1318,6 +1319,13 @@ class EngineArgs:
            help="Shutdown timeout in seconds. 0 = abort, >0 = wait.",
        )
+        parser.add_argument(
+            "--gdn-prefill-backend",
+            dest="gdn_prefill_backend",
+            choices=["flashinfer", "triton"],
+            default=None,
+            help="Select GDN prefill backend.",
+        )
        return parser
    @classmethod
@@ -1903,6 +1911,9 @@ class EngineArgs:
            ),
        )
+        if self.gdn_prefill_backend is not None:
+            self.additional_config["gdn_prefill_backend"] = self.gdn_prefill_backend
        config = VllmConfig(
            model_config=model_config,
            cache_config=cache_config,

--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -161,13 +161,45 @@ def fi_chunk_gated_delta_rule(
 class ChunkGatedDeltaRule(CustomOp):
    def __init__(self) -> None:
        super().__init__()
-        if current_platform.is_cuda() and current_platform.is_device_capability(90):
+        backend = (
+            str(
+                get_current_vllm_config().additional_config.get(
+                    "gdn_prefill_backend", "auto"
+                )
+            )
+            .strip()
+            .lower()
+        )
+        supports_flashinfer = (
+            current_platform.is_cuda() and current_platform.is_device_capability(90)
+        )
+        if backend == "flashinfer":
+            use_flashinfer = supports_flashinfer
+            if not use_flashinfer:
+                logger.warning_once(
+                    "GDN prefill backend 'flashinfer' is selected but "
+                    "cannot use this kernel on the current platform. "
+                    "Falling back to Triton/FLA."
+                )
+        elif backend == "triton":
+            use_flashinfer = False
+        else:
+            use_flashinfer = supports_flashinfer
+        if use_flashinfer:
+            logger.info_once("Using FlashInfer GDN prefill kernel")
            logger.info_once(
-                "Using FlashInfer GDN prefill kernel on CUDA compute capability 90"
+                "FlashInfer GDN prefill kernel is JIT-compiled; first run may "
+                "take a while to compile. Set `--gdn-prefill-backend triton` to "
+                "avoid JIT compile time."
            )
-            self._forward_method = self.forward_cuda
        else:
-            self._forward_method = self.forward_native
+            logger.info_once("Using Triton/FLA GDN prefill kernel")
+        self._forward_method = (
+            self.forward_cuda if use_flashinfer else self.forward_native
+        )
    def forward_cuda(
        self,