[BugFix] Stopgap - Flashinfer Autotuner + GPT-OSS + DP/TP (#27762)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>

[BugFix] Stopgap - Flashinfer Autotuner + GPT-OSS + DP/TP (#27762)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
e5e076ca · Varun Sundar Rabindranath · GitHub · eebf00cb · e5e076ca
Unverified Commit e5e076ca authored Oct 30, 2025 by Varun Sundar Rabindranath Committed by GitHub Oct 30, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 7 deletions

vllm/model_executor/warmup/kernel_warmup.py vllm/model_executor/warmup/kernel_warmup.py +13 -7

No files found.
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -11,7 +11,7 @@ from typing import TYPE_CHECKING
 import torch
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
 from vllm.platforms import current_platform
@@ -30,13 +30,19 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool:
    Record known issues with vllm + flashinfer autotune here. Return True if
    and only if flashinfer autotune will run through without issues.
    """
-    return not (
+    is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or (
-        vllm_config.parallel_config.data_parallel_size > 1
+        vllm_config.parallel_config.tensor_parallel_size > 1
-        and (
-            envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
-            or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-        )
    )
+    is_fi_mxfp4_backend = (
+        envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
+        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
+    ) or (
+        current_platform.is_cuda() and current_platform.is_device_capability(100)
+    )  # on >=sm100, default mxfp4 backend is flashinfer
+    is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager)
 def kernel_warmup(worker: "Worker"):