Unverified Commit e5e076ca authored by Varun Sundar Rabindranath's avatar Varun Sundar Rabindranath Committed by GitHub
Browse files

[BugFix] Stopgap - Flashinfer Autotuner + GPT-OSS + DP/TP (#27762)


Signed-off-by: default avatarVarun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: default avatarVarun Sundar Rabindranath <vsundarr@redhat.com>
parent eebf00cb
...@@ -11,7 +11,7 @@ from typing import TYPE_CHECKING ...@@ -11,7 +11,7 @@ from typing import TYPE_CHECKING
import torch import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import CUDAGraphMode, VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -30,13 +30,19 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: ...@@ -30,13 +30,19 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool:
Record known issues with vllm + flashinfer autotune here. Return True if Record known issues with vllm + flashinfer autotune here. Return True if
and only if flashinfer autotune will run through without issues. and only if flashinfer autotune will run through without issues.
""" """
return not ( is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or (
vllm_config.parallel_config.data_parallel_size > 1 vllm_config.parallel_config.tensor_parallel_size > 1
and (
envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
)
) )
is_fi_mxfp4_backend = (
envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
) or (
current_platform.is_cuda() and current_platform.is_device_capability(100)
) # on >=sm100, default mxfp4 backend is flashinfer
is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager)
def kernel_warmup(worker: "Worker"): def kernel_warmup(worker: "Worker"):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment