[Bugfix] Only use triton_kernels for MXFP4 on SM90 and SM100 (#29339)

Signed-off-by: mgoin <mgoin64@gmail.com>

[Bugfix] Only use triton_kernels for MXFP4 on SM90 and SM100 (#29339)
Signed-off-by: mgoin <mgoin64@gmail.com>
c17610e2 · Michael Goin · GitHub · 71df2a57 · c17610e2
Unverified Commit c17610e2 authored Nov 24, 2025 by Michael Goin Committed by GitHub Nov 24, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 6 deletions

vllm/model_executor/layers/quantization/mxfp4.py vllm/model_executor/layers/quantization/mxfp4.py +9 -6

No files found.
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -132,12 +132,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
            )

        # If FlashInfer is not available, try either Marlin or Triton
-        if (
-            envs.VLLM_MXFP4_USE_MARLIN
-            or current_platform.get_device_capability()[0] < 9
-            or not has_triton_kernels()
-            or not is_torch_equal_or_newer("2.8.0")
-        ):
+        triton_kernels_supported = (
+            has_triton_kernels()
+            and is_torch_equal_or_newer("2.8.0")
+            # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+            # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+            # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+            and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+        )
+        if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
            logger.info_once("Using Marlin backend")
            return Mxfp4Backend.MARLIN
        else: