[Bugfix] Fix incorrect dispatch for CutlassBlockScaledGroupedGemm and DeepGEMM (#20933)

Signed-off-by: mgoin <mgoin64@gmail.com>

[Bugfix] Fix incorrect dispatch for CutlassBlockScaledGroupedGemm and DeepGEMM (#20933)
Signed-off-by: mgoin <mgoin64@gmail.com>
bcdfb2a3 · Michael Goin · GitHub · ba8c3000 · bcdfb2a3
Unverified Commit bcdfb2a3 authored Jul 15, 2025 by Michael Goin Committed by GitHub Jul 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 5 deletions

vllm/model_executor/layers/quantization/fp8.py vllm/model_executor/layers/quantization/fp8.py +10 -5

No files found.
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -488,11 +488,16 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                logger.warning_once("Failed to import DeepGemm kernels.")
            elif not self.block_quant:
                logger.warning_once("Model is not block quantized. Not using "
-                                    " DeepGemm kernels")
+                                    "DeepGemm kernels")
            elif (current_platform.is_cuda()
-                  and current_platform.has_device_capability(90)):
+                  and current_platform.is_device_capability(90)):
                logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
                self.allow_deep_gemm = True
+            elif (current_platform.is_cuda()
+                  and is_blackwell_deep_gemm_used()):
+                logger.info_once("Using DeepGemm SM100 kernels for "
+                                 "Fp8MoEMethod.")
+                self.allow_deep_gemm = True
            else:
                logger.warning_once(
                    "DeepGemm not supported on the current platform.")
@@ -500,10 +505,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
        # Check for CutlassBlockScaledGroupedGemm support.
        self.allow_cutlass_block_scaled_grouped_gemm = False
        if not self.block_quant:
-            logger.warning_once("Model is not block quantized. Not using "
-                                "CutlassBlockScaledGroupedGemm kernels")
+            logger.debug_once("Model is not block quantized. Not using "
+                              "CutlassBlockScaledGroupedGemm kernels")
        elif (current_platform.is_cuda()
-              and current_platform.has_device_capability(100)):
+              and current_platform.is_device_capability(100)):
            logger.info_once(
                "Using CutlassBlockScaledGroupedGemm kernels for Fp8MoEMethod."
            )