SM120 / NVFP4: add device guard and runtime SM dispatch to cutlass_scaled_fp4_mm (#29711)

Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>

SM120 / NVFP4: add device guard and runtime SM dispatch to cutlass_scaled_fp4_mm (#29711)
Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
c0dfc894 · Hendrik Holtmann · GitHub · 44822d7f · c0dfc894
Unverified Commit c0dfc894 authored Dec 02, 2025 by Hendrik Holtmann Committed by GitHub Dec 01, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 13 deletions

csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu +26 -13

No files found.
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -15,6 +15,8 @@
 */
 #include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "cutlass_extensions/common.hpp"
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
 void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
@@ -32,23 +34,34 @@ void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
                                  torch::Tensor const& alpha);
 #endif
-void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
+void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A,
-                           torch::Tensor const& B, torch::Tensor const& A_sf,
+                           const torch::Tensor& B, const torch::Tensor& A_sf,
-                           torch::Tensor const& B_sf,
+                           const torch::Tensor& B_sf,
-                           torch::Tensor const& alpha) {
+                           const torch::Tensor& alpha) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+  // Make sure we’re on A’s device.
-  return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+  const c10::cuda::OptionalCUDAGuard device_guard(device_of(A));
-#elif defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+  const int32_t sm = get_sm_version_num();
-  return cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
+#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
+  if (sm >= 100 && sm < 120) {
+    cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+    return;
+  }
+#endif
+#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
+  if (sm >= 120 && sm < 130) {
+    cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
+    return;
+  }
 #endif
-  TORCH_CHECK_NOT_IMPLEMENTED(false,
-                              "No compiled nvfp4 mm kernel, vLLM should "
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel for SM ", sm,
-                              "be compiled using CUDA 12.8 and target "
+                              ". Recompile with CUDA >= 12.8 and CC >= 100.");
-                              "compute capability 100 or above.");
 }
 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
  int runtimeVersion;
  cudaRuntimeGetVersion(&runtimeVersion);
  return cuda_device_capability >= 100 && runtimeVersion >= 12080;
 }
\ No newline at end of file