[NVIDIA] Bugfix NVFP4 DGX Spark and RTX50 (#38423)

Signed-off-by: johnnynunez <johnnynuca14@gmail.com> Signed-off-by: Johnny <johnnynuca14@gmail.com>

[NVIDIA] Bugfix NVFP4 DGX Spark and RTX50 (#38423)
Signed-off-by: johnnynunez <johnnynuca14@gmail.com> Signed-off-by: Johnny <johnnynuca14@gmail.com>
b4a2f3ac · Johnny · GitHub · 8e6293e8 · b4a2f3ac · b4a2f3ac
Unverified Commit b4a2f3ac authored Mar 30, 2026 by Johnny Committed by GitHub Mar 30, 2026
15 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -309,7 +309,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1")
+  set(CUTLASS_REVISION "v4.4.2")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})

--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -16,6 +16,7 @@
 #include <torch/all.h>
+#include "cutlass_extensions/common.hpp"
 #include "nvfp4_utils.cuh"
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -53,12 +54,27 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor const& output_scale_offset_by_experts);
 #endif
+static bool nvfp4_quant_sm_supported() {
+  const int32_t sm = get_sm_version_num();
+#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
+  if (sm >= 100 && sm < 120) return true;
+#endif
+#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
+  if (sm >= 120 && sm < 130) return true;
+#endif
+  return false;
+}
 void scaled_fp4_quant_out(torch::Tensor const& input,
                          torch::Tensor const& input_sf,
                          bool is_sf_swizzled_layout, torch::Tensor& output,
                          torch::Tensor& output_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  TORCH_CHECK(nvfp4_quant_sm_supported(),
+              "No compiled nvfp4 quantization kernel for SM ",
+              get_sm_version_num(),
+              ". Recompile with the appropriate CUDA arch.");
  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
                                 is_sf_swizzled_layout);
 #endif
@@ -100,6 +116,10 @@ void scaled_fp4_experts_quant(
    torch::Tensor const& output_scale_offset_by_experts) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  TORCH_CHECK(nvfp4_quant_sm_supported(),
+              "No compiled nvfp4 experts quantization kernel for SM ",
+              get_sm_version_num(),
+              ". Recompile with the appropriate CUDA arch.");
  return scaled_fp4_experts_quant_sm1xxa(
      output, output_scale, input, input_global_scale, input_offset_by_experts,
      output_scale_offset_by_experts);
@@ -112,6 +132,10 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf,
                              torch::Tensor& input, torch::Tensor& input_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  TORCH_CHECK(nvfp4_quant_sm_supported(),
+              "No compiled silu_and_mul nvfp4 quantization kernel for SM ",
+              get_sm_version_num(),
+              ". Recompile with the appropriate CUDA arch.");
  return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf);
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(
@@ -125,6 +149,11 @@ void silu_and_mul_scaled_fp4_experts_quant(
    torch::Tensor const& output_scale_offset_by_experts) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  TORCH_CHECK(nvfp4_quant_sm_supported(),
+              "No compiled silu_and_mul nvfp4 experts quantization kernel "
+              "for SM ",
+              get_sm_version_num(),
+              ". Recompile with the appropriate CUDA arch.");
  return silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
      output, output_scale, input, input_global_scale, input_offset_by_experts,
      output_scale_offset_by_experts);

--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -63,5 +63,17 @@ void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A,
 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
  int runtimeVersion;
  cudaRuntimeGetVersion(&runtimeVersion);
-  return cuda_device_capability >= 100 && runtimeVersion >= 12080;
+  if (runtimeVersion < 12080) return false;
+  // Only report support when the SM-specific kernel was actually compiled in,
+  // so the Python-side backend selector does not choose CUTLASS and then hit
+  // TORCH_CHECK_NOT_IMPLEMENTED (or worse, fall through to Marlin).
+#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
+  if (cuda_device_capability >= 100 && cuda_device_capability < 120)
+    return true;
+#endif
+#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
+  if (cuda_device_capability >= 120 && cuda_device_capability < 130)
+    return true;
+#endif
+  return false;
 }
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -154,6 +154,7 @@ struct MacheteCollectiveMma {
  struct DispatchPolicy {
    constexpr static int Stages = PipelineStages;
    using ClusterShape = ClusterShape_MNK;
+    using ArchTag = arch::Sm90;
    using Schedule = KernelScheduleType;
  };

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -590,7 +590,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install FlashInfer JIT cache (requires CUDA-version-specific index URL)
 # https://docs.flashinfer.ai/installation.html
 # From versions.json: .flashinfer.version
-ARG FLASHINFER_VERSION=0.6.6
+# 0.6.7: CUTLASS 4.4.2 bump, fixes TMA grouped GEMM on SM12x (flashinfer#2798)
+# TODO: bump to 0.6.8 when released for NVFP4/MXFP4 group GEMMs on
+#   SM120/SM121 (RTX 50 / DGX Spark) via flashinfer#2738
+ARG FLASHINFER_VERSION=0.6.7
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \

--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -217,13 +217,16 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.6.6
+# release version: v0.6.7
+# 0.6.7: CUTLASS 4.4.2 bump, fixes TMA grouped GEMM on SM12x (flashinfer#2798)
+# TODO: bump to 0.6.8 when released for NVFP4/MXFP4 group GEMMs on
+#   SM120/SM121 (RTX 50 / DGX Spark) via flashinfer#2738
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo "git clone flashinfer..." \
-    && git clone --depth 1 --branch v0.6.6 --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.7 --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \

--- a/docker/versions.json
+++ b/docker/versions.json
@@ -68,7 +68,7 @@
      "default": "true"
    },
    "FLASHINFER_VERSION": {
-      "default": "0.6.6"
+      "default": "0.6.7"
    },
    "GDRCOPY_CUDA_VERSION": {
      "default": "12.8"

--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,8 +9,8 @@ torchaudio==2.10.0
 # These must be updated alongside torch
 torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.6.6
+flashinfer-python==0.6.7
-flashinfer-cubin==0.6.6
+flashinfer-cubin==0.6.7
 # Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to
 # breaking changes in 1.19.0
 nvidia-cudnn-frontend>=1.13.0,<1.19.0

--- a/tests/kernels/moe/test_unquantized_backend_selection.py
+++ b/tests/kernels/moe/test_unquantized_backend_selection.py
@@ -57,7 +57,6 @@ def test_select_default_backend_by_platform(
        moe_config = make_dummy_moe_config()
        selected_backend = select_unquantized_moe_backend(
            moe_config=moe_config,
-            use_ep=False,
            use_dp=False,
        )
@@ -90,7 +89,6 @@ def test_select_rocm_aiter_backend(mock_aiter_enabled, mock_has_flashinfer):
        moe_config = make_dummy_moe_config()
        selected_backend = select_unquantized_moe_backend(
            moe_config=moe_config,
-            use_ep=False,
            use_dp=False,
        )
@@ -129,7 +127,6 @@ def test_select_cuda_flashinfer_trtllm_backend(
        selected_backend = select_unquantized_moe_backend(
            moe_config=moe_config,
-            use_ep=True,
            use_dp=False,
        )
@@ -171,7 +168,6 @@ def test_select_cuda_flashinfer_cutlass_backend(
        selected_backend = select_unquantized_moe_backend(
            moe_config=moe_config,
-            use_ep=True,  # CUTLASS requires EP
            use_dp=False,  # CUTLASS doesn't support DP
        )

--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
@@ -406,6 +406,11 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
        if self.routing_method_type == RoutingMethodType.DeepSeekV3:
            router_logits = router_logits.to(torch.float32)
+        # Currently FI requires bfloat16 routing bias.
+        # https://github.com/flashinfer-ai/flashinfer/issues/2909
+        if e_score_correction_bias is not None:
+            e_score_correction_bias = e_score_correction_bias.to(torch.bfloat16)
        out = flashinfer.fused_moe.trtllm_fp8_per_tensor_scale_moe(
            routing_logits=router_logits,
            routing_bias=e_score_correction_bias,

--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
@@ -5,6 +5,7 @@ import flashinfer
 import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
@@ -27,6 +28,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
+logger = init_logger(__name__)
 class TrtLlmNvFp4ExpertsBase:
    """
@@ -315,6 +318,11 @@ class TrtLlmNvFp4ExpertsMonolithic(
            else router_logits
        )
+        # Currently FI requires bfloat16 routing bias.
+        # https://github.com/flashinfer-ai/flashinfer/issues/2909
+        if e_score_correction_bias is not None:
+            e_score_correction_bias = e_score_correction_bias.to(torch.bfloat16)
        # Invoke kernel.
        return flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
            routing_logits=router_logits,

--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -361,7 +361,7 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
            fc1_expert_weights = w1
            fc2_expert_weights = w2
        else:
-            quant_scales = None
+            quant_scales = []
            a1q_scale = None
            fc1_expert_weights = w1
            fc2_expert_weights = w2

--- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
@@ -70,7 +70,6 @@ def map_unquantized_backend(runner_backend: MoEBackend) -> UnquantizedMoeBackend
 def select_unquantized_moe_backend(
    moe_config: FusedMoEConfig,
-    use_ep: bool,
    use_dp: bool,
 ) -> UnquantizedMoeBackend:
    """
@@ -96,7 +95,6 @@ def select_unquantized_moe_backend(
    # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
    flashinfer_cutlass_available = (
        has_flashinfer_cutlass_fused_moe()
-        and use_ep
        and (not use_dp)
        and current_platform.has_device_capability(90)
    )
@@ -161,9 +159,9 @@ def select_unquantized_moe_backend(
                    "to enable it for better performance.",
                    scope="local",
                )
-            elif use_ep and (not use_dp):
+            elif not use_dp and flashinfer_cutlass_available:
                logger.info_once(
-                    "FlashInfer MoE is available for EP"
+                    "FlashInfer CUTLASS MoE is available"
                    " but not enabled, consider setting"
                    " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
                    scope="local",

--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -61,7 +61,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        super().__init__(moe)
        self.unquantized_backend = select_unquantized_moe_backend(
            moe_config=self.moe,
-            use_ep=self.moe.moe_parallel_config.use_ep,
            use_dp=self.moe.moe_parallel_config.dp_size > 1,
        )

--- a/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
@@ -55,8 +55,16 @@ def select_nvfp4_linear_backend() -> NvFp4LinearBackend:
    elif envs.VLLM_USE_NVFP4_CT_EMULATIONS:
        backend = NvFp4LinearBackend.EMULATION
    elif envs.VLLM_NVFP4_GEMM_BACKEND is None:
-        # Auto-select best available backend
+        # Auto-select best available backend.
-        if current_platform.has_device_capability(100) and has_flashinfer():
+        # cutlass_fp4_supported() checks that the vLLM NVFP4 kernels (both
+        # quantization and GEMM) were compiled for the current SM version.
+        # FlashInfer backends still rely on the vLLM quantization kernels,
+        # so we gate them on the same check.
+        if (
+            cutlass_fp4_supported()
+            and current_platform.has_device_capability(100)
+            and has_flashinfer()
+        ):
            backend = NvFp4LinearBackend.FLASHINFER_CUTLASS
        elif cutlass_fp4_supported():
            backend = NvFp4LinearBackend.VLLM_CUTLASS
@@ -72,6 +80,10 @@ def select_nvfp4_linear_backend() -> NvFp4LinearBackend:
        NvFp4LinearBackend.FLASHINFER_CUDNN,
    ):
        assert has_flashinfer(), f"FlashInfer is required for {backend}"
+        assert cutlass_fp4_supported(), (
+            f"{backend} requires vLLM NVFP4 quantization kernels compiled "
+            f"for the current GPU (SM {current_platform.get_device_capability()})"
+        )
    elif backend == NvFp4LinearBackend.VLLM_CUTLASS:
        assert cutlass_fp4_supported(), f"Cutlass is required for {backend}"
    elif backend == NvFp4LinearBackend.MARLIN: