Support Tensorrt-LLM MoE fp4 for low-latency (#21331)

Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Po-Han Huang <pohanh@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: XIn Li <xinli@nvidia.com> Co-authored-by: XIn Li <xinli@nvidia.com>

Support Tensorrt-LLM MoE fp4 for low-latency (#21331)
Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Po-Han Huang <pohanh@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Signed-off-by: XIn Li <xinli@nvidia.com> Co-authored-by: XIn Li <xinli@nvidia.com>
a3b9c17b · Shu Wang · GitHub · d57dc236 · a3b9c17b · a3b9c17b
Unverified Commit a3b9c17b authored Aug 07, 2025 by Shu Wang Committed by GitHub Aug 07, 2025
7 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -129,6 +129,7 @@ if TYPE_CHECKING:
    VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
    VLLM_USE_FLASHINFER_MOE_FP8: bool = False
    VLLM_USE_FLASHINFER_MOE_FP4: bool = False
+    VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
    VLLM_XGRAMMAR_CACHE_MB: int = 0
    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
    VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -982,6 +983,20 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ALL2ALL_BACKEND":
    lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
+    # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both
+    # require compute capability 10.0 or above.
+    # Available options:
+    # - "throughput":  [default]
+    #     Uses CUTLASS kernels optimized for high-throughput batch inference.
+    # - "latency":
+    #     Uses TensorRT-LLM kernels optimized for low-latency inference.
+    # To set this backend, define the environment variable:
+    #     export VLLM_FLASHINFER_MOE_BACKEND=latency.
+    # If not set, defaults to "throughput".
+    "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv(
+    "VLLM_FLASHINFER_MOE_BACKEND", "throughput"
+    ),
    # Control the maximum number of tokens per expert supported by the
    # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
    # the blockscale tensor of activations NVFP4 Quantization.

--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -192,7 +192,8 @@ class FusedMoEParallelConfig:
    @property
    def use_flashinfer_cutlass_kernels(self):
        return (envs.VLLM_USE_FLASHINFER_MOE_FP4
-                and has_flashinfer_cutlass_fused_moe())
+                and has_flashinfer_cutlass_fused_moe()
+                and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
    @staticmethod
    def make(tp_size_: int, dp_size_: int,

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -105,7 +105,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
            detect_nvfp4_moe_support)
        _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
        self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
-        self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
+        self.allow_flashinfer = _nvfp4.allow_flashinfer
        self.use_marlin = _nvfp4.use_marlin
        self.group_size = 16
        self.fused_experts = None  # type: ignore[assignment]
@@ -212,7 +212,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                                             requires_grad=False)
        # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
-        if self.allow_flashinfer_cutlass:
+        if self.allow_flashinfer:
            w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data,
                                        layer.w13_weight_scale.data,
                                        dim=-2)
@@ -266,7 +266,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
            (layer.w2_input_global_scale), requires_grad=False)
    def maybe_swap_experts_impl(self, moe_parallel_config):
-        if not self.allow_flashinfer_cutlass:
+        if not self.allow_flashinfer:
            return
        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
            moe_parallel_config)
@@ -277,8 +277,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
            select_nvfp4_gemm_impl)
-        return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
+        return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger)
-                                      logger)
    def apply(
        self,

--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -126,7 +126,7 @@ def flashinfer_fp4_cutlass_moe_forward(
 def select_nvfp4_gemm_impl(
-        allow_flashinfer_cutlass: bool,
+        allow_flashinfer: bool,
        moe,  # FusedMoEConfig
        logger):
    """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
@@ -137,8 +137,14 @@ def select_nvfp4_gemm_impl(
    all2all_manager = get_ep_group().device_communicator.all2all_manager
    assert all2all_manager is not None
-    if allow_flashinfer_cutlass:
+    if allow_flashinfer:
-        logger.debug_once("Using FlashInferExperts")
+        flashinfer_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
+        if flashinfer_backend != "throughput":
+            raise ValueError(
+                f"Only throughput backend is supported for FlashInferExperts, "
+                f"but got {flashinfer_backend}.")
+        logger.debug_once(
+            "Initializing FlashInferExperts with throughput backend.")
        return FlashInferExperts(
            use_nvfp4_w4a4=True,
            use_dp=moe.moe_parallel_config.dp_size > 1,

--- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -21,7 +21,7 @@ class NvFp4Support:
    """Result container for NV-FP4 capability probing."""
    cutlass_supported: bool
-    allow_flashinfer_cutlass: bool
+    allow_flashinfer: bool
    use_marlin: bool
@@ -54,6 +54,6 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
    return NvFp4Support(
        cutlass_supported=cutlass_supported,
-        allow_flashinfer_cutlass=allow_flashinfer,
+        allow_flashinfer=allow_flashinfer,
        use_marlin=use_marlin,
    )
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -86,6 +86,8 @@ flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe",
 fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
 nvfp4_block_scale_interleave = _lazy_import_wrapper(
    "flashinfer", "nvfp4_block_scale_interleave")
+trtllm_fp4_block_scale_moe = _lazy_import_wrapper(
+    "flashinfer", "trtllm_fp4_block_scale_moe")
 # Special case for autotune since it returns a context manager
 autotune = _lazy_import_wrapper(
@@ -112,6 +114,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
        ("flashinfer.fused_moe", "cutlass_fused_moe"),
        ("flashinfer", "fp4_quantize"),
        ("flashinfer", "nvfp4_block_scale_interleave"),
+        ("flashinfer.fused_moe", "trtllm_fp4_block_scale_moe"),
    ]
    for module_name, attr_name in required_functions:
@@ -188,6 +191,7 @@ __all__ = [
    "flashinfer_cutlass_fused_moe",
    "fp4_quantize",
    "nvfp4_block_scale_interleave",
+    "trtllm_fp4_block_scale_moe",
    "autotune",
    "has_flashinfer_moe",
    "has_flashinfer_cutlass_fused_moe",