[MoE Refactor] Oracle Select FP8+NVFP4 Kernels In Priority (#32414)

42135d68 · Robert Shaw · GitHub · e14467be · 42135d68 · 42135d68
Unverified Commit 42135d68 authored Jan 21, 2026 by Robert Shaw Committed by GitHub Jan 21, 2026
20 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -634,6 +634,46 @@ steps:
    - pip install helion
    - pytest -v -s kernels/helion/
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true

--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -9,6 +9,7 @@ but use different quantization strategies and backends.
 import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
@@ -138,12 +139,13 @@ def bench_run(
    fn = mk.FusedMoEModularKernel(
        MoEPrepareAndFinalizeNoEP(),
        CutlassExpertsFp8(
-            out_dtype=a.dtype,
+            moe_config=make_dummy_moe_config(
-            e=num_experts,
+                num_experts=num_experts,
-            n=n,
+                hidden_dim=k,
-            k=k,
+                intermediate_size_per_partition=n,
+                in_dtype=a.dtype,
+            ),
            quant_config=quant_config,
-            device=w1.device,
        ),
    )

--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -12,6 +12,7 @@ import torch
 import torch.utils.benchmark as benchmark
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
@@ -198,8 +199,7 @@ def bench_run(
        kernel = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
            CutlassExpertsFp4(
-                out_dtype=dtype,
+                make_dummy_moe_config(),
-                max_experts_per_worker=e,
                quant_config=quant_config,
            ),
        )
@@ -244,8 +244,7 @@ def bench_run(
        kernel = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
            CutlassExpertsFp4(
-                out_dtype=dtype,
+                make_dummy_moe_config(),
-                max_experts_per_worker=e,
                quant_config=quant_config,
            ),
        )

--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -6,6 +6,7 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES_MOE
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
@@ -134,13 +135,13 @@ def bench_run(
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                out_dtype=a.dtype,
+                moe_config=make_dummy_moe_config(
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                    num_experts=w2.shape[0],
-                e=w2.shape[0],
+                    hidden_dim=w2.shape[1],
-                n=w2.shape[2],
+                    intermediate_size_per_partition=w2.shape[2],
-                k=w2.shape[1],
+                    in_dtype=a.dtype,
+                ),
                quant_config=quant_config,
-                device=w1.device,
            ),
        )
@@ -166,13 +167,13 @@ def bench_run(
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                out_dtype=a.dtype,
+                moe_config=make_dummy_moe_config(
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                    num_experts=w2.shape[0],
-                e=w2.shape[0],
+                    hidden_dim=w2.shape[1],
-                n=w2.shape[2],
+                    intermediate_size_per_partition=w2.shape[2],
-                k=w2.shape[1],
+                    in_dtype=a.dtype,
+                ),
                quant_config=quant_config,
-                device=w1.device,
            ),
        )

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -16,10 +16,16 @@ import torch
 from ray.experimental.tqdm_ray import tqdm
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
    FusedMoEQuantConfig,
+    RoutingMethodType,
    _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts,
+)
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
@@ -194,10 +200,33 @@ def benchmark_config(
            block_shape=block_quant_shape,
        )
+        deep_gemm_experts = mk.FusedMoEModularKernel(
+            prepare_finalize=MoEPrepareAndFinalizeNoEP(),
+            fused_experts=TritonOrDeepGemmExperts(
+                moe_config=FusedMoEConfig(
+                    num_experts=num_experts,
+                    experts_per_token=topk,
+                    hidden_dim=hidden_size,
+                    intermediate_size_per_partition=shard_intermediate_size,
+                    num_local_experts=num_experts,
+                    activation="silu",
+                    parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+                    in_dtype=init_dtype,
+                    routing_method=RoutingMethodType.TopK,
+                ),
+                quant_config=quant_config,
+            ),
+        )
        with override_config(config):
            topk_weights, topk_ids, token_expert_indices = fused_topk(
                x, input_gating, topk, renormalize=not use_deep_gemm
            )
+            if use_deep_gemm:
+                return deep_gemm_experts(
+                    x, w1, w2, topk_weights, topk_ids, inplace=True
+                )
            return fused_experts(
                x,
                w1,
@@ -206,7 +235,6 @@ def benchmark_config(
                topk_ids,
                inplace=True,
                quant_config=quant_config,
-                allow_deep_gemm=use_deep_gemm,
            )
    # JIT compilation & warmup

--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -85,10 +85,10 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 |--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
 | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
-| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
+| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
 | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
-| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
+| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |

--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -43,7 +43,7 @@ from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    QuantKey,
    kFp8StaticTensorSym,
-    kNvfp4Quant,
+    kNvfp4Dynamic,
 )
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
@@ -215,7 +215,7 @@ class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel):
 class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
    """Test model for AttentionNvfp4QuantPattern fusion."""
-    quant_key = kNvfp4Quant
+    quant_key = kNvfp4Dynamic
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -468,7 +468,7 @@ def test_attention_quant_pattern(
    # Note: for fp8, fully_replaced=False because query quant ops remain in graph.
    # Only output quant ops are fused into attention.
-    test_backend.check_before_ops([quant_op], fully_replaced=quant_key is kNvfp4Quant)
+    test_backend.check_before_ops([quant_op], fully_replaced=quant_key is kNvfp4Dynamic)
    # access the underlying `AttnFusionPass` on the `LazyInitPass`
    assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)

--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape,
    kFp8StaticTensorSym,
-    kNvfp4Quant,
+    kNvfp4Dynamic,
 )
 from vllm.platforms import current_platform
@@ -134,11 +134,11 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
    def ops_in_model_before(self):
        return [
            SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
-            QUANT_OPS[kNvfp4Quant],
+            QUANT_OPS[kNvfp4Dynamic],
        ]
    def ops_in_model_after(self):
-        return [FUSED_OPS[kNvfp4Quant]]
+        return [FUSED_OPS[kNvfp4Dynamic]]
 class TestSiluMulGroupFp8QuantModel(torch.nn.Module):

--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
@@ -3,3 +3,6 @@ accuracy_threshold: 0.92
 num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "0"
+  VLLM_USE_DEEP_GEMM: "0"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
@@ -6,4 +6,3 @@ server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enab
 env:
  VLLM_USE_DEEP_GEMM: "1"
  VLLM_USE_DEEP_GEMM_MOE: "1"
-  VLLM_USE_DEEP_GEMM_E8M0: "0"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
@@ -6,4 +6,3 @@ server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enab
 env:
  VLLM_USE_DEEP_GEMM: "1"
  VLLM_USE_DEEP_GEMM_MOE: "1"
-  VLLM_USE_DEEP_GEMM_E8M0: "0"
--- a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
@@ -3,3 +3,5 @@ accuracy_threshold: 0.92
 num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "0"
--- a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml
@@ -4,7 +4,5 @@ num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
 env:
-  VLLM_USE_DEEP_GEMM: "0"
-  VLLM_USE_DEEP_GEMM_MOE: "0"
  VLLM_USE_FLASHINFER_MOE_FP8: "1"
  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
--- a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml
@@ -4,7 +4,5 @@ num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
 env:
-  VLLM_USE_DEEP_GEMM: "0"
-  VLLM_USE_DEEP_GEMM_MOE: "0"
  VLLM_USE_FLASHINFER_MOE_FP8: "1"
  VLLM_FLASHINFER_MOE_BACKEND: "latency"
--- a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml
@@ -4,6 +4,4 @@ num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
 env:
-  VLLM_USE_DEEP_GEMM: "0"
-  VLLM_USE_DEEP_GEMM_MOE: "0"
  VLLM_TEST_FORCE_FP8_MARLIN: "1"
--- a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml
@@ -4,5 +4,5 @@ num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
 env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "0"
  VLLM_USE_DEEP_GEMM: "0"
-  VLLM_USE_DEEP_GEMM_MOE: "0"
--- a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml
@@ -4,7 +4,5 @@ num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
 env:
-  VLLM_USE_DEEP_GEMM: "0"
-  VLLM_USE_DEEP_GEMM_MOE: "0"
  VLLM_USE_FLASHINFER_MOE_FP8: "1"
  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
--- a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml
@@ -4,6 +4,4 @@ num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
 env:
-  VLLM_USE_DEEP_GEMM: "0"
-  VLLM_USE_DEEP_GEMM_MOE: "0"
  VLLM_TEST_FORCE_FP8_MARLIN: "1"
--- a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-triton.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-triton.yaml
@@ -4,5 +4,5 @@ num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
 env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "0"
  VLLM_USE_DEEP_GEMM: "0"
-  VLLM_USE_DEEP_GEMM_MOE: "0"
--- a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml
@@ -3,3 +3,5 @@ accuracy_threshold: 0.88
 num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "0"