[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>

[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)
Signed-off-by: Bill Nell <bnell@redhat.com>
5963b98b · bnellnm · GitHub · e6585ddb · 5963b98b · 5963b98b
Unverified Commit 5963b98b authored Sep 17, 2025 by bnellnm Committed by GitHub Sep 17, 2025
20 changed files
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -95,8 +95,6 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
    def prepare_async(
        self,
        a1: torch.Tensor,
-        a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        num_experts: int,
@@ -130,8 +128,10 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        repeat_cols = 4
        repeat_rows = 1 if quant_config.per_act_token_quant else a1.size(0)
+        # TODO(bnell): always pass quant_config.a1_scale?
        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1, (None if quant_config.per_act_token_quant else a1_scale),
+            a1, (None if quant_config.per_act_token_quant else
+                 quant_config.a1_scale),
            quant_dtype=quant_config.quant_dtype,
            per_act_token_quant=quant_config.per_act_token_quant,
            block_shape=quant_config.block_shape)
@@ -253,8 +253,6 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
    def prepare(
        self,
        a1: torch.Tensor,
-        a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        num_experts: int,
@@ -264,8 +262,6 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
    ) -> mk.PrepareResultType:
        hook, receiver = self.prepare_async(
            a1,
-            a1_scale,
-            a2_scale,
            topk_weights,
            topk_ids,
            num_experts,

--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -30,8 +30,6 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
    def prepare(
        self,
        a1: torch.Tensor,
-        a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        num_experts: int,
@@ -48,7 +46,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
            a1.mul_(topk_weights.to(a1.dtype))
        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1, a1_scale, quant_config.quant_dtype,
+            a1, quant_config.a1_scale, quant_config.quant_dtype,
            quant_config.per_act_token_quant, quant_config.block_shape)
        return a1q, a1q_scale, None, None, None

--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -7,6 +7,8 @@ from typing import Optional
 import torch
 from vllm import envs
+from vllm.model_executor.layers.fused_moe.config import (
+    FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
@@ -305,21 +307,18 @@ def rocm_aiter_grouped_topk(
 def rocm_aiter_fused_experts(
-        hidden_states: torch.Tensor,
+    hidden_states: torch.Tensor,
-        w1: torch.Tensor,
+    w1: torch.Tensor,
-        w2: torch.Tensor,
+    w2: torch.Tensor,
-        topk_weights: torch.Tensor,
+    topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
+    topk_ids: torch.Tensor,
-        activation: str = "silu",
+    activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
+    apply_router_weight_on_input: bool = False,
-        use_fp8_w8a8: bool = False,
+    expert_map: Optional[torch.Tensor] = None,
-        per_channel_quant: bool = False,
+    quant_config: Optional[FusedMoEQuantConfig] = None,
-        w1_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
-        w2_scale: Optional[torch.Tensor] = None,
+    if quant_config is None:
-        a1_scale: Optional[torch.Tensor] = None,
+        quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
-        a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None,
-        expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
    activation_method = (ActivationMethod.SILU
                         if activation == "silu" else ActivationMethod.GELU)
@@ -333,7 +332,8 @@ def rocm_aiter_fused_experts(
        expert_mask = None
    # w8a8 per-channel quantization
-    if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+    if (quant_config.per_act_token_quant and apply_router_weight_on_input
+            and quant_config.use_fp8_w8a8):
        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
        # This applies topk_weights on the GEMM output of the first FC layer
        #  rather than the second FC.
@@ -349,8 +349,8 @@ def rocm_aiter_fused_experts(
            w2,
            topk_weights,
            topk_ids,
-            fc1_scale=w1_scale,
+            fc1_scale=quant_config.w1_scale,
-            fc2_scale=w2_scale,
+            fc2_scale=quant_config.w2_scale,
            fc1_smooth_scale=None,
            fc2_smooth_scale=None,
            a16=False,
@@ -362,14 +362,14 @@ def rocm_aiter_fused_experts(
        quant_method = QuantMethod.NO.value
        # w8a8 block-scaled
-        if block_shape is not None and use_fp8_w8a8:
+        if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
            assert not apply_router_weight_on_input, (
                "apply_router_weight_on_input is\
                not supported for block scaled moe")
-            assert w1_scale is not None
+            assert quant_config.w1_scale is not None
-            assert w2_scale is not None
+            assert quant_config.w2_scale is not None
            quant_method = QuantMethod.BLOCK_128x128.value
-        elif use_fp8_w8a8:
+        elif quant_config.use_fp8_w8a8:
            # Currently only per tensor quantization method is enabled.
            quant_method = QuantMethod.PER_TENSOR.value
@@ -390,10 +390,10 @@ def rocm_aiter_fused_experts(
            expert_mask=expert_mask,
            quant_method=quant_method,
            activation_method=activation_method,
-            w1_scale=w1_scale,
+            w1_scale=quant_config.w1_scale,
-            w2_scale=w2_scale,
+            w2_scale=quant_config.w2_scale,
-            a1_scale=a1_scale,
+            a1_scale=quant_config.a1_scale,
-            a2_scale=a2_scale,
+            a2_scale=quant_config.a2_scale,
            doweight_stage1=apply_router_weight_on_input)

--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py