[Model] Add MoE support for NemotronH (#25863)

Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>

[Model] Add MoE support for NemotronH (#25863)
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
61089465 · tomeras91 · GitHub · 88afa110 · 61089465 · 61089465
Unverified Commit 61089465 authored Oct 23, 2025 by tomeras91 Committed by GitHub Oct 23, 2025
7 changed files
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -823,6 +823,8 @@ class FusedMoEConfig:

    has_bias: bool = False

+    is_act_and_mul: bool = True
+
    def __post_init__(self):
        if self.dp_size > 1:
            logger.debug_once(

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1647,6 +1647,7 @@ def fused_experts(

 SILU_NO_MUL: str = activation_without_mul("silu")
 GELU_NO_MUL: str = activation_without_mul("gelu")
+RELU2_NO_MUL: str = activation_without_mul("relu2")


 def _get_config_quant_dtype(
@@ -1914,7 +1915,8 @@ def fused_experts_impl(
            intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
        elif activation == GELU_NO_MUL:
            intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-
+        elif activation == RELU2_NO_MUL:
+            intermediate_cache2 = torch.square(F.relu(intermediate_cache1.view(-1, N)))
        else:
            raise ValueError(f"Unsupported FusedMoe activation: {activation}.")


--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -411,11 +411,15 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
+        if self.moe.is_act_and_mul:
+            w13_up_dim = 2 * intermediate_size_per_partition
+        else:
+            w13_up_dim = intermediate_size_per_partition
        # Fused gate_up_proj (column parallel)
        w13_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
-                2 * intermediate_size_per_partition,
+                w13_up_dim,
                hidden_size,
                dtype=params_dtype,
            ),
@@ -425,9 +429,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        set_weight_attrs(w13_weight, extra_weight_attrs)
        if self.moe.has_bias:
            w13_bias = torch.nn.Parameter(
-                torch.zeros(
-                    num_experts, 2 * intermediate_size_per_partition, dtype=params_dtype
-                ),
+                torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
                requires_grad=False,
            )
            layer.register_parameter("w13_bias", w13_bias)
@@ -1073,6 +1075,7 @@ class FusedMoE(CustomOp):
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
+        is_act_and_mul: bool = True,
        enable_eplb: bool = False,
        num_redundant_experts: int = 0,
        has_bias: bool = False,
@@ -1263,6 +1266,7 @@ class FusedMoE(CustomOp):
            in_dtype=moe_in_dtype,
            max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
            has_bias=has_bias,
+            is_act_and_mul=is_act_and_mul,
        )
        self.moe_config = moe
        self.moe_quant_config: FusedMoEQuantConfig | None = None
@@ -1283,6 +1287,24 @@ class FusedMoE(CustomOp):
        assert isinstance(quant_method, FusedMoEMethodBase)
        self.quant_method = quant_method

+        if not self.moe_config.is_act_and_mul:
+            # Avoid circular import
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8MoEMethod,
+            )
+
+            if not isinstance(
+                quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
+            ):
+                raise NotImplementedError(
+                    "is_act_and_mul=False is supported only for unquantized "
+                    "and ModelOpt FP8 moe for now"
+                )
+            if not current_platform.is_cuda():
+                raise NotImplementedError(
+                    "is_act_and_mul=False is supported only for CUDA for now"
+                )
+
        if self.enable_eplb:
            from vllm.model_executor.layers.quantization.fp8 import Fp8MoEMethod

@@ -1531,7 +1553,10 @@ class FusedMoE(CustomOp):
    ):
        # Index the loaded weight for tp sharding.
        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
+        if self.moe_config.is_act_and_mul:
            shard_size = expert_data.shape[shard_dim] // 2
+        else:
+            shard_size = expert_data.shape[shard_dim]
        if not load_full:
            loaded_weight = loaded_weight.narrow(
                shard_dim, shard_size * tp_rank, shard_size

--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -354,7 +354,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):

        self.cutlass_fp8_supported = cutlass_fp8_supported()
        self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
-        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
+        if (
+            envs.VLLM_USE_FLASHINFER_MOE_FP8
+            and has_flashinfer_moe()
+            and self.moe.is_act_and_mul
+        ):
            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
            logger.info_once(
                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
@@ -405,10 +409,15 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
        )
        weight_loader = extra_weight_attrs.get("weight_loader")

+        if self.moe.is_act_and_mul:
+            w13_up_dim = 2 * intermediate_size_per_partition
+        else:
+            w13_up_dim = intermediate_size_per_partition
+
        w13_weight = ModelWeightParameter(
            data=torch.empty(
                num_experts,
-                2 * intermediate_size_per_partition,
+                w13_up_dim,
                hidden_size,
                dtype=weight_dtype,
            ),
@@ -433,11 +442,16 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):

        if self.quant_config.is_checkpoint_fp8_serialized:
            # WEIGHT SCALES - Per-tensor scaling for ModelOpts
-            # Allocate 2 scales for w1 and w3 respectively.
+            # For gated MoE, allocate 2 scales for w1 and w3 respectively.
            # They will be combined to a single scale after weight loading.
+            # For non-gated MoE, allocate 1 scale for w13.
+            if self.moe.is_act_and_mul:
+                w13_weight_scale_shape = (num_experts, 2)
+            else:
+                w13_weight_scale_shape = (num_experts, 1)
            w13_weight_scale = PerTensorScaleParameter(
                data=torch.full(
-                    (num_experts, 2),
+                    w13_weight_scale_shape,
                    1.0,
                    dtype=torch.float32,
                ),
@@ -485,7 +499,14 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
            # Fp8 moe kernel needs single weight scale for w13 per expert.
            # We take the max of the w1 and w3 scales
            # then dequant and requant each expert.
-            if layer.w13_weight_scale.dim() == 2:
+            if (
+                layer.w13_weight_scale.dim() == 2
+                and layer.w13_weight_scale.shape[1] == 2
+            ):
+                assert self.moe.is_act_and_mul, (
+                    "w13_weight_scale should have 2 elements per expert "
+                    "only for gated MoE"
+                )
                # Get the maximum scale across w1 and w3 for each expert
                max_w13_scales = layer.w13_weight_scale.max(dim=1).values


--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -673,7 +673,9 @@ class MixtureOfExperts(Protocol):


 def is_mixture_of_experts(model: object) -> TypeIs[MixtureOfExperts]:
-    return isinstance(model, MixtureOfExperts)
+    return (
+        isinstance(model, MixtureOfExperts) and getattr(model, "num_moe_layers", 0) > 0
+    )


 @runtime_checkable

--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -185,6 +185,15 @@ class NemotronHConfig(PretrainedConfig):
        mamba_proj_bias=False,
        mamba_chunk_size=256,
        rescale_prenorm_residual=True,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        moe_intermediate_size=7688,
+        moe_shared_expert_intermediate_size=7688,
+        num_experts_per_tok=2,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        norm_topk_prob=True,
        **kwargs,
    ):
        self.vocab_size = vocab_size
@@ -241,6 +250,15 @@ class NemotronHConfig(PretrainedConfig):
        self.mamba_proj_bias = mamba_proj_bias
        self.chunk_size = mamba_chunk_size
        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size  # noqa: E501
+        self.num_experts_per_tok = num_experts_per_tok
+        self.routed_scaling_factor = routed_scaling_factor
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.norm_topk_prob = norm_topk_prob

        super().__init__(
            pad_token_id=pad_token_id,
@@ -258,5 +276,7 @@ class NemotronHConfig(PretrainedConfig):
            else "attention"
            if self.hybrid_override_pattern[i] == "*"
            else "mlp"
+            if self.hybrid_override_pattern[i] == "-"
+            else "moe"
            for i in range(self.num_hidden_layers)
        ]