[Feature]:Allow for Granite MoE Hybrid models with _only_ shared experts. (#19652)

Signed-off-by: Shawn Tan <shawntan@ibm.com>

[Feature]:Allow for Granite MoE Hybrid models with _only_ shared experts. (#19652)
Signed-off-by: Shawn Tan <shawntan@ibm.com>
4d542402 · Shawn Tan · GitHub · 3e750697 · 4d542402
Unverified Commit 4d542402 authored Jun 16, 2025 by Shawn Tan Committed by GitHub Jun 16, 2025
Show whitespace changes
Inline Side-by-side

Showing with 40 additions and 24 deletions

vllm/model_executor/models/granitemoehybrid.py vllm/model_executor/models/granitemoehybrid.py +40 -24

No files found.
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -67,6 +67,8 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
                                activation=config.hidden_act,
                                quant_config=quant_config)

+        self.block_sparse_moe = None
+        if getattr(config, "num_local_experts", 0) > 0:
            self.block_sparse_moe = GraniteMoeMoE(
                num_experts=config.num_local_experts,
                top_k=config.num_experts_per_tok,
@@ -105,13 +107,19 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        if self.shared_mlp is None:
+            if self.block_sparse_moe is not None:
                hidden_states = self.block_sparse_moe(hidden_states)
+            # else: skip
        else:
            # create a copy since block_sparse_moe modifies in-place
+            if self.block_sparse_moe is not None:
                moe_hidden_states = hidden_states.clone()
                moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
-            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+                hidden_states = moe_hidden_states + self.shared_mlp(
+                    hidden_states)
                del moe_hidden_states
+            else:
+                hidden_states = self.shared_mlp(hidden_states)
        hidden_states = residual + hidden_states * self.residual_multiplier

        return hidden_states, residual
@@ -137,6 +145,8 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn")

+        self.block_sparse_moe = None
+        if getattr(config, "num_local_experts", 0) > 0:
            self.block_sparse_moe = GraniteMoeMoE(
                num_experts=config.num_local_experts,
                top_k=config.num_experts_per_tok,
@@ -178,13 +188,19 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        if self.shared_mlp is None:
+            if self.block_sparse_moe is not None:
                hidden_states = self.block_sparse_moe(hidden_states)
+            # else: skip
        else:
            # create a copy since block_sparse_moe modifies in-place
+            if self.block_sparse_moe is not None:
                moe_hidden_states = hidden_states.clone()
                moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
-            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+                hidden_states = moe_hidden_states + self.shared_mlp(
+                    hidden_states)
                del moe_hidden_states
+            else:
+                hidden_states = self.shared_mlp(hidden_states)
        hidden_states = residual + hidden_states * self.residual_multiplier

        return hidden_states, residual