Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4d542402
Unverified
Commit
4d542402
authored
Jun 16, 2025
by
Shawn Tan
Committed by
GitHub
Jun 16, 2025
Browse files
[Feature]:Allow for Granite MoE Hybrid models with _only_ shared experts. (#19652)
Signed-off-by:
Shawn Tan
<
shawntan@ibm.com
>
parent
3e750697
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
40 additions
and
24 deletions
+40
-24
vllm/model_executor/models/granitemoehybrid.py
vllm/model_executor/models/granitemoehybrid.py
+40
-24
No files found.
vllm/model_executor/models/granitemoehybrid.py
View file @
4d542402
...
...
@@ -67,6 +67,8 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
activation
=
config
.
hidden_act
,
quant_config
=
quant_config
)
self
.
block_sparse_moe
=
None
if
getattr
(
config
,
"num_local_experts"
,
0
)
>
0
:
self
.
block_sparse_moe
=
GraniteMoeMoE
(
num_experts
=
config
.
num_local_experts
,
top_k
=
config
.
num_experts_per_tok
,
...
...
@@ -105,13 +107,19 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
if
self
.
shared_mlp
is
None
:
if
self
.
block_sparse_moe
is
not
None
:
hidden_states
=
self
.
block_sparse_moe
(
hidden_states
)
# else: skip
else
:
# create a copy since block_sparse_moe modifies in-place
if
self
.
block_sparse_moe
is
not
None
:
moe_hidden_states
=
hidden_states
.
clone
()
moe_hidden_states
=
self
.
block_sparse_moe
(
moe_hidden_states
)
hidden_states
=
moe_hidden_states
+
self
.
shared_mlp
(
hidden_states
)
hidden_states
=
moe_hidden_states
+
self
.
shared_mlp
(
hidden_states
)
del
moe_hidden_states
else
:
hidden_states
=
self
.
shared_mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
*
self
.
residual_multiplier
return
hidden_states
,
residual
...
...
@@ -137,6 +145,8 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.self_attn"
)
self
.
block_sparse_moe
=
None
if
getattr
(
config
,
"num_local_experts"
,
0
)
>
0
:
self
.
block_sparse_moe
=
GraniteMoeMoE
(
num_experts
=
config
.
num_local_experts
,
top_k
=
config
.
num_experts_per_tok
,
...
...
@@ -178,13 +188,19 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
if
self
.
shared_mlp
is
None
:
if
self
.
block_sparse_moe
is
not
None
:
hidden_states
=
self
.
block_sparse_moe
(
hidden_states
)
# else: skip
else
:
# create a copy since block_sparse_moe modifies in-place
if
self
.
block_sparse_moe
is
not
None
:
moe_hidden_states
=
hidden_states
.
clone
()
moe_hidden_states
=
self
.
block_sparse_moe
(
moe_hidden_states
)
hidden_states
=
moe_hidden_states
+
self
.
shared_mlp
(
hidden_states
)
hidden_states
=
moe_hidden_states
+
self
.
shared_mlp
(
hidden_states
)
del
moe_hidden_states
else
:
hidden_states
=
self
.
shared_mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
*
self
.
residual_multiplier
return
hidden_states
,
residual
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment