Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
2ceeaafd
Commit
2ceeaafd
authored
Jun 18, 2025
by
dongcl
Browse files
fix bug caused by mtp > 1 when using moe a2a overlap
parent
040838a0
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
2 deletions
+12
-2
dcu_megatron/core/transformer/transformer_layer.py
dcu_megatron/core/transformer/transformer_layer.py
+12
-2
No files found.
dcu_megatron/core/transformer/transformer_layer.py
View file @
2ceeaafd
...
@@ -205,6 +205,15 @@ class TransformerLayer(MegatronCoreTransformerLayer):
...
@@ -205,6 +205,15 @@ class TransformerLayer(MegatronCoreTransformerLayer):
]
]
return
tuple
(
outputs
)
return
tuple
(
outputs
)
def
_submodule_shared_expert_forward
(
self
,
pre_mlp_layernorm_output
):
"""
Performs a forward pass for shared experts.
"""
shared_expert_output
=
None
if
self
.
mlp
.
use_shared_expert
and
not
self
.
mlp
.
shared_expert_overlap
:
shared_expert_output
=
self
.
mlp
.
shared_experts
(
pre_mlp_layernorm_output
)
return
shared_expert_output
def
_submodule_dispatch_forward
(
self
,
tokens_per_expert
,
permutated_local_input_tokens
):
def
_submodule_dispatch_forward
(
self
,
tokens_per_expert
,
permutated_local_input_tokens
):
"""
"""
Dispatches tokens to the appropriate experts based on the router output.
Dispatches tokens to the appropriate experts based on the router output.
...
@@ -234,13 +243,14 @@ class TransformerLayer(MegatronCoreTransformerLayer):
...
@@ -234,13 +243,14 @@ class TransformerLayer(MegatronCoreTransformerLayer):
and optional shared-expert computations.
and optional shared-expert computations.
"""
"""
shared_expert_output
=
None
shared_expert_output
=
None
if
self
.
mlp
.
use_shared_expert
and
not
self
.
mlp
.
shared_expert_overlap
:
shared_expert_output
=
self
.
mlp
.
shared_experts
(
pre_mlp_layernorm_output
)
(
dispatched_input
,
tokens_per_expert
)
=
(
(
dispatched_input
,
tokens_per_expert
)
=
(
self
.
mlp
.
token_dispatcher
.
dispatch_postprocess
(
tokens_per_expert
,
global_input_tokens
)
self
.
mlp
.
token_dispatcher
.
dispatch_postprocess
(
tokens_per_expert
,
global_input_tokens
)
)
)
expert_output
,
mlp_bias
=
self
.
mlp
.
experts
(
dispatched_input
,
tokens_per_expert
)
expert_output
,
mlp_bias
=
self
.
mlp
.
experts
(
dispatched_input
,
tokens_per_expert
)
expert_output
=
self
.
mlp
.
token_dispatcher
.
combine_preprocess
(
expert_output
)
expert_output
=
self
.
mlp
.
token_dispatcher
.
combine_preprocess
(
expert_output
)
if
self
.
mlp
.
use_shared_expert
and
not
self
.
mlp
.
shared_expert_overlap
:
shared_expert_output
=
self
.
mlp
.
shared_experts
(
pre_mlp_layernorm_output
)
return
expert_output
,
shared_expert_output
,
mlp_bias
return
expert_output
,
shared_expert_output
,
mlp_bias
def
_submodule_combine_forward
(
self
,
hidden_states
):
def
_submodule_combine_forward
(
self
,
hidden_states
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment