Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
b9fdbcfa
Commit
b9fdbcfa
authored
Mar 25, 2025
by
dongcl
Browse files
适配megatron v0.11.0
parent
d05234e0
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
18 deletions
+10
-18
dcu_megatron/core/models/gpt/gpt_model.py
dcu_megatron/core/models/gpt/gpt_model.py
+4
-0
dcu_megatron/core/transformer/transformer_block.py
dcu_megatron/core/transformer/transformer_block.py
+1
-1
dcu_megatron/core/transformer/transformer_config.py
dcu_megatron/core/transformer/transformer_config.py
+5
-17
No files found.
dcu_megatron/core/models/gpt/gpt_model.py
View file @
b9fdbcfa
...
...
@@ -37,6 +37,7 @@ def gpt_model_init(
rotary_percent
:
float
=
1.0
,
rotary_base
:
int
=
10000
,
rope_scaling
:
bool
=
False
,
rope_scaling_factor
:
float
=
8.0
,
scatter_embedding_sequence_parallel
:
bool
=
True
,
seq_len_interpolation_factor
:
Optional
[
float
]
=
None
,
mtp_spec
:
ModuleSpec
=
None
...
...
@@ -83,9 +84,12 @@ def gpt_model_init(
seq_len_interpolation_factor
=
seq_len_interpolation_factor
,
rotary_base
=
rotary_base
,
rope_scaling
=
rope_scaling
,
rope_scaling_factor
=
rope_scaling_factor
,
use_cpu_initialization
=
self
.
config
.
use_cpu_initialization
,
)
# Cache for RoPE tensors which do not change between iterations.
self
.
rotary_pos_emb_cache
=
{}
# Transformer.
self
.
decoder
=
TransformerBlock
(
config
=
self
.
config
,
...
...
dcu_megatron/core/transformer/transformer_block.py
View file @
b9fdbcfa
...
...
@@ -189,7 +189,7 @@ def transformer_block_forward(
hidden_states
=
self
.
group_prefetch_offload_commit_async
(
hidden_states
)
# Final layer norm.
if
self
.
final_layernorm
is
not
None
:
if
(
not
self
.
move_final_norm_out_of_block
)
and
self
.
final_layernorm
is
not
None
:
hidden_states
=
self
.
final_layernorm
(
hidden_states
)
# TENorm produces a "viewed" tensor. This will result in schedule.py's
# deallocate_output_tensor() throwing an error, so a viewless tensor is
...
...
dcu_megatron/core/transformer/transformer_config.py
View file @
b9fdbcfa
...
...
@@ -149,12 +149,6 @@ class TransformerConfig(ModelParallelConfig):
"""Standard deviation of the zero mean normal for the default initialization method, not used if
init_method and output_layer_init_method are provided."""
init_model_with_meta_device
:
bool
=
False
"""
If True, initializes the model with the meta device. This is helpful for
training of very large models. This feature is only works when custom fsdp is turned on.
"""
####################
# mixed-precision
####################
...
...
@@ -360,10 +354,7 @@ class TransformerConfig(ModelParallelConfig):
moe_token_dispatcher_type
:
str
=
"allgather"
"""The type of token dispatcher to use. The default is 'allgather'.
Options are 'allgather','alltoall' and 'flex'."""
moe_enable_deepep
:
bool
=
False
"""[Experimental] Enable DeepEP for efficient token dispatching and combine in MoE models."""
Options are 'allgather' and 'alltoall'."""
moe_per_layer_logging
:
bool
=
False
"""Enable per-layer logging for MoE, currently supports auxiliary loss and z loss."""
...
...
@@ -472,9 +463,6 @@ class TransformerConfig(ModelParallelConfig):
inference_rng_tracker
:
bool
=
False
""" Whether we should instantiate a separate RNG tracker for inference. """
use_custom_fsdp
:
bool
=
False
""" Whether to use custom fsdp for training. """
def
__post_init__
(
self
):
"""Python dataclass method that is used to modify attributes after initialization.
See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
...
...
@@ -519,10 +507,6 @@ class TransformerConfig(ModelParallelConfig):
if
self
.
moe_ffn_hidden_size
is
None
:
self
.
moe_ffn_hidden_size
=
self
.
ffn_hidden_size
if
self
.
moe_enable_deepep
:
if
self
.
moe_token_dispatcher_type
!=
"flex"
:
raise
ValueError
(
"DeepEP backend is only supported with flex token dispatcher."
)
if
self
.
moe_shared_expert_intermediate_size
is
not
None
:
if
self
.
moe_shared_expert_intermediate_size
<=
0
:
raise
ValueError
(
...
...
@@ -538,6 +522,10 @@ class TransformerConfig(ModelParallelConfig):
)
if
self
.
moe_expert_capacity_factor
is
not
None
:
if
self
.
moe_token_dispatcher_type
not
in
[
"alltoall"
,
"alltoall_seq"
]:
raise
ValueError
(
'moe_expert_capacity_factor only works with alltoall token dispatcher'
)
if
self
.
moe_expert_capacity_factor
<
0
:
self
.
moe_expert_capacity_factor
=
None
if
self
.
moe_router_load_balancing_type
not
in
[
"aux_loss"
,
"seq_aux_loss"
,
"none"
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment