适配megatron v0.11.0

b9fdbcfa · dongcl · d05234e0 · b9fdbcfa · b9fdbcfa · b9fdbcfa
Commit b9fdbcfa authored Mar 25, 2025 by dongcl
3 changed files
--- a/dcu_megatron/core/models/gpt/gpt_model.py
+++ b/dcu_megatron/core/models/gpt/gpt_model.py
@@ -37,6 +37,7 @@ def gpt_model_init(
    rotary_percent: float = 1.0,
    rotary_base: int = 10000,
    rope_scaling: bool = False,
+    rope_scaling_factor: float = 8.0,
    scatter_embedding_sequence_parallel: bool = True,
    seq_len_interpolation_factor: Optional[float] = None,
    mtp_spec: ModuleSpec = None
@@ -83,9 +84,12 @@ def gpt_model_init(
            seq_len_interpolation_factor=seq_len_interpolation_factor,
            rotary_base=rotary_base,
            rope_scaling=rope_scaling,
+            rope_scaling_factor=rope_scaling_factor,
            use_cpu_initialization=self.config.use_cpu_initialization,
        )

+    # Cache for RoPE tensors which do not change between iterations.
+    self.rotary_pos_emb_cache = {}
    # Transformer.
    self.decoder = TransformerBlock(
        config=self.config,

--- a/dcu_megatron/core/transformer/transformer_block.py
+++ b/dcu_megatron/core/transformer/transformer_block.py
@@ -189,7 +189,7 @@ def transformer_block_forward(
                    hidden_states = self.group_prefetch_offload_commit_async(hidden_states)

    # Final layer norm.
-    if self.final_layernorm is not None:
+    if (not self.move_final_norm_out_of_block) and self.final_layernorm is not None:
        hidden_states = self.final_layernorm(hidden_states)
        # TENorm produces a "viewed" tensor. This will result in schedule.py's
        # deallocate_output_tensor() throwing an error, so a viewless tensor is

--- a/dcu_megatron/core/transformer/transformer_config.py
+++ b/dcu_megatron/core/transformer/transformer_config.py
@@ -149,12 +149,6 @@ class TransformerConfig(ModelParallelConfig):
    """Standard deviation of the zero mean normal for the default initialization method, not used if
    init_method and output_layer_init_method are provided."""

-    init_model_with_meta_device: bool = False
-    """
-    If True, initializes the model with the meta device. This is helpful for
-    training of very large models. This feature is only works when custom fsdp is turned on.
-    """
-
    ####################
    # mixed-precision
    ####################
@@ -360,10 +354,7 @@ class TransformerConfig(ModelParallelConfig):

    moe_token_dispatcher_type: str = "allgather"
    """The type of token dispatcher to use. The default is 'allgather'.
-    Options are 'allgather','alltoall' and 'flex'."""
-
-    moe_enable_deepep: bool = False
-    """[Experimental] Enable DeepEP for efficient token dispatching and combine in MoE models."""
+    Options are 'allgather' and 'alltoall'."""

    moe_per_layer_logging: bool = False
    """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss."""
@@ -472,9 +463,6 @@ class TransformerConfig(ModelParallelConfig):
    inference_rng_tracker: bool = False
    """ Whether we should instantiate a separate RNG tracker for inference. """

-    use_custom_fsdp: bool = False
-    """ Whether to use custom fsdp for training. """
-
    def __post_init__(self):
        """Python dataclass method that is used to modify attributes after initialization.
        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
@@ -519,10 +507,6 @@ class TransformerConfig(ModelParallelConfig):
        if self.moe_ffn_hidden_size is None:
            self.moe_ffn_hidden_size = self.ffn_hidden_size

-        if self.moe_enable_deepep:
-            if self.moe_token_dispatcher_type != "flex":
-                raise ValueError("DeepEP backend is only supported with flex token dispatcher.")
-
        if self.moe_shared_expert_intermediate_size is not None:
            if self.moe_shared_expert_intermediate_size <= 0:
                raise ValueError(
@@ -538,6 +522,10 @@ class TransformerConfig(ModelParallelConfig):
                )

        if self.moe_expert_capacity_factor is not None:
+            if self.moe_token_dispatcher_type not in ["alltoall", "alltoall_seq"]:
+                raise ValueError(
+                    'moe_expert_capacity_factor only works with alltoall token dispatcher'
+                )
            if self.moe_expert_capacity_factor < 0:
                self.moe_expert_capacity_factor = None
            if self.moe_router_load_balancing_type not in ["aux_loss", "seq_aux_loss", "none"]: