Commit b9fdbcfa authored by dongcl's avatar dongcl
Browse files

适配megatron v0.11.0

parent d05234e0
......@@ -37,6 +37,7 @@ def gpt_model_init(
rotary_percent: float = 1.0,
rotary_base: int = 10000,
rope_scaling: bool = False,
rope_scaling_factor: float = 8.0,
scatter_embedding_sequence_parallel: bool = True,
seq_len_interpolation_factor: Optional[float] = None,
mtp_spec: ModuleSpec = None
......@@ -83,9 +84,12 @@ def gpt_model_init(
seq_len_interpolation_factor=seq_len_interpolation_factor,
rotary_base=rotary_base,
rope_scaling=rope_scaling,
rope_scaling_factor=rope_scaling_factor,
use_cpu_initialization=self.config.use_cpu_initialization,
)
# Cache for RoPE tensors which do not change between iterations.
self.rotary_pos_emb_cache = {}
# Transformer.
self.decoder = TransformerBlock(
config=self.config,
......
......@@ -189,7 +189,7 @@ def transformer_block_forward(
hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
# Final layer norm.
if self.final_layernorm is not None:
if (not self.move_final_norm_out_of_block) and self.final_layernorm is not None:
hidden_states = self.final_layernorm(hidden_states)
# TENorm produces a "viewed" tensor. This will result in schedule.py's
# deallocate_output_tensor() throwing an error, so a viewless tensor is
......
......@@ -149,12 +149,6 @@ class TransformerConfig(ModelParallelConfig):
"""Standard deviation of the zero mean normal for the default initialization method, not used if
init_method and output_layer_init_method are provided."""
init_model_with_meta_device: bool = False
"""
If True, initializes the model with the meta device. This is helpful for
training of very large models. This feature is only works when custom fsdp is turned on.
"""
####################
# mixed-precision
####################
......@@ -360,10 +354,7 @@ class TransformerConfig(ModelParallelConfig):
moe_token_dispatcher_type: str = "allgather"
"""The type of token dispatcher to use. The default is 'allgather'.
Options are 'allgather','alltoall' and 'flex'."""
moe_enable_deepep: bool = False
"""[Experimental] Enable DeepEP for efficient token dispatching and combine in MoE models."""
Options are 'allgather' and 'alltoall'."""
moe_per_layer_logging: bool = False
"""Enable per-layer logging for MoE, currently supports auxiliary loss and z loss."""
......@@ -472,9 +463,6 @@ class TransformerConfig(ModelParallelConfig):
inference_rng_tracker: bool = False
""" Whether we should instantiate a separate RNG tracker for inference. """
use_custom_fsdp: bool = False
""" Whether to use custom fsdp for training. """
def __post_init__(self):
"""Python dataclass method that is used to modify attributes after initialization.
See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
......@@ -519,10 +507,6 @@ class TransformerConfig(ModelParallelConfig):
if self.moe_ffn_hidden_size is None:
self.moe_ffn_hidden_size = self.ffn_hidden_size
if self.moe_enable_deepep:
if self.moe_token_dispatcher_type != "flex":
raise ValueError("DeepEP backend is only supported with flex token dispatcher.")
if self.moe_shared_expert_intermediate_size is not None:
if self.moe_shared_expert_intermediate_size <= 0:
raise ValueError(
......@@ -538,6 +522,10 @@ class TransformerConfig(ModelParallelConfig):
)
if self.moe_expert_capacity_factor is not None:
if self.moe_token_dispatcher_type not in ["alltoall", "alltoall_seq"]:
raise ValueError(
'moe_expert_capacity_factor only works with alltoall token dispatcher'
)
if self.moe_expert_capacity_factor < 0:
self.moe_expert_capacity_factor = None
if self.moe_router_load_balancing_type not in ["aux_loss", "seq_aux_loss", "none"]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment