Megatron v0.11.0

ce02cd51 · dongcl · aeed6d97 · ce02cd51 · ce02cd51 · ce02cd51
Commit ce02cd51 authored Apr 21, 2025 by dongcl
20 changed files
--- a/README.md
+++ b/README.md
@@ -66,6 +66,10 @@ def unpermute(
 ):
 ```

+### 项目支持使用[flux kernel](http://10.6.10.68/dcutoolkit/deeplearing/flux)
+在tp场景下，用户可以选择使用flux通算融合算子，获得更好的训练和推理性能。项目通过替换transformer engine方法集成flux，使用时需要设置环境变量USE_FLUX_OVERLAP=1，并设置transformer-impl为transformer_engine。
+
+
 ### 使用方式
 在使用时，进入到examples目录下，有相关模型执行脚本，所用数据集请自行下载：https://r0ddbu55vzx.feishu.cn/drive/folder/ZxHHfCoX4lg75td2hTqcmiAin3g
 ```

--- a/dcu_megatron/adaptor/megatron_adaptor.py
+++ b/dcu_megatron/adaptor/megatron_adaptor.py
@@ -99,7 +99,7 @@ class CoreAdaptation(MegatronAdaptationABC):
        )
        from ..core.models.gpt.gpt_model import (
            gpt_model_forward,
-            gpt_model_init,
+            gpt_model_init_wrapper,
            shared_embedding_or_mtp_embedding_weight
        )
        from ..training.utils import get_batch_on_this_tp_rank
@@ -116,20 +116,20 @@ class CoreAdaptation(MegatronAdaptationABC):

        # GPT Model
        MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward', gpt_model_forward)
-        MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__', gpt_model_init)
+        MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__',
+                                    gpt_model_init_wrapper,
+                                    apply_wrapper=True)

        from megatron.core.models.gpt.gpt_model import GPTModel
        setattr(GPTModel, 'shared_embedding_or_mtp_embedding_weight', shared_embedding_or_mtp_embedding_weight)

    def patch_core_transformers(self):
-        from ..core import transformer_block_init_wrapper, transformer_block_forward
+        from ..core import transformer_block_init_wrapper
        from ..core.transformer.transformer_config import TransformerConfigPatch, MLATransformerConfigPatch
        
        # Transformer block
        MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__',
                                    transformer_block_init_wrapper)
-        MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.forward',
-                                    transformer_block_forward)

        # Transformer config
        MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig',
@@ -141,9 +141,9 @@ class CoreAdaptation(MegatronAdaptationABC):
        MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity',
                                    torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}),
                                    apply_wrapper=True)
-        MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
-                                    torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
-                                    apply_wrapper=True)
+        # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
+        #                             torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
+        #                             apply_wrapper=True)
        MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute',
                                    torch.compile(mode='max-autotune-no-cudagraphs'),
                                    apply_wrapper=True)
@@ -166,7 +166,6 @@ class CoreAdaptation(MegatronAdaptationABC):
    def patch_tensor_parallel(self):
        from ..core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy
        from ..core.tensor_parallel import vocab_parallel_embedding_forward, vocab_parallel_embedding_init
-        from ..core.tensor_parallel import ColumnParallelLinearPatch, RowParallelLinearPatch, parallel_linear_init_wrapper

        # VocabParallelEmbedding
        MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward',
@@ -188,17 +187,19 @@ class CoreAdaptation(MegatronAdaptationABC):
                                    apply_wrapper=True)

        # flux
-        MegatronAdaptation.register("megatron.core.tensor_parallel.layers.ColumnParallelLinear.__init__",
-                                    parallel_linear_init_wrapper,
-                                    apply_wrapper=True)
-        MegatronAdaptation.register("megatron.core.tensor_parallel.layers.ColumnParallelLinear.forward",
-                                    ColumnParallelLinearPatch.forward)
-        MegatronAdaptation.register("megatron.core.tensor_parallel.layers.RowParallelLinear.__init__",
-                                    parallel_linear_init_wrapper,
-                                    apply_wrapper=True)
-        MegatronAdaptation.register("megatron.core.tensor_parallel.layers.RowParallelLinear.forward",
-                                    RowParallelLinearPatch.forward)
+        if int(os.getenv("USE_FLUX_OVERLAP", "0")):
+            from ..core.tensor_parallel import (
+                FluxColumnParallelLinear,
+                FluxRowParallelLinear
+            )
+            from ..core.models.gpt.gpt_layer_specs import get_gpt_layer_with_flux_spec

+            MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TEColumnParallelLinear",
+                                        FluxColumnParallelLinear)
+            MegatronAdaptation.register("megatron.core.extensions.transformer_engine.TERowParallelLinear",
+                                        FluxRowParallelLinear)
+            MegatronAdaptation.register("megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec",
+                                        get_gpt_layer_with_flux_spec)

    def patch_training(self):
        from ..training.tokenizer import build_tokenizer
@@ -232,19 +233,22 @@ class LegacyAdaptation(MegatronAdaptationABC):
        self.patch_legacy_models()

    def patch_legacy_models(self):
-        from ..legacy.model.transformer import ParallelMLP, ParallelAttention
+        from ..legacy.model.transformer import ParallelMLPPatch, ParallelAttentionPatch
+        from ..legacy.model.utils import get_norm

        # ParallecMLP
        MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelMLP.__init__',
-                                    ParallelMLP.__init__)
+                                    ParallelMLPPatch.__init__)

        MegatronAdaptation.register('megatron.legacy.model.transformer.ParallelAttention.forward',
-                                    ParallelAttention.forward)
+                                    ParallelAttentionPatch.forward)

        # rms_norm.RMSNorm
        MegatronAdaptation.register('megatron.legacy.model.rms_norm.RMSNorm.forward',
                                    torch.compile(mode="max-autotune-no-cudagraphs"),
                                    apply_wrapper=True)
+        MegatronAdaptation.register('megatron.legacy.model.utils.get_norm',
+                                    get_norm)


 MegatronAdaptation.execute()
--- a/dcu_megatron/core/__init__.py
+++ b/dcu_megatron/core/__init__.py
-from .transformer.transformer_block import transformer_block_init_wrapper, transformer_block_forward
+from .transformer.transformer_block import transformer_block_init_wrapper
--- a/dcu_megatron/core/distributed/finalize_model_grads.py
+++ b/dcu_megatron/core/distributed/finalize_model_grads.py
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
-
 from typing import List

 import torch

--- a/dcu_megatron/core/models/common/embeddings/language_model_embedding.py
+++ b/dcu_megatron/core/models/common/embeddings/language_model_embedding.py
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#  Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
 from typing import Literal

 import torch

--- a/dcu_megatron/core/models/gpt/gpt_layer_specs.py
+++ b/dcu_megatron/core/models/gpt/gpt_layer_specs.py
+import warnings
+from typing import Optional
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.multi_latent_attention import (
+    MLASelfAttention,
+    MLASelfAttentionSubmodules,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import (
+    TransformerLayer,
+    TransformerLayerSubmodules,
+)
+
+from dcu_megatron.core.tensor_parallel.layers import FluxColumnParallelLinear, FluxRowParallelLinear
+
+from megatron.core.utils import is_te_min_version
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEDotProductAttention,
+        TENorm,
+    )
+except ImportError:
+    warnings.warn('transformer_engine is not installed.')
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+except ImportError:
+    warnings.warn('Apex is not installed.')
+
+
+def get_gpt_layer_with_flux_spec(
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
+    qk_layernorm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
+    fp8: Optional[str] = None,  # pylint: disable=unused-arguments
+    moe_use_legacy_grouped_gemm: Optional[bool] = False,
+) -> ModuleSpec:
+    """Use this spec to use flux modules (required for fp8 training).
+
+
+    Args:
+        num_experts (int, optional): Number of experts. Defaults to None.
+        moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
+        qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
+        fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
+        moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
+                                                      Defaults to False.
+
+    Returns:
+        ModuleSpec: Module specification with flux modules
+    """
+    if fp8 is not None:
+        warnings.warn(
+            'The fp8 argument in "get_gpt_layer_with_transformer_engine_spec" has been deprecated'
+            ' and will be removed soon. Please update your code accordingly.'
+        )
+
+    mlp = get_mlp_module_flux_spec(
+        use_te=False,
+        num_experts=num_experts,
+        moe_grouped_gemm=moe_grouped_gemm,
+        moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm,
+    )
+
+    if multi_latent_attention:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                input_layernorm=TENorm,
+                self_attention=ModuleSpec(
+                    module=MLASelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=MLASelfAttentionSubmodules(
+                        linear_q_proj=FluxColumnParallelLinear,
+                        linear_q_down_proj=FluxColumnParallelLinear,
+                        linear_q_up_proj=FluxColumnParallelLinear,
+                        linear_kv_down_proj=FluxColumnParallelLinear,
+                        linear_kv_up_proj=FluxColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=FluxRowParallelLinear,
+                        q_layernorm=TENorm if qk_layernorm else IdentityOp,
+                        kv_layernorm=TENorm if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=TENorm,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
+    else:
+
+        # TENorm significantly harms convergence when used
+        # for QKLayerNorm if TE Version < 1.9;
+        # we instead use the Apex implementation.
+        qk_norm = TENorm if is_te_min_version("1.9.0") else FusedLayerNorm
+
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                input_layernorm=TENorm,
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=FluxColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=FluxRowParallelLinear,
+                        q_layernorm=qk_norm if qk_layernorm else IdentityOp,
+                        k_layernorm=qk_norm if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=TENorm,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
+
+
+def get_mlp_module_flux_spec(
+    use_te: Optional[bool] = True,
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
+    fp8: Optional[str] = None,  # pylint: disable=unused-arguments
+    moe_use_legacy_grouped_gemm: Optional[bool] = False,
+) -> ModuleSpec:
+    """Helper function to get module spec for MLP/MoE"""
+    if fp8 is not None:
+        warnings.warn(
+            'The fp8 argument in "_get_mlp_module_spec" has been deprecated'
+            ' and will be removed soon. Please update your code accordingly.'
+        )
+
+    if num_experts is None:
+        # Dense MLP w/ or w/o TE modules.
+        return ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(
+                linear_fc1=FluxColumnParallelLinear,
+                linear_fc2=FluxRowParallelLinear,
+            ),
+        )
+    else:
+        # Mixture of experts with modules in megatron core.
+        return get_moe_module_spec(
+            use_te=True,
+            num_experts=num_experts,
+            moe_grouped_gemm=moe_grouped_gemm,
+            moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm,
+        )
--- a/dcu_megatron/core/models/gpt/gpt_model.py
+++ b/dcu_megatron/core/models/gpt/gpt_model.py
+import os
 import logging
 from typing import Literal, Optional
 from functools import wraps
@@ -16,160 +17,76 @@ from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.transformer.enums import ModelType
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.extensions.transformer_engine import TEColumnParallelLinear

 from dcu_megatron.core.utils import tensor_slide
 from dcu_megatron.core.transformer.mtp.multi_token_predictor import MultiTokenPredictor
 from dcu_megatron.core.transformer.transformer_config import TransformerConfig
+from dcu_megatron.core.tensor_parallel import FluxColumnParallelLinear
+
+
+def gpt_model_init_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        fn(self, *args, **kwargs)
+
+        if (
+            self.post_process
+            and int(os.getenv("USE_FLUX_OVERLAP", "0"))
+        ):
+            self.output_layer = FluxColumnParallelLinear(
+                self.config.hidden_size,
+                self.vocab_size,
+                config=self.config,
+                init_method=self.config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+                embedding_activation_buffer=self.embedding_activation_buffer,
+                grad_output_buffer=self.grad_output_buffer,
+            )

-
-def gpt_model_init(
-    self,
-    config: TransformerConfig,
-    transformer_layer_spec: ModuleSpec,
-    vocab_size: int,
-    max_sequence_length: int,
-    pre_process: bool = True,
-    post_process: bool = True,
-    fp16_lm_cross_entropy: bool = False,
-    parallel_output: bool = True,
-    share_embeddings_and_output_weights: bool = False,
-    position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
-    rotary_percent: float = 1.0,
-    rotary_base: int = 10000,
-    rope_scaling: bool = False,
-    rope_scaling_factor: float = 8.0,
-    scatter_embedding_sequence_parallel: bool = True,
-    seq_len_interpolation_factor: Optional[float] = None,
-    mtp_spec: ModuleSpec = None
-) -> None:
-    super(GPTModel, self).__init__(config=config)
-
-    if has_config_logger_enabled(config):
-        log_config_to_disk(config, locals(), prefix=type(self).__name__)
-
-    self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
-    self.vocab_size = vocab_size
-    self.max_sequence_length = max_sequence_length
-    self.pre_process = pre_process
-    self.post_process = post_process
-    self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
-    self.parallel_output = parallel_output
-    self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-    self.position_embedding_type = position_embedding_type
-
-    # megatron core pipelining currently depends on model type
-    # TODO: remove this dependency ?
-    self.model_type = ModelType.encoder_or_decoder
-
-    # These 4 attributes are needed for TensorRT-LLM export.
-    self.max_position_embeddings = max_sequence_length
-    self.rotary_percent = rotary_percent
-    self.rotary_base = rotary_base
-    self.rotary_scaling = rope_scaling
-
-    if self.pre_process:
-        self.embedding = LanguageModelEmbedding(
-            config=self.config,
-            vocab_size=self.vocab_size,
-            max_sequence_length=self.max_sequence_length,
-            position_embedding_type=position_embedding_type,
-            scatter_to_sequence_parallel=scatter_embedding_sequence_parallel,
-        )
-
-    if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
-        self.rotary_pos_emb = RotaryEmbedding(
-            kv_channels=self.config.kv_channels,
-            rotary_percent=rotary_percent,
-            rotary_interleaved=self.config.rotary_interleaved,
-            seq_len_interpolation_factor=seq_len_interpolation_factor,
-            rotary_base=rotary_base,
-            rope_scaling=rope_scaling,
-            rope_scaling_factor=rope_scaling_factor,
-            use_cpu_initialization=self.config.use_cpu_initialization,
-        )
-
-    # Cache for RoPE tensors which do not change between iterations.
-    self.rotary_pos_emb_cache = {}
-    # Transformer.
-    self.decoder = TransformerBlock(
-        config=self.config,
-        spec=transformer_layer_spec,
-        pre_process=self.pre_process,
-        post_process=self.post_process
-    )
-
-    # Output
-    if post_process:
-        if self.config.defer_embedding_wgrad_compute:
-            # The embedding activation buffer preserves a reference to the input activations
-            # of the final embedding projection layer GEMM. It will hold the activations for
-            # all the micro-batches of a global batch for the last pipeline stage. Once we are
-            # done with all the back props for all the microbatches for the last pipeline stage,
-            # it will be in the pipeline flush stage. During this pipeline flush we use the
-            # input activations stored in embedding activation buffer and gradient outputs
-            # stored in gradient buffer to calculate the weight gradients for the embedding
-            # final linear layer.
-            self.embedding_activation_buffer = []
-            self.grad_output_buffer = []
-        else:
-            self.embedding_activation_buffer = None
-            self.grad_output_buffer = None
-
-        self.output_layer = tensor_parallel.ColumnParallelLinear(
-            config.hidden_size,
-            self.vocab_size,
-            config=config,
-            init_method=config.init_method,
-            bias=False,
-            skip_bias_add=False,
-            gather_output=not self.parallel_output,
-            skip_weight_param_allocation=self.pre_process
-            and self.share_embeddings_and_output_weights,
-            embedding_activation_buffer=self.embedding_activation_buffer,
-            grad_output_buffer=self.grad_output_buffer,
-        )
-
-    # add mtp
-    self.mtp_spec: ModuleSpec = mtp_spec
-    self.num_nextn_predict_layers = self.config.num_nextn_predict_layers
-    self.share_mtp_embedding_and_output_weight = self.config.share_mtp_embedding_and_output_weight
-    self.recompute_mtp_norm = self.config.recompute_mtp_norm
-    self.recompute_mtp_layer = self.config.recompute_mtp_layer
-    self.mtp_loss_scale = self.config.mtp_loss_scale
-    if self.post_process and self.training and self.num_nextn_predict_layers:
-        self.mtp_layers = torch.nn.ModuleList(
-            [
-                MultiTokenPredictor(
-                    config,
-                    self.mtp_spec.submodules,
-                    vocab_size=self.vocab_size,
-                    max_sequence_length=self.max_sequence_length,
-                    layer_number=i,
-                    pre_process=self.pre_process,
-                    fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
-                    parallel_output=self.parallel_output,
-                    position_embedding_type=self.position_embedding_type,
-                    rotary_percent=self.rotary_percent,
-                    seq_len_interpolation_factor=seq_len_interpolation_factor,
-                    share_mtp_embedding_and_output_weight=self.share_mtp_embedding_and_output_weight,
-                    recompute_mtp_norm=self.recompute_mtp_norm,
-                    recompute_mtp_layer=self.recompute_mtp_layer,
-                    add_output_layer_bias=False
+            self.setup_embeddings_and_output_layer()
+
+        # add mtp
+        self.num_nextn_predict_layers = self.config.num_nextn_predict_layers
+        if self.num_nextn_predict_layers:
+            assert hasattr(self.config, "mtp_spec")
+            self.mtp_spec: ModuleSpec = self.config.mtp_spec
+            self.share_mtp_embedding_and_output_weight = self.config.share_mtp_embedding_and_output_weight
+            self.recompute_mtp_norm = self.config.recompute_mtp_norm
+            self.recompute_mtp_layer = self.config.recompute_mtp_layer
+            self.mtp_loss_scale = self.config.mtp_loss_scale
+            if self.post_process and self.training:
+                self.mtp_layers = torch.nn.ModuleList(
+                    [
+                        MultiTokenPredictor(
+                            self.config,
+                            self.mtp_spec.submodules,
+                            vocab_size=self.vocab_size,
+                            max_sequence_length=self.max_sequence_length,
+                            layer_number=i,
+                            pre_process=self.pre_process,
+                            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+                            parallel_output=self.parallel_output,
+                            position_embedding_type=self.position_embedding_type,
+                            rotary_percent=self.rotary_percent,
+                            seq_len_interpolation_factor=seq_len_interpolation_factor,
+                            share_mtp_embedding_and_output_weight=self.share_mtp_embedding_and_output_weight,
+                            recompute_mtp_norm=self.recompute_mtp_norm,
+                            recompute_mtp_layer=self.recompute_mtp_layer,
+                            add_output_layer_bias=False
+                        )
+                        for i in range(self.num_nextn_predict_layers)
+                    ]
                )
-                for i in range(self.num_nextn_predict_layers)
-            ]
-        )
-
-    if self.pre_process or self.post_process:
-        self.setup_embeddings_and_output_layer()

-    if has_config_logger_enabled(self.config):
-        log_config_to_disk(
-            self.config, self.state_dict(), prefix=f'{type(self).__name__}_init_ckpt'
-        )
+            if self.pre_process or self.post_process:
+                setup_mtp_embeddings(self)

-    if self.num_nextn_predict_layers and (self.pre_process or self.post_process):
-        setup_mtp_embeddings(self)
+    return wrapper


 def shared_embedding_or_mtp_embedding_weight(self) -> Tensor:
@@ -424,10 +341,10 @@ def gpt_model_forward(
   
    if (
        self.num_nextn_predict_layers
-        and getattr(self.decoder, "final_layernorm", None) is not None
+        and getattr(self.decoder, "main_final_layernorm", None) is not None
    ):
        # move block main model final norms here
-        hidden_states = self.decoder.final_layernorm(hidden_states)
+        hidden_states = self.decoder.main_final_layernorm(hidden_states)

    logits, _ = self.output_layer(
        hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output

--- a/dcu_megatron/core/tensor_parallel/__init__.py
+++ b/dcu_megatron/core/tensor_parallel/__init__.py
 from .layers import (
-    parallel_linear_init_wrapper,
-    ColumnParallelLinearPatch,
-    RowParallelLinearPatch,
+    FluxColumnParallelLinear,
+    FluxRowParallelLinear,
    vocab_parallel_embedding_forward,
    vocab_parallel_embedding_init,
 )
\ No newline at end of file
--- a/dcu_megatron/core/tensor_parallel/layers.py
+++ b/dcu_megatron/core/tensor_parallel/layers.py
--- a/dcu_megatron/core/transformer/mtp/multi_token_predictor.py
+++ b/dcu_megatron/core/transformer/mtp/multi_token_predictor.py
-#  Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+import os
 import logging
 from dataclasses import dataclass
 from typing import Union, Optional, Literal
@@ -11,6 +11,7 @@ from megatron.core.models.common.embeddings.language_model_embedding import Lang
 from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.transformer.module import MegatronModule
+from megatron.core.extensions.transformer_engine import TEColumnParallelLinear
 from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy

 from megatron.core.transformer import ModuleSpec, TransformerConfig, build_module
@@ -136,18 +137,22 @@ class MultiTokenPredictor(MegatronModule):
            self.embedding_activation_buffer = None
            self.grad_output_buffer = None

-        self.output_layer = tensor_parallel.ColumnParallelLinear(
-            config.hidden_size,
-            self.vocab_size,
-            config=config,
-            init_method=config.init_method,
-            bias=self.add_output_layer_bias,
-            skip_bias_add=False,
-            gather_output=not self.parallel_output,
-            skip_weight_param_allocation=self.share_mtp_embedding_and_output_weight,
-            embedding_activation_buffer=self.embedding_activation_buffer,
-            grad_output_buffer=self.grad_output_buffer,
-        )
+        if int(os.getenv("USE_FLUX_OVERLAP", "0")):
+            column_parallel_linear_impl = FluxColumnParallelLinear
+        else:
+            column_parallel_linear_impl = tensor_parallel.ColumnParallelLinear
+        self.output_layer = column_parallel_linear_impl(
+                self.config.hidden_size,
+                self.vocab_size,
+                config=self.config,
+                init_method=self.config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.share_mtp_embedding_and_output_weight,
+                embedding_activation_buffer=self.embedding_activation_buffer,
+                grad_output_buffer=self.grad_output_buffer,
+            )

    def forward(
            self,

--- a/dcu_megatron/core/transformer/transformer_block.py
+++ b/dcu_megatron/core/transformer/transformer_block.py
-from contextlib import nullcontext
-from typing import Optional
 from functools import wraps

-import torch
-from torch import Tensor
-
-from megatron.core import InferenceParams, parallel_state, tensor_parallel
-from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core.utils import make_viewless_tensor
-
-try:
-    from megatron.core.extensions.transformer_engine import TEDelayedScaling
-
-    HAVE_TE = True
-except ImportError:
-    HAVE_TE = False
-

 def transformer_block_init_wrapper(fn):
    @wraps(fn)
@@ -25,178 +8,8 @@ def transformer_block_init_wrapper(fn):

        # mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
        config = args[0] if len(args) > 1 else kwargs['config']
-        self.move_final_norm_out_of_block = getattr(config, "num_nextn_predict_layers", 0) > 0
+        if getattr(config, "num_nextn_predict_layers", 0) > 0:
+            self.main_final_layernorm = self.final_layernorm
+            self.final_layernorm = None

    return wrapper
-
-
-def transformer_block_forward(
-    self,
-    hidden_states: Tensor,
-    attention_mask: Tensor,
-    context: Tensor = None,
-    context_mask: Tensor = None,
-    rotary_pos_emb: Tensor = None,
-    rotary_pos_cos: Tensor = None,
-    rotary_pos_sin: Tensor = None,
-    attention_bias: Tensor = None,
-    inference_params: InferenceParams = None,
-    packed_seq_params: PackedSeqParams = None,
-    sequence_len_offset: Tensor = None,
-):
-    """
-    Perform the forward pass through the transformer block.
-
-    This method handles the core computation of the transformer, including
-    self-attention, optional cross-attention, and feed-forward operations.
-
-    Args:
-        hidden_states (Tensor): Input tensor of shape [s, b, h] where s is the
-            sequence length, b is the batch size, and h is the hidden size.
-        attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
-            self-attention.
-        context (Tensor, optional): Context tensor for cross-attention.
-        context_mask (Tensor, optional): Mask for cross-attention context
-        rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
-        attention_bias (Tensor): Bias tensor for Q * K.T of shape in shape broadcastable
-            to [b, num_head, sq, skv], e.g. [1, 1, sq, skv].
-            Used as an alternative to apply attention mask for TE cuDNN attention.
-        inference_params (InferenceParams, optional): Parameters for inference-time
-            optimizations.
-        packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
-            processing.
-
-    Returns:
-        Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
-        [s, b, h], and optionally the updated context tensor if cross-attention is used.
-    """
-
-    if not self.pre_process:
-        # See set_input_tensor()
-        hidden_states = self.input_tensor
-
-    # Update the inference parameters with the current batch size in case it is variable
-    if inference_params and not self.training:
-        inference_params.current_batch_size = hidden_states.size(1)
-
-    # Viewless tensor.
-    # - We only need to create a viewless tensor in the case of micro batch
-    #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
-    #   above creates a view tensor, and '.contiguous()' is a pass-through.
-    #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
-    #   the need to make it viewless.
-    #
-    #   However, we don't explicitly check mbs == 1 here because
-    #   make_viewless_tensor() has negligible overhead when its input
-    #   is already viewless.
-    #
-    # - For the 'else' case above, calling make_viewless_tensor() here is
-    #   likely redundant, since p2p_communication.py (likely originator)
-    #   already creates viewless tensors. That said, make_viewless_tensor()
-    #   is called here to be future-proof and corner-case-proof.
-    hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
-
-    if self.config.sequence_parallel:
-        rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
-    else:
-        rng_context = nullcontext()
-
-    if self.config.fp8:
-        import transformer_engine  # To keep out TE dependency when not training in fp8
-
-        if self.config.fp8 == "e4m3":
-            fp8_format = transformer_engine.common.recipe.Format.E4M3
-        elif self.config.fp8 == "hybrid":
-            fp8_format = transformer_engine.common.recipe.Format.HYBRID
-        else:
-            raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
-
-        fp8_recipe = TEDelayedScaling(
-            config=self.config,
-            fp8_format=fp8_format,
-            override_linear_precision=(False, False, not self.config.fp8_wgrad),
-        )
-        fp8_group = None
-        if parallel_state.model_parallel_is_initialized():
-            fp8_group = parallel_state.get_amax_reduction_group(
-                with_context_parallel=True, tp_only_amax_red=self.tp_only_amax_red
-            )
-        fp8_context = transformer_engine.pytorch.fp8_autocast(
-            enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
-        )
-    else:
-        fp8_context = nullcontext()
-
-    with rng_context, fp8_context:
-        # Forward pass.
-        if self.config.recompute_granularity == 'full' and self.training:
-            hidden_states = self._checkpointed_forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                context=context,
-                context_mask=context_mask,
-                rotary_pos_emb=rotary_pos_emb,
-                attention_bias=attention_bias,
-                packed_seq_params=packed_seq_params,
-            )
-        else:
-            for l_no, layer in enumerate(self.layers):
-                with self.offload_context:
-                    layer.use_cudagraph = True
-                    if (len(self.cuda_graphs) == 0) or (not self.training):
-                        hidden_states, context = layer(
-                            hidden_states=hidden_states,
-                            attention_mask=attention_mask,
-                            context=context,
-                            context_mask=context_mask,
-                            rotary_pos_emb=rotary_pos_emb,
-                            rotary_pos_cos=rotary_pos_cos,
-                            rotary_pos_sin=rotary_pos_sin,
-                            attention_bias=attention_bias,
-                            inference_params=inference_params,
-                            packed_seq_params=packed_seq_params,
-                            sequence_len_offset=sequence_len_offset,
-                        )
-                    else:
-                        # CUDA graph replay for layer `l_no` and microbatch
-                        # `self.current_microbatch`. TransformerEngine versions>=1.10
-                        # allow keyword arguments with CUDA graph. However, CUDA graph
-                        # acccepts only Tensor inputs and Tensor outputs. Hence,
-                        # `inference_params` and `packed_seq_params` are excluded from
-                        # input list while output is limited to `hidden_states`.
-                        cg_index = self.current_microbatch % len(self.cuda_graphs[l_no])
-                        assert not any(
-                            [inference_params, packed_seq_params]
-                        ), "CUDA graph accepts only Tensor inputs."
-                        optional_inputs = self.get_cuda_graph_optional_args(
-                            attention_mask,
-                            context,
-                            context_mask,
-                            rotary_pos_emb,
-                            attention_bias,
-                            inference_params,
-                            packed_seq_params,
-                        )
-                        hidden_states = self.cuda_graphs[l_no][cg_index](
-                            hidden_states, **optional_inputs
-                        )
-
-                if (
-                    torch.is_grad_enabled()
-                    and self.config.cpu_offloading
-                    and self.group_prefetch_offload_commit_async is not None
-                ):
-                    hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
-
-    # Final layer norm.
-    if (not self.move_final_norm_out_of_block) and self.final_layernorm is not None:
-        hidden_states = self.final_layernorm(hidden_states)
-        # TENorm produces a "viewed" tensor. This will result in schedule.py's
-        # deallocate_output_tensor() throwing an error, so a viewless tensor is
-        # created to prevent this.
-        hidden_states = make_viewless_tensor(
-            inp=hidden_states, requires_grad=True, keep_graph=True
-        )
-
-    return hidden_states
-
--- a/dcu_megatron/core/transformer/transformer_config.py
+++ b/dcu_megatron/core/transformer/transformer_config.py
@@ -26,9 +26,6 @@ class ExtraTransformerConfig:
    ##################
    # flux
    ##################
-    use_flux: bool = False
-    """If set, flux will be used in ColumnParallelLinear and RowParallelLinear"""
-
    flux_transpose_weight: bool = False



--- a/dcu_megatron/core/utils.py
+++ b/dcu_megatron/core/utils.py
 import torch

 from typing import List, Optional, Union
+from importlib.metadata import version
+from packaging.version import Version as PkgVersion
+
+
+_flux_version = None
+
+
+def get_flux_version():
+    """Get flux version from __version__; if not available use pip's. Use caching."""
+
+    def get_flux_version_str():
+        import flux
+
+        if hasattr(flux, '__version__'):
+            return str(flux.__version__)
+        else:
+            return version("flux")
+
+    global _flux_version
+    if _flux_version is None:
+        _flux_version = PkgVersion(get_flux_version_str())
+    return _flux_version
+
+
+def is_flux_min_version(version, check_equality=True):
+    """Check if minimum version of `flux` is installed."""
+    if check_equality:
+        return get_flux_version() >= PkgVersion(version)
+    return get_flux_version() > PkgVersion(version)


 def tensor_slide(

--- a/dcu_megatron/legacy/model/rms_norm.py
+++ b/dcu_megatron/legacy/model/rms_norm.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from torch import nn
+
+class RMSNorm(torch.nn.Module):
+
+    def __init__(self,
+                 dim: int,
+                 eps: float = 1e-6,
+                 sequence_parallel: bool = False,
+                 config: dict = None):
+        """RMS Normaliation module
+
+        Args:
+            dim (int): The width of input, i.e. hidden size
+            eps (float): epsilon to use for the norm, default to 1e-6
+            sequence_parallel (bool): Set to true if sequence parallelism is being used,
+              this marks the weights as needing to be allreduced.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+        setattr(self.weight, 'sequence_parallel', sequence_parallel)
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    @torch.compile(mode="max-autotune-no-cudagraphs")
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+import torch
+
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+import lightop # rmsnorm_forward,rmsnorm_backward
+
+from functools import partial
+from megatron.core.utils import is_torch_min_version
+if is_torch_min_version("2.4.0a0"):
+    custom_fwd = partial(torch.amp.custom_fwd, device_type="cuda")
+    custom_bwd = partial(torch.amp.custom_bwd, device_type="cuda")
+else:
+    custom_fwd = torch.cuda.amp.custom_fwd
+    custom_bwd = torch.cuda.amp.custom_bwd
+
+
+class _LightopRMSNorm(torch.autograd.Function):
+    """ 使用lightop实现rmsnorm"""
+
+    @staticmethod
+    # @custom_fwd
+    def forward(ctx,
+                inp: torch.Tensor,
+                weight: torch.Tensor,
+                ln_out: torch.Tensor,
+                eps: float,
+                is_grad_enabled):
+        output = lightop.rmsnorm_forward(inp, weight, ln_out, eps, training=True)# output = (output, weight)
+        rsigma = output[1]
+        if is_grad_enabled:
+            ctx.save_for_backward(inp, weight, rsigma)
+        return output[0]
+
+    @staticmethod
+    # @custom_bwd
+    def backward(ctx, grad_output):
+        inp, weight, rsigma = ctx.saved_tensors
+
+        dgrad, dgamma = lightop.rmsnorm_backward(grad_output, inp, rsigma, weight)
+        return dgrad, dgamma, None, None, None
+
+
+class LightopRMSNorm(torch.nn.Module):
+    def __init__(self,
+                 dim: int,
+                 eps: float = 1e-6,):
+        """RMS Normaliation module
+
+        Args:
+            dim (int): The width of input, i.e. hidden size
+            eps (float): epsilon to use for the norm, default to 1e-6
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    # @no_torch_dynamo() # 动态torch._dynamo.disable
+    def forward(self, inp: torch.Tensor, is_first_microbatch: Optional[bool] = None):
+        if torch.is_grad_enabled():
+            fwd_fn = _LightopRMSNorm.apply
+            args = []
+        else:
+            fwd_fn = _LightopRMSNorm.forward
+            args = [None]
+        ln_out = torch.empty_like(inp, dtype=inp.dtype, memory_format=torch.contiguous_format)
+        args += (inp, self.weight, ln_out, self.eps, torch.is_grad_enabled())
+        out = fwd_fn(*args)
+        return out
--- a/dcu_megatron/legacy/model/transformer.py
+++ b/dcu_megatron/legacy/model/transformer.py
@@ -3,14 +3,21 @@ import torch.nn.functional as F

 from megatron.training import get_args
 from megatron.core import tensor_parallel
+from megatron.legacy.model.enums import AttnType
+from megatron.core.models.common.embeddings import apply_rotary_pos_emb
 from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.transformer import ParallelMLP
 from megatron.legacy.model.utils import (
    erf_gelu,
    openai_gelu,
 )

+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None

-class ParallelMLP(MegatronModule):
+class ParallelMLPPatch(MegatronModule):
    """MLP.

    MLP will take the input with h hidden state, project it to 4*h
@@ -74,7 +81,7 @@ class ParallelMLP(MegatronModule):
        )


-class ParallelAttention(MegatronModule):
+class ParallelAttentionPatch(MegatronModule):
    """Parallel self-attention layer abstract class.

    Self-attention layer takes input with size [s, b, h]

--- a/dcu_megatron/legacy/model/utils.py
+++ b/dcu_megatron/legacy/model/utils.py
+from megatron.training import get_args
+from megatron.legacy.model import LayerNorm
+from .rms_norm import RMSNorm, LightopRMSNorm
+
+
+def get_norm(config):
+    args = get_args()
+    if args.normalization == "LayerNorm":
+        return LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            no_persist_layer_norm=not config.persist_layer_norm,
+            sequence_parallel=config.sequence_parallel,
+            apply_layernorm_1p=args.apply_layernorm_1p)
+    elif args.normalization == "RMSNorm":
+        if args.apply_layernorm_1p:
+            raise NotImplementedError('RMSNorm does not currently support the layernorm_1p formulation.')
+    
+        return RMSNorm(dim=config.hidden_size,
+                       eps=config.layernorm_epsilon,
+                       sequence_parallel=config.sequence_parallel)
+    elif args.normalization == "LightopRMSNorm":
+        return LightopRMSNorm(dim=config.hidden_size,
+                       eps=config.layernorm_epsilon)
+    else:
+        raise Exception(f"unsupported norm type '{args.normalization}'.")
--- a/dcu_megatron/training/arguments.py
+++ b/dcu_megatron/training/arguments.py
@@ -51,6 +51,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):

    # Standard arguments.
    parser = _add_network_size_args(parser)
+    parser = _add_extra_network_size_args(parser)
    parser = _add_regularization_args(parser)
    parser = _add_training_args(parser)
    parser = _add_extra_training_args(parser)
@@ -106,6 +107,18 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
    return args


+def _add_extra_network_size_args(parser):
+    # 删除原参数
+    remove_original_params(parser, ["normalization"])
+
+    # 重定义参数
+    group = parser.add_argument_group(title='extra network size args')
+    group.add_argument('--normalization', default='LayerNorm',
+                       choices=['LayerNorm', 'RMSNorm', 'LightopRMSNorm'],
+                       help='Which normalization technique to use.')
+    return parser
+
+
 def _add_extra_distributed_args(parser):
    group = parser.add_argument_group(title='extra distributed args')
    group.add_argument('--rank', default=-1, type=int,
@@ -169,9 +182,7 @@ def _add_mtp_args(parser):


 def _add_flux_args(parser):
-    group = parser.add_argument_group(title='multi token prediction')
-    group.add_argument('--use-flux', action='store_true', default=False,
-                       help='If set, flux will be used in ColumnParallelLinear and RowParallelLinear')
+    group = parser.add_argument_group(title='flux args')
    group.add_argument('--flux-transpose-weight', action='store_true', default=False,
                       help='Whether to transpose weight when using flux kernel')
    return parser
--- a/examples/llama/Llama2_70b.sh
+++ b/examples/llama/Llama2_70b.sh
+#!/bin/bash
+# set -eux
+
+#export FLASH_ATTENTION_PRINT_PARAM=1
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=PXB # SYS
+
+#export HIP_ALLOC_INITIALIZE=0
+# export GPU_MAX_HW_QUEUES=10
+
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=32 # 20
+export NCCL_MAX_NCHANNELS=32 # 20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
+# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
+export GLOG_minloglevel=3 # 打印error级别的nccl日志
+source /opt/dtk/env.sh
+# 导入hipblaslt库
+# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 
+# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
+export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
+# 更新rocblas
+# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
+
+# torch控制多流转单流
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export SENDRECV_STREAM_WITH_COMPUTE=1 
+
+# prof采集添加同步, 避免卡顿
+# export GPU_FLUSH_ON_EXECUTION=1
+# export HIP_DIRECT_DISPATCH=0
+
+# 采集rocblas size
+# export ROCBLAS_LAYER=3
+# export HIPBLASLT_LOG_LEVEL=3
+# 采集 fa size
+# export FLASH_ATTENTION_PRINT_PARAM=1
+
+#增加编译缓存
+export cache_size_limit=64
+
+# lightop算子库
+export PYTORCH_ROCM_ARCH='gfx906;gfx926;gfx936'
+
+# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # 
+SAVE_PATH=./tmp_7b
+TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
+DATA_PATH="/public/home/gmhtest_tmp/RedPajama-Data-1T-Sample/redpajama_text_document" #<Specify path and file prefix>_text_document
+# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
+
+
+GPT_MODEL_ARGS=(
+    --num-layers 80 #80 #80 #40 # 20 # 
+    --hidden-size 8192
+    --ffn-hidden-size 22016 # 28672
+    --num-attention-heads 64
+    --max-position-embeddings 8192
+    --group-query-attention
+    --num-query-groups 8
+
+    --normalization RMSNorm 
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
+)
+
+export NVTE_FLASH_ATTN=1 # 走cutlass
+# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
+# --transformer-impl transformer_engine # 走core用这两组参数
+    # --use-mcore-models
+    # --transformer-impl local # 走legacy用这两组参数
+    # --use-legacy-models 
+TRAINING_ARGS=(
+    --transformer-impl local # 走legacy用这两组参数
+    --use-legacy-models 
+    --micro-batch-size 1
+    --global-batch-size 512 #32 #512 #256 # 64 #240 #60 #512 #64
+    --train-iters 300
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    # --fp16 # 开启fp16需要指定loss-scale
+    # --loss-scale 1024
+    --use-distributed-optimizer 
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    # --no-gradient-accumulation-fusion
+    # --no-check-for-nan-in-loss-and-grad
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+    --ckpt-format torch
+    --ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
+    # --recompute-activations
+    # --recompute-granularity full # 开启重计算降低显存增加耗时
+    # --recompute-num-layers 1 #0 #
+    # --recompute-method block
+    --overlap-grad-reduce # 重叠ddp grad reduce
+    # --tp-comm-overlap # tensor parallel comm和gemm重叠, 启动core
+    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 启动core
+    --use-flash-attn
+)
+# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+# export TORCHINDUCTOR_BENCHMARK_FUSION=1
+# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
+# export TORCHINDUCTOR_MAX_AUTOTUNE=1
+# export TORCHINDUCTOR_CACHE_DIR=./cache
+# --use-flash-attn-cutlass # cutlass fa
+# --use-flash-attn-triton # triton fa
+# --use-flash-attn-torch # torch fa
+
+MODEL_PARALLEL_ARGS=(
+    --sequence-parallel
+	--tensor-model-parallel-size 4
+	--pipeline-model-parallel-size 8
+  --context-parallel-size 1
+    # --num-layers-per-virtual-pipeline-stage 1
+  # --microbatch-group-size-per-virtual-pipeline-stage 5
+  # --no-overlap-p2p-communication # 开启后
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --seq-length 4096 #8192 #4096
+    --split 949,50,1
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model /public/home/gmhtest_tmp/RedPajama-Data-1T-Sample/tokenizer.model
+    # --tokenizer-model /data/model_weights/llama3.2/tokenizer.model
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --log-throughput
+    --save-interval 500 
+    --eval-interval 50 
+    --eval-iters 3
+    --save $SAVE_PATH 
+    --load $SAVE_PATH 
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+# FINETUNE_ARGS=(
+#     # --finetune
+#     # --pretrained-checkpoint $CHECKPOINT_PATH
+#     --load $CHECKPOINT_PATH
+#     --no-load-optim
+#     --no-load-rng
+# )
+
+PROFILE_ARGS=(
+    --profile
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-pytorch-profiler
+    --profile-ranks 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+    --profile-dir prof_data
+)
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=34577
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+APP="python -u ../../pretrain_gpt.py \
+        ${GPT_MODEL_ARGS[@]} \
+        ${TRAINING_ARGS[@]} \
+        ${MODEL_PARALLEL_ARGS[@]} \
+        ${DATA_ARGS[@]} \
+        ${EVAL_AND_LOGGING_ARGS[@]} \
+        ${DISTRIBUTED_ARGS[@]} \
+        
+"
+# 开启profile
+# ${PROFILE_ARGS[@]} \
+
+# export HIP_VISIBLE_DEVICES=0,7 #  # 4,5,6,7 #,
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
+# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
+# ${APP}
+case ${LOCAL_RANK} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+esac
\ No newline at end of file
--- a/examples/llama/Llama2_7b.sh
+++ b/examples/llama/Llama2_7b.sh
+#!/bin/bash
+# set -eux
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+
+
+#default env
+#export FLASH_ATTENTION_PRINT_PARAM=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=PXB # SYS
+# export GPU_MAX_HW_QUEUES=10
+#export HIP_ALLOC_INITIALIZE=0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# nccl env
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=32 # 20
+export NCCL_MAX_NCHANNELS=32 # 20
+export NCCL_IB_TIMEOUT=22
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
+export GLOG_minloglevel=3 # 打印error级别的nccl日志
+source /opt/dtk/env.sh
+
+# hipblaslt库
+export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
+
+# rocblas
+export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
+
+# torch控制多流转单流
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export SENDRECV_STREAM_WITH_COMPUTE=1 
+
+#增加编译缓存
+export cache_size_limit=64
+
+# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # 
+SAVE_PATH=./tmp_7b
+TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
+DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
+
+GPT_MODEL_ARGS=(
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 11008 
+    --num-attention-heads 32
+    --max-position-embeddings 4096
+
+    --normalization RMSNorm # LightopRMSNorm
+    --position-embedding-type rope # none # 
+    --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
+)
+
+export NVTE_FLASH_ATTN=1 # 走cutlass
+# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
+# --transformer-impl transformer_engine # 走core用这两组参数
+    # --use-mcore-models
+    # --transformer-impl local # 走legacy用这两组参数
+    # --use-legacy-models 
+TRAINING_ARGS=(
+    --transformer-impl local # 走legacy用这两组参数
+    --use-legacy-models 
+    --micro-batch-size 1
+    --global-batch-size 256 #256 #240 #60 #512 #64
+    --train-iters 50
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    # --fp16 # 开启fp16需要指定loss-scale
+    # --loss-scale 1024
+    --use-distributed-optimizer 
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    # --no-gradient-accumulation-fusion
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+    --ckpt-format torch
+    --ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
+    # --recompute-granularity full # 开启重计算降低显存增加耗时
+    # --recompute-num-layers 5 #0 #
+    # --recompute-method block
+    --overlap-grad-reduce # 重叠ddp grad reduce
+    # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
+    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
+    --use-flash-attn
+)
+# 使用torch fa的环境变量
+# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+# export TORCHINDUCTOR_BENCHMARK_FUSION=1
+# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
+# export TORCHINDUCTOR_MAX_AUTOTUNE=1
+# export TORCHINDUCTOR_CACHE_DIR=./cache
+# --use-flash-attn-cutlass # cutlass fa
+# --use-flash-attn-triton # triton fa
+# --use-flash-attn-torch # torch fa
+
+MODEL_PARALLEL_ARGS=(
+    --sequence-parallel
+	--tensor-model-parallel-size 1
+	--pipeline-model-parallel-size 2
+  # --context-parallel-size 2
+  # --num-layers-per-virtual-pipeline-stage 4
+  # --microbatch-group-size-per-virtual-pipeline-stage 1
+  # --no-overlap-p2p-communication # 开启后
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --seq-length 4096 #4096
+    --split 949,50,1
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-throughput
+    --eval-iters 50
+    --log-interval 1
+    --save-interval 1000 
+    --eval-interval 1000 
+    --save $SAVE_PATH 
+    --load $SAVE_PATH 
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+# FINETUNE_ARGS=(
+#     # --finetune
+#     # --pretrained-checkpoint $CHECKPOINT_PATH
+#     --load $CHECKPOINT_PATH
+#     --no-load-optim
+#     --no-load-rng
+# )
+
+PROFILE_ARGS=(
+    --profile
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-pytorch-profiler
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-dir prof_data
+)
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=34577
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
+        ${GPT_MODEL_ARGS[@]} \
+        ${TRAINING_ARGS[@]} \
+        ${MODEL_PARALLEL_ARGS[@]} \
+        ${DATA_ARGS[@]} \
+        ${EVAL_AND_LOGGING_ARGS[@]} \
+        ${DISTRIBUTED_ARGS[@]} \
+        
+"
+# 开启profile
+# ${PROFILE_ARGS[@]} \
+
+# export HIP_VISIBLE_DEVICES=0,7 #  # 4,5,6,7 #,
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
+# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
+# ${APP}
+case ${LOCAL_RANK} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+esac
\ No newline at end of file
--- a/examples/llama/Llama3_405b.sh
+++ b/examples/llama/Llama3_405b.sh
+#!/bin/bash
+# set -eux
+
+#export FLASH_ATTENTION_PRINT_PARAM=1
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=PXB # SYS
+
+#export HIP_ALLOC_INITIALIZE=0
+# export GPU_MAX_HW_QUEUES=10
+
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=32 # 20
+export NCCL_MAX_NCHANNELS=32 # 20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
+# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
+export GLOG_minloglevel=3 # 打印error级别的nccl日志
+source /opt/dtk/env.sh
+# 导入hipblaslt库
+# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 
+# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
+export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
+# 更新rocblas
+# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
+
+# torch控制多流转单流
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export SENDRECV_STREAM_WITH_COMPUTE=1 
+
+# prof采集添加同步, 避免卡顿
+# export GPU_FLUSH_ON_EXECUTION=1
+# export HIP_DIRECT_DISPATCH=0
+
+# 采集rocblas size
+# export ROCBLAS_LAYER=3
+# 采集 fa size
+# export FLASH_ATTENTION_PRINT_PARAM=1
+
+#增加编译缓存
+export cache_size_limit=64
+
+# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # 
+SAVE_PATH=./tmp_7b
+TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
+DATA_PATH="/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
+# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
+
+GPT_MODEL_ARGS=(
+    --num-layers 126 #96 #8 # 126
+    --hidden-size 16384
+    --ffn-hidden-size 53248 
+    --num-attention-heads 128
+    --max-position-embeddings 16384
+    --group-query-attention
+    --num-query-groups 16
+
+    --normalization RMSNorm 
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
+)
+
+export NVTE_FLASH_ATTN=1 # 走cutlass
+# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
+# --transformer-impl transformer_engine # 走core用这两组参数
+    # --use-mcore-models
+    # --transformer-impl local # 走legacy用这两组参数
+    # --use-legacy-models 
+TRAINING_ARGS=(
+    --transformer-impl transformer_engine # 走core用这两组参数
+    --use-mcore-models
+    --micro-batch-size 1
+    --global-batch-size 6912 # 252 #32 # 64 #240 #60 #512 #64
+    --train-iters 100
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    # --fp16 # 开启fp16需要指定loss-scale
+    # --loss-scale 1024
+    --use-distributed-optimizer 
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    # --no-gradient-accumulation-fusion
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+    --ckpt-format torch
+    --ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
+    # --recompute-granularity full # 开启重计算降低显存增加耗时
+    # --recompute-num-layers 5 #0 #
+    # --recompute-method block
+    --overlap-grad-reduce # 重叠ddp grad reduce
+    # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
+    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 优化项未适配
+    --use-flash-attn-cutlass
+)
+# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+# export TORCHINDUCTOR_BENCHMARK_FUSION=1
+# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
+# export TORCHINDUCTOR_MAX_AUTOTUNE=1
+# export TORCHINDUCTOR_CACHE_DIR=./cache
+# --use-flash-attn-cutlass # cutlass fa
+# --use-flash-attn-triton # triton fa
+# --use-flash-attn-torch # torch fa
+
+MODEL_PARALLEL_ARGS=(
+    --sequence-parallel
+	--tensor-model-parallel-size 8
+	--pipeline-model-parallel-size 18 # 7 layer/gpu
+    --context-parallel-size 2
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --seq-length 4096 #4096
+    --split 949,50,1
+    --tokenizer-type Llama3Tokenizer
+    --tokenizer-model /public/home/wangxj/Downloads/model_weights/llama3.2/tokenizer.model
+    # --tokenizer-model /data/model_weights/llama3.2/tokenizer.model
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --log-throughput
+    --save-interval 1000 
+    --eval-interval 1000 
+    --save $SAVE_PATH 
+    --load $SAVE_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+# FINETUNE_ARGS=(
+#     # --finetune
+#     # --pretrained-checkpoint $CHECKPOINT_PATH
+#     --load $CHECKPOINT_PATH
+#     --no-load-optim
+#     --no-load-rng
+# )
+
+PROFILE_ARGS=(
+    --profile
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-pytorch-profiler
+    --profile-ranks 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
+    --profile-dir prof_data
+)
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=34577
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+APP="python -u pretrain_gpt.py \
+        ${GPT_MODEL_ARGS[@]} \
+        ${TRAINING_ARGS[@]} \
+        ${MODEL_PARALLEL_ARGS[@]} \
+        ${DATA_ARGS[@]} \
+        ${EVAL_AND_LOGGING_ARGS[@]} \
+        ${DISTRIBUTED_ARGS[@]} \
+        
+"
+# 开启profile
+# ${PROFILE_ARGS[@]} \
+
+# export HIP_VISIBLE_DEVICES=0,7 #  # 4,5,6,7 #,
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
+# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
+${APP}
+# case ${LOCAL_RANK} in
+# [0])
+#   export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+#   numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [1])
+#   export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+#   numactl --cpunodebind=1 --membind=1 ${APP}
+#   ;;
+# [2])
+#   export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+#   numactl --cpunodebind=2 --membind=2 ${APP}
+#   ;;
+# [3])
+#   export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   numactl --cpunodebind=3 --membind=3 ${APP}
+#   # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [4])
+#   export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   numactl --cpunodebind=4 --membind=4 ${APP}
+#   # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [5])
+#   export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   numactl --cpunodebind=5 --membind=5 ${APP}
+#   # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [6])
+#   export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   numactl --cpunodebind=6 --membind=6 ${APP}
+#   # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [7])
+#   export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   numactl --cpunodebind=7 --membind=7 ${APP}
+#   # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# esac
\ No newline at end of file