init dcu_megatron

d05234e0 · dongcl · f8eedf6b · d05234e0 · d05234e0 · d05234e0
Commit d05234e0 authored Mar 25, 2025 by dongcl
19 changed files
--- a/dcu_megatron/__init__.py
+++ b/dcu_megatron/__init__.py
+from .adaptor import megatron_adaptor
\ No newline at end of file
--- a/dcu_megatron/adaptor/megatron_adaptor.py
+++ b/dcu_megatron/adaptor/megatron_adaptor.py
+# coding=utf-8
+# Copyright (c) 2024, HUAWEI CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import sys
+import types
+import argparse
+import torch
+
+
+class MegatronAdaptation:
+    """
+        A module manager supports adaptation registration, application and execution.
+    """
+    _patch_info_collection = {}
+    _args = None
+
+    @classmethod
+    def execute(cls):
+        """
+        Execute adaptations.
+        """
+        for adaptation in [CoreAdaptation(), LegacyAdaptation()]:
+            adaptation.execute()
+        MegatronAdaptation.apply()
+        # MegatronAdaptation.post_execute()
+
+    @classmethod
+    def register(cls, orig_func_name, new_func=None, force_patch=False, create_dummy=False):
+        """
+        Register adaptations into collection.
+        """
+        if orig_func_name not in cls._patch_info_collection:
+            from .patch_utils import Patch
+            cls._patch_info_collection[orig_func_name] = Patch(orig_func_name, new_func, create_dummy)
+        else:
+            cls._patch_info_collection.get(orig_func_name).set_patch_func(new_func, force_patch)
+
+    @classmethod
+    def apply(cls):
+        """
+        Apply adaptations.
+        """
+        for patch in cls._patch_info_collection.values():
+            patch.apply_patch()
+
+    @classmethod
+    def post_execute(cls):
+        """
+        Execute after other adaptations.
+        """
+        from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear
+        from megatron.core.transformer.transformer_block import TransformerBlock
+
+
+class MegatronAdaptationABC:
+    """
+    Abstract class for adaptation.
+    """
+    @abc.abstractmethod
+    def execute(self):
+        """
+        Do Adaptation
+        """
+
+
+class CoreAdaptation(MegatronAdaptationABC):
+    """
+    Adaptations for models in Megatron-LM Core structure.
+    """
+    def execute(self):
+        self.patch_core_distributed()
+        self.patch_core_models()
+        self.patch_core_transformers()
+        self.patch_tensor_parallel()
+        self.patch_training()
+        self.patch_miscellaneous()
+
+    def patch_core_distributed(self):
+        # Mtp share embedding
+        from ..core.distributed.finalize_model_grads import _allreduce_word_embedding_grads
+        MegatronAdaptation.register('megatron.core.distributed.finalize_model_grads._allreduce_word_embedding_grads',
+                                    _allreduce_word_embedding_grads)
+
+    def patch_core_models(self):
+        from ..core.models.common.embeddings.language_model_embedding import (
+            language_model_embedding_forward,
+            language_model_embedding_init_func
+        )
+        from ..core.models.gpt.gpt_model import (
+            gpt_model_forward,
+            gpt_model_init,
+            shared_embedding_or_mtp_embedding_weight
+        )
+
+        # Embedding
+        MegatronAdaptation.register(
+            'megatron.core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding.__init__',
+            language_model_embedding_init_func)
+        MegatronAdaptation.register(
+            'megatron.core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding.forward',
+            language_model_embedding_forward)
+
+        # GPT Model
+        MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.forward', gpt_model_forward)
+        MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel.__init__', gpt_model_init)
+
+        from megatron.core.models.gpt.gpt_model import GPTModel
+        setattr(GPTModel, 'shared_embedding_or_mtp_embedding_weight', shared_embedding_or_mtp_embedding_weight)
+
+    def patch_core_transformers(self):
+        from ..core import transformer_block_init_wrapper, transformer_block_forward
+        from ..core.transformer.transformer_config import TransformerConfig, MLATransformerConfig
+
+        # Transformer block
+        MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.__init__',
+                                    transformer_block_init_wrapper)
+        MegatronAdaptation.register('megatron.core.transformer.transformer_block.TransformerBlock.forward',
+                                    transformer_block_forward)
+
+        # Transformer config
+        MegatronAdaptation.register('megatron.core.transformer.transformer_config.TransformerConfig',
+                                    TransformerConfig)
+        # Transformer config
+        MegatronAdaptation.register('megatron.core.transformer.transformer_config.MLATransformerConfig',
+                                    MLATransformerConfig)
+
+    def patch_tensor_parallel(self):
+        from ..core import vocab_parallel_embedding_forward, vocab_parallel_embedding_init
+
+        MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward',
+                                    vocab_parallel_embedding_forward)
+        MegatronAdaptation.register('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__',
+                                    vocab_parallel_embedding_init)
+
+    def patch_training(self):
+        from ..training.tokenizer import build_tokenizer
+
+        MegatronAdaptation.register('megatron.training.tokenizer.tokenizer.build_tokenizer',
+                                    build_tokenizer)
+
+    def patch_miscellaneous(self):
+        from ..training.arguments import parse_args
+
+        MegatronAdaptation.register('megatron.training.arguments.parse_args', parse_args)
+
+
+class LegacyAdaptation(MegatronAdaptationABC):
+    """
+        Adaptations for models in legacy structure.
+    """
+
+    def execute(self):
+        pass
+
+
+MegatronAdaptation.execute()
--- a/dcu_megatron/adaptor/patch_utils.py
+++ b/dcu_megatron/adaptor/patch_utils.py
+import importlib
+import sys
+import types
+
+
+def get_func_name(func):
+    if isinstance(func, str):
+        return func
+    return '.'.join((func.__module__, func.__qualname__))
+
+
+def dummy_function_wrapper(func_name):
+    def dummy_function(*args, **kwargs):
+        raise RuntimeError('function {} no exist'.format(func_name))
+
+    return dummy_function
+
+
+class Patch:
+    def __init__(self, orig_func_name, new_func, create_dummy):
+        split_name = orig_func_name.rsplit('.', 1)
+        if len(split_name) == 1:
+            self.orig_module_name, self.orig_func_name = orig_func_name, None
+        else:
+            self.orig_module_name, self.orig_func_name = split_name
+        self.orig_module = None
+        self.orig_func = None
+
+        self.patch_func = None
+        self.wrappers = []
+        if new_func is None:
+            new_func = dummy_function_wrapper(orig_func_name)
+        self.set_patch_func(new_func)
+        self.is_applied = False
+        self.create_dummy = create_dummy
+
+    @property
+    def orig_func_id(self):
+        return id(self.orig_func)
+
+    @property
+    def patch_func_id(self):
+        return id(self.patch_func)
+
+    def set_patch_func(self, new_func, force_patch=False):
+        if hasattr(new_func, '__name__') and new_func.__name__.endswith(('wrapper', 'decorator')):
+            self.wrappers.append(new_func)
+        else:
+            if self.patch_func and not force_patch:
+                raise RuntimeError('the patch of {} exist !'.format(self.orig_func_name))
+            self.patch_func = new_func
+        self.is_applied = False
+
+    def apply_patch(self):
+        if self.is_applied:
+            return
+
+        self.orig_module, self.orig_func = Patch.parse_path(self.orig_module_name, self.orig_func_name, self.create_dummy)
+
+        final_patch_func = self.orig_func
+        if self.patch_func is not None:
+            final_patch_func = self.patch_func
+
+        for wrapper in self.wrappers:
+            final_patch_func = wrapper(final_patch_func)
+
+        if self.orig_func_name is not None:
+            setattr(self.orig_module, self.orig_func_name, final_patch_func)
+        for key, value in sys.modules.copy().items():
+            if self.orig_func_name is not None and hasattr(value, self.orig_func_name) \
+                    and id(getattr(value, self.orig_func_name)) == self.orig_func_id:
+                setattr(value, self.orig_func_name, final_patch_func)
+        self.is_applied = True
+
+    @staticmethod
+    def parse_path(module_path, function_name, create_dummy):
+        from importlib.machinery import ModuleSpec
+        modules = module_path.split('.')
+        for i in range(1, len(modules) + 1):
+            parent = '.'.join(modules[:i - 1])
+            path = '.'.join(modules[:i])
+            try:
+                importlib.import_module(path)
+            except ModuleNotFoundError as e:
+                if not parent or not hasattr(importlib.import_module(parent), modules[i - 1]):
+                    if not create_dummy:
+                        raise ModuleNotFoundError(e) from e
+                    sys.modules[path] = types.ModuleType(path)
+                    sys.modules[path].__file__ = 'dcu_megatron.dummy_module.py'
+                    sys.modules[path].__spec__ = ModuleSpec(path, None)
+                    if parent:
+                        setattr(importlib.import_module(parent), modules[i - 1], sys.modules[path])
+                else:
+                    module = getattr(importlib.import_module(parent), modules[i - 1])
+                    if hasattr(module, function_name):
+                        return module, getattr(module, function_name)
+                    elif create_dummy:
+                        return module, dummy_function_wrapper(function_name)
+                    else:
+                        raise RuntimeError('no exist {} of {}'.format(function_name, module))
+
+        if function_name is not None and not hasattr(sys.modules[module_path], function_name):
+            setattr(sys.modules[module_path], function_name, None)
+        return sys.modules[module_path], getattr(sys.modules[module_path], function_name) if function_name is not None else None
+
+
+class MegatronPatchesManager:
+    patches_info = {}
+
+    @staticmethod
+    def register_patch(orig_func_name, new_func=None, force_patch=False, create_dummy=False):
+        if orig_func_name not in MegatronPatchesManager.patches_info:
+            MegatronPatchesManager.patches_info[orig_func_name] = Patch(orig_func_name, new_func, create_dummy)
+        else:
+            MegatronPatchesManager.patches_info.get(orig_func_name).set_patch_func(new_func, force_patch)
+
+    @staticmethod
+    def apply_patches():
+        for patch in MegatronPatchesManager.patches_info.values():
+            patch.apply_patch()
--- a/dcu_megatron/core/__init__.py
+++ b/dcu_megatron/core/__init__.py
+from .tensor_parallel.layers import vocab_parallel_embedding_forward, vocab_parallel_embedding_init
+from .transformer.transformer_block import transformer_block_init_wrapper, transformer_block_forward
--- a/dcu_megatron/core/distributed/finalize_model_grads.py
+++ b/dcu_megatron/core/distributed/finalize_model_grads.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+
+from typing import List
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.distributed.finalize_model_grads import _unshard_if_dtensor, _reshard_if_dtensor
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import get_attr_wrapped_model
+
+
+def _allreduce_word_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig):
+    """
+    All-reduce word embedding grads.
+
+    Reduce grads across first and last stages to ensure that word_embeddings parameters stay in
+    sync.
+    """
+
+    if (
+        parallel_state.is_rank_in_embedding_group(ignore_virtual=True)
+        and torch.distributed.get_world_size(parallel_state.get_embedding_group()) > 1
+    ):
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            model_module = model[0]
+        elif parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+            model_module = model[-1]
+        else:  # We do not support an interleaved schedule for models with encoders yet.
+            model_module = model[0]
+
+        model_module = get_attr_wrapped_model(model_module, 'pre_process', return_model_obj=True)
+        if model_module.share_embeddings_and_output_weights:
+            weight = model_module.shared_embedding_or_output_weight()
+            grad_attr = "main_grad" if hasattr(weight, "main_grad") else "grad"
+            orig_grad = getattr(weight, grad_attr)
+            grad = _unshard_if_dtensor(orig_grad)
+            torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group())
+            setattr(weight, grad_attr, _reshard_if_dtensor(grad, orig_grad))
+
+        if hasattr(model_module,
+                   "share_mtp_embedding_and_output_weight") and model_module.share_mtp_embedding_and_output_weight:
+            weight = model_module.shared_embedding_or_mtp_embedding_weight()
+            grad_attr = "main_grad" if hasattr(weight, "main_grad") else "grad"
+            orig_grad = getattr(weight, grad_attr)
+            grad = _unshard_if_dtensor(orig_grad)
+            torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group())
+            setattr(weight, grad_attr, _reshard_if_dtensor(grad, orig_grad))
--- a/dcu_megatron/core/models/common/embeddings/language_model_embedding.py
+++ b/dcu_megatron/core/models/common/embeddings/language_model_embedding.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#  Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+from typing import Literal
+
+import torch
+from torch import Tensor
+
+from megatron.core import tensor_parallel
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+
+
+def language_model_embedding_init_func(
+    self,
+    config: TransformerConfig,
+    vocab_size: int,
+    max_sequence_length: int,
+    position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
+    num_tokentypes: int = 0,
+    scatter_to_sequence_parallel: bool = True,
+    skip_weight_param_allocation: bool = False
+):
+    """Patch language model embeddings init."""
+    super(LanguageModelEmbedding, self).__init__(config=config)
+
+    self.config: TransformerConfig = config
+    self.vocab_size: int = vocab_size
+    self.max_sequence_length: int = max_sequence_length
+    self.add_position_embedding: bool = position_embedding_type == 'learned_absolute'
+    self.num_tokentypes = num_tokentypes
+    self.scatter_to_sequence_parallel = scatter_to_sequence_parallel
+    self.reduce_scatter_embeddings = (
+        (not self.add_position_embedding)
+        and self.num_tokentypes <= 0
+        and self.config.sequence_parallel
+        and self.scatter_to_sequence_parallel
+    )
+
+    # Word embeddings (parallel).
+    self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
+        num_embeddings=self.vocab_size,
+        embedding_dim=self.config.hidden_size,
+        init_method=self.config.init_method,
+        reduce_scatter_embeddings=self.reduce_scatter_embeddings,
+        config=self.config,
+        skip_weight_param_allocation=skip_weight_param_allocation
+    )
+
+    # Position embedding (serial).
+    if self.add_position_embedding:
+        self.position_embeddings = torch.nn.Embedding(
+            self.max_sequence_length, self.config.hidden_size
+        )
+
+        # Initialize the position embeddings.
+        if self.config.perform_initialization:
+            self.config.init_method(self.position_embeddings.weight)
+
+    if self.num_tokentypes > 0:
+        self.tokentype_embeddings = torch.nn.Embedding(
+            self.num_tokentypes, self.config.hidden_size
+        )
+        # Initialize the token-type embeddings.
+        if self.config.perform_initialization:
+            self.config.init_method(self.tokentype_embeddings.weight)
+    else:
+        self.tokentype_embeddings = None
+
+    # Embeddings dropout
+    self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout)
+
+
+def language_model_embedding_forward(self,
+                                     input_ids: Tensor,
+                                     position_ids: Tensor,
+                                     tokentype_ids: int = None,
+                                     weight: Tensor = None) -> Tensor:
+    """Pacth forward pass of the embedding module.
+
+    Args:
+        input_ids (Tensor): The input tokens
+        position_ids (Tensor): The position id's used to calculate position embeddings
+        tokentype_ids (int): The token type ids. Used when args.bert_binary_head is
+            set to True. Defaults to None
+        weight (Tensor): embedding weight
+
+    Returns:
+        Tensor: The output embeddings
+    """
+    if weight is None:
+        if self.word_embeddings.weight is None:
+            raise RuntimeError(
+                "weight was not supplied to VocabParallelEmbedding forward pass "
+                "and skip_weight_param_allocation is True."
+            )
+        weight = self.word_embeddings.weight
+
+    word_embeddings = self.word_embeddings(input_ids, weight)
+    if self.add_position_embedding:
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = word_embeddings + position_embeddings
+    else:
+        embeddings = word_embeddings
+
+    if not self.reduce_scatter_embeddings:
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+
+    if tokentype_ids is not None:
+        assert self.tokentype_embeddings is not None
+        # [b s h] -> [s b h] (So that it can be added with embeddings)
+        tokentype_embedding = self.tokentype_embeddings(tokentype_ids).permute(1, 0, 2)
+        embeddings = embeddings + tokentype_embedding
+    else:
+        assert self.tokentype_embeddings is None
+
+    # If the input flag for fp32 residual connection is set, convert for float.
+    if self.config.fp32_residual_connection:
+        embeddings = embeddings.float()
+
+    # Dropout.
+    if self.config.sequence_parallel:
+        if not self.reduce_scatter_embeddings and self.scatter_to_sequence_parallel:
+            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+        # `scatter_to_sequence_parallel_region` returns a view, which prevents
+        # the original tensor from being garbage collected. Clone to facilitate GC.
+        # Has a small runtime cost (~0.5%).
+        if self.config.clone_scatter_output_in_embedding and self.scatter_to_sequence_parallel:
+            embeddings = embeddings.clone()
+        with tensor_parallel.get_cuda_rng_tracker().fork():
+            embeddings = self.embedding_dropout(embeddings)
+    else:
+        embeddings = self.embedding_dropout(embeddings)
+
+    return embeddings
--- a/dcu_megatron/core/models/gpt/gpt_model.py
+++ b/dcu_megatron/core/models/gpt/gpt_model.py
+import logging
+from typing import Literal, Optional
+from functools import wraps
+from collections import OrderedDict
+
+import torch
+from torch import Tensor
+
+from megatron.core import InferenceParams, parallel_state, tensor_parallel
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlock
+
+from dcu_megatron.core.utils import tensor_slide
+from dcu_megatron.core.transformer.mtp.multi_token_predictor import MultiTokenPredictor
+from dcu_megatron.core.transformer.transformer_config import TransformerConfig
+
+
+def gpt_model_init(
+    self,
+    config: TransformerConfig,
+    transformer_layer_spec: ModuleSpec,
+    vocab_size: int,
+    max_sequence_length: int,
+    pre_process: bool = True,
+    post_process: bool = True,
+    fp16_lm_cross_entropy: bool = False,
+    parallel_output: bool = True,
+    share_embeddings_and_output_weights: bool = False,
+    position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
+    rotary_percent: float = 1.0,
+    rotary_base: int = 10000,
+    rope_scaling: bool = False,
+    scatter_embedding_sequence_parallel: bool = True,
+    seq_len_interpolation_factor: Optional[float] = None,
+    mtp_spec: ModuleSpec = None
+) -> None:
+    super(GPTModel, self).__init__(config=config)
+
+    if has_config_logger_enabled(config):
+        log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+    self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
+    self.vocab_size = vocab_size
+    self.max_sequence_length = max_sequence_length
+    self.pre_process = pre_process
+    self.post_process = post_process
+    self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+    self.parallel_output = parallel_output
+    self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+    self.position_embedding_type = position_embedding_type
+
+    # megatron core pipelining currently depends on model type
+    # TODO: remove this dependency ?
+    self.model_type = ModelType.encoder_or_decoder
+
+    # These 4 attributes are needed for TensorRT-LLM export.
+    self.max_position_embeddings = max_sequence_length
+    self.rotary_percent = rotary_percent
+    self.rotary_base = rotary_base
+    self.rotary_scaling = rope_scaling
+
+    if self.pre_process:
+        self.embedding = LanguageModelEmbedding(
+            config=self.config,
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.max_sequence_length,
+            position_embedding_type=position_embedding_type,
+            scatter_to_sequence_parallel=scatter_embedding_sequence_parallel,
+        )
+
+    if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
+        self.rotary_pos_emb = RotaryEmbedding(
+            kv_channels=self.config.kv_channels,
+            rotary_percent=rotary_percent,
+            rotary_interleaved=self.config.rotary_interleaved,
+            seq_len_interpolation_factor=seq_len_interpolation_factor,
+            rotary_base=rotary_base,
+            rope_scaling=rope_scaling,
+            use_cpu_initialization=self.config.use_cpu_initialization,
+        )
+
+    # Transformer.
+    self.decoder = TransformerBlock(
+        config=self.config,
+        spec=transformer_layer_spec,
+        pre_process=self.pre_process,
+        post_process=self.post_process
+    )
+
+    # Output
+    if post_process:
+        if self.config.defer_embedding_wgrad_compute:
+            # The embedding activation buffer preserves a reference to the input activations
+            # of the final embedding projection layer GEMM. It will hold the activations for
+            # all the micro-batches of a global batch for the last pipeline stage. Once we are
+            # done with all the back props for all the microbatches for the last pipeline stage,
+            # it will be in the pipeline flush stage. During this pipeline flush we use the
+            # input activations stored in embedding activation buffer and gradient outputs
+            # stored in gradient buffer to calculate the weight gradients for the embedding
+            # final linear layer.
+            self.embedding_activation_buffer = []
+            self.grad_output_buffer = []
+        else:
+            self.embedding_activation_buffer = None
+            self.grad_output_buffer = None
+
+        self.output_layer = tensor_parallel.ColumnParallelLinear(
+            config.hidden_size,
+            self.vocab_size,
+            config=config,
+            init_method=config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            gather_output=not self.parallel_output,
+            skip_weight_param_allocation=self.pre_process
+            and self.share_embeddings_and_output_weights,
+            embedding_activation_buffer=self.embedding_activation_buffer,
+            grad_output_buffer=self.grad_output_buffer,
+        )
+
+    # add mtp
+    self.mtp_spec: ModuleSpec = mtp_spec
+    self.num_nextn_predict_layers = self.config.num_nextn_predict_layers
+    self.share_mtp_embedding_and_output_weight = self.config.share_mtp_embedding_and_output_weight
+    self.recompute_mtp_norm = self.config.recompute_mtp_norm
+    self.recompute_mtp_layer = self.config.recompute_mtp_layer
+    self.mtp_loss_scale = self.config.mtp_loss_scale
+    if self.post_process and self.training and self.num_nextn_predict_layers:
+        self.mtp_layers = torch.nn.ModuleList(
+            [
+                MultiTokenPredictor(
+                    config,
+                    self.mtp_spec.submodules,
+                    vocab_size=self.vocab_size,
+                    max_sequence_length=self.max_sequence_length,
+                    layer_number=i,
+                    pre_process=self.pre_process,
+                    fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+                    parallel_output=self.parallel_output,
+                    position_embedding_type=self.position_embedding_type,
+                    rotary_percent=self.rotary_percent,
+                    seq_len_interpolation_factor=seq_len_interpolation_factor,
+                    share_mtp_embedding_and_output_weight=self.share_mtp_embedding_and_output_weight,
+                    recompute_mtp_norm=self.recompute_mtp_norm,
+                    recompute_mtp_layer=self.recompute_mtp_layer,
+                    add_output_layer_bias=False
+                )
+                for i in range(self.num_nextn_predict_layers)
+            ]
+        )
+
+    if self.pre_process or self.post_process:
+        self.setup_embeddings_and_output_layer()
+
+    if has_config_logger_enabled(self.config):
+        log_config_to_disk(
+            self.config, self.state_dict(), prefix=f'{type(self).__name__}_init_ckpt'
+        )
+
+    if self.num_nextn_predict_layers and (self.pre_process or self.post_process):
+        setup_mtp_embeddings(self)
+
+
+def shared_embedding_or_mtp_embedding_weight(self) -> Tensor:
+    """Gets the embedding weight when share embedding and mtp embedding weights set to True.
+
+    Returns:
+        Tensor: During pre processing it returns the input embeddings weight while during post processing it returns
+         mtp embedding layers weight
+    """
+    assert self.num_nextn_predict_layers > 0
+    if self.pre_process:
+        return self.embedding.word_embeddings.weight
+    elif self.post_process:
+        return self.mtp_layers[0].embedding.word_embeddings.weight
+    return None
+
+
+def setup_mtp_embeddings(self):
+    """
+    Share embedding layer in mtp layer.
+    """
+    if self.pre_process:
+        self.embedding.word_embeddings.weight.is_embedding_or_output_parameter = True
+    # Set `is_embedding_or_output_parameter` attribute.
+    for i in range(self.num_nextn_predict_layers):
+        if self.post_process and self.mtp_layers[i].embedding.word_embeddings.weight is not None:
+            self.mtp_layers[i].embedding.word_embeddings.weight.is_embedding_or_output_parameter = True
+
+    if not self.share_mtp_embedding_and_output_weight:
+        return
+
+    if self.pre_process and self.post_process:
+        # Zero out wgrad if sharing embeddings between two layers on same
+        # pipeline stage to make sure grad accumulation into main_grad is
+        # correct and does not include garbage values (e.g., from torch.empty).
+        self.shared_embedding_or_mtp_embedding_weight().zero_out_wgrad = True
+        return
+
+    if self.pre_process and not self.post_process:
+        assert parallel_state.is_pipeline_first_stage()
+        self.shared_embedding_or_mtp_embedding_weight().shared_embedding = True
+
+    if self.post_process and not self.pre_process:
+        assert not parallel_state.is_pipeline_first_stage()
+        for i in range(self.num_nextn_predict_layers):
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.mtp_layers[i].embedding.word_embeddings.weight.data.fill_(0)
+            self.mtp_layers[i].embedding.word_embeddings.weight.shared = True
+            self.mtp_layers[i].embedding.word_embeddings.weight.shared_embedding = True
+
+    # Parameters are shared between the word embeddings layers, and the
+    # heads at the end of the model. In a pipelined setup with more than
+    # one stage, the initial embedding layer and the head are on different
+    # workers, so we do the following:
+    # 1. Create a second copy of word_embeddings on the last stage, with
+    #    initial parameters of 0.0.
+    # 2. Do an all-reduce between the first and last stage to ensure that
+    #    the two copies of word_embeddings start off with the same
+    #    parameter values.
+    # 3. In the training loop, before an all-reduce between the grads of
+    #    the two word_embeddings layers to ensure that every applied weight
+    #    update is the same on both stages.
+
+    # Ensure that first and last stages have the same initial parameter
+    # values.
+    if torch.distributed.is_initialized():
+        if parallel_state.is_rank_in_embedding_group():
+            weight = self.shared_embedding_or_mtp_embedding_weight()
+            weight.data = weight.data.cuda()
+            torch.distributed.all_reduce(
+                weight.data, group=parallel_state.get_embedding_group()
+            )
+
+    elif not getattr(LanguageModule, "embedding_warning_printed", False):
+        logging.getLogger(__name__).warning(
+            "Distributed processes aren't initialized, so the output layer "
+            "is not initialized with weights from the word embeddings. "
+            "If you are just manipulating a model this is fine, but "
+            "this needs to be handled manually. If you are training "
+            "something is definitely wrong."
+        )
+        LanguageModule.embedding_warning_printed = True
+
+
+def slice_inputs(self, input_ids, labels, position_ids, attention_mask):
+    if self.num_nextn_predict_layers == 0:
+        return (
+            [input_ids],
+            [labels],
+            [position_ids],
+            [attention_mask],
+        )
+
+    return (
+        tensor_slide(input_ids, self.num_nextn_predict_layers),
+        tensor_slide(labels, self.num_nextn_predict_layers),
+        generate_nextn_position_ids(position_ids, self.num_nextn_predict_layers),
+        # not compatible with ppo attn_mask
+        tensor_slide(attention_mask, self.num_nextn_predict_layers, dims=[-2, -1]),
+    )
+
+
+def generate_nextn_position_ids(tensor, slice_num):
+    slides = tensor_slide(tensor, slice_num)
+    if slides[0] is None:
+        return slides
+
+    for idx in range(1, len(slides)):
+        slides[idx] = regenerate_position_ids(slides[idx], idx)
+    return slides
+
+
+def regenerate_position_ids(tensor, offset):
+    if tensor is None:
+        return None
+
+    tensor = tensor.clone()
+    for i in range(tensor.size(0)):
+        row = tensor[i]
+        zero_mask = (row == 0)        # 两句拼接情形
+        if zero_mask.any():
+            first_zero_idx = torch.argmax(zero_mask.int()).item()
+            tensor[i, :first_zero_idx] = torch.arange(first_zero_idx)
+        else:
+            tensor[i] = tensor[i] - offset
+    return tensor
+
+
+def gpt_model_forward(
+    self,
+    input_ids: Tensor,
+    position_ids: Tensor,
+    attention_mask: Tensor,
+    decoder_input: Tensor = None,
+    labels: Tensor = None,
+    inference_params: InferenceParams = None,
+    packed_seq_params: PackedSeqParams = None,
+    extra_block_kwargs: dict = None,
+    runtime_gather_output: Optional[bool] = None,
+) -> Tensor:
+    """Forward function of the GPT Model This function passes the input tensors
+    through the embedding layer, and then the decoeder and finally into the post
+    processing layer (optional).
+
+    It either returns the Loss values if labels are given  or the final hidden units
+
+    Args:
+        runtime_gather_output (bool): Gather output at runtime. Default None means
+            `parallel_output` arg in the constructor will be used.
+    """
+    # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
+    # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
+
+    # generate inputs for main and mtps
+    input_ids, labels, position_ids, attention_mask = slice_inputs(
+        self,
+        input_ids,
+        labels,
+        position_ids,
+        attention_mask
+    )
+
+    # Decoder embedding.
+    if decoder_input is not None:
+        pass
+    elif self.pre_process:
+        decoder_input = self.embedding(input_ids=input_ids[0], position_ids=position_ids[0])
+    else:
+        # intermediate stage of pipeline
+        # decoder will get hidden_states from encoder.input_tensor
+        decoder_input = None
+
+    # Rotary positional embeddings (embedding is None for PP intermediate devices)
+    rotary_pos_emb = None
+    rotary_pos_cos = None
+    rotary_pos_sin = None
+    if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
+        if not self.training and self.config.flash_decode and inference_params:
+            # Flash decoding uses precomputed cos and sin for RoPE
+            rotary_pos_cos, rotary_pos_sin = self.rotary_pos_emb_cache.setdefault(
+                inference_params.max_sequence_length,
+                self.rotary_pos_emb.get_cos_sin(inference_params.max_sequence_length),
+            )
+        else:
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params, self.decoder, decoder_input, self.config, packed_seq_params
+            )
+            rotary_pos_emb = self.rotary_pos_emb(
+                rotary_seq_len,
+                packed_seq=packed_seq_params is not None
+                and packed_seq_params.qkv_format == 'thd',
+            )
+    if (
+        (self.config.enable_cuda_graph or self.config.flash_decode)
+        and rotary_pos_cos is not None
+        and inference_params
+    ):
+        sequence_len_offset = torch.tensor(
+            [inference_params.sequence_len_offset] * inference_params.current_batch_size,
+            dtype=torch.int32,
+            device=rotary_pos_cos.device,  # Co-locate this with the rotary tensors
+        )
+    else:
+        sequence_len_offset = None
+
+    # Run decoder.
+    hidden_states = self.decoder(
+        hidden_states=decoder_input,
+        attention_mask=attention_mask[0],
+        inference_params=inference_params,
+        rotary_pos_emb=rotary_pos_emb,
+        rotary_pos_cos=rotary_pos_cos,
+        rotary_pos_sin=rotary_pos_sin,
+        packed_seq_params=packed_seq_params,
+        sequence_len_offset=sequence_len_offset,
+        **(extra_block_kwargs or {}),
+    )
+
+    if not self.post_process:
+        return hidden_states
+
+    # logits and loss
+    output_weight = None
+    if self.share_embeddings_and_output_weights:
+        output_weight = self.shared_embedding_or_output_weight()
+
+    loss = 0
+    # Multi token prediction module
+    if self.num_nextn_predict_layers and self.training:
+        if not self.share_embeddings_and_output_weights and self.share_mtp_embedding_and_output_weight:
+            output_weight = self.output_layer.weight
+            output_weight.zero_out_wgrad = True
+
+        embedding_weight = self.shared_embedding_or_mtp_embedding_weight() if self.share_mtp_embedding_and_output_weight else None
+        mtp_hidden_states = hidden_states
+        for i in range(self.num_nextn_predict_layers):
+            mtp_hidden_states, mtp_loss = self.mtp_layers[i](
+                mtp_hidden_states,  # [s,b,h]
+                input_ids[i + 1],
+                position_ids[i + 1] if position_ids[0] is not None else None,
+                attention_mask[i + 1] if attention_mask[0] is not None else None,
+                labels[i + 1] if labels[0] is not None else None,
+                inference_params,
+                packed_seq_params,
+                extra_block_kwargs,
+                embeding_weight=embedding_weight,
+                output_weight=output_weight,
+            )
+
+            loss += self.mtp_loss_scale / self.num_nextn_predict_layers * mtp_loss
+   
+    if (
+        self.num_nextn_predict_layers
+        and getattr(self.decoder, "final_layernorm", None) is not None
+    ):
+        # move block main model final norms here
+        hidden_states = self.decoder.final_layernorm(hidden_states)
+
+    logits, _ = self.output_layer(
+        hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+    )
+
+    if has_config_logger_enabled(self.config):
+        payload = OrderedDict(
+            {
+                'input_ids': input_ids[0],
+                'position_ids': position_ids[0],
+                'attention_mask': attention_mask[0],
+                'decoder_input': decoder_input,
+                'logits': logits,
+            }
+        )
+        log_config_to_disk(self.config, payload, prefix='input_and_logits')
+
+    if labels[0] is None:
+        # [s b h] => [b s h]
+        return logits.transpose(0, 1).contiguous()
+
+    loss += self.compute_language_model_loss(labels[0], logits)
+
+    return loss
--- a/dcu_megatron/core/tensor_parallel/layers.py
+++ b/dcu_megatron/core/tensor_parallel/layers.py
+from typing import Callable
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.utils import is_torch_min_version
+from megatron.core.tensor_parallel.layers import (
+    _initialize_affine_weight_cpu,
+    _initialize_affine_weight_gpu,
+    VocabParallelEmbedding,
+)
+
+from megatron.core.tensor_parallel.mappings import (
+    reduce_from_tensor_model_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+)
+from megatron.core.tensor_parallel.utils import VocabUtility
+
+
+def vocab_parallel_embedding_init(
+    self,
+    num_embeddings: int,
+    embedding_dim: int,
+    *,
+    init_method: Callable,
+    reduce_scatter_embeddings: bool = False,
+    config: ModelParallelConfig,
+    skip_weight_param_allocation: bool = False
+):
+    super(VocabParallelEmbedding, self).__init__()
+    # Keep the input dimensions.
+    self.num_embeddings = num_embeddings
+    self.embedding_dim = embedding_dim
+    self.reduce_scatter_embeddings = reduce_scatter_embeddings
+    self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+    # Divide the weight matrix along the vocaburaly dimension.
+    (self.vocab_start_index, self.vocab_end_index) = (
+        VocabUtility.vocab_range_from_global_vocab_size(
+            self.num_embeddings,
+            get_tensor_model_parallel_rank(),
+            self.tensor_model_parallel_size,
+        )
+    )
+    self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
+    self.deterministic_mode = config.deterministic_mode
+
+    # Allocate weights and initialize.
+    if not skip_weight_param_allocation:
+        if config.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition, self.embedding_dim, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_cpu(
+                    self.weight,
+                    self.num_embeddings,
+                    self.embedding_dim,
+                    self.num_embeddings_per_partition,
+                    0,
+                    init_method,
+                    params_dtype=config.params_dtype,
+                )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition,
+                    self.embedding_dim,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
+    else:
+        self.weight = None
+
+
+@torch.compile(mode='max-autotune-no-cudagraphs')
+def vocab_parallel_embedding_forward(self, input_, weight=None):
+    """Forward.
+
+    Args:
+        input_ (torch.Tensor): Input tensor.
+    """
+    if weight is None:
+        if self.weight is None:
+            raise RuntimeError(
+                "weight was not supplied to VocabParallelEmbedding forward pass "
+                "and skip_weight_param_allocation is True."
+            )
+        weight = self.weight
+
+    if self.tensor_model_parallel_size > 1:
+        # Build the mask.
+        input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
+        # Mask the input.
+        masked_input = input_.clone() - self.vocab_start_index
+        masked_input[input_mask] = 0
+    else:
+        masked_input = input_
+    # Get the embeddings.
+    if self.deterministic_mode:
+        output_parallel = weight[masked_input]
+    else:
+        # F.embedding currently has a non-deterministic backward function
+        output_parallel = F.embedding(masked_input, weight)
+    # Mask the output embedding.
+    if self.tensor_model_parallel_size > 1:
+        output_parallel[input_mask, :] = 0.0
+
+    if self.reduce_scatter_embeddings:
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        output_parallel = output_parallel.transpose(0, 1).contiguous()
+        output = reduce_scatter_to_sequence_parallel_region(output_parallel)
+    else:
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_tensor_model_parallel_region(output_parallel)
+    return output
--- a/dcu_megatron/core/tensor_parallel/random.py
+++ b/dcu_megatron/core/tensor_parallel/random.py
+import torch
+
+from megatron.core.tensor_parallel.random import (
+    get_cuda_rng_tracker,
+    _set_cuda_rng_state
+)
+
+
+class CheckpointFunctionWithoutOutput(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, checkpoint, *args):
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Store everything
+        ctx.save_for_backward(*detach_variable(args))
+        checkpoint.ctx = ctx
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        inputs = ctx.saved_tensors
+        outputs = ctx.outputs
+        torch.autograd.backward(outputs, args)
+        ctx.outputs = None
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in inputs)
+        return (None, None) + grads
+
+
+class CheckpointWithoutOutput:
+    def __init__(self):
+        self.run_function = None
+        self.fwd_cpu_rng_state = None
+        self.fwd_cuda_rng_state = None
+        self.fwd_cuda_rng_state_tracker = None
+        self.outputs = None
+
+    def checkpoint(self, run_function, distribute_saved_activations, *args):
+        self.run_function = run_function
+
+        if distribute_saved_activations:
+            raise RuntimeError(
+                "CheckpointFunctionWithoutOutput does not support "
+                "distribute_saved_activations"
+            )
+
+        #Copy the rng states.
+        self.fwd_cpu_rng_state = torch.get_rng_state()
+        self.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        self.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        outputs = CheckpointFunctionWithoutOutput.apply(run_function, self, *args)
+        self.outputs = outputs
+        if isinstance(self.outputs, torch.Tensor):
+            self.outputs = (self.outputs,)
+
+        return outputs
+
+    def discard_output(self):
+        for output in self.outputs:
+            output.untyped_storage().resize_(0)
+
+    def recompute(self, _):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError(
+                "Checkpointing is not compatible with .grad(), "
+                "please use .backward() if possible"
+            )
+
+        # Store the current states.
+        cur_cpu_rng_state = torch.get_rng_state()
+        cur_cuda_rng_state = torch.cuda.get_rng_state()
+        cur_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(self.fwd_cpu_rng_state)
+        _set_cuda_rng_state(self.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(self.fwd_cuda_rng_state_tracker)
+
+        with torch.enable_grad():
+            outputs = self.run_function(*self.ctx.saved_tensors)
+        self.run_function = None
+        self.fwd_cpu_rng_state = None
+        self.fwd_cuda_rng_state = None
+        self.fwd_cuda_rng_state_tracker = None
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(cur_cpu_rng_state)
+        _set_cuda_rng_state(cur_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(cur_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+
+        for output, recomputation_output in zip(self.outputs, outputs):
+            output_size = recomputation_output.untyped_storage().size()
+            output.untyped_storage().resize_(output_size)
+            with torch.no_grad():
+                output.untyped_storage().copy_(recomputation_output.untyped_storage())
+
+        self.ctx.outputs = outputs
+        self.outputs = None
+        self.ctx = None
--- a/dcu_megatron/core/transformer/mtp/mtp_spec.py
+++ b/dcu_megatron/core/transformer/mtp/mtp_spec.py
+import warnings
+
+from megatron.core.tensor_parallel import ColumnParallelLinear
+from megatron.core.transformer import ModuleSpec
+from .multi_token_predictor import (
+    MultiTokenPredicationSubmodules,
+    MultiTokenPredictor
+)
+
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TENorm
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    LNImpl = FusedLayerNorm
+except ImportError:
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    warnings.warn('Apex is not installed. Falling back to Torch Norm')
+    LNImpl = WrappedTorchNorm
+
+
+def get_mtp_spec(transformer_layer, use_te=False):
+    """
+    Multi Token Predication Layer Specification.
+    """
+    use_te = use_te & HAVE_TE
+    mtp_spec = ModuleSpec(
+        module=MultiTokenPredictor,
+        submodules=MultiTokenPredicationSubmodules(
+            embedding=None,
+            enorm=TENorm if use_te else LNImpl,
+            hnorm=TENorm if use_te else LNImpl,
+            eh_proj=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            transformer_layer=transformer_layer,
+            final_layernorm=TENorm if use_te else LNImpl,
+            output_layer=None,
+        )
+    )
+
+    return mtp_spec
--- a/dcu_megatron/core/transformer/mtp/multi_token_predictor.py
+++ b/dcu_megatron/core/transformer/mtp/multi_token_predictor.py
+#  Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+import logging
+from dataclasses import dataclass
+from typing import Union, Optional, Literal
+
+import torch
+from torch import Tensor
+
+from megatron.core import tensor_parallel, InferenceParams
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
+
+from megatron.core.transformer import ModuleSpec, TransformerConfig, build_module
+from ...tensor_parallel.random import CheckpointWithoutOutput
+
+
+@dataclass
+class MultiTokenPredicationSubmodules:
+    embedding: Union[ModuleSpec, type] = None
+    output_layer: Union[ModuleSpec, type] = None
+    eh_proj: Union[ModuleSpec, type] = None
+    enorm: Union[ModuleSpec, type] = None
+    hnorm: Union[ModuleSpec, type] = None
+    transformer_layer: Union[ModuleSpec, type] = None
+    final_layernorm: Union[ModuleSpec, type] = None
+
+
+class MultiTokenPredictor(MegatronModule):
+    def __init__(
+            self,
+            config: TransformerConfig,
+            submodules: MultiTokenPredicationSubmodules,
+            vocab_size: int,
+            max_sequence_length: int,
+            layer_number: int = 1,
+            hidden_dropout: float = None,
+            pre_process: bool = True,
+            fp16_lm_cross_entropy: bool = False,
+            parallel_output: bool = True,
+            position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
+            rotary_percent: float = 1.0,
+            rotary_base: int = 10000,
+            seq_len_interpolation_factor: Optional[float] = None,
+            share_mtp_embedding_and_output_weight=True,
+            recompute_mtp_norm=False,
+            recompute_mtp_layer=False,
+            add_output_layer_bias=False
+    ):
+        super().__init__(config=config)
+
+        self.config = config
+        self.submodules = submodules
+        self.layer_number = layer_number
+        self.hidden_dropout = hidden_dropout
+        self.hidden_size = self.config.hidden_size
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.position_embedding_type = position_embedding_type
+        
+        # share with main model
+        self.share_mtp_embedding_and_output_weight = share_mtp_embedding_and_output_weight
+        self.recompute_layer_norm = recompute_mtp_norm
+        self.recompute_mtp_layer = recompute_mtp_layer
+        self.add_output_layer_bias = add_output_layer_bias
+
+        self.embedding = LanguageModelEmbedding(
+            config=self.config,
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.max_sequence_length,
+            position_embedding_type=self.position_embedding_type,
+            skip_weight_param_allocation=self.pre_process and self.share_mtp_embedding_and_output_weight
+        )
+
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                rotary_base=rotary_base,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+            )
+
+        self.enorm = build_module(
+            self.submodules.enorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.hnorm = build_module(
+            self.submodules.hnorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.eh_proj = build_module(
+            self.submodules.eh_proj,
+            self.hidden_size + self.hidden_size,
+            self.hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='eh',
+        )
+
+        self.transformer_layer = build_module(
+            self.submodules.transformer_layer,
+            config=self.config,
+        )
+
+        if self.submodules.final_layernorm:
+            self.final_layernorm = build_module(
+                self.submodules.final_layernorm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.final_layernorm = None
+
+        if self.config.defer_embedding_wgrad_compute:
+            self.embedding_activation_buffer = []
+            self.grad_output_buffer = []
+        else:
+            self.embedding_activation_buffer = None
+            self.grad_output_buffer = None
+
+        self.output_layer = tensor_parallel.ColumnParallelLinear(
+            config.hidden_size,
+            self.vocab_size,
+            config=config,
+            init_method=config.init_method,
+            bias=self.add_output_layer_bias,
+            skip_bias_add=False,
+            gather_output=not self.parallel_output,
+            skip_weight_param_allocation=self.share_mtp_embedding_and_output_weight,
+            embedding_activation_buffer=self.embedding_activation_buffer,
+            grad_output_buffer=self.grad_output_buffer,
+        )
+
+    def forward(
+            self,
+            hidden_input_ids: Tensor,
+            embed_input_ids: Tensor,
+            position_ids: Tensor,
+            attention_mask: Tensor,
+            labels: Tensor = None,
+            inference_params: InferenceParams = None,
+            packed_seq_params: PackedSeqParams = None,
+            extra_block_kwargs: dict = None,
+            embeding_weight: Optional[torch.Tensor] = None,
+            output_weight: Optional[torch.Tensor] = None,
+    ):
+        """Forward function of the MTP module"""
+
+        # Decoder embedding.
+        decoder_input = self.embedding(
+            input_ids=embed_input_ids,
+            position_ids=position_ids,
+            weight=embeding_weight,
+        )
+
+        # Rotary positional embeddings (embedding is None for PP intermediate devices)
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
+            if inference_params is not None:
+                rotary_seq_len = inference_params.max_sequence_length
+            else:
+                rotary_seq_len = decoder_input.size(0)
+
+                if self.config.sequence_parallel:
+                    rotary_seq_len *= self.config.tensor_model_parallel_size
+
+            rotary_seq_len *= self.config.context_parallel_size
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        if self.recompute_layer_norm:
+            self.enorm_ckpt = CheckpointWithoutOutput()
+            enorm_output = self.enorm_ckpt.checkpoint(self.enorm, False, decoder_input)
+            self.hnorm_ckpt = CheckpointWithoutOutput()
+            hnorm_output = self.hnorm_ckpt.checkpoint(self.hnorm, False, hidden_input_ids)
+        else:
+            enorm_output = self.enorm(decoder_input)
+            hnorm_output = self.hnorm(hidden_input_ids)
+
+        # [s, b, h] -> [s, b, 2h]
+        hidden_states = torch.concat(
+            [hnorm_output,
+             enorm_output],
+            dim=-1
+        )
+
+        if self.recompute_layer_norm:
+            self.enorm_ckpt.discard_output()
+            self.hnorm_ckpt.discard_output()
+            hidden_states.register_hook(self.enorm_ckpt.recompute)
+            hidden_states.register_hook(self.hnorm_ckpt.recompute)
+        # hidden_states -> [s, b, h]
+        hidden_states, _ = self.eh_proj(hidden_states)
+
+        if self.config.tensor_model_parallel_size > 1:
+            hidden_states = tensor_parallel.gather_from_tensor_model_parallel_region(hidden_states)
+            if self.config.sequence_parallel:
+                hidden_states = tensor_parallel.scatter_to_sequence_parallel_region(hidden_states)
+        if self.recompute_mtp_layer:
+            hidden_states, context = tensor_parallel.checkpoint(
+                self.transformer_layer,
+                self.config.distribute_saved_activations,
+                hidden_states,
+                attention_mask,
+                None,
+                None,
+                rotary_pos_emb,
+                inference_params,
+                packed_seq_params,
+            )
+        else:
+            hidden_states, _ = self.transformer_layer(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                rotary_pos_emb=rotary_pos_emb,
+                inference_params=inference_params,
+                packed_seq_params=packed_seq_params,
+                **(extra_block_kwargs or {}),
+            )
+
+        # Final layer norm.
+        if self.final_layernorm is not None:
+            if self.recompute_layer_norm:
+                self.finalnorm_ckpt = CheckpointWithoutOutput()
+                finalnorm_output = self.finalnorm_ckpt.checkpoint(self.final_layernorm, False, hidden_states)
+            else:
+                finalnorm_output = self.final_layernorm(hidden_states)
+        else:
+            finalnorm_output = hidden_states
+
+        logits, _ = self.output_layer(finalnorm_output, weight=output_weight)
+
+        if self.recompute_layer_norm:
+            self.finalnorm_ckpt.discard_output()
+            logits.register_hook(self.finalnorm_ckpt.recompute)
+
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+
+        loss = self.compute_language_model_loss(labels, logits)
+        return hidden_states, loss
+
+    def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor:
+        """Computes the language model loss (Cross entropy across vocabulary)
+
+        Args:
+            labels (Tensor): The labels of dimension [batch size, seq length]
+            logits (Tensor): The final logits returned by the output layer of the transformer model
+
+        Returns:
+            Tensor: Loss tensor of dimensions [batch size, sequence_length]
+        """
+        # [b s] => [s b]
+        labels = labels.transpose(0, 1).contiguous()
+        if self.config.cross_entropy_loss_fusion:
+            loss = fused_vocab_parallel_cross_entropy(logits, labels)
+        else:
+            loss = tensor_parallel.vocab_parallel_cross_entropy(logits, labels)
+
+        # [s b] => [b, s]
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
\ No newline at end of file
--- a/dcu_megatron/core/transformer/transformer_block.py
+++ b/dcu_megatron/core/transformer/transformer_block.py
+from contextlib import nullcontext
+from typing import Optional
+from functools import wraps
+
+import torch
+from torch import Tensor
+
+from megatron.core import InferenceParams, parallel_state, tensor_parallel
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.utils import make_viewless_tensor
+
+try:
+    from megatron.core.extensions.transformer_engine import TEDelayedScaling
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+
+def transformer_block_init_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        fn(self, *args, **kwargs)
+
+        # mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
+        config = args[0] if len(args) > 1 else kwargs['config']
+        self.move_final_norm_out_of_block = getattr(config, "num_nextn_predict_layers", 0) > 0
+
+    return wrapper
+
+
+def transformer_block_forward(
+    self,
+    hidden_states: Tensor,
+    attention_mask: Tensor,
+    context: Tensor = None,
+    context_mask: Tensor = None,
+    rotary_pos_emb: Tensor = None,
+    rotary_pos_cos: Tensor = None,
+    rotary_pos_sin: Tensor = None,
+    attention_bias: Tensor = None,
+    inference_params: InferenceParams = None,
+    packed_seq_params: PackedSeqParams = None,
+    sequence_len_offset: Tensor = None,
+):
+    """
+    Perform the forward pass through the transformer block.
+
+    This method handles the core computation of the transformer, including
+    self-attention, optional cross-attention, and feed-forward operations.
+
+    Args:
+        hidden_states (Tensor): Input tensor of shape [s, b, h] where s is the
+            sequence length, b is the batch size, and h is the hidden size.
+        attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
+            self-attention.
+        context (Tensor, optional): Context tensor for cross-attention.
+        context_mask (Tensor, optional): Mask for cross-attention context
+        rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
+        attention_bias (Tensor): Bias tensor for Q * K.T of shape in shape broadcastable
+            to [b, num_head, sq, skv], e.g. [1, 1, sq, skv].
+            Used as an alternative to apply attention mask for TE cuDNN attention.
+        inference_params (InferenceParams, optional): Parameters for inference-time
+            optimizations.
+        packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
+            processing.
+
+    Returns:
+        Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
+        [s, b, h], and optionally the updated context tensor if cross-attention is used.
+    """
+
+    if not self.pre_process:
+        # See set_input_tensor()
+        hidden_states = self.input_tensor
+
+    # Update the inference parameters with the current batch size in case it is variable
+    if inference_params and not self.training:
+        inference_params.current_batch_size = hidden_states.size(1)
+
+    # Viewless tensor.
+    # - We only need to create a viewless tensor in the case of micro batch
+    #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+    #   above creates a view tensor, and '.contiguous()' is a pass-through.
+    #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+    #   the need to make it viewless.
+    #
+    #   However, we don't explicitly check mbs == 1 here because
+    #   make_viewless_tensor() has negligible overhead when its input
+    #   is already viewless.
+    #
+    # - For the 'else' case above, calling make_viewless_tensor() here is
+    #   likely redundant, since p2p_communication.py (likely originator)
+    #   already creates viewless tensors. That said, make_viewless_tensor()
+    #   is called here to be future-proof and corner-case-proof.
+    hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
+
+    if self.config.sequence_parallel:
+        rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+    else:
+        rng_context = nullcontext()
+
+    if self.config.fp8:
+        import transformer_engine  # To keep out TE dependency when not training in fp8
+
+        if self.config.fp8 == "e4m3":
+            fp8_format = transformer_engine.common.recipe.Format.E4M3
+        elif self.config.fp8 == "hybrid":
+            fp8_format = transformer_engine.common.recipe.Format.HYBRID
+        else:
+            raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
+
+        fp8_recipe = TEDelayedScaling(
+            config=self.config,
+            fp8_format=fp8_format,
+            override_linear_precision=(False, False, not self.config.fp8_wgrad),
+        )
+        fp8_group = None
+        if parallel_state.model_parallel_is_initialized():
+            fp8_group = parallel_state.get_amax_reduction_group(
+                with_context_parallel=True, tp_only_amax_red=self.tp_only_amax_red
+            )
+        fp8_context = transformer_engine.pytorch.fp8_autocast(
+            enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
+        )
+    else:
+        fp8_context = nullcontext()
+
+    with rng_context, fp8_context:
+        # Forward pass.
+        if self.config.recompute_granularity == 'full' and self.training:
+            hidden_states = self._checkpointed_forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                context=context,
+                context_mask=context_mask,
+                rotary_pos_emb=rotary_pos_emb,
+                attention_bias=attention_bias,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            for l_no, layer in enumerate(self.layers):
+                with self.offload_context:
+                    layer.use_cudagraph = True
+                    if (len(self.cuda_graphs) == 0) or (not self.training):
+                        hidden_states, context = layer(
+                            hidden_states=hidden_states,
+                            attention_mask=attention_mask,
+                            context=context,
+                            context_mask=context_mask,
+                            rotary_pos_emb=rotary_pos_emb,
+                            rotary_pos_cos=rotary_pos_cos,
+                            rotary_pos_sin=rotary_pos_sin,
+                            attention_bias=attention_bias,
+                            inference_params=inference_params,
+                            packed_seq_params=packed_seq_params,
+                            sequence_len_offset=sequence_len_offset,
+                        )
+                    else:
+                        # CUDA graph replay for layer `l_no` and microbatch
+                        # `self.current_microbatch`. TransformerEngine versions>=1.10
+                        # allow keyword arguments with CUDA graph. However, CUDA graph
+                        # acccepts only Tensor inputs and Tensor outputs. Hence,
+                        # `inference_params` and `packed_seq_params` are excluded from
+                        # input list while output is limited to `hidden_states`.
+                        cg_index = self.current_microbatch % len(self.cuda_graphs[l_no])
+                        assert not any(
+                            [inference_params, packed_seq_params]
+                        ), "CUDA graph accepts only Tensor inputs."
+                        optional_inputs = self.get_cuda_graph_optional_args(
+                            attention_mask,
+                            context,
+                            context_mask,
+                            rotary_pos_emb,
+                            attention_bias,
+                            inference_params,
+                            packed_seq_params,
+                        )
+                        hidden_states = self.cuda_graphs[l_no][cg_index](
+                            hidden_states, **optional_inputs
+                        )
+
+                if (
+                    torch.is_grad_enabled()
+                    and self.config.cpu_offloading
+                    and self.group_prefetch_offload_commit_async is not None
+                ):
+                    hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
+
+    # Final layer norm.
+    if self.final_layernorm is not None:
+        hidden_states = self.final_layernorm(hidden_states)
+        # TENorm produces a "viewed" tensor. This will result in schedule.py's
+        # deallocate_output_tensor() throwing an error, so a viewless tensor is
+        # created to prevent this.
+        hidden_states = make_viewless_tensor(
+            inp=hidden_states, requires_grad=True, keep_graph=True
+        )
+
+    return hidden_states
+
--- a/dcu_megatron/core/transformer/transformer_config.py
+++ b/dcu_megatron/core/transformer/transformer_config.py
+import warnings
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch.nn.functional as F
+
+from megatron.core.transformer.enums import AttnBackend
+
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.utils import get_te_version, init_method_normal, is_te_min_version, scaled_init_method_normal
+
+
+@dataclass
+class TransformerConfig(ModelParallelConfig):
+    """Configuration object for megatron-core transformers.
+
+    The initialization function has an argument for each parameter,
+    including those in ModelParallelConfig.
+    """
+
+    ####################
+    # model architecture
+    ####################
+    num_layers: int = 0
+    """Number of transformer layers in a transformer block."""
+
+    num_layers_in_first_pipeline_stage: Optional[int] = None
+    """Number of transformer layers on first pipeline stage. 
+    None implies equal layer division across PP ranks."""
+
+    num_layers_in_last_pipeline_stage: Optional[int] = None
+    """Number of transformer layers on last pipeline stage. 
+    None implies equal layer division across PP ranks."""
+
+    account_for_embedding_in_pipeline_split: bool = False
+    """If set, the embedding layer will be treated as a standard transformer
+    layer in the context of partition and placement for pipeline parallelism."""
+
+    account_for_loss_in_pipeline_split: bool = False
+    """If set, the loss layer will be treated as a standard transformer
+    layer in the context of partition and placement for pipeline parallelism."""
+
+    hidden_size: int = 0
+    """Transformer hidden size."""
+
+    num_attention_heads: int = 0
+    """Number of transformer attention heads."""
+
+    attention_backend: AttnBackend = AttnBackend.auto
+    """Attention backend to run. By default we let transformer engine
+    decide the best backend to run (except in the case of local).
+    If attention backend is local we use the local pytorch implementation in mcore. 
+    Users can specify exact backend by changing this config. """
+
+    softmax_scale: Optional[float] = None
+    """Softmax scale for attention scaling."""
+
+    num_query_groups: Optional[int] = None
+    """Number of query groups for group query attention. If None, normal attention is used."""
+
+    ffn_hidden_size: Optional[int] = None
+    """Transformer Feed-Forward Network hidden size. This is set to 4*hidden_size
+    if not provided."""
+
+    kv_channels: Optional[int] = None
+    """Projection weights dimension in multi-head attention. This is set to hidden_size //
+    num_attention_heads if not provided."""
+
+    hidden_dropout: float = 0.1
+    """Dropout probability for transformer hidden state."""
+
+    attention_dropout: float = 0.1
+    """Post attention dropout probability."""
+
+    fp32_residual_connection: bool = False
+    """If true, move residual connections to fp32."""
+
+    # @jcasper should we keep this option?
+    apply_residual_connection_post_layernorm: bool = False
+    """If True, uses the original BERT residule connection ordering."""
+
+    layernorm_epsilon: float = 1e-5
+    """Epsilon value for any LayerNorm operations."""
+
+    layernorm_zero_centered_gamma: bool = False
+    """If set to True, the LayerNorm is adjusted to center the gamma values around 0. This improves
+    numerical stability."""
+
+    add_bias_linear: bool = True
+    """Include a bias term in all linear layers (QKV projections, after core attention, and two in
+    MLP layer)."""
+
+    add_qkv_bias: bool = False
+    """Add a bias term only for QKV projections."""
+
+    gated_linear_unit: bool = False
+    """Use a gated linear unit for the first linear layer in the MLP."""
+
+    activation_func: Callable = F.gelu
+    """Activation function to use for the non-linearity in the MLP."""
+
+    activation_func_fp8_input_store: bool = False
+    """Store the input of MLP activation function in FP8 for backprop to save memory.
+    The stored input is casted back to the original precision before backprop compuatation."""
+
+    num_moe_experts: Optional[int] = None
+    """Number of experts to use for MoE layer. When set, it replaces MLP with MoE layer. Set to None
+    for no MoE."""
+
+    rotary_interleaved: bool = False
+    """True is rotate pairs of even and odd dimensions (RoFormer style), False is rotate pairs of
+    first half and second half (LLaMa style). Default to False."""
+
+    window_size: Optional[Tuple[int, int]] = None
+    """If not None, then will use sliding window attention. The size of the window is specified by
+    the numbers inside the tuple; -1 is special value meaning "infinite window size"."""
+
+    normalization: str = "LayerNorm"
+    """Which norm to use for normalization layers, valid options are `LayerNorm` and `RMSNorm`."""
+
+    qk_layernorm: bool = False
+    """Whether to apply LayerNorm to the query and key embeddings."""
+
+    test_mode: bool = False
+    """Whether to run real-time tests."""
+
+    calculate_per_token_loss: bool = False
+    """Whether cross entropy loss is calculated over the actual number of non-padded tokens in the
+    global batch, versus the default behavior of assuming all tokens are non-padded."""
+
+    multi_latent_attention: bool = False
+    """Whether to use multi-latent attention."""
+
+    ####################
+    # initialization
+    ####################
+    init_method: Optional[Callable] = None
+    """Method to initialize weights. Note that bias is always set to zero. Should be a function that
+    takes a single Tensor and initializes it. If None, will be set to
+    megatron.core.utils.init_method_normal(init_method_std) which is torch nn init normal with
+    mean=0.0 and std=init_method_std."""
+
+    output_layer_init_method: Optional[Callable] = None
+    """Method to initialize weights of the output layer of both attention and MLP blocks. If None,
+    will be set to megatron.core.utils.scaled_init_method_normal(init_method_std) which is torch nn
+    init normal with mean=0.0 and std=init_method_std / math.sqrt(2.0 * num_layers)."""
+
+    init_method_std: float = 0.02
+    """Standard deviation of the zero mean normal for the default initialization method, not used if
+    init_method and output_layer_init_method are provided."""
+
+    init_model_with_meta_device: bool = False
+    """
+    If True, initializes the model with the meta device. This is helpful for
+    training of very large models. This feature is only works when custom fsdp is turned on.
+    """
+
+    ####################
+    # mixed-precision
+    ####################
+    apply_query_key_layer_scaling: bool = False
+    """If true, scale Q * K^T by 1 / layer-number. This improve numeric stability when training with
+    fp16."""
+
+    attention_softmax_in_fp32: bool = True
+    """If True, run attention masking and softmax in fp32. This should be True if
+    apply_query_key_layer_scaling is True."""
+
+    ####################
+    # fusion
+    ####################
+    bias_activation_fusion: bool = False
+    """If True, fuses bias addition and the activation function when possible."""
+
+    masked_softmax_fusion: bool = False
+    """If True, uses softmax fusion."""
+
+    persist_layer_norm: bool = False
+    """If True, uses the persistent fused layer norm kernel. This kernel only supports a fixed set
+    of hidden sizes."""
+
+    memory_efficient_layer_norm: bool = False
+    """If True, and using local layers (not from TransformerEngine), tells Apex to use the memory
+    efficient fused LayerNorm kernel. Ignored if not using LayerNorm."""
+
+    bias_dropout_fusion: bool = False  # TODO: this should be bias_dropout_add_fusion?
+    """If True, uses bias dropout fusion."""
+
+    apply_rope_fusion: bool = False
+    """If True, use fused RoPE kernel."""
+
+    ####################
+    # activation recomputation
+    ####################
+    recompute_granularity: Optional[str] = None
+    """Determines which type of activation recompute to use.  Megatron-core supports 'selective'
+    activation checkpointing where only the memory intensive part of attention is checkpointed.
+    These memory intensive activations are also less compute intensive which makes activation
+    checkpointing more efficient for LLMs (20B+).  See Reducing Activation Recomputation in Large
+    Transformer Models (https://arxiv.org/abs/2205.05198) for more details.  'full' will checkpoint
+    the entire transformer layer.  If None, no recompute is performed and all activations are saved.
+    If set, must be 'selective' or 'full'. 'selective' always uses all layers.
+    """
+
+    recompute_method: Optional[str] = None
+    """Determines which transformer layers will be recomputed. uniform will uniformly divide the
+    total number of transformer layers in a transformer block and recompute the input activation of
+    each divided chunk at the specified granularity.  block will recompute the input activations for
+    only a set number of transformer layers per pipeline stage.  The rest of the layers in the
+    pipeline stage will not have any activations recomputed.  If None, and recompute is enabled, all
+    layers will do recomputation. If set, must be 'uniform' or 'block'."""
+
+    recompute_num_layers: Optional[int] = None
+    """When recompute_method is uniform, recompute_num_layers is the number of transformer layers in
+    each uniformly divided recompute unit.  When recompute_method is block, recompute_num_layers is
+    the number of transformer layers to recompute within each pipeline stage.  Must be None for
+    'selective' activation checkpointing."""
+
+    distribute_saved_activations: Optional[bool] = None
+    """If True, distribute recomputed activations across the model parallel group."""
+
+    ####################
+    # fp8 related
+    ####################
+    fp8: Optional[str] = None
+    """If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined
+    choices (1) 'e4m3' uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8
+    activation and weight tensors and e5m2 for all FP8 output activation gradient tensors."""
+
+    fp8_margin: int = 0
+    """Margin for the scaling factor computation."""
+
+    fp8_interval: int = 1
+    """DEPRECATED from TransformerEngine v1.8.0. This flag is ignored.
+    Controls how often the scaling factor is recomputed.
+    """
+
+    fp8_amax_history_len: int = 1
+    """The length of the amax history window used for scaling factor computation."""
+
+    fp8_amax_compute_algo: str = "most_recent"
+    """Algorithm used for choosing the `amax` value for the scaling factor computation. There are 2
+    predefined choices: `max` chooses the largest `amax` in the history window, while `most_recent`
+    always chooses the most recently seen value.
+
+    """
+
+    fp8_wgrad: bool = True
+    """When set to False, override FP8 config options and do the wgrad computation
+    in higher precision."""
+
+    fp8_dot_product_attention: bool = False
+    """When set to True, use the FP8 implementation of Dot Product Attention."""
+
+    fp8_multi_head_attention: bool = False
+    """When set to True, use the FP8 implementation of Multi Head Attention."""
+
+    tp_only_amax_red: bool = False
+    """When set to True, reduce the FP8 AMAX only in the TP or TP-CP domain"""
+
+    ####################
+    # MoE related
+    ####################
+    moe_shared_expert_intermediate_size: Optional[int] = None
+    """Shared expert total ffn hidden size.
+    It should be equal to 'num_shared_experts * ffn_size_of_each_shared_expert' if
+    there are multiple shared experts.
+    None means no shared expert."""
+
+    moe_shared_expert_overlap: bool = False
+    """Enable overlapping between shared expert computations and dispatcher communications.
+    Without this, the shared epxerts execute after the routed experts."""
+
+    moe_layer_freq: Union[int, List[int]] = 1
+    """Frequency between MoE layers and Dense layers. Accepts either:
+    - An integer N: Represents a 1:N ratio, meaning one expert layer for every N-1 dense layers.
+    - A list that defines a custom pattern, e.g.: [1,1,1,0,1,1,1,0,1,1,1,0]"""
+
+    moe_ffn_hidden_size: Optional[int] = None
+    """MoE Feed-Forward Network hidden size"""
+
+    moe_router_load_balancing_type: str = "aux_loss"
+    """The load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss 
+    used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the loss used in DeepSeekV2, 
+    which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing 
+    algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss"."""
+
+    moe_router_topk: int = 2
+    """Number of experts to route to for each token."""
+
+    moe_router_topk_limited_devices: Optional[int] = None
+    """Number of EP ranks to consider for each token in group-limited routing, 
+    DEPRECATED and replaced by moe_router_num_groups and moe_router_group_topk.
+    """
+
+    moe_router_num_groups: Optional[int] = None
+    """Number of groups to divide experts into for group-limited routing.
+    When using group-limited routing:
+    1. Experts are divided into 'moe_router_num_groups' equal-sized groups
+    2. For each token, 'moe_router_group_topk' groups are selected based on routing scores
+    (specifically, the sum of top-2 expert scores within each group)
+    3. From these selected groups, 'moe_router_topk' individual experts are chosen
+    Two common use cases:
+    - Device-limited routing: Set 'moe_router_num_groups' equal to expert parallel size (EP)
+    to limit each token to experts on a subset of devices
+    (See DeepSeek-V2: https://arxiv.org/pdf/2405.04434)
+    - Node-limited routing: Set 'moe_router_num_groups' equal to number of nodes in EP group
+    to limit each token to experts on a subset of nodes
+    (See DeepSeek-V3: https://arxiv.org/pdf/2412.19437)
+    """
+
+    moe_router_group_topk: Optional[int] = None
+    """Number of selected groups for group-limited routing."""
+
+    moe_router_pre_softmax: bool = False
+    """Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. 
+    By default, softmax is done after top-k."""
+
+    moe_router_topk_scaling_factor: Optional[float] = None
+    """Scaling factor for routing score in top-k selection, only works when moe_router_pre_softmax 
+    enabled. Defaults to None, which means no scaling."""
+
+    moe_router_score_function: str = "softmax"
+    """Score function for MoE routing. Can be "softmax" or "sigmoid"."""
+
+    moe_router_enable_expert_bias: bool = False
+    """TopK routing with dynamic per-expert bias in the aux-loss-free load balancing strategy.
+    The routing decision is based on the sum of the routing scores and the expert bias.
+    See https://arxiv.org/abs/2408.15664 for details."""
+
+    moe_router_bias_update_rate: float = 1e-3
+    """The expert bias is updated based on the number of assigned tokens to each expert 
+    in a global batch, where the bias is increased for the experts with less assigned tokens
+    and decreased for the experts with more assigned tokens. 
+    The default value 1e-3 is same as that used in DeepSeekV3."""
+
+    moe_grouped_gemm: bool = False
+    """When there are multiple experts per rank, compress multiple local (potentially small) gemms
+    in a single kernel launch to improve the utilization and performance by leveraging the Grouped
+    GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm).
+    """
+
+    moe_use_legacy_grouped_gemm: bool = False
+    """Use legacy GroupedMLP rather than TEGroupedMLP.
+    Note: The legacy one will be deprecated soon."""
+
+    moe_aux_loss_coeff: float = 0  # 1e-2 would be a good start value for load balance loss.
+    """Scaling coefficient for the aux loss. A starting value of 1e-2 is recommended."""
+
+    moe_z_loss_coeff: Optional[float] = None  # 1e-3 would be a good start value for z-loss
+    """Scaling coefficient for the z-loss. A starting value of 1e-3 is recommended."""
+
+    moe_input_jitter_eps: Optional[float] = None
+    """Add noise to the input tensor by applying jitter with a specified epsilon value."""
+
+    moe_token_dropping: bool = False
+    """This feature involves selectively dropping and padding tokens for each expert to achieve a
+    specified capacity, similar to GShard, Switch-Transformer, and DeepSpeed-MoE. Note that this is
+    currently unsupported so should remain False."""
+
+    moe_token_dispatcher_type: str = "allgather"
+    """The type of token dispatcher to use. The default is 'allgather'.
+    Options are 'allgather','alltoall' and 'flex'."""
+
+    moe_enable_deepep: bool = False
+    """[Experimental] Enable DeepEP for efficient token dispatching and combine in MoE models."""
+
+    moe_per_layer_logging: bool = False
+    """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss."""
+
+    moe_expert_capacity_factor: Optional[float] = None
+    """moe_expert_capacity_factor (float): The capacity factor for each expert, None means no token
+    will be dropped. The default is None."""
+
+    moe_pad_expert_input_to_capacity: bool = False
+    """moe_pad_expert_input_to_capacity (bool): If True, pads the input for each expert to match
+    the expert capacity length, effective only after the moe_expert_capacity_factor is set. The
+    default setting is False."""
+
+    moe_token_drop_policy: str = 'probs'
+    """The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with
+    the lowest probabilities will be dropped. If "position", tokens at the end of each batch will
+    be dropped.
+    """
+
+    moe_layer_recompute: bool = False
+    """Memory optimization: checkpointing moe_layer to save actiavtion memory."""
+
+    moe_permute_fusion: bool = False
+    """Fuse token rearrangement ops during token dispatching."""
+
+    ##################
+    # multi-token prediction
+    ##################
+    num_nextn_predict_layers: int = 0
+    """The number of multi-token prediction layers"""
+
+    mtp_loss_scale: float = 0.3
+    """Multi-token prediction loss scale"""
+
+    recompute_mtp_norm: bool = False
+    """Whether to recompute mtp normalization"""
+
+    recompute_mtp_layer: bool = False
+    """Whether to recompute mtp layer"""
+
+    share_mtp_embedding_and_output_weight: bool = False
+    """share embedding and output weight with mtp layer."""
+
+    ##################
+    # Context Parallel
+    ##################
+    cp_comm_type: Optional[Union[str, List[str]]] = None
+    """Inter-gpu communication type for context parallelism.
+    str: all layers share same communication type.
+    List[str]: each layer has its separate communication type.
+    cp_comm_type of each layer can be "p2p" or "all_gather" or "a2a" or "a2a+p2p".
+    "p2p": Exchange KV chunks with P2P communications in ring topology. P2P is async and can be
+    overlapped with attention compute.
+    "all_gather": All-gather to get full sequence of KV before attention. The all-gather is not
+    async, and cannot be overlapped.
+    "a2a": Like DeepSpeed Ulysses, scatter attention heads across the CP group, and gather to get
+    full sequence of QKV.
+    "a2a+p2p": A hierarchical implementation of context parallelism to attention. 
+    It uses A2A communications in low-level CP groups (e.g., via NVLink),
+    and P2P communications in high-level CP groups (e.g., via IBLink).
+    """
+
+    ##################
+    # Cuda Graphs
+    ##################
+    enable_cuda_graph: bool = False
+    """When set to true, TransformerLayer layers are swapped with a CUDA graphed version."""
+
+    cuda_graph_use_single_mempool: bool = False
+    """When set to true, cudagraphs will be captured inside a single mempool, in which all 
+    cudagraphs may only be used once per step. If false, cudagraphs may be reused across 
+    microbatches. Enabling may reduce cudagraph memory overheads due to memory fragmentation, 
+    however may greatly increase the number of cudagraphs created when the number of microbatches 
+    is high."""
+
+    cuda_graph_retain_backward_graph: bool = False
+    """When set to true, cudagraph backward passes will be graph captured with 'retain_grad=True'
+    This may enable cudagraphs for certain modules that are not completely cudagraph safe. For 
+    more details, see: https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html."""
+
+    cuda_graph_warmup_steps: int = 3
+    """Number of warmup steps for CUDA graphs"""
+
+    external_cuda_graph: bool = False
+    """When set to true, TransformerLayer layers are swapped with user provided CUDA graphs."""
+
+    ####################
+    # miscellaneous
+    ####################
+    clone_scatter_output_in_embedding: bool = True
+    """When set to True, clone the output of scatter_to_sequence_parallel_region in embedding layer
+    to facilitate garbage collection of input."""
+
+    disable_parameter_transpose_cache: bool = False
+    """When set to true, the parameter transposes are not cached for subsequent iterations."""
+
+    config_logger_dir: str = ""
+    """When non-empty, dumps entry-point configs to config_logger_dir"""
+
+    flash_decode: bool = False
+    """ Use the optimized flash decoding kernel during inference. """
+
+    use_te_rng_tracker: bool = False
+    """ Whether to use the TE or MCore version of the RNG tracker. """
+
+    inference_rng_tracker: bool = False
+    """ Whether we should instantiate a separate RNG tracker for inference. """
+
+    use_custom_fsdp: bool = False
+    """ Whether to use custom fsdp for training. """
+
+    def __post_init__(self):
+        """Python dataclass method that is used to modify attributes after initialization.
+        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
+        details.
+        """
+        super().__post_init__()
+        if self.fp16 and self.bf16:
+            raise ValueError(
+                f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.'
+            )
+
+        if self.num_attention_heads % self.tensor_model_parallel_size != 0:
+            raise ValueError(
+                f"num_attention_heads ({self.num_attention_heads}) must be a multiple of "
+                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+            )
+
+        if self.ffn_hidden_size is None:
+            self.ffn_hidden_size = 4 * self.hidden_size
+
+        if self.kv_channels is None:
+            self.kv_channels = self.hidden_size // self.num_attention_heads
+
+        if self.num_query_groups is None:
+            self.num_query_groups = self.num_attention_heads
+
+        if self.num_query_groups % self.tensor_model_parallel_size != 0:
+            raise ValueError(
+                f"num_query_groups ({self.num_query_groups}) must be a multiple of "
+                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+            )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.expert_model_parallel_size > 1 and self.num_moe_experts is None:
+            raise ValueError('num_moe_experts must be non None to use expert-parallel.')
+
+        if self.num_moe_experts is not None and self.num_moe_experts <= 0:
+            raise ValueError('num_moe_experts must be non-negative.')
+
+        if self.moe_ffn_hidden_size is None:
+            self.moe_ffn_hidden_size = self.ffn_hidden_size
+
+        if self.moe_enable_deepep:
+            if self.moe_token_dispatcher_type != "flex":
+                raise ValueError("DeepEP backend is only supported with flex token dispatcher.")
+
+        if self.moe_shared_expert_intermediate_size is not None:
+            if self.moe_shared_expert_intermediate_size <= 0:
+                raise ValueError(
+                    f'moe_shared_expert_intermediate_size must be '
+                    f'num_shared_experts * ffn_size_of_each_shared_expert, '
+                    f'but got {self.moe_shared_expert_intermediate_size}'
+                )
+            if self.moe_shared_expert_overlap and self.moe_token_dispatcher_type not in [
+                "alltoall"
+            ]:
+                raise ValueError(
+                    f'moe_shared_expert_overlap only works with alltoall token dispatcher.'
+                )
+
+        if self.moe_expert_capacity_factor is not None:
+            if self.moe_expert_capacity_factor < 0:
+                self.moe_expert_capacity_factor = None
+            if self.moe_router_load_balancing_type not in ["aux_loss", "seq_aux_loss", "none"]:
+                raise ValueError(
+                    'moe_expert_capacity_factor only works with aux_loss or none load balancing'
+                )
+
+        if self.moe_pad_expert_input_to_capacity:
+            if self.moe_expert_capacity_factor is None:
+                raise ValueError(
+                    'moe_expert_capacity_factor must be set to use moe_pad_expert_input_to_capacity'
+                )
+
+        if self.cpu_offloading and (
+            self.cpu_offloading_num_layers < 0 or self.cpu_offloading_num_layers >= self.num_layers
+        ):
+            raise ValueError(
+                f'CPU offloading can be done only for layers less than {self.num_layers}'
+            )
+
+        if self.cpu_offloading and self.pipeline_model_parallel_size > 1:
+            raise ValueError(
+                'Currently there is no support for Pipeline parallelism with CPU offloading'
+            )
+
+        if self.cpu_offloading and self.recompute_granularity is not None:
+            raise ValueError(
+                'CPU offloading does not work when activation recomputation is enabled'
+            )
+
+        if self.recompute_granularity is not None:
+            if self.recompute_granularity not in ['full', 'selective']:
+                raise ValueError(
+                    f'When using recompute_granuarlity: {self.recompute_granularity} must be "full"'
+                    'or "selective".'
+                )
+
+            if self.recompute_method is not None:
+                if self.recompute_method not in ['block', 'uniform']:
+                    raise ValueError(
+                        f'recompute_method: {self.recompute_method} must be "block" or "uniform".'
+                    )
+            elif self.recompute_granularity != 'selective':
+                raise ValueError(
+                    f'Using recompute_granularity: {self.recompute_granularity} so '
+                    'recompute_method must be "block" or "uniform"'
+                )
+
+            if self.recompute_granularity != 'selective' and self.recompute_num_layers is None:
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} '
+                    'recompute_num_layers must be between '
+                    '1 and num_layers_per_pipeline_rank: '
+                    f'{self.num_layers // self.pipeline_model_parallel_size}'
+                )
+            elif (
+                self.recompute_granularity == 'selective' and self.recompute_num_layers is not None
+            ):
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} '
+                    'recompute_num_layers must be None.'
+                )
+
+            if self.distribute_saved_activations and self.sequence_parallel:
+                raise ValueError(
+                    f'distribute_saved_activations: {self.distribute_saved_activations} must be '
+                    f'false when sequence parallel is enabled: {self.sequence_parallel}'
+                )
+
+        if (
+            self.num_layers_in_first_pipeline_stage is not None
+            or self.num_layers_in_last_pipeline_stage is not None
+        ) and (
+            self.account_for_embedding_in_pipeline_split or self.account_for_loss_in_pipeline_split
+        ):
+            raise ValueError(
+                'num_layers_in_first_pipeline_stage and num_layers_in_last_pipeline_stage cannot be'
+                'set at the same time with account_for_embedding_in_pipeline_split'
+                'and account_for_loss_in_pipeline_split'
+            )
+
+        if (
+            self.num_layers_in_first_pipeline_stage is not None
+            or self.num_layers_in_last_pipeline_stage is not None
+        ):
+            pipeline_parallel_size = self.pipeline_model_parallel_size
+            num_layers = self.num_layers
+
+            if self.num_layers_in_first_pipeline_stage is not None:
+                if self.num_layers_in_first_pipeline_stage <= 0:
+                    raise ValueError('num_layers_in_first_pipeline_stage must be larger than 0')
+
+                if self.virtual_pipeline_model_parallel_size is not None:
+                    if (
+                        self.num_layers_in_first_pipeline_stage
+                        % self.virtual_pipeline_model_parallel_size
+                        != 0
+                    ):
+                        raise ValueError(
+                            f'number of layers at first stage: '
+                            f'{self.num_layers_in_first_pipeline_stage}'
+                            f'must be divisible by virtual pipeline'
+                            f'parallel degree {self.virtual_pipeline_model_parallel_size}'
+                        )
+                num_layers -= self.num_layers_in_first_pipeline_stage
+                pipeline_parallel_size -= 1
+
+            if self.num_layers_in_last_pipeline_stage is not None:
+                if self.num_layers_in_last_pipeline_stage <= 0:
+                    raise ValueError('num_layers_in_last_pipeline_stage must be larger than 0')
+
+                if self.virtual_pipeline_model_parallel_size is not None:
+                    if (
+                        self.num_layers_in_last_pipeline_stage
+                        % self.virtual_pipeline_model_parallel_size
+                        != 0
+                    ):
+                        raise ValueError(
+                            f'number of layers at last stage: '
+                            f'{self.num_layers_in_last_pipeline_stage}'
+                            f'must be divisible by virtual pipeline'
+                            f'parallel degree {self.virtual_pipeline_model_parallel_size}'
+                        )
+                num_layers -= self.num_layers_in_last_pipeline_stage
+                pipeline_parallel_size -= 1
+
+            if not num_layers % pipeline_parallel_size == 0:
+                raise ValueError(
+                    f'number of layers at middle stage: {num_layers} must be divisible by'
+                    f'the middle pipeline model parallel size {pipeline_parallel_size}'
+                )
+
+            if self.virtual_pipeline_model_parallel_size is not None:
+                num_layers_per_middle_pipeline_rank = num_layers // pipeline_parallel_size
+                if (
+                    not num_layers_per_middle_pipeline_rank
+                    % self.virtual_pipeline_model_parallel_size
+                    == 0
+                ):
+                    raise ValueError(
+                        f'number of layers on each middle pipeline rank:'
+                        f'{num_layers_per_middle_pipeline_rank} must be divisible by virtual'
+                        f'pipeline parallel degree {self.virtual_pipeline_model_parallel_size}'
+                    )
+
+        if self.account_for_embedding_in_pipeline_split or self.account_for_loss_in_pipeline_split:
+            if self.virtual_pipeline_model_parallel_size is None:
+                pipeline_parallel_size = self.pipeline_model_parallel_size
+
+                if self.account_for_embedding_in_pipeline_split:
+                    pipeline_parallel_size -= 1
+
+                if self.account_for_loss_in_pipeline_split:
+                    pipeline_parallel_size -= 1
+
+                if not self.num_layers % pipeline_parallel_size == 0:
+                    raise ValueError(
+                        f'number of middle layers: {self.num_layers} must be divisible by '
+                        f'middle pipeline_model_parallel_size {pipeline_parallel_size}'
+                    )
+            else:
+                num_layers = self.num_layers
+                if self.account_for_embedding_in_pipeline_split:
+                    num_layers += 1
+
+                if self.account_for_loss_in_pipeline_split:
+                    num_layers += 1
+
+                if not num_layers % self.pipeline_model_parallel_size == 0:
+                    raise ValueError(
+                        f'num_layers: {num_layers} after enable'
+                        f'account_for_embedding_in_pipeline_split or '
+                        f'account_for_loss_in_pipeline_split must be divisible'
+                        f'by pipeline_model_parallel_size '
+                        f'{self.pipeline_model_parallel_size}'
+                    )
+
+                num_layers_per_pipeline_rank = num_layers // self.pipeline_model_parallel_size
+                if (
+                    not num_layers_per_pipeline_rank % self.virtual_pipeline_model_parallel_size
+                    == 0
+                ):
+                    raise ValueError(
+                        f'number of layers on each pipeline rank: {num_layers_per_pipeline_rank}'
+                        f'(after enable account_for_embedding_in_pipeline_split or '
+                        f'account_for_loss_in_pipeline_split) must be divisible by'
+                        f'virtual_pipeline_model_parallel_size'
+                        f'{self.virtual_pipeline_model_parallel_size}'
+                    )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.bias_activation_fusion:
+            if self.activation_func not in [F.gelu, F.silu]:
+                raise ValueError(
+                    "When bias_activation_fusion is True, activation function should be either "
+                    "gelu or swiglu"
+                )
+            if (
+                self.activation_func == F.gelu
+                and not self.gated_linear_unit
+                and not self.add_bias_linear
+            ):
+                raise ValueError(
+                    "When bias_activation_fusion is True, gated_linear_unit is False, "
+                    "and activation function is gelu, add_bias_linear must also be True."
+                )
+
+        if self.activation_func_fp8_input_store:
+            if self.activation_func != F.silu or not self.gated_linear_unit:
+                raise ValueError("Storing activation input in FP8 is supported only for SwiGLU.")
+
+        if self.apply_rope_fusion:
+            if self.rotary_interleaved:
+                raise ValueError("rotary_interleaved does not work with apply_rope_fusion.")
+
+            from megatron.core.models.common.embeddings.rope_utils import (
+                fused_apply_rotary_pos_emb,
+                fused_apply_rotary_pos_emb_thd,
+            )
+
+            if fused_apply_rotary_pos_emb is None and fused_apply_rotary_pos_emb_thd is None:
+                raise ValueError(
+                    "apply_rope_fusion is not available. Please install TE >= 1.4 or Apex."
+                )
+
+            if self.multi_latent_attention:
+                raise ValueError("multi_latent_attention does not support apply_rope_fusion.")
+
+        if self.multi_latent_attention and self.rotary_interleaved:
+            raise ValueError("rotary_interleaved does not work with multi_latent_attention.")
+
+        if self.init_method is None:
+            self.init_method = init_method_normal(self.init_method_std)
+
+        if self.output_layer_init_method is None:
+            self.output_layer_init_method = scaled_init_method_normal(
+                self.init_method_std, self.num_layers
+            )
+
+        if (
+            self.moe_token_dispatcher_type == "alltoall_seq"
+            and self.tensor_model_parallel_size != self.expert_tensor_parallel_size
+        ):
+            raise ValueError(
+                "alltoall_seq dispatcher not support different TP size for MoE and Dense layer."
+            )
+
+        if self.moe_router_enable_expert_bias and self.moe_router_score_function != "sigmoid":
+            raise ValueError(
+                "Expert bias for aux-loss-free routing only supports sigmoid score function."
+                "Please set --moe-router-score-function sigmoid for sigmoid score function."
+            )
+
+        if self.num_moe_experts and self.fp8:
+            # TE version below 1.7.0 will raise Error when handle zeros tokens for expert
+            if not is_te_min_version("1.7.0.dev0"):
+                raise ValueError(
+                    "Only transformer-engine>=1.7.0 supports MoE FP8 training, "
+                    f"but your version is {get_te_version()}."
+                )
+
+            if self.moe_grouped_gemm and not is_te_min_version("1.11.0"):
+                raise ValueError(
+                    "Only transformer-engine>=1.11.0 supports FP8 grouped gemm, "
+                    f"but your version is {get_te_version()}."
+                )
+
+        if (
+            self.moe_router_topk == 1
+            and self.moe_router_score_function == 'softmax'
+            and not self.moe_router_pre_softmax
+            and self.moe_router_load_balancing_type != 'sinkhorn'
+        ):
+            # Requires applying softmax before selecting the top-k when k is 1,
+            # since softmax on a [num_tokens, 1] would yield a zero gradient.
+            raise ValueError("Please use --moe-router-pre-softmax when topk is 1.")
+
+        if self.moe_router_group_topk:
+            if self.moe_router_topk_limited_devices:
+                raise ValueError(
+                    "moe_router_topk_limited_devices is deprecated and replaced by "
+                    "moe_router_group_topk and moe_router_num_groups."
+                )
+            if not self.moe_router_num_groups:
+                raise ValueError(
+                    "When using group limited routing, moe_router_num_groups must be specified."
+                )
+            else:
+                assert self.num_moe_experts % self.moe_router_num_groups == 0, (
+                    f"num_moe_experts ({self.num_moe_experts}) should be divisible by "
+                    f"moe_router_num_groups ({self.moe_router_num_groups})."
+                )
+                assert self.moe_router_group_topk <= self.moe_router_num_groups, (
+                    f"moe_router_group_topk ({self.moe_router_group_topk}) should be smaller than "
+                    f"moe_router_num_groups ({self.moe_router_num_groups})."
+                )
+        elif self.moe_router_topk_limited_devices:
+            warnings.warn(
+                "moe_router_topk_limited_devices is deprecated. Use moe_router_group_topk and "
+                "moe_router_num_groups instead."
+            )
+            self.moe_router_group_topk = self.moe_router_topk_limited_devices
+            self.moe_router_num_groups = self.expert_model_parallel_size
+
+        if self.flash_decode and self.fp8:
+            raise ValueError("FP8 inference is currently not support with flash decoding.")
+
+        if self.enable_cuda_graph:
+            if self.cpu_offloading:
+                raise ValueError("CUDA graphs not supported with CPU offloading.")
+            if self.recompute_granularity:
+                raise ValueError("CUDA graphs not supported with activation recomputation.")
+
+        if self.moe_token_dispatcher_type in ['allgather', 'alltoall_seq']:
+            if self.variable_seq_lengths is True:
+                raise ValueError(
+                    f"Token dispatcher type: {self.moe_token_dispatcher_type} does not support "
+                    f"variable sequence length, please use alltoall dispatcher instead."
+                )
+
+        if self.moe_permute_fusion:
+            from megatron.core.transformer.moe.moe_utils import (
+                fused_permute,
+                fused_sort_chunks_by_index,
+                fused_unpermute,
+            )
+
+            if (
+                fused_permute is None
+                or fused_sort_chunks_by_index is None
+                or fused_unpermute is None
+            ):
+                raise ValueError("fused permutation is not available. Please install TE >= 2.1.0.")
+
+        if self.cp_comm_type is not None:
+            if isinstance(self.cp_comm_type, list):
+                assert len(self.cp_comm_type) == self.num_layers, (
+                    f"Length of cp_comm_type ({len(self.cp_comm_type)}) should equal to "
+                    f"the total number of transformer layers ({self.num_layers})!"
+                )
+            else:
+                assert isinstance(
+                    self.cp_comm_type, str
+                ), "Unsupported communication type for context parallelism!"
+
+        assert (
+            self.pipeline_model_parallel_size > 0
+        ), f"Pipeline model parallel size must be larger than 0 \
+            when enable --standalone-embedding-stage and --standalone-loss-stage"
+
+
+@dataclass
+class MLATransformerConfig(TransformerConfig):
+    """Configuration object for megatron-core Multi-Latent Attention (MLA) transformers.
+
+    The initialization function has an argument for each parameter, including those in
+    ModelParallelConfig. Included YaRN RoPE parameters that is fused in MLA.
+    """
+
+    multi_latent_attention: bool = True
+    """Whether to use Multi-Latent Attention."""
+
+    q_lora_rank: int = 512
+    """Rank of Query tensor's low rank representation."""
+
+    kv_lora_rank: int = 512
+    """Rank of Key and Value tensors' low rank representation."""
+
+    qk_head_dim: int = 128
+    """Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim"""
+
+    qk_pos_emb_head_dim: int = 64
+    """Dimension of the position embedding in the QK projection."""
+
+    v_head_dim: int = 128
+    """Dimension of the head in the V projection."""
+
+    normalization: str = "RMSNorm"
+    """Default normalization layer for MLA models is RMSNorm."""
+
+    rope_type: str = "yarn"
+    """Type of RoPE to use. Default to yarn, options are rope and yarn."""
+
+    rotary_base: float = 10000
+    """Rotary base for the rotary embeddings, used by rope and yarn."""
+
+    rotary_percent: float = 1.0
+    """Rotary percent for the rotary embeddings, used by rope."""
+
+    rotary_scaling_factor: float = 40
+    """Rotary scaling factor for the rotary embeddings, used by yarn."""
+
+    max_position_embeddings: int = 4096
+    """Maximum position embeddings for the original model, used by yarn."""
+
+    beta_fast: float = 32
+    """Beta fast for YaRN RoPE, used by yarn."""
+
+    beta_slow: float = 1
+    """Beta slow for YaRN RoPE, used by yarn."""
+
+    mscale: float = 0.707
+    """Mscale for YaRN RoPE in Multi-Latent Attention, used by yarn."""
+
+    mscale_all_dim: float = 0.707
+    """Mscale all dimensions for YaRN RoPE in Multi-Latent Attention, used by yarn."""
--- a/dcu_megatron/core/utils.py
+++ b/dcu_megatron/core/utils.py
+import torch
+
+from typing import List, Optional, Union
+
+
+def tensor_slide(
+        tensor: Optional[torch.Tensor],
+        num_slice: int,
+        dims: Union[int, List[int]] = -1,
+        step: int = 1,
+        return_first=False,
+) -> List[Union[torch.Tensor, None]]:
+    """通用滑动窗口函数，支持任意维度"""
+    if tensor is None:
+        # return `List[None]` to avoid NoneType Error
+        return [None] * (num_slice + 1)
+
+    if num_slice == 0:
+        return [tensor]
+
+    window_size = tensor.shape[-1] - num_slice
+    dims = [dims] if isinstance(dims, int) else sorted(dims, reverse=True)
+
+    # 连续多维度滑动
+    slices = []
+    for i in range(0, tensor.size(dims[-1]) - window_size + 1, step):
+        slice_obj = [slice(None)] * tensor.dim()
+        for dim in dims:
+            slice_obj[dim] = slice(i, i + window_size)
+        slices.append(tensor[tuple(slice_obj)])
+        if return_first:
+            return slices
+    return slices
--- a/dcu_megatron/training/arguments.py
+++ b/dcu_megatron/training/arguments.py
+import argparse
+
+from megatron.training.arguments import (
+    _add_network_size_args,
+    _add_regularization_args,
+    _add_training_args,
+    _add_initialization_args,
+    _add_learning_rate_args,
+    _add_checkpointing_args,
+    _add_mixed_precision_args,
+    _add_distributed_args,
+    _add_validation_args,
+    _add_data_args,
+    _add_autoresume_args,
+    _add_biencoder_args,
+    _add_vision_args,
+    _add_moe_args,
+    _add_mla_args,
+    _add_logging_args,
+    _add_straggler_detector_args,
+    _add_inference_args,
+    _add_transformer_engine_args,
+    _add_retro_args,
+    _add_experimental_args,
+    _add_one_logger_args,
+    _add_ft_package_args,
+    _add_config_logger_args,
+    _add_rerun_machine_args,
+)
+
+
+def parse_args(extra_args_provider=None, ignore_unknown_args=False):
+    """Parse all arguments."""
+    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
+                                     allow_abbrev=False)
+
+    # Standard arguments.
+    parser = _add_network_size_args(parser)
+    parser = _add_regularization_args(parser)
+    parser = _add_training_args(parser)
+    parser = _add_initialization_args(parser)
+    parser = _add_learning_rate_args(parser)
+    parser = _add_checkpointing_args(parser)
+    parser = _add_mixed_precision_args(parser)
+    parser = _add_distributed_args(parser)
+    parser = _add_validation_args(parser)
+    parser = _add_data_args(parser)
+    parser = _add_tokenizer_args(parser)
+    parser = _add_autoresume_args(parser)
+    parser = _add_biencoder_args(parser)
+    parser = _add_vision_args(parser)
+    parser = _add_moe_args(parser)
+    parser = _add_mla_args(parser)
+    parser = _add_mtp_args(parser)
+    parser = _add_logging_args(parser)
+    parser = _add_straggler_detector_args(parser)
+    parser = _add_inference_args(parser)
+    parser = _add_transformer_engine_args(parser)
+    parser = _add_retro_args(parser)
+    parser = _add_experimental_args(parser)
+    parser = _add_one_logger_args(parser)
+    parser = _add_ft_package_args(parser)
+    parser = _add_config_logger_args(parser)
+    parser = _add_rerun_machine_args(parser)
+
+    # Custom arguments.
+    if extra_args_provider is not None:
+        parser = extra_args_provider(parser)
+
+    # Parse.
+    if ignore_unknown_args:
+        args, _ = parser.parse_known_args()
+    else:
+        args = parser.parse_args()
+
+    # Experimental yaml
+    if args.yaml_cfg is not None:
+        from megatron.training.yaml_arguments import load_yaml
+        assert args.yaml_cfg and not args.use_legacy_models, \
+            "Yaml config is not supported with legacy models."
+        args = load_yaml(args.yaml_cfg)
+
+
+    # Args from environment
+    #args.rank = int(os.getenv('RANK', '0'))
+    #args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    return args
+
+
+def _add_tokenizer_args(parser):
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--vocab-size', type=int, default=None,
+                       help='Size of vocab before EOD or padding.')
+    group.add_argument('--extra-vocab-size', type=int, default=0,
+                       help="--extra-vocab-size")
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file.')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file.')
+    group.add_argument('--vocab-extra-ids', type=int, default=0,
+                       help='Number of additional vocabulary tokens. '
+                            'They are used for span masking in the T5 model')
+    group.add_argument('--tokenizer-type', type=str,
+                       default=None,
+                       choices=['BertWordPieceLowerCase',
+                                'BertWordPieceCase',
+                                'GPT2BPETokenizer',
+                                'SentencePieceTokenizer',
+                                'GPTSentencePieceTokenizer',
+                                'HuggingFaceTokenizer',
+                                'Llama2Tokenizer',
+                                'TikTokenizer',
+                                'MultimodalTokenizer',
+                                'NullTokenizer',
+                                'DeepSeekV2Tokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--tokenizer-model', type=str, default=None,
+                       help='Sentencepiece tokenizer model.')
+    group.add_argument('--tiktoken-pattern', type=str, default=None,
+                       help='Which tiktoken pattern to use. Options: [v1, v2]')
+    group.add_argument('--tiktoken-num-special-tokens', type=int, default=1000,
+                       help='Number of special tokens in tiktoken tokenizer')
+    group.add_argument('--tiktoken-special-tokens', type=str, nargs='+', default=None,
+                       help='List of tiktoken special tokens, needs to have ["<unk>", "<s>", "</s>"]')
+    return parser
+
+
+def _add_mtp_args(parser):
+    group = parser.add_argument_group(title='multi token prediction')
+    group.add_argument('--num-nextn-predict-layers', type=int, default=0, help='Multi-Token prediction layer num')
+    group.add_argument('--mtp-loss-scale', type=float, default=0.3, help='Multi-Token prediction loss scale')
+    group.add_argument('--recompute-mtp-norm', action='store_true', default=False,
+                       help='Multi-Token prediction recompute norm')
+    group.add_argument('--recompute-mtp-layer', action='store_true', default=False,
+                       help='Multi-Token prediction recompute layer')
+    group.add_argument('--share-mtp-embedding-and-output-weight', action='store_true', default=False,
+                       help='Main model share embedding and output weight with mtp layer.')
+    return parser
\ No newline at end of file
--- a/dcu_megatron/training/tokenizer/__init__.py
+++ b/dcu_megatron/training/tokenizer/__init__.py
+from .tokenizer import build_tokenizer
\ No newline at end of file
--- a/dcu_megatron/training/tokenizer/tokenizer.py
+++ b/dcu_megatron/training/tokenizer/tokenizer.py
+from transformers import AutoTokenizer
+from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
+from megatron.training.tokenizer.tokenizer import (
+    _BertWordPieceTokenizer,
+    _GPT2BPETokenizer,
+    _SentencePieceTokenizer,
+    _GPTSentencePieceTokenizer,
+    _HuggingFaceTokenizer,
+    _Llama2Tokenizer,
+    CustomTikTokenizer,
+    _NullTokenizer,
+    _vocab_size_with_padding
+)
+
+
+def build_tokenizer(args, **kwargs):
+    """Initialize tokenizer."""
+    if args.rank == 0:
+        print('> building {} tokenizer ...'.format(args.tokenizer_type), flush=True)
+
+    # Select and instantiate the tokenizer.
+    if args.tokenizer_type == 'BertWordPieceLowerCase':
+        assert args.vocab_file is not None
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file, lower_case=True, vocab_extra_ids=args.vocab_extra_ids
+        )
+    elif args.tokenizer_type == 'BertWordPieceCase':
+        assert args.vocab_file is not None
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file, lower_case=False, vocab_extra_ids=args.vocab_extra_ids
+        )
+    elif args.tokenizer_type == 'GPT2BPETokenizer':
+        assert args.vocab_file is not None
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+    elif args.tokenizer_type == 'SentencePieceTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _SentencePieceTokenizer(
+            args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids
+        )
+    elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'HuggingFaceTokenizer':
+        tokenizer = _HuggingFaceTokenizer(args.tokenizer_model, **kwargs)
+    elif args.tokenizer_type == 'Llama2Tokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _Llama2Tokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'TikTokenizer':
+        assert args.tokenizer_model is not None
+        assert args.tiktoken_pattern is not None
+        assert args.tiktoken_pattern in {"v1", "v2"}
+        pattern = PATTERN_TIKTOKEN if args.tiktoken_pattern == "v1" else PATTERN_TIKTOKEN_V2
+        tokenizer = CustomTikTokenizer(
+            path=args.tokenizer_model,
+            pattern=pattern,
+            vocab_size=args.vocab_size,
+            num_special_tokens=args.tiktoken_num_special_tokens,
+            special_tokens=args.tiktoken_special_tokens,
+        )
+    elif args.tokenizer_type == 'NullTokenizer':
+        assert args.vocab_size is not None
+        tokenizer = _NullTokenizer(args.vocab_size)
+    elif args.tokenizer_type == "MultimodalTokenizer":
+        try:
+            import transformers
+        except ImportError:
+            raise ImportError(
+                "MultimodalTokenizer currently requires transformers library to be installed"
+            )
+
+        kwargs = dict()
+        if args.tokenizer_prompt_format == "nvlm-yi-34b":
+            kwargs = {
+                "from_slow": True,
+                "legacy": False,
+                "add_bos_token": True,
+            }
+
+        # Currently, only HuggingFace tokenizers are supported.
+        underlying_tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=args.tokenizer_model, **kwargs
+        )
+
+        tokenizer = MultimodalTokenizer(
+            underlying_tokenizer,
+            args.tokenizer_prompt_format,
+            args.special_tokens,
+            args.image_tag_type,
+        )
+    elif args.tokenizer_type == "DeepSeekV2Tokenizer":
+        tokenizer = _DeepSeekV2Tokenizer(args.tokenizer_model, args.extra_vocab_size)
+        args.padded_vocab_size = tokenizer.vocab_size
+    else:
+        raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type))
+
+    # Add vocab size (if not already set from a checkpoint).
+    if getattr(args, "padded_vocab_size", None) is None:
+        args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
+
+    return tokenizer
+
+
+class _DeepSeekV2Tokenizer(MegatronTokenizer):
+    def __init__(self, tokenizer_path, extra_vocab_size):
+        super().__init__(tokenizer_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path,
+            padding_side="right",
+            trust_remote_code=True
+        )
+        self.extra_vocab_size = extra_vocab_size
+
+        if self.tokenizer.chat_template is None:
+            self.tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+            try:
+                test_conversation = [
+                    {'role': 'user', 'content': 'hello world'}
+                ]
+                self.apply_chat_template(test_conversation)
+            except Exception:
+                # the default chat_template is invalid, assume user will not do SFT
+                self.tokenizer.chat_template = None
+
+    def __call__(self, text, return_tensors=None,
+                 padding=None, max_length=None, truncation=None, add_special_tokens=None):
+
+        return self.tokenizer(text, return_tensors=return_tensors, padding=padding,
+                max_length=max_length, truncation=truncation, add_special_tokens=add_special_tokens)
+
+    def apply_chat_template(self, conversations, tokenize:bool=True, **kwargs):
+        return self.tokenizer.apply_chat_template(conversations, tokenize=tokenize, **kwargs)
+
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer) + self.extra_vocab_size - 2
+
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def eos_token(self):
+        return self.tokenizer.eos_token
+
+    @property
+    def pad_token_id(self):
+        return self.tokenizer.pad_token_id
+
+    @property
+    def eos_token_id(self):
+        return self.tokenizer.eos_token_id
--- a/dcu_megatron/training/utils.py
+++ b/dcu_megatron/training/utils.py
+import torch
+
+from megatron.core import mpu
+from megatron.training import get_args
+
+
+def get_batch_on_this_tp_rank(data_iterator):
+
+    args = get_args()
+
+    def _broadcast(item):
+        if item is not None:
+            torch.distributed.broadcast(item, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group())
+
+    if mpu.get_tensor_model_parallel_rank() == 0:
+
+        if data_iterator is not None:
+            data = next(data_iterator)
+        else:
+            data = None
+
+        batch = {
+            'tokens': data["tokens"].cuda(non_blocking = True),
+            'labels': data["labels"].cuda(non_blocking = True),
+            'loss_mask': data["loss_mask"].cuda(non_blocking = True),
+            'attention_mask': None if "attention_mask" not in data else data["attention_mask"].cuda(non_blocking = True),
+            'position_ids': data["position_ids"].cuda(non_blocking = True)
+        }
+
+        if args.pipeline_model_parallel_size == 1:
+            _broadcast(batch['tokens'])
+            _broadcast(batch['labels'])
+            _broadcast(batch['loss_mask'])
+            _broadcast(batch['attention_mask'])
+            _broadcast(batch['position_ids'])
+
+        elif mpu.is_pipeline_first_stage():
+            _broadcast(batch['tokens'])
+            _broadcast(batch['attention_mask'])
+            _broadcast(batch['position_ids'])
+
+        elif mpu.is_pipeline_last_stage():
+            if args.num_nextn_predict_layers:
+                _broadcast(batch['tokens'])
+            _broadcast(batch['labels'])
+            _broadcast(batch['loss_mask'])
+            _broadcast(batch['attention_mask'])
+            if args.reset_position_ids or args.num_nextn_predict_layers:
+                _broadcast(batch['position_ids'])
+
+    else:
+        tokens=torch.empty((args.micro_batch_size, args.seq_length + args.num_nextn_predict_layers), 
+                            dtype = torch.int64, 
+                            device = torch.cuda.current_device())
+        labels=torch.empty((args.micro_batch_size, args.seq_length + args.num_nextn_predict_layers), 
+                            dtype = torch.int64, 
+                            device = torch.cuda.current_device())
+        loss_mask=torch.empty((args.micro_batch_size, args.seq_length + args.num_nextn_predict_layers), 
+                                dtype = torch.float32, 
+                                device = torch.cuda.current_device())
+        if args.create_attention_mask_in_dataloader:
+            attention_mask=torch.empty(
+                (args.micro_batch_size, 1, args.seq_length + args.num_nextn_predict_layers,
+                    args.seq_length + args.num_nextn_predict_layers), dtype = torch.bool, 
+                device = torch.cuda.current_device()
+            )
+        else:
+            attention_mask=None
+        position_ids=torch.empty((args.micro_batch_size, args.seq_length + args.num_nextn_predict_layers), 
+                                    dtype = torch.int64, 
+                                    device = torch.cuda.current_device())
+
+        if args.pipeline_model_parallel_size == 1:
+            _broadcast(tokens)
+            _broadcast(labels)
+            _broadcast(loss_mask)
+            _broadcast(attention_mask)
+            _broadcast(position_ids)
+
+        elif mpu.is_pipeline_first_stage():
+            labels=None
+            loss_mask=None
+
+            _broadcast(tokens)
+            _broadcast(attention_mask)
+            _broadcast(position_ids)
+
+        elif mpu.is_pipeline_last_stage():
+            if args.num_nextn_predict_layers:
+                _broadcast(tokens)
+            else:
+                tokens = None
+        
+            _broadcast(labels)
+            _broadcast(loss_mask)
+            _broadcast(attention_mask)
+
+            if args.reset_position_ids or args.num_nextn_predict_layers:
+                _broadcast(position_ids)
+            else:
+                position_ids = None
+
+        batch = {
+            'tokens': tokens,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids
+        }
+
+    return batch
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain GPT."""
+
+import os
+import torch
+from functools import partial
+from contextlib import nullcontext
+import inspect
+
+from typing import List, Optional, Tuple, Union
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_timers
+from megatron.training import get_tokenizer
+from megatron.core import mpu
+from megatron.core.enums import ModelType
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset
+from megatron.core.rerun_state_machine import get_rerun_state_machine
+import megatron.legacy.model
+from megatron.core.models.gpt import GPTModel
+from megatron.training import pretrain
+from megatron.core.utils import StragglerDetector
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+    get_blend_and_blend_per_split,
+)
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.training.yaml_arguments import core_transformer_config_from_yaml
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_decoder_block_spec,
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+
+from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
+from dcu_megatron.core.transformer.mtp.mtp_spec import get_mtp_spec
+from dcu_megatron.core.utils import tensor_slide
+from dcu_megatron import megatron_adaptor
+
+
+stimer = StragglerDetector()
+
+def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
+    """Builds the model.
+
+    If you set the use_legacy_models to True, it will return the legacy GPT model and if not the mcore GPT model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
+    """
+    args = get_args()
+    use_te = args.transformer_impl == "transformer_engine"
+
+    if args.record_memory_history:
+        torch.cuda.memory._record_memory_history(True,
+            # keep 100,000 alloc/free events from before the snapshot
+            trace_alloc_max_entries=100000,
+
+            # record stack information for the trace events
+            trace_alloc_record_context=True)
+
+        def oom_observer(device, alloc, device_alloc, device_free):
+            # snapshot right after an OOM happened
+            print('saving allocated state during OOM')
+            snapshot = torch.cuda.memory._snapshot()
+            from pickle import dump
+            dump(snapshot, open(f"oom_rank-{torch.distributed.get_rank()}_{args.memory_snapshot_path}", 'wb'))
+
+        torch._C._cuda_attach_out_of_memory_observer(oom_observer)
+
+    print_rank_0('building GPT model ...')
+    # Experimental loading arguments from yaml
+    if args.yaml_cfg is not None:
+        config = core_transformer_config_from_yaml(args, "language_model")
+    else:
+        config = core_transformer_config_from_args(args)
+
+    print_rank_0(f"config: {config}")
+
+    if args.use_legacy_models:
+        model = megatron.legacy.model.GPTModel(
+            config,
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process,
+        )
+    else: # using core models
+        if args.spec is not None:
+            transformer_layer_spec = import_module(args.spec)
+        else:
+            if args.num_experts:
+                # Define the decoder block spec
+                transformer_layer_spec = get_gpt_decoder_block_spec(config, use_transformer_engine=use_te)
+            else:
+                # Define the decoder layer spec
+                if use_te:
+                    transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+                        args.num_experts, args.moe_grouped_gemm,
+                        args.qk_layernorm, args.multi_latent_attention, args.moe_use_legacy_grouped_gemm)
+                else:
+                    transformer_layer_spec = get_gpt_layer_local_spec(
+                        args.num_experts, args.moe_grouped_gemm,
+                        args.qk_layernorm, args.multi_latent_attention, args.moe_use_legacy_grouped_gemm)
+
+        build_model_context = nullcontext
+        build_model_context_args = {}
+        if args.fp8_param_gather:
+            try:
+                from transformer_engine.pytorch import fp8_model_init
+
+                build_model_context = fp8_model_init
+                build_model_context_args["enabled"] = True
+
+                # Check if fp8_model_init supports preserve_high_precision_init_val
+                if "preserve_high_precision_init_val" in inspect.signature(fp8_model_init).parameters:
+                    build_model_context_args["preserve_high_precision_init_val"] = True
+            except:
+                raise RuntimeError("--fp8-param-gather requires `fp8_model_init` from TransformerEngine, but not found.")
+
+        # Define the mtp layer spec
+        if isinstance(transformer_layer_spec, TransformerBlockSubmodules):
+            mtp_transformer_layer_spec = transformer_layer_spec.layer_specs[-1]
+        else:
+            mtp_transformer_layer_spec = transformer_layer_spec
+
+        mtp_spec = get_mtp_spec(mtp_transformer_layer_spec, use_te=use_te)
+        with build_model_context(**build_model_context_args):
+            model = GPTModel(
+                config=config,
+                transformer_layer_spec=transformer_layer_spec,
+                vocab_size=args.padded_vocab_size,
+                max_sequence_length=args.max_position_embeddings,
+                pre_process=pre_process,
+                post_process=post_process,
+                fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+                parallel_output=True,
+                share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+                position_embedding_type=args.position_embedding_type,
+                rotary_percent=args.rotary_percent,
+                rotary_base=args.rotary_base,
+                rope_scaling=args.use_rope_scaling,
+                mtp_spec=mtp_spec
+            )
+    # model = torch.compile(model,mode='max-autotune-no-cudagraphs')
+    print_rank_0(model)
+
+    return model
+
+
+def get_batch(data_iterator):
+    """Generate a batch."""
+
+    # TODO: this is pretty hacky, find a better way
+    if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+        return None, None, None, None, None
+
+    # get batches based on the TP rank you are on
+    batch = get_batch_on_this_tp_rank(data_iterator)
+
+    # slice batch along sequence dimension for context parallelism
+    batch = get_batch_on_this_cp_rank(batch)
+
+    return batch.values()
+
+
+# define spiky loss as a loss that's 10x the max loss observed
+SPIKY_LOSS_FACTOR = 10
+
+
+def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+    """Loss function.
+
+    Args:
+        loss_mask (torch.Tensor): Used to mask out some portions of the loss
+        output_tensor (torch.Tensor): The tensor with the losses
+
+    Returns:
+        the loss scalar for this micro-batch
+        the number of non-padded tokens in this microbatch
+        a dict containing reporting metrics on the loss and number of tokens across
+            the data parallel ranks
+    """
+    args = get_args()
+
+    losses = output_tensor.float()
+    if args.num_nextn_predict_layers > 0:
+        loss_mask = tensor_slide(loss_mask, args.num_nextn_predict_layers, return_first=True)[0]
+    loss_mask = loss_mask.view(-1).float()
+    total_tokens = loss_mask.sum()
+    loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
+
+    if args.context_parallel_size > 1:
+        torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
+
+    # Check individual rank losses are not NaN prior to DP all-reduce.
+    rerun_state_machine = get_rerun_state_machine()
+    if args.check_for_nan_in_loss_and_grad:
+        rerun_state_machine.validate_result(
+            result=loss[0],
+            rejection_func=torch.isnan,
+            message="found NaN in local forward loss calculation",
+            tolerance=0.0,        # forward pass calculations are determinisic
+            fatal=True,
+        )
+        rerun_state_machine.validate_result(
+            result=loss[0],
+            rejection_func=torch.isinf,
+            message="found Inf in local forward loss calculation",
+            tolerance=0.0,        # forward pass calculations are determinisic
+            fatal=True,
+        )
+    # Check for spiky loss
+    if args.check_for_spiky_loss:
+        rerun_state_machine.validate_result(
+            result=loss[0],
+            rejection_func=partial(
+                rerun_state_machine.is_unexpectedly_large,
+                threshold=SPIKY_LOSS_FACTOR,
+                context="loss",
+            ),
+            message="Spiky loss",
+            tolerance=0.0,        # forward pass calculations are determinisic
+            fatal=False,
+        )
+    # Reduce loss for logging.
+    reporting_loss = loss.clone().detach()
+    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
+
+    local_num_tokens = loss[1].clone().detach().to(torch.int)
+    return (
+        loss[0] * args.context_parallel_size,
+        local_num_tokens,
+        {'lm loss': (reporting_loss[0], reporting_loss[1])},
+    )
+
+
+def forward_step(data_iterator, model: GPTModel):
+    """Forward training step.
+
+    Args:
+        data_iterator : Input data iterator
+        model (GPTModel): The GPT Model
+    """
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    global stimer
+    with stimer(bdata=True):
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+            data_iterator)
+    timers('batch-generator').stop()
+
+    with stimer:
+        output_tensor = model(tokens, position_ids, attention_mask,
+                              labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def is_dataset_built_on_rank():
+    return (
+        mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
+    ) and mpu.get_tensor_model_parallel_rank() == 0
+
+
+def core_gpt_dataset_config_from_args(args):
+    tokenizer = get_tokenizer()
+
+    # Sometimes --data-path is too long, instead we parse it from a file.
+    blend: Optional[Tuple[List[str], Optional[List[float]]]]
+    blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]]
+    blend, blend_per_split = get_blend_and_blend_per_split(args)
+
+    return GPTDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.seq_length + args.num_nextn_predict_layers,
+        blend=blend,
+        blend_per_split=blend_per_split,
+        split=args.split,
+        num_dataset_builder_threads=args.num_dataset_builder_threads,
+        path_to_cache=args.data_cache_path,
+        mmap_bin_files=args.mmap_bin_files,
+        tokenizer=tokenizer,
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+        create_attention_mask=args.create_attention_mask_in_dataloader,
+        s3_cache_path=args.s3_cache_path,
+    )
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build the train test and validation datasets.
+
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train test and validation.
+    """
+    args = get_args()
+
+    config = core_gpt_dataset_config_from_args(args)
+
+    if args.mock_data:
+        dataset_type = MockGPTDataset
+    else:
+        dataset_type = GPTDataset
+
+    print_rank_0("> building train, validation, and test datasets for GPT ...")
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        dataset_type,
+        train_val_test_num_samples,
+        is_dataset_built_on_rank,
+        config
+    ).build()
+
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_or_decoder,
+        forward_step,
+        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+    )