update to core_v0.9

4b097dee · liangjing · 3aca1415 · 4b097dee · 4b097dee · 4b097dee
Commit 4b097dee authored Oct 29, 2024 by liangjing
20 changed files
--- a/megatron/core/inference_params.py
+++ b/megatron/core/inference_params.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 class InferenceParams:
    """Inference parameters that are passed to the main model in order
    to efficienly calculate and store the context during inference."""
@@ -25,3 +26,6 @@ class InferenceParams:
                new_inference_key_memory,
                new_inference_value_memory,
            )
+    def __str__(self):
+        return f"InferenceParams(max_seq_len = {self.max_sequence_length}, max_batch_size = {self.max_batch_size}, sequence_len_offset = {self.sequence_len_offset}, batch_size_offset = {self.batch_size_offset}, key_value_memory_dict = {self.key_value_memory_dict.keys()})"
--- a/megatron/core/jit.py
+++ b/megatron/core/jit.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+TORCH_MAJOR = int(torch.__version__.split(".")[0])
+TORCH_MINOR = int(torch.__version__.split(".")[1])
+jit_fuser = torch.jit.script
+# nvFuser is deprecated in PyTorch JIT starting from 2.2
+if (TORCH_MAJOR > 2) or (TORCH_MAJOR == 2 and TORCH_MINOR >= 2):
+    jit_fuser = torch.compile
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 from dataclasses import dataclass
-from typing import Callable, Optional
+from typing import Callable, ContextManager, Optional
 import torch
@@ -10,152 +10,298 @@ import torch
 class ModelParallelConfig:
    """Base configuration for Megatron Core
-    Model Parallelism
+    The initialization function has an argument for each parameter.
-    -----------------
+    """
-    tensor_model_parallel_size (int): Intra-layer model parallelism. Splits tensors across GPU ranks. Defaults to 1.
+    ###################
+    # Model parallelism
+    ###################
+    tensor_model_parallel_size: int = 1
+    """Intra-layer model parallelism. Splits tensors across GPU ranks."""
-    pipeline_model_parallel_size (int): Inter-layer model parallelism. Splits transformer layers across GPU
+    pipeline_model_parallel_size: int = 1
-        ranks. Defaults to 1.
+    """Inter-layer model parallelism. Splits transformer layers across GPU ranks."""
-    virtual_pipeline_model_parallel_size (int): Interleaved pipeline parallelism is used to improve performance by
+    virtual_pipeline_model_parallel_size: Optional[int] = None
-        reducing the pipeline bubble.  Considers a transformer block as a list of smaller transformer (virtual) blocks.
+    """Interleaved pipeline parallelism is used to improve performance by reducing the pipeline
-        The number of virtual blocks per pipeline model parallel rank is the virtual model parallel size.  See Efficient
+       bubble.  Considers a transformer block as a list of smaller transformer (virtual) blocks.
-        Large-Scale Language Model Training on GPU Clusters Using Megatron-LM: https://arxiv.org/pdf/2104.04473.pdf for
+       The number of virtual blocks per pipeline model parallel rank is the virtual model parallel
-        more details.  Defaults to None.
+       size.  See Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM:
+       arxiv.org/pdf/2104.04473.pdf for more details.
+    """
-    sequence_parallel (bool): Makes tensor parallelism more memory efficient for LLMs (20B+) by
+    sequence_parallel: bool = False
-        parallelizing layer norms and dropout sequentially.  See Reducing Activation Recomputation in Large Transformer
+    """Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms
-        Models: https://arxiv.org/abs/2205.05198 for more details. Defaults to False.
+       and dropout sequentially.  See Reducing Activation Recomputation in Large Transformer Models
+       (https://arxiv.org/abs/2205.05198) for more details.
+    """
-    Initialization
+    context_parallel_size: int = 1
-    --------------
+    """Splits network input along sequence dimension across GPU ranks."""
-    perform_initialization (bool, default=True): If true, weights are initialized. This option can be useful when you
+    expert_model_parallel_size: int = 1
-        know you are going to load values from a checkpoint.
+    """Distributes Moe Experts across sub data parallel dimension."""
-    use_cpu_initialization: (bool, default=False): When set to False, we initialize the weights directly on the GPU.
+    moe_extended_tp: bool = False
-        Transferring weights from CPU to GPU can take a significant amount of time for large models. Defaults to False.
+    """Alternative parallelization strategy for expert parallelism. Instead of distributing experts
+       across expert_model_parallel_size, each expert is sharded along extendended tensor parallel
+       domain (tensor_model_paralle_size * expert_model_parallel_size). It avoids the load balancing
+       problem with MOE training.
+    """
-    Training
+    ###################
-    --------
+    # Initialization
+    ###################
+    perform_initialization: bool = True
+    """If true, weights are initialized. This option can be useful when you know you are going to
+       load values from a checkpoint.
+    """
-    fp16 (bool): If true, train with fp16 mixed precision training. Defaults to False.
+    use_cpu_initialization: bool = False
+    """When set to False, we initialize the weights directly on the GPU. CPU initialization is the
+       same regardless of tensor model parallelism, but GPU initialization is not. Transferring
+       weights from CPU to GPU can take a significant amount of time for large models.
+    """
-    bf16 (bool): If true, train with bf16 mixed precision training. Defaults to False.
+    ###################
+    # Training
+    ###################
+    fp16: bool = False
+    """If true, train with fp16 mixed precision training."""
-    params_dtype (torch.dtype): dtype used when intializing the weights. Defaults to torch.float32
+    bf16: bool = False
+    """If true, train with bf16 mixed precision training."""
-    timers (optional, default=None): TODO
+    params_dtype: torch.dtype = torch.float32
+    """dtype used when intializing the weights."""
-    Optimizations
+    timers: Callable = None
-    -------------
+    """Timers object to call for various timing functions. See megatron.core.timers.Timers"""
-    gradient_accumulation_fusion (bool): If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA
+    finalize_model_grads_func: Callable = None
-        extension fused_weight_gradient_mlp_cuda module. To use gradient_accumulation_fusion you must install APEX with
+    """Function that finalizes gradients on all workers. Could include ensuring that grads are
-        --cpp_ext and --cuda_ext. For example: "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\"
+       all-reduced across data parallelism, pipeline parallelism, and sequence parallelism
-        ". Note that the extension requires CUDA>=11. Otherwise, you must turn off gradient accumulation fusion.
+       dimensions.
-        Defaults to False.
+    """
-    async_tensor_model_parallel_allreduce (bool, default=True): If true, enables asynchronous execution of
+    grad_scale_func: Callable = None
-        tensor-model-parallel all-reduce with weight gradient compuation of a column-linear layer.  Defaults to False.
+    """If using loss scaling, this function should take the loss and return the scaled loss. If
+       None, no function is called on the loss.
+    """
-    Pipeline Parallelism
+    no_sync_func: Callable = None
-    --------------------
+    """Function that creates a context that suppresses asynchronous data-parallel communication. If
+       the model is an instance of core.distributed.DistributedDataParallel, the default is to use
+       core.distributed.DistributedDataParallel.no_sync.
+    """
-    pipeline_dtype (required): dtype used in p2p communication, usually params_dtype
+    grad_sync_func: Callable = None
+    """Function that launches asynchronous gradient reductions (e.g. distributed optimizer gradient
+       reduce-scatters). The function should take one argument: an iterable of parameters whose
+       gradients are to be synchronized.
+    """
-    grad_scale_func (optional, default=None): If using loss scaling, this function should take the loss and return the
+    param_sync_func: Callable = None
-        scaled loss. If None, no function is called on the loss.
+    """Function that launches asynchronous parameter synchronizations (e.g. distributed optimizer
+       parameter all-gathers). The function should take one argument: an iterable of parameters to
+       be synchronized.
+    """
-    enable_autocast (bool): If true runs the forward step function inside torch.autocast context. Default is False.
+    deterministic_mode: bool = False
+    """If true, code that has deterministic execution will be chosen. This usually
+       means slower execution, but is good for debugging and testing. Defaults to False."""
-    autocast_dtype (torch.dtype): dtype to pass to torch.amp.autocast when enabled. Default is pipeline_dtype.
+    enable_autocast: bool = False
+    """If true runs the forward step function inside torch.autocast context."""
-    variable_seq_lengths (bool, default=False): Support for variable sequence lengths across microbatches. Setting this
-        communicates the size of tensors during pipeline parallelism communication, because of this extra overhead it
-        should only be set if the sequence length varies by microbatch within a global batch.
-    num_microbatches_with_partial_activation_checkpoints (int, default=None): If int, set the number of microbatches
+    autocast_dtype: torch.dtype = None
-        where not all of the layers will be checkpointed and recomputed. The rest of the microbatches within the window
+    """dtype to pass to torch.amp.autocast when enabled. If None, is set to pipeline_dtype."""
-        of maximum outstanding microbatches will recompute all layers (either full recompute or selective recompute). If
-        None, the checkpoint and recompute will be left up to the forward_step function.
-    overlap_p2p_comm (bool, optional, default=False): When True some of the peer to peer communication for pipeline
+    num_microbatches_with_partial_activation_checkpoints: Optional[int] = None
-        parallelism will overlap with computation. Must be False if batch_p2p_comm is true.
+    """If int, set the number of microbatches where not all of the layers will be checkpointed and
+       recomputed. The rest of the microbatches within the window of maximum outstanding
+       microbatches will recompute all layers (either full recompute or selective recompute). If
+       None, the checkpoint and recompute will be left up to the forward_step function.
-    batch_p2p_comm (bool, default=True): Use batch_isend_irecv instead of individual isend/irecv calls. Must be False
+    """
-        if overlap_p2p_comm is True.
-    batch_p2p_sync (bool, default=True): When using batch_isend_irecv, do a cuda.device.synchronize afterward to work
+    ###################
-        around a bug in older version of PyTorch.
+    # Optimizations
+    ###################
+    gradient_accumulation_fusion: bool = False
+    """If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA extension
+       fused_weight_gradient_mlp_cuda module. To use gradient_accumulation_fusion you must install
+       APEX with --cpp_ext and --cuda_ext. For example: "pip install --global-option=\"--cpp_ext\"
+       --global-option=\"--cuda_ext\" ". Note that the extension requires CUDA>=11. Otherwise, you
+       must turn off gradient accumulation fusion.
+    """
-    use_ring_exchange_p2p (bool, default = False): Use custom ring_exchange kernel instead of
+    async_tensor_model_parallel_allreduce: bool = False
-        torch.distributed.batch_isend_irecv(). Requires custom built torch with torch.distributed.ring_exchange.
+    """NOTE: Deprecated. This flag is ignored."""
-    deallocate_pipeline_outputs (optional, default=False): If True, output data is deallocated after the tensor is sent
+    use_te_rng_tracker: bool = False
-        to the next pipeline stage.  Helps with saving memory, does nothing when pipeline parallel is not used.
+    """If true, uses RNG state tracker in TransformerEngine if exists.
+    """
-    no_sync_func (optional): Function that creates a context that suppresses asynchronous data-parallel
+    tp_comm_overlap: bool = False
-        communication. If the model is an instance of torch.nn.DistributedDataParallel, the default is to use
+    """If true, allows overlapping of Linear layer execution with tensor parallel communication
-        torch.nn.DistributedDataParallel.no_sync.
+       collectives like AllGather/ReduceScatter. Overlapping is done for the linear layers wherever
+       possible during the forward and the backward pass.
+    """
-    grad_sync_func (optional): Function that launches asynchronous gradient reductions (e.g. distributed optimizer
+    tp_comm_bulk_wgrad: bool = True
-        gradient reduce-scatters). The function should take one argument: an iterable of parameters whose gradients are
+    """If true, allows All-Gather overlap with Bprop activation gradient GEMM. Don't care if
-        to be synchronized.
+       tp_comm_overlap is False.
+    """
-    param_sync_func (optional): Function that launches asynchronous parameter synchronizations (e.g. distributed
+    tp_comm_bulk_dgrad: bool = True
-        optimizer parameter all-gathers). The function should take one argument: an iterable of parameters to be
+    """If true, allows Reduce-Scatter overlap with Bprop weight gradient GEMM. Don't care if
-        synchronized.
+       tp_comm_overlap is False.
+    """
+    tp_comm_overlap_ag: bool = True
+    """If true, allows All-Gather overlap with GEMM by pipelining the GEMM and All-Gather.
+       Don't care if tp_comm_overlap is False.
    """
-    # Model parallelism
+    tp_comm_overlap_rs: bool = True
-    tensor_model_parallel_size: int = 1
+    """If true, allows Reduce-Scatter overlap with GEMM by pipelining the GEMM and Reduce-Scatter.
-    pipeline_model_parallel_size: int = 1
+       Don't care if tp_comm_overlap is False.
-    virtual_pipeline_model_parallel_size: Optional[int] = None
+    """
-    sequence_parallel: bool = False
-    # Initialization
+    tp_comm_overlap_rs_dgrad: bool = False
-    perform_initialization: bool = True
+    """If true, allows Reduce-Scatter overlap with DGRAD GEMM by pipelining the
-    use_cpu_initialization: bool = False
+       GEMM and Reduce-Scatter splits. Don't care if tp_comm_overlap is False.
+    """
-    # Training
+    tp_comm_split_ag: bool = True
-    fp16: bool = False
+    """Deprecated from TransformerEngine v1.6.0.
-    bf16: bool = False
+       If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather
-    params_dtype: torch.dtype = torch.float32
+       splits. Don't care if tp_comm_overlap is False.
-    timers: Callable = None
+    """
-    # Optimizations
+    tp_comm_atomic_ag: bool = False
-    gradient_accumulation_fusion: bool = False
+    """Deprecated from TransformerEngine v1.6.0.
-    async_tensor_model_parallel_allreduce: bool = False
+        If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather both
+       done atomically. Don't care if tp_comm_overlap is False.
+    """
+    tp_comm_split_rs: bool = True
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and
+       Reduce-Scatter splits. Don't care if tp_comm_overlap is False.
+    """
+    tp_comm_atomic_rs: bool = False
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and
+       Reduce-Scatter both done atomically. Don't care if tp_comm_overlap is False.
+    """
+    cross_entropy_loss_fusion: bool = False
+    """If this is enabled, the fused cross entropy implementation would be used.
+       Defaults to False.
+    """
+    tp_comm_overlap_disable_qkv: bool = False
+    """
+       If true, the AllGather -> Gemm overlap for QKV gets disabled
+    """
+    tp_comm_overlap_disable_fc1: bool = False
+    """
+       If true, the AllGather -> Gemm overlap for FC1 layer of MLP gets disabled
+    """
+    ###################
    # Pipeline Parallel
+    ###################
    pipeline_dtype: torch.dtype = None
-    grad_scale_func: Callable = None
+    """dtype used in p2p communication, usually params_dtype"""
-    enable_autocast: bool = False
-    autocast_dtype: torch.dtype = None
    variable_seq_lengths: bool = False
-    num_microbatches_with_partial_activation_checkpoints: Optional[int] = None
+    """Support for variable sequence lengths across microbatches. Setting this communicates the size
+        of tensors during pipeline parallelism communication, because of this extra overhead it
+        should only be set if the sequence length varies by microbatch within a global batch.
+    """
    overlap_p2p_comm: bool = False
+    """When True some of the peer to peer communication for pipeline parallelism will overlap with
+       computation. Must be False if batch_p2p_comm is true.
+    """
    batch_p2p_comm: bool = True
+    """Use batch_isend_irecv instead of individual isend/irecv calls. Must be False if
+       overlap_p2p_comm is True.
+    """
    batch_p2p_sync: bool = True
+    """When using batch_isend_irecv, do a cuda.device.synchronize afterward to work around a bug in
+       older version of PyTorch.
+    """
    use_ring_exchange_p2p: bool = False
+    """Use custom ring_exchange kernel instead of torch.distributed.batch_isend_irecv(). Requires
+       custom built torch with torch.distributed.ring_exchange.
+    """
    deallocate_pipeline_outputs: bool = False
-    no_sync_func: Callable = None
+    """If True, output data is deallocated after the tensor is sent to the next pipeline stage.
-    grad_sync_func: Callable = None
+       Helps with saving memory, does nothing when pipeline parallel is not used.
-    param_sync_func: Callable = None
+    """
+    defer_embedding_wgrad_compute: bool = False
+    """If true, defers the embedding WGRAD GEMMs while pipeline flush is
+       taking place enabling us to hide pipeline flush latency. Defaults to False.
+    """
+    wgrad_deferral_limit: int = 0
+    """This value tunes the number of micro-batches for which the embedding weight gradient compute
+       needs to be deferred to pipeline flush, this argument is invalid if `defer_embedding_wgrad_compute` is False.
+       Defaults to 0, which means all micro-batches are deferred.
+    """
+    pipeline_model_parallel_split_rank: Optional[int] = None
+    """If int, rank where encoder and decoder should be split in cases where the model has both an
+       encoder and decoder (e.g., T5). Ignored if None.
+    """
+    ###################
+    # CPU Offloading
+    ###################
+    cpu_offloading: bool = False
+    """When set to True, all the activations are offloaded to the CPU asynchronously."""
+    cpu_offloading_num_layers: int = 0
+    """Tells the number of transformer layers for which activations has to be offloaded."""
+    _cpu_offloading_context: ContextManager = (
+        None  # Used for internal use only, not to be set by the user. TODO: Need to move to the 'right' place when possible.
+    )
+    """For internal use only, do not set."""
+    cpu_offloading_activations: bool = True
+    """If True, offloads the activations to CPU."""
+    cpu_offloading_weights: bool = True
+    """If True, offloads the weights to CPU."""
+    ###################
+    # Timing
+    ###################
+    barrier_with_L1_time: bool = True
+    """If true, use barrier with level 1 time measurements. It is up to the user to make sure
+       calling barrier with their timers will not result in hangs. This can happen if for example
+       the user adds a level 1 timer that is not called by all ranks.
+    """
    def __post_init__(self):
-        """ Python dataclass method that is used to modify attributes after initialization.
+        """Python dataclass method that is used to modify attributes after initialization.
-            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
        """
        if self.sequence_parallel:
            if self.tensor_model_parallel_size <= 1:
                raise ValueError("Can not use sequence paralllelism without tensor parallelism")
-            if self.async_tensor_model_parallel_allreduce:
-                # sequence_parallelism already does this async
-                self.async_tensor_model_parallel_allreduce = False
        if self.pipeline_model_parallel_size > 1:
            if self.pipeline_dtype is None:
@@ -165,3 +311,24 @@ class ModelParallelConfig:
        if self.autocast_dtype is None:
            self.autocast_dtype = self.params_dtype
+        if self.defer_embedding_wgrad_compute and self.pipeline_model_parallel_size == 1:
+            raise ValueError(
+                "Cannot defer embedding wgrad compute when pipeline model parallel is not used"
+            )
+        if self.defer_embedding_wgrad_compute and not self.gradient_accumulation_fusion:
+            raise ValueError(
+                "Cannot defer embedding wgrad compute when gradient accumulation fusion is not used"
+            )
+        if self.defer_embedding_wgrad_compute and self.wgrad_deferral_limit < 0:
+            raise ValueError(
+                "Wgrad deferral limit should be greater than or equal to 0 when this optimization is enabled!"
+            )
+        if self.expert_model_parallel_size > 1 and self.tensor_model_parallel_size > 1:
+            if self.sequence_parallel is False:
+                raise ValueError(
+                    "When using expert parallelism and tensor parallelism, sequence parallelism must be used"
+                )
--- a/megatron/core/models/T5/__init__.py
+++ b/megatron/core/models/T5/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from .t5_model import T5Model
--- a/megatron/core/models/T5/t5_model.py
+++ b/megatron/core/models/T5/t5_model.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from typing import List, Literal, Optional
+import torch
+from torch import Tensor
+from megatron.core import InferenceParams, tensor_parallel
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+class T5LMHead(MegatronModule):
+    """Masked LM head for T5
+    Args:
+        config (TransformerConfig): transformer config
+        parallel_output (bool): wether output logits being distributed or not.
+        vocab_size (int): vocabulary size
+        pre_process (bool): Include embedding layer
+        share_embeddings_and_output_weights (bool): When True, input
+            embeddings and output logit weights are shared.
+    """
+    def __init__(
+        self,
+        config: TransformerConfig,
+        parallel_output: bool,
+        vocab_size: int,
+        pre_process: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+    ):
+        super(T5LMHead, self).__init__(config=config)
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+        self.parallel_output = parallel_output
+        self.output_layer = tensor_parallel.ColumnParallelLinear(
+            config.hidden_size,
+            vocab_size,
+            config=config,
+            init_method=config.init_method,
+            bias=share_embeddings_and_output_weights,
+            skip_bias_add=not share_embeddings_and_output_weights,
+            gather_output=not self.parallel_output,
+            skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
+        )
+    def forward(self, hidden_states: Tensor, word_embeddings_weight: Tensor) -> Tensor:
+        """Forward pass.
+        Args:
+            hidden_states (Tensor): output hidden states from decoder
+            word_embeddings_weight (Tensor): word embedding weight
+        Returns:
+            Tensor: logits tensor
+        """
+        logits, _ = self.output_layer(hidden_states, weight=word_embeddings_weight)
+        return logits
+class T5Model(LanguageModule):
+    """T5 Language model.
+    Args:
+        config (TransformerConfig): transformer config
+        encoder_config (TransformerConfig): encoder transformer config
+        transformer_encoder_layer_spec (ModuleSpec): transformer layer
+            customization specs for encoder
+        transformer_decoder_layer_spec (ModuleSpec): transformer layer
+            customization specs for decoder
+        vocab_size (int): vocabulary size
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+        pre_process (bool): Include embedding layer (used with pipeline parallelism)
+        post_process (bool): Include an output layer (used with pipeline parallelism)
+        fp16_lm_cross_entropy (bool, optional): Defaults to False
+        parallel_output (bool): Do not gather the outputs,
+            keep them split across tensor parallel ranks
+        share_embeddings_and_output_weights (bool): When True,
+            input embeddings and output logit weights are shared. Defaults to False.
+        position_embedding_type (string): Position embedding type.
+            Options ['learned_absolute', 'rope'].
+            Defaults is 'learned_absolute'.
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
+        seq_len_interpolation_factor (float): scale of linearly interpolating
+            RoPE for longer sequences. The value must be a float larger than 1.0.
+            Defaults to None.
+        add_encoder (bool): Create the encoder (used with pipeline parallelism).
+            When using pipelining, the encoder will only be created on a subset
+            of the pipeline ranks.
+        add_decoder (bool): Include an output layer (used with pipeline parallelism).
+            As with `add_encoder`, when using this model and pipelining,
+            the decoder will only be created on a subset of the pipeline ranks.
+    """
+    def __init__(
+        self,
+        config: TransformerConfig,
+        encoder_config: TransformerConfig,
+        transformer_encoder_layer_spec: ModuleSpec,
+        transformer_decoder_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        seq_len_interpolation_factor: Optional[float] = None,
+        add_encoder: bool = True,
+        add_decoder: bool = True,
+    ):
+        super(T5Model, self).__init__(config=config)
+        self.config: TransformerConfig = config
+        self.encoder_config: TransformerConfig = encoder_config
+        self.transformer_encoder_layer_spec: ModuleSpec = transformer_encoder_layer_spec
+        self.transformer_decoder_layer_spec: ModuleSpec = transformer_decoder_layer_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.add_encoder = add_encoder
+        self.add_decoder = add_decoder
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+        self.encoder_hidden_state = None
+        self.model_type = ModelType.encoder_and_decoder
+        # Tells schedules.py that this model has a skip connection
+        # between the encoder's output and the decoder
+        # (and hence both the encoder and decoder's tensors are required for correct backprop).
+        self.xattn_needed = True
+        # specify the position embeddings as a member
+        # variable in the T5 class so that they are easy to
+        # find for `finalize_model_grads._allreduce_position_embedding_grads`
+        self.position_embeddings = None
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=self.position_embedding_type,
+            )
+            self.position_embeddings = self.embedding.position_embeddings
+        # Rotary Position Embeddings
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+            )
+        # Transformer encoder
+        encoder_spec, decoder_spec = (
+            self.transformer_encoder_layer_spec,
+            self.transformer_decoder_layer_spec,
+        )
+        if self.add_encoder:
+            self.encoder = TransformerBlock(
+                config=self.encoder_config,
+                spec=encoder_spec,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+        else:
+            self.encoder = None
+        if self.add_decoder:
+            # Transformer decoder
+            self.decoder = TransformerBlock(
+                config=self.config,
+                spec=decoder_spec,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+        else:
+            self.decoder = None
+        # Output
+        if post_process:
+            self.lm_head = T5LMHead(
+                config,
+                parallel_output,
+                self.vocab_size,
+                self.pre_process,
+                self.share_embeddings_and_output_weights,
+            )
+            self.output_layer = self.lm_head.output_layer
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+    def forward(
+        self,
+        encoder_input_ids: Tensor,
+        decoder_input_ids: Tensor,
+        encoder_attn_mask: Tensor,
+        decoder_attn_mask: Tensor,
+        encoder_decoder_attn_mask: Tensor,
+        lm_labels: Tensor = None,
+        encoder_hidden_states: Tensor = None,
+        output_encoder_hidden_only: bool = False,
+        inference_params: InferenceParams = None,
+    ) -> Tensor:
+        """Forward pass.
+        Args:
+            encoder_input_ids (Tensor): input ids for encoder
+            decoder_input_ids (Tensor): input ids for decoder
+            encoder_attn_mask (Tensor): self-attention mask for encoder
+            decoder_attn_mask (Tensor): self-attention mask for decoder
+            encoder_decoder_attn_mask (Tensor): cross-attention mask between encoder and decoder
+            lm_labels (Tensor): labels for decoder output
+            inference_params (InferenceParams): relevant arguments for inferencing
+        Returns:
+            Tensor: loss tensor
+        """
+        (encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask) = (
+            t5_extended_attention_mask(
+                [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask]
+            )
+        )
+        ## Encoder forward
+        if encoder_hidden_states is None:
+            # Encoder position ids
+            encoder_position_ids = t5_position_ids(encoder_input_ids)
+            # Encoder embedding.
+            if self.pre_process:
+                encoder_input = self.embedding(
+                    input_ids=encoder_input_ids, position_ids=encoder_position_ids
+                )
+            else:
+                # intermediate stage of pipeline
+                encoder_input = None
+            # Rotary positional embeddings
+            rotary_pos_emb = None
+            if self.position_embedding_type == 'rope':
+                rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                    inference_params, self.encoder, encoder_input, self.config
+                )
+                rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+            # Run encoder.
+            if self.add_encoder:
+                encoder_hidden_states = self.encoder(
+                    hidden_states=encoder_input,
+                    attention_mask=encoder_attn_mask,
+                    inference_params=inference_params,
+                    rotary_pos_emb=rotary_pos_emb,
+                )
+            else:
+                encoder_hidden_states = self.encoder_hidden_state
+        if not self.add_decoder or output_encoder_hidden_only:
+            return encoder_hidden_states
+        ## Decoder forward
+        # Decoder position ids
+        decoder_position_ids = t5_position_ids(decoder_input_ids)
+        # Decoder embedding.
+        if self.pre_process:
+            decoder_input = self.embedding(
+                input_ids=decoder_input_ids, position_ids=decoder_position_ids
+            )
+        else:
+            # intermediate stage of pipeline
+            decoder_input = None  ### should it take encoder_hidden_states
+        # Rotary positional embeddings
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params, self.decoder, decoder_input, self.config
+            )
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+        # Run decoder.
+        decoder_hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=decoder_attn_mask,
+            context=encoder_hidden_states,
+            context_mask=encoder_decoder_attn_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+        if self.post_process:
+            lm_logits = self.lm_head(
+                decoder_hidden_states, self.shared_embedding_or_output_weight()
+            )
+            if lm_labels is None:
+                # [s b h] => [b s h]
+                return lm_logits.transpose(0, 1).contiguous()
+            else:
+                # [b s] => [s b]
+                lm_loss = self.compute_language_model_loss(lm_labels, lm_logits)
+                return lm_loss
+        else:
+            return decoder_hidden_states
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+        if self.add_encoder and self.add_decoder:
+            assert (
+                len(input_tensor) == 1
+            ), 'input_tensor should only be length 1 for stage with both encoder and decoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_encoder:
+            assert (
+                len(input_tensor) == 1
+            ), 'input_tensor should only be length 1 for stage with only encoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_decoder:
+            if len(input_tensor) == 2:
+                self.decoder.set_input_tensor(input_tensor[0])
+                self.encoder_hidden_state = input_tensor[1]
+            elif len(input_tensor) == 1:
+                self.decoder.set_input_tensor(None)
+                self.encoder_hidden_state = input_tensor[0]
+            else:
+                raise Exception('input_tensor must have either length 1 or 2')
+        else:
+            raise Exception('Stage must have at least either encoder or decoder')
+    def shared_embedding_or_output_weight(self) -> Tensor:
+        """Function to share the input embeddings and output logit weights."""
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.lm_head.output_layer.weight
+        return None
+def t5_extended_attention_mask(attention_mask_list: List[Tensor]) -> List[Tensor]:
+    """Creates the extended attention mask
+    Converts the attention mask of dimension [batch size, seq_len, seq_len]
+    to [batch size, 1, seq_len, seq_len]
+    Args:
+        attention_mask (Tensor): The input attention mask
+    Returns:
+        Tensor: The extended binary attention mask
+    """
+    def attn_mask_postprocess(attn_mask):
+        # [b, 1, s, s]
+        extended_attention_mask = attn_mask.unsqueeze(1)
+        return extended_attention_mask
+    return [
+        (attn_mask_postprocess(attn_mask) if attn_mask is not None else None)
+        for attn_mask in attention_mask_list
+    ]
+def t5_position_ids(token_ids: Tensor) -> Tensor:
+    """Calculate position ids from token ids
+    Args:
+        token_ids (Tensor): input tokens
+    Returns:
+        Tensor: position ids
+    """
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+    return position_ids
--- a/megatron/core/models/T5/t5_spec.py
+++ b/megatron/core/models/T5/t5_spec.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import (
+    CrossAttention,
+    CrossAttentionSubmodules,
+    SelfAttention,
+    SelfAttentionSubmodules,
+)
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+try:
+    import apex  # pylint: disable=unused-import
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
+    """T5 encoder TE spec (uses Transformer Engine components)."""
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
+    """T5 decoder TE spec (uses Transformer Engine components)."""
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_cross_attn_layernorm=TENorm,
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
+                submodules=CrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            cross_attn_bda=get_bias_dropout_add,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+def encoder_model_with_local_spec() -> ModuleSpec:
+    """T5 encoder local spec (uses Megatron-Core components)."""
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=LNImpl,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=LNImpl,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            sharded_state_dict_keys_map={
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+            },
+        ),
+    )
+def decoder_model_with_local_spec() -> ModuleSpec:
+    """T5 decoder local spec (uses Megatron-Core components)."""
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=LNImpl,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_cross_attn_layernorm=LNImpl,
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
+                submodules=CrossAttentionSubmodules(
+                    linear_q=ColumnParallelLinear,
+                    linear_kv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                ),
+            ),
+            cross_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=LNImpl,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            sharded_state_dict_keys_map={
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+            },
+        ),
+    )
+def get_t5_encoder_with_transformer_engine_block_spec(
+    num_layers: int,
+) -> TransformerBlockSubmodules:
+    """T5 encoder block spec for Transformer Engine
+    Args:
+      config (TransformerConfig): config, containing number of layers for encoder
+    """
+    layer_spec = encoder_model_with_transformer_engine_default_spec()
+    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm)
+    return block_spec
+def get_t5_decoder_with_transformer_engine_block_spec(
+    num_layers: int,
+) -> TransformerBlockSubmodules:
+    """T5 decoder block spec for Transformer Engine
+    Args:
+      config (TransformerConfig): config, containing number of layers for decoder
+    """
+    layer_spec = decoder_model_with_transformer_engine_default_spec()
+    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm)
+    return block_spec
+def get_t5_encoder_with_local_block_spec(num_layers: int) -> TransformerBlockSubmodules:
+    """T5 encoder block spec for local (uses Megatron-Core components)
+    Args:
+      num_layers (int): number of encoder layers
+    """
+    layer_spec = encoder_model_with_local_spec()
+    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm)
+    return block_spec
+def get_t5_decoder_with_local_block_spec(num_layers: int) -> TransformerBlockSubmodules:
+    """T5 decoder block spec for local (uses Megatron-Core components)
+    Args:
+      num_layers (int): number of decoder layers
+    """
+    layer_spec = decoder_model_with_local_spec()
+    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm)
+    return block_spec
--- a/megatron/core/models/bert/__init__.py
+++ b/megatron/core/models/bert/__init__.py
--- a/megatron/core/models/bert/bert_layer_specs.py
+++ b/megatron/core/models/bert/bert_layer_specs.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TERowParallelLinear,
+    )
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+try:
+    import apex  # pylint: disable=unused-import
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
+bert_layer_with_transformer_engine_spec = ModuleSpec(
+    module=TransformerLayer,
+    submodules=TransformerLayerSubmodules(
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.padding},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TELayerNormColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(
+                linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+            ),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    ),
+)
+# Use this spec for an implementation using only modules in megatron core
+bert_layer_local_spec = ModuleSpec(
+    module=TransformerLayer,
+    submodules=TransformerLayerSubmodules(
+        input_layernorm=LNImpl,
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.padding},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=ColumnParallelLinear,
+                core_attention=DotProductAttention,
+                linear_proj=RowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        pre_mlp_layernorm=LNImpl,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear),
+        ),
+        mlp_bda=get_bias_dropout_add,
+        sharded_state_dict_keys_map={
+            'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+            'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+        },
+    ),
+)
--- a/megatron/core/models/bert/bert_lm_head.py
+++ b/megatron/core/models/bert/bert_lm_head.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+from torch import Tensor
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import get_linear_layer
+try:
+    import apex
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+class BertLMHead(MegatronModule):
+    """Masked LM head for Bert.
+    Args:
+        hidden_size: hidden size
+        config (TransformerConfig): TransformerConfig object
+    """
+    def __init__(self, hidden_size: int, config: TransformerConfig):
+        super().__init__(config=config)
+        # TODO: Should switch this to TE ?
+        self.dense = get_linear_layer(
+            hidden_size, hidden_size, config.init_method, config.perform_initialization
+        )
+        setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel)
+        setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel)
+        self.layer_norm = LNImpl(
+            config=config, hidden_size=hidden_size, eps=config.layernorm_epsilon
+        )
+        self.gelu = torch.nn.functional.gelu
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
--- a/megatron/core/models/bert/bert_model.py
+++ b/megatron/core/models/bert/bert_model.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import os
+import warnings
+from typing import Literal, Optional
+import torch
+from torch import Tensor
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.models.bert.bert_layer_specs import bert_layer_local_spec
+from megatron.core.models.bert.bert_lm_head import BertLMHead
+from megatron.core.models.bert.pooler import Pooler
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.transformer.enums import AttnMaskType, ModelType
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import get_linear_layer
+from megatron.core.utils import get_te_version as _get_te_version
+from megatron.core.utils import is_te_min_version
+def get_te_version():
+    """Included for backwards compatibility."""
+    warnings.warn("`get_te_version` will be deprecated in a future release")
+    return _get_te_version()
+# pylint: disable=line-too-long
+class BertModel(LanguageModule):
+    """Transformer language model.
+    Args:
+        config (TransformerConfig): transformer config
+        num_tokentypes (int) : Set to 2 when args.bert_binary_head is True, and 0 otherwise. Defaults to 0.
+        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers
+        vocab_size (int): vocabulary size
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+        pre_process (bool): Include embedding layer (used with pipeline parallelism)
+        post_process (bool): Include an output layer (used with pipeline parallelism)
+        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
+        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are shared. Defaults to False.
+        position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
+            Defaults is 'learned_absolute'.
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
+    """
+    def __init__(
+        self,
+        config: TransformerConfig,
+        num_tokentypes: int,
+        transformer_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        seq_len_interpolation_factor: Optional[float] = None,
+        add_binary_head=True,
+        return_embeddings=False,
+    ):
+        super(BertModel, self).__init__(config=config)
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+        if return_embeddings:
+            assert self.post_process and self.add_binary_head
+        self.config: TransformerConfig = config
+        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+        self.add_binary_head = add_binary_head
+        self.return_embeddings = return_embeddings
+        # megatron core pipelining currently depends on model type
+        self.model_type = ModelType.encoder_or_decoder
+        self.attn_mask_dimensions = self._sanity_check_attention_and_get_attn_mask_dimension()
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=position_embedding_type,
+                num_tokentypes=num_tokentypes,
+            )
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+            )
+        # Transformer.
+        self.encoder = TransformerBlock(
+            config=self.config,
+            spec=self.transformer_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        # Output
+        if post_process:
+            # TODO: Make sure you are passing in the mpu_vocab_size properly
+            self.lm_head = BertLMHead(config.hidden_size, config)
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=True,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
+            )
+            self.binary_head = None
+            if self.add_binary_head:
+                # TODO: Shoudl switch this to TE ?
+                self.binary_head = get_linear_layer(
+                    config.hidden_size, 2, config.init_method, config.perform_initialization
+                )
+                self.pooler = Pooler(
+                    config.hidden_size, config.init_method, config, config.sequence_parallel
+                )
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+    # pylint: disable=line-too-long
+    def _sanity_check_attention_and_get_attn_mask_dimension(self) -> str:
+        """We do some checks and return attention mask dimensions for self attention
+        Transformer engine library underwent a lot of change. So we need to change dimensions of
+        the attention mask depending on the TE version. We also santiy check some arguments.
+        1. If we use local version of attention dimension of the mask is [b,1,s,s]
+        2. If we use transformer engine > 1.10 we support all 3 backends with padding mask and [b,1,s,s]
+        3. If we use transformer engine >= 1.7 but less than 1.10
+          a ) Flash and Fused attention uses padding mask with [b,1,1,s]
+          b ) Unfused attention works with arbitrary mask with [b,1,s,s]
+        4. If we use transformer engine < 1.7
+          Flash and fused attention is not supported. Unfused attention will work with padding mask [b,1,s,s]
+        Default if you dont set any NVTE_ATTN flag will it will just use the fused path for transformer engine version >= 1.7 and unfused path for other
+        Args:
+            transformer_layer_spec (ModuleSpec): The transformer layer spec
+        Returns:
+            str: A string showing the format of the attn mask dimensions
+        """
+        attn_mask_dimensions = None
+        # For local layer spec we just use b1ss
+        if self.transformer_layer_spec == bert_layer_local_spec:
+            attn_mask_dimensions = "b1ss"
+        else:
+            attn_mask_type = self.transformer_layer_spec.submodules.self_attention.params[
+                'attn_mask_type'
+            ]
+            flash_attention_enabled = os.getenv('NVTE_FLASH_ATTN') == '1'
+            fused_attention_enabled = os.getenv('NVTE_FUSED_ATTN') == '1'
+            # For TE >= 1.10 (We always use padding mask and use b11s)
+            if is_te_min_version("1.10.0"):
+                attn_mask_dimensions = "b11s"
+                if attn_mask_type != AttnMaskType.padding:
+                    warnings.warn(
+                        f'For TE versions >= 1.10 , flash/fused/unfused support padding mask. Setting attention mask from {attn_mask_type} to padding'
+                    )
+                    self.transformer_layer_spec.submodules.self_attention.params[
+                        'attn_mask_type'
+                    ] = AttnMaskType.padding
+            # For 1.7 >= TE < 1.10 flash and fused path use padding mask with b11s and unfused path uses arbitrary mask with b1ss
+            elif is_te_min_version("1.7.0"):
+                if flash_attention_enabled or fused_attention_enabled:
+                    attn_mask_dimensions = "b11s"
+                else:
+                    if attn_mask_type != AttnMaskType.arbitrary:
+                        warnings.warn(
+                            f'For TE versions >= 1.7 but < 1.10 , unfused path supports only arbitrary mask. Setting attention mask from {attn_mask_type} to arbitray'
+                        )
+                        self.transformer_layer_spec.submodules.self_attention.params[
+                            'attn_mask_type'
+                        ] = AttnMaskType.arbitrary
+                    attn_mask_dimensions = "b1ss"
+            # For TE < 1.7 we only support unfused attention with b1ss and padding mask
+            else:
+                attn_mask_dimensions = "b1ss"
+                assert not flash_attention_enabled and not fused_attention_enabled, (
+                    "Flash and fused attention is not supported with transformer engine version "
+                    "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer "
+                    "engine >= 1.7"
+                )
+        return attn_mask_dimensions
+    def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
+        """Creates the extended attention mask
+        Converts the attention mask of dimension [batch size, 1, seq len] to [batch size, 1, seq len, seq len] or [batch size, 1, 1, seq_len] and makes it binary
+        Args:
+            attention_mask (Tensor): The input attention mask
+        Returns:
+            Tensor: The extended binary attention mask
+        """
+        # We create a 3D attention mask from a 2D tensor mask.
+        if self.attn_mask_dimensions == "b1ss":
+            # [b, 1, s]
+            attention_mask_b1s = attention_mask.unsqueeze(1)
+            # [b, s, 1]
+            attention_mask_bs1 = attention_mask.unsqueeze(2)
+            # [b, s, s]
+            attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+            # [b, 1, s, s]
+            extended_attention_mask = attention_mask_bss.unsqueeze(1)
+        else:
+            # [b, 1, 1, s]
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+        # Convert attention mask to binary:
+        extended_attention_mask = extended_attention_mask < 0.5
+        return extended_attention_mask
+    def bert_position_ids(self, token_ids):
+        """Position ids for bert model"""
+        # Create position ids
+        seq_length = token_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+        return position_ids
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """Sets input tensor to the model.
+        See megatron.model.transformer.set_input_tensor()
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert'
+        self.encoder.set_input_tensor(input_tensor[0])
+    def forward(
+        self,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        tokentype_ids: Tensor = None,
+        lm_labels: Tensor = None,
+        inference_params=None,
+    ):
+        """Forward function of BERT model
+        Forward function of the BERT Model This function passes the input tensors
+        through the embedding layer, and then the encoder and finally into the post
+        processing layer (optional).
+        It either returns the Loss values if labels are given  or the final hidden units
+        """
+        extended_attention_mask = self.bert_extended_attention_mask(attention_mask)
+        if parallel_state.is_pipeline_first_stage():
+            input_ids = input_ids
+            position_ids = self.bert_position_ids(input_ids)
+        else:
+            position_ids = None
+            input_ids = None
+        # Encoder embedding.
+        if self.pre_process:
+            encoder_input = self.embedding(
+                input_ids=input_ids, position_ids=position_ids, tokentype_ids=tokentype_ids
+            )
+        else:
+            # intermediate stage of pipeline
+            # encoder will get hidden_states from encoder.input_tensor
+            encoder_input = None
+        # Rotary positional embeddings (Why not move this into BERT/GPTEmberdding ?)
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params, self.encoder, encoder_input, self.config
+            )
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+        # Run encoder.
+        hidden_states = self.encoder(
+            hidden_states=encoder_input,
+            attention_mask=extended_attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+        if not self.post_process:
+            return hidden_states
+        if self.add_binary_head:
+            pooled_output = self.pooler(hidden_states, 0)
+        if self.return_embeddings:
+            embeddings = torch.transpose(hidden_states, 0, 1)
+            masks = torch.sum(attention_mask, dim=1)
+            # Collect masked embeddings.
+            output = torch.zeros(
+                size=(embeddings.shape[0], embeddings.shape[2]),
+                dtype=torch.float32,
+                device=torch.cuda.current_device(),
+            )
+            for i, (embedding, mask) in enumerate(zip(embeddings, masks)):
+                output[i, :] = torch.mean(embedding[1 : mask - 1], dim=0)
+            return output
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        hidden_states_after_lm_head = self.lm_head(hidden_states=hidden_states)
+        logits, _ = self.output_layer(hidden_states_after_lm_head, weight=output_weight)
+        binary_logits = None
+        if self.binary_head is not None:
+            binary_logits = self.binary_head(pooled_output)
+        if lm_labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous(), binary_logits
+        loss = self.compute_language_model_loss(lm_labels, logits)
+        return loss, binary_logits
--- a/megatron/core/models/bert/pooler.py
+++ b/megatron/core/models/bert/pooler.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+from torch import Tensor
+from megatron.core import tensor_parallel
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import get_linear_layer
+class Pooler(MegatronModule):
+    """Pooler layer.
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+    Args:
+        hidden_size (int): The hidden size_
+        init_method (callable): weight initialization method for the linear layer. bias is set to zero.
+        config (TransformerConfig): The transformer configuration
+        sequence_parallel (bool): Using squence parallel ? Defaults to False
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        init_method: callable,
+        config: TransformerConfig,
+        sequence_parallel: bool = False,
+    ):
+        super(Pooler, self).__init__(config)
+        # TODO: Shoudl switch this to TE ?
+        self.dense = get_linear_layer(
+            hidden_size, hidden_size, init_method, config.perform_initialization
+        )
+        self.sequence_parallel = sequence_parallel
+    def forward(self, hidden_states: Tensor, sequence_index=0):
+        # hidden_states: [s, b, h]
+        # sequence_index: index of the token to pool.
+        # gather data along sequence dimensions
+        # same pooler is run on all tensor parallel nodes
+        if self.sequence_parallel:
+            hidden_states = tensor_parallel.gather_from_sequence_parallel_region(
+                hidden_states, tensor_parallel_output_grad=False
+            )
+        pooled = hidden_states[sequence_index, :, :]
+        pooled = self.dense(pooled)
+        pooled = torch.tanh(pooled)
+        return pooled
--- a/megatron/core/models/common/embeddings/__init__.py
+++ b/megatron/core/models/common/embeddings/__init__.py
--- a/megatron/core/models/gpt/gpt_embedding.py
+++ b/megatron/core/models/gpt/gpt_embedding.py
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from typing import Literal
 import torch
+from torch import Tensor
 from megatron.core import tensor_parallel
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import (
-    make_sharded_tensor_for_checkpoint,
-    make_tp_sharded_tensor_for_checkpoint,
-)
-class GPTEmbedding(MegatronModule):
+class LanguageModelEmbedding(MegatronModule):
    """Language model embeddings.
-    Arguments:
+    Args:
        config (TransformerConfig): config object with all necessary configs for TransformerBlock
        vocab_size (int): vocabulary size
        max_sequence_length (int): maximum size of sequence. This
                             is used for positional embedding
        add_position_embedding (bool): Add a position embedding.
-        embedding_dropout_prob float): dropout probability for embeddings
+        embedding_dropout_prob (float): dropout probability for embeddings
+        num_tokentypes (int): Set to 0 without binary head, and 2 with a binary head . Defaults to 0.
    """
    def __init__(
@@ -28,20 +28,28 @@ class GPTEmbedding(MegatronModule):
        config: TransformerConfig,
        vocab_size: int,
        max_sequence_length: int,
-        add_position_embedding: bool,
+        position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
+        num_tokentypes: int = 0,
    ):
        super().__init__(config=config)
        self.config: TransformerConfig = config
        self.vocab_size: int = vocab_size
        self.max_sequence_length: int = max_sequence_length
-        self.add_position_embedding: bool = add_position_embedding
+        self.add_position_embedding: bool = position_embedding_type == 'learned_absolute'
+        self.num_tokentypes = num_tokentypes
+        self.reduce_scatter_embeddings = (
+            (not self.add_position_embedding)
+            and self.num_tokentypes <= 0
+            and self.config.sequence_parallel
+        )
        # Word embeddings (parallel).
        self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.config.hidden_size,
            init_method=self.config.init_method,
+            reduce_scatter_embeddings=self.reduce_scatter_embeddings,
            config=self.config,
        )
@@ -55,6 +63,16 @@ class GPTEmbedding(MegatronModule):
            if self.config.perform_initialization:
                self.config.init_method(self.position_embeddings.weight)
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(
+                self.num_tokentypes, self.config.hidden_size
+            )
+            # Initialize the token-type embeddings.
+            if self.config.perform_initialization:
+                self.config.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout)
@@ -64,9 +82,21 @@ class GPTEmbedding(MegatronModule):
        self.word_embeddings.weight.shared = True
        self.position_embeddings.weight.data.fill_(0)
        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
-    def forward(self, input_ids, position_ids):
+    def forward(self, input_ids: Tensor, position_ids: Tensor, tokentype_ids: int = None) -> Tensor:
-        # Embeddings.
+        """Forward pass of the embedding module.
+        Args:
+            input_ids (Tensor): The input tokens
+            position_ids (Tensor): The position id's used to calculate position embeddings
+            tokentype_ids (int): The token type ids. Used when args.bert_binary_head is set to True. Defaults to None
+        Returns:
+            Tensor: The output embeddings
+        """
        word_embeddings = self.word_embeddings(input_ids)
        if self.add_position_embedding:
            position_embeddings = self.position_embeddings(position_ids)
@@ -74,8 +104,17 @@ class GPTEmbedding(MegatronModule):
        else:
            embeddings = word_embeddings
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        if not self.reduce_scatter_embeddings:
-        embeddings = embeddings.transpose(0, 1).contiguous()
+            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+            embeddings = embeddings.transpose(0, 1).contiguous()
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            # [b s h] -> [s b h] (So that it can be added with embeddings)
+            tokentype_embedding = self.tokentype_embeddings(tokentype_ids).permute(1, 0, 2)
+            embeddings = embeddings + tokentype_embedding
+        else:
+            assert self.tokentype_embeddings is None
        # If the input flag for fp32 residual connection is set, convert for float.
        if self.config.fp32_residual_connection:
@@ -83,41 +122,16 @@ class GPTEmbedding(MegatronModule):
        # Dropout.
        if self.config.sequence_parallel:
-            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+            if not self.reduce_scatter_embeddings:
+                embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+            # `scatter_to_sequence_parallel_region` returns a view, which prevents
+            # the original tensor from being garbage collected. Clone to facilitate GC.
+            # Has a small runtime cost (~0.5%).
+            if self.config.clone_scatter_output_in_embedding:
+                embeddings = embeddings.clone()
            with tensor_parallel.get_cuda_rng_tracker().fork():
                embeddings = self.embedding_dropout(embeddings)
        else:
            embeddings = self.embedding_dropout(embeddings)
        return embeddings
-    def sharded_state_dict(self, prefix=''):
-        sharded_state_dict = {}
-        word_embeddings_prefix = f'{prefix}word_embeddings.'
-        word_embeddings_state_dict = self.word_embeddings.state_dict(
-            prefix=word_embeddings_prefix, keep_vars=True
-        )
-        sharded_word_embeddings_key = f'{word_embeddings_prefix}weight'
-        sharded_word_embeddings_tensor = make_tp_sharded_tensor_for_checkpoint(
-            tensor=word_embeddings_state_dict[sharded_word_embeddings_key],
-            key=sharded_word_embeddings_key,
-            allow_shape_mismatch=True,
-        )
-        sharded_state_dict[sharded_word_embeddings_key] = sharded_word_embeddings_tensor
-        if self.add_position_embedding:
-            position_embeddings_prefix = f'{prefix}position_embeddings.'
-            position_embeddings_state_dict = self.position_embeddings.state_dict(
-                prefix=position_embeddings_prefix, keep_vars=True
-            )
-            sharded_position_embeddings_key = f'{position_embeddings_prefix}weight'
-            sharded_position_embeddings_tensor = make_sharded_tensor_for_checkpoint(
-                tensor=position_embeddings_state_dict[sharded_position_embeddings_key],
-                key=sharded_position_embeddings_key,
-            )
-            sharded_state_dict[sharded_position_embeddings_key] = sharded_position_embeddings_tensor
-        return sharded_state_dict
--- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from __future__ import annotations
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from megatron.core.transformer.transformer_block import TransformerBlock
+import logging
+import torch
+from torch import Tensor, nn
+from megatron.core import parallel_state
+logger = logging.getLogger(__name__)
+try:
+    from apex.transformer.functional import (
+        fused_apply_rotary_pos_emb,
+        fused_apply_rotary_pos_emb_thd,
+    )
+    HAVE_APPLY_ROPE_FUSION = True
+except ImportError:
+    HAVE_APPLY_ROPE_FUSION = False
+__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
+def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
+    cp_size = parallel_state.get_context_parallel_world_size()
+    cp_rank = parallel_state.get_context_parallel_rank()
+    cp_idx = torch.tensor(
+        [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True
+    ).cuda(non_blocking=True)
+    pos_emb = pos_emb.view(
+        *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :]
+    )
+    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
+    return pos_emb
+class RotaryEmbedding(nn.Module):
+    """Rotary Embedding for language model.
+    Args:
+        kv_channels (int): Projection weights dimension in multi-head attention. Obtained from transformer config
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None
+        rotary_base (int, optional): Base period for rotary position embeddings. Defaults to 10000.
+        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on the GPU. Defaults to False
+    """
+    def __init__(
+        self,
+        kv_channels: int,
+        rotary_percent: float,
+        rotary_interleaved: bool = False,
+        seq_len_interpolation_factor: float = None,
+        rotary_base: int = 10000,
+        use_cpu_initialization: bool = False,
+    ) -> None:
+        super().__init__()
+        dim = kv_channels
+        if rotary_percent < 1.0:
+            dim = int(dim * rotary_percent)
+        self.rotary_interleaved = rotary_interleaved
+        self.seq_len_interpolation_factor = seq_len_interpolation_factor
+        device = 'cpu' if use_cpu_initialization else torch.cuda.current_device()
+        self.inv_freq = 1.0 / (
+            rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+    def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
+        """Forward pass of RoPE embedding.
+        Args:
+            max_seq_len (int): Maximum size of sequence
+            offset (int, optional): _description_. Defaults to 0.
+        Returns:
+            Tensor: Embeddings after applying RoPE.
+        """
+        if self.inv_freq.device.type == 'cpu':
+            # move `inv_freq` to GPU once at the first micro-batch forward pass
+            self.inv_freq = self.inv_freq.to(device=torch.cuda.current_device())
+        seq = (
+            torch.arange(max_seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+            + offset
+        )
+        if self.seq_len_interpolation_factor is not None:
+            seq *= 1 / self.seq_len_interpolation_factor
+        freqs = torch.outer(seq, self.inv_freq)
+        # first part even vector components, second part odd vector components,
+        #  2 * dim in dimension size
+        if not self.rotary_interleaved:
+            emb = torch.cat((freqs, freqs), dim=-1)
+        else:
+            emb = torch.stack((freqs.view(-1, 1), freqs.view(-1, 1)), dim=-1).view(
+                freqs.shape[0], -1
+            )
+        # emb [seq_length, .., dim]
+        emb = emb[:, None, None, :]
+        if parallel_state.get_context_parallel_world_size() > 1:
+            # slice rotary_pos_emb along sequence dimension and select the parition of the current CP rank
+            emb = get_pos_emb_on_this_cp_rank(emb, 0)
+        return emb
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        state_dict.pop(f'{prefix}inv_freq', None)
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    def get_rotary_seq_len(
+        self,
+        inference_params,
+        transformer: TransformerBlock,
+        transformer_input: Tensor,
+        transformer_config: TransformerConfig,
+    ) -> float:
+        """Function to get the rotary sequence length.
+        Args:
+            inference_params : Used during Inference time
+            transformer (TransformerBlock): The transformer block (decoder/encoder) used by the model
+            transformer_input (Tensor): _description_
+            transformer_config (TransformerConfig): Transformer config used by the model
+        Returns:
+            float: The rotary sequence length
+        """
+        if inference_params is not None:
+            rotary_seq_len = inference_params.max_sequence_length
+        else:
+            if transformer.input_tensor is not None:
+                rotary_seq_len = transformer.input_tensor.size(0)
+            else:
+                rotary_seq_len = transformer_input.size(0)
+            if transformer_config.sequence_parallel:
+                rotary_seq_len *= transformer_config.tensor_model_parallel_size
+        rotary_seq_len *= transformer_config.context_parallel_size
+        return rotary_seq_len
+def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
+    """Change sign so the last dimension becomes [-odd, +even]
+    Args:
+        x (Tensor): Input tensor
+    Returns:
+        Tensor: Tensor rotated half
+    """
+    if not rotary_interleaved:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1 = x[:, :, :, ::2]
+        x2 = x[:, :, :, 1::2]
+        x_new = torch.stack((-x2, x1), dim=-1)
+        return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1)
+def apply_rotary_pos_emb_bshd(t: Tensor, freqs: Tensor, rotary_interleaved: bool = False) -> Tensor:
+    """Apply rotary positional embedding to input tensor T.
+    check https://kexue.fm/archives/8265 for detailed formulas
+    Args:
+        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
+    Returns:
+        Tensor: The input tensor after applying RoPE
+    """
+    rot_dim = freqs.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    cos_ = torch.cos(freqs).to(t.dtype)
+    sin_ = torch.sin(freqs).to(t.dtype)
+    t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
+    return torch.cat((t, t_pass), dim=-1)
+def apply_rotary_pos_emb_thd(
+    t: Tensor, cu_seqlens: Tensor, freqs: Tensor, rotary_interleaved: bool = False
+) -> Tensor:
+    """A baseline implementation of applying RoPE for `thd` format.
+    Args:
+        t (Tensor): Input tensor T is of shape [t, h, d]
+        cu_seqlens(Tensor):  Cumulative sum of sequence lengths in a batch for `t`,
+        with shape [b + 1] and dtype torch.int32.
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d]
+    Returns:
+        Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
+    """
+    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    return torch.cat(
+        [
+            apply_rotary_pos_emb_bshd(x.unsqueeze(1), freqs[: x.size(0)])
+            for x in torch.split(t, seqlens)
+        ]
+    ).squeeze(1)
+def apply_rotary_pos_emb(
+    t: Tensor, freqs: Tensor, config: TransformerConfig, cu_seqlens: Optional[Tensor] = None
+):
+    """
+    Reroute to the appropriate apply_rotary_pos_emb function depending on
+    fused/unfused kernels, or bshd (conventional) / thd (packed seq) format
+    """
+    if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION:
+        # setting apply_rope_fusion in config to False so that subsequent queries to this config also return False
+        config.apply_rope_fusion = False
+        if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False):
+            logger.warning(
+                "Setting apply_rope_fusion to false because its implementation"
+                " is not included in Apex. Try upgrading to the latest version"
+            )
+            apply_rotary_pos_emb.printed_fused_warning = True
+    if config.apply_rope_fusion:
+        if cu_seqlens is None:
+            return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
+        else:
+            return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
+    else:
+        if cu_seqlens is None:
+            return apply_rotary_pos_emb_bshd(t, freqs, rotary_interleaved=config.rotary_interleaved)
+        else:
+            return apply_rotary_pos_emb_thd(
+                t, cu_seqlens, freqs, rotary_interleaved=config.rotary_interleaved
+            )
--- a/megatron/core/models/common/language_module/__init__.py
+++ b/megatron/core/models/common/language_module/__init__.py
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import logging
+from typing import Optional, Tuple
+import torch
+from torch import Tensor
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
+class LanguageModule(MegatronModule):
+    """Base language module that has common helper functions used across GPT, BERT etc.
+    Args:
+        config (TransformerConfig): Input transformer config for the model
+    """
+    def __init__(self, config: TransformerConfig) -> None:
+        super().__init__(config=config)
+    def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor:
+        """Computes the language model loss (Cross entropy across vocabulary)
+        Args:
+            labels (Tensor): The labels of dimension [batch size, seq length]
+            logits (Tensor): The final logits returned by the output layer of the transformer model
+        Returns:
+            Tensor: Loss tensor of dimensions [batch size, sequence_length]
+        """
+        # [b s] => [s b]
+        labels = labels.transpose(0, 1).contiguous()
+        if self.config.cross_entropy_loss_fusion:
+            loss = fused_vocab_parallel_cross_entropy(logits, labels)
+        else:
+            loss = tensor_parallel.vocab_parallel_cross_entropy(logits, labels)
+        # [s b] => [b, s]
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
+    def setup_embeddings_and_output_layer(self) -> None:
+        """Sets up embedding layer in first stage and output layer in last stage.
+        This function initalizes word embeddings in the final stage when we are
+        using pipeline parallelism and sharing word embeddings, and sets up param
+        attributes on the embedding and output layers.
+        """
+        # Set `is_embedding_or_output_parameter` attribute.
+        if self.pre_process:
+            self.embedding.word_embeddings.weight.is_embedding_or_output_parameter = True
+        if self.post_process and self.output_layer.weight is not None:
+            self.output_layer.weight.is_embedding_or_output_parameter = True
+        if not self.share_embeddings_and_output_weights:
+            return
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            # Zero out wgrad if sharing embeddings between two layers on same
+            # pipeline stage to make sure grad accumulation into main_grad is
+            # correct and does not include garbage values (e.g., from torch.empty).
+            self.shared_embedding_or_output_weight().zero_out_wgrad = True
+            return
+        if parallel_state.is_pipeline_first_stage() and self.pre_process and not self.post_process:
+            self.shared_embedding_or_output_weight().shared_embedding = True
+        if self.post_process and not self.pre_process:
+            assert not parallel_state.is_pipeline_first_stage()
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.output_layer.weight.data.fill_(0)
+            self.output_layer.weight.shared = True
+            self.output_layer.weight.shared_embedding = True
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if parallel_state.is_rank_in_embedding_group():
+                weight = self.shared_embedding_or_output_weight()
+                weight.data = weight.data.cuda()
+                torch.distributed.all_reduce(
+                    weight.data, group=parallel_state.get_embedding_group()
+                )
+        elif not getattr(LanguageModule, "embedding_warning_printed", False):
+            logging.getLogger(__name__).warning(
+                "Distributed processes aren't initialized, so the output layer "
+                "is not initialized with weights from the word embeddings. "
+                "If you are just manipulating a model this is fine, but "
+                "this needs to be handled manually. If you are training "
+                "something is definitely wrong."
+            )
+            LanguageModule.embedding_warning_printed = True
+    def shared_embedding_or_output_weight(self) -> Tensor:
+        """Gets the emedding weight or output logit weights when share embedding and output weights set to True.
+        Returns:
+            Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight
+        """
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+    def sharded_state_dict(
+        self,
+        prefix: str = '',
+        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+        metadata: Optional[dict] = None,
+    ) -> ShardedStateDict:
+        """Sharded state dict implementation that handles the output layer weights tying.
+        Args:
+            prefix (str): Module name prefix.
+            sharded_offsets (tuple): PP related offsets, expected to be empty at this module level.
+            metadata (Optional[Dict]): metadata controlling sharded state dict creation.
+        Returns:
+            ShardedStateDict: sharded state dict for the LanguageModel
+        """
+        assert not sharded_offsets, "Unexpected sharded offsets"
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+        first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
+        output_layer_weight_key = f'{prefix}output_layer.weight'
+        output_layer_bias_key = f'{prefix}output_layer.bias'
+        if self.share_embeddings_and_output_weights:
+            self.tie_embeddings_and_output_weights_state_dict(
+                sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key
+            )
+        elif self.post_process:
+            # Make sure the output layer follows the embeddings padding logic
+            sharded_state_dict[output_layer_weight_key].allow_shape_mismatch = True
+        # Regardless of sharing the output weights with embeddings, we must handle the bias padding
+        if self.post_process and output_layer_bias_key in sharded_state_dict:
+            sharded_state_dict[output_layer_bias_key].allow_shape_mismatch = True
+        return sharded_state_dict
+    def tie_embeddings_and_output_weights_state_dict(
+        self,
+        sharded_state_dict: ShardedStateDict,
+        output_layer_weight_key: str,
+        first_stage_word_emb_key: str,
+    ) -> None:
+        """Ties the embedding and output weights in a given sharded state dict.
+        Args:
+            sharded_state_dict (ShardedStateDict): state dict with the weight to tie
+            output_layer_weight_key (str): key of the output layer weight in the state dict.
+                This entry will be replaced with a tied version
+            first_stage_word_emb_key (str): this must be the same as the
+                ShardedTensor.key of the first stage word embeddings.
+        Returns: None, acts in-place
+        """
+        if not self.post_process:
+            # No output layer
+            assert output_layer_weight_key not in sharded_state_dict, sharded_state_dict.keys()
+            return
+        if self.pre_process:
+            # Output layer is equivalent to the embedding already
+            return
+        # Replace the default output layer with a one sharing the weights with the embedding
+        del sharded_state_dict[output_layer_weight_key]
+        tensor = self.shared_embedding_or_output_weight()
+        last_stage_word_emb_replica_id = (
+            1,  # copy of first stage embedding
+            0,
+            parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        )
+        sharded_state_dict[output_layer_weight_key] = make_tp_sharded_tensor_for_checkpoint(
+            tensor=tensor,
+            key=first_stage_word_emb_key,
+            replica_id=last_stage_word_emb_replica_id,
+            allow_shape_mismatch=True,
+        )
--- a/megatron/core/models/common/rotary_pos_embedding.py
+++ b/megatron/core/models/common/rotary_pos_embedding.py
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-import importlib.util
-import torch
-from torch import einsum, nn
-__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, seq_len_interpolation_factor=None):
-        super().__init__()
-        self.seq_len_interpolation_factor = seq_len_interpolation_factor
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-    def forward(self, max_seq_len, offset=0):
-        seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
-        if self.seq_len_interpolation_factor is not None:
-            seq = seq.type_as(self.inv_freq)
-            seq *= 1 / self.seq_len_interpolation_factor
-        freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
-        # first part even vector components, second part odd vector components,
-        #  2 * dim in dimension size
-        emb = torch.cat((freqs, freqs), dim=-1)
-        # emb [seq_length, .., dim]
-        return emb[:, None, None, :]
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        state_dict.pop(f'{prefix}inv_freq', None)
-        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-def _rotate_half(x):
-    """
-    change sign so the last dimension becomes [-odd, +even]
-    """
-    x1, x2 = torch.chunk(x, 2, dim=-1)
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(t, freqs):
-    """
-    input tensor t is of shape [seq_length, ..., dim]
-    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
-    check https://kexue.fm/archives/8265 for detailed formulas
-    """
-    rot_dim = freqs.shape[-1]
-    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
-    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
-    # first part is cosine component
-    # second part is sine component, need to change signs with _rotate_half method
-    t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin())
-    return torch.cat((t, t_pass), dim=-1)
--- a/megatron/core/models/common/vision_module/__init__.py
+++ b/megatron/core/models/common/vision_module/__init__.py
--- a/megatron/core/models/common/vision_module/vision_module.py
+++ b/megatron/core/models/common/vision_module/vision_module.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+"""Megatron Vision Module."""
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+# Note: This is only a stub at the moment. This will be expanded in follow-up changes.
+class VisionModule(MegatronModule):
+    """Base vision module that has common helper functions used across CLIP, ViT, etc.
+    Args:
+        config (TransformerConfig): Input transformer config for the model
+    """
+    def __init__(self, config: TransformerConfig) -> None:
+        super().__init__(config=config)
--- a/megatron/core/models/gpt/__init__.py
+++ b/megatron/core/models/gpt/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 from .gpt_model import GPTModel