Initial commit

7c19b3a8 · wangsen · 7c19b3a8 · 7c19b3a8 · 7c19b3a8 · 7c19b3a8
Commit 7c19b3a8 authored Sep 26, 2024 by wangsen
20 changed files
--- a/megatron/core/fusions/__pycache__/fused_bias_gelu.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_bias_gelu.cpython-310.pyc
--- a/megatron/core/fusions/__pycache__/fused_bias_swiglu.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_bias_swiglu.cpython-310.pyc
--- a/megatron/core/fusions/__pycache__/fused_cross_entropy.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_cross_entropy.cpython-310.pyc
--- a/megatron/core/fusions/__pycache__/fused_layer_norm.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_layer_norm.cpython-310.pyc
--- a/megatron/core/fusions/__pycache__/fused_softmax.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_softmax.cpython-310.pyc
--- a/megatron/core/fusions/fused_bias_dropout.py
+++ b/megatron/core/fusions/fused_bias_dropout.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from typing import Optional, Tuple
+import torch
+from megatron.core.jit import jit_fuser
+def _bias_dropout_add_func(x_with_bias, residual, prob, training):
+    # type: (Tuple[Tensor, Optional[Tensor]], Tensor, float, bool) -> Tensor
+    # NOTE: Previously, the argument `bias` used to be passed as
+    # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
+    # transformer layer but broadcasting should automatically take care of that.
+    # Also, looking at broadcasting semantics, `expand_as` and broadcasting
+    # seem to be identical performance-wise (both just change the view).
+    x, bias = x_with_bias  # unpack
+    # If we want to train mixed precision, then the output of this function
+    # should be half precision. However, in AMP O1, the input (residual) is
+    # in fp32, and it will up-cast the result to fp32, causing pipeline parallel
+    # GPU communication to hang. Therefore, we need to cast residual to the same
+    # dtype as x.
+    residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
+    # The Dropout operation, Residual Addition and the tensor returning can be
+    # done generically outside the if statement, but that stops fusing of Bias
+    # Addition-Dropout-Residual Addition operation. So doing it together inside
+    # the conditional branch to improve performance
+    if bias is not None:
+        x = x + bias
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out
+        return out
+    else:
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out
+        return out
+def bias_dropout_add_unfused(training):
+    def _bias_dropout_add(x_with_bias, residual, prob):
+        return _bias_dropout_add_func(x_with_bias, residual, prob, training)
+    return _bias_dropout_add
+@jit_fuser
+def bias_dropout_add_fused_train(
+    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float,
+) -> torch.Tensor:
+    return _bias_dropout_add_func(x_with_bias, residual, prob, True)
+@jit_fuser
+def bias_dropout_add_fused_inference(
+    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float,
+) -> torch.Tensor:
+    return _bias_dropout_add_func(x_with_bias, residual, prob, False)
+def get_bias_dropout_add(training, fused):
+    if fused:
+        # jit scripting for a nn.module (with dropout) is not
+        # triggering the fusion kernel. For now, we use two
+        # different nn.functional routines to account for varying
+        # dropout semantics during training and inference phases.
+        if training:
+            return bias_dropout_add_fused_train
+        else:
+            return bias_dropout_add_fused_inference
+    else:
+        return bias_dropout_add_unfused(training)
--- a/megatron/core/fusions/fused_bias_geglu.py
+++ b/megatron/core/fusions/fused_bias_geglu.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import torch
+from megatron.core.jit import jit_fuser
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@jit_fuser
+def geglu(y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return (y_1 * 0.5 * (1.0 + torch.tanh(0.79788456 * y_1 * (1 + 0.044715 * y_1 * y_1)))) * y_2
+@jit_fuser
+def bias_geglu(bias, y):
+    y = y + bias
+    return geglu(y)
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def geglu_back(g, y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    tanh_out = torch.tanh(0.79788456 * y_1 * (1 + 0.044715 * y_1 * y_1))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * y_1 * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * y_1 * y_1)) + 0.5 * (
+        1 + tanh_out
+    )
+    return torch.cat(((g * y_2) * ff, g * (y_1 * 0.5 * (1.0 + tanh_out))), -1)
+@jit_fuser
+def bias_geglu_back(g, y, bias):
+    y = y + bias
+    return geglu_back(g, y)
+class BiasGeGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_geglu(input, bias)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_geglu_back(grad_output, input, bias)
+        return tmp, tmp
+class GeGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return geglu(input)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors
+        tmp = geglu_back(grad_output, input[0])
+        return tmp
+def bias_geglu_impl(input, bias):
+    ori_shape = input.shape
+    assert len(ori_shape) in [2, 3]
+    input = input.view(-1, ori_shape[-1])
+    if bias is not None:
+        output = BiasGeGLUFunction.apply(input, bias)
+    else:
+        output = GeGLUFunction.apply(input)
+    return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
--- a/megatron/core/fusions/fused_bias_gelu.py
+++ b/megatron/core/fusions/fused_bias_gelu.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import torch
+from megatron.core.jit import jit_fuser
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@jit_fuser
+def bias_gelu(bias, y):
+    x = bias + y
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    return ff * g
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+bias_gelu_impl = GeLUFunction.apply
--- a/megatron/core/fusions/fused_bias_swiglu.py
+++ b/megatron/core/fusions/fused_bias_swiglu.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+import torch.nn.functional as F
+from megatron.core.jit import jit_fuser
+###### BIAS SWIGLU FUSION/ NO AUTOGRAD ################
+@jit_fuser
+def swiglu(y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return F.silu(y_1) * y_2
+@jit_fuser
+def bias_swiglu(y, bias):
+    y = y + bias
+    return swiglu(y)
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def swiglu_back(g, y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return torch.cat(
+        (g * torch.sigmoid(y_1) * (1 + y_1 * (1 - torch.sigmoid(y_1))) * y_2, g * F.silu(y_1)), -1
+    )
+@jit_fuser
+def bias_swiglu_back(g, y, bias):
+    y = y + bias
+    return swiglu_back(g, y)
+class BiasSwiGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias, fp8_input_store):
+        input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input
+        ctx.save_for_backward(input_for_backward, bias)
+        ctx.ori_input_dtype = input.dtype
+        ctx.fp8_input_store = fp8_input_store
+        return bias_swiglu(input, bias)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input
+        tmp = bias_swiglu_back(grad_output, input, bias)
+        return tmp, tmp, None
+class SwiGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, fp8_input_store):
+        input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input
+        ctx.save_for_backward(input_for_backward)
+        ctx.ori_input_dtype = input.dtype
+        ctx.fp8_input_store = fp8_input_store
+        return swiglu(input)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors[0]
+        input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input
+        tmp = swiglu_back(grad_output, input)
+        return tmp, None
+def bias_swiglu_impl(input, bias, fp8_input_store=False):
+    ori_shape = input.shape
+    assert len(ori_shape) in [2, 3]
+    input = input.view(-1, ori_shape[-1])
+    if bias is not None:
+        output = BiasSwiGLUFunction.apply(input, bias, fp8_input_store)
+    else:
+        output = SwiGLUFunction.apply(input, fp8_input_store)
+    return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
+# bias_swiglu_impl = BiasSwiGLUFunction.apply
+# swiglu_impl = SwiGLUFunction.apply
--- a/megatron/core/fusions/fused_cross_entropy.py
+++ b/megatron/core/fusions/fused_cross_entropy.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import Tuple
+import torch
+from megatron.core.jit import jit_fuser
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy
+@jit_fuser
+def calculate_logits_max(vocab_parallel_logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    vocab_parallel_logits, logits_max = VocabParallelCrossEntropy.calculate_logits_max(
+        vocab_parallel_logits
+    )
+    return vocab_parallel_logits, logits_max
+#@jit_fuser
+def calculate_predicted_logits(
+    vocab_parallel_logits: torch.Tensor, target: torch.Tensor, logits_max: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    (
+        target_mask,
+        masked_target_1d,
+        predicted_logits,
+        sum_exp_logits,
+        exp_logits,
+    ) = VocabParallelCrossEntropy.calculate_predicted_logits(
+        vocab_parallel_logits, target, logits_max
+    )
+    predicted_logits_sum_exp_logits = torch.cat((predicted_logits, sum_exp_logits))
+    return target_mask, masked_target_1d, predicted_logits_sum_exp_logits, exp_logits
+@jit_fuser
+def calculate_cross_entropy_loss(
+    exp_logits: torch.Tensor, predicted_logits_sum_exp_logits: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    split_val = predicted_logits_sum_exp_logits.size()[0] // 2
+    predicted_logits, sum_exp_logits = torch.split(predicted_logits_sum_exp_logits, split_val)
+    exp_logits, loss = VocabParallelCrossEntropy.calculate_cross_entropy_loss(
+        exp_logits, predicted_logits, sum_exp_logits
+    )
+    return exp_logits, loss
+#@jit_fuser
+def calculate_gradients(
+    softmax: torch.Tensor,
+    grad_output: torch.Tensor,
+    target_mask: torch.Tensor,
+    masked_target_1d: torch.Tensor,
+) -> torch.Tensor:
+    (
+        grad_2d,
+        arange_1d,
+        softmax_update,
+        grad_input,
+    ) = VocabParallelCrossEntropy.prepare_gradient_calculation_operands(softmax, target_mask)
+    grad_input = VocabParallelCrossEntropy.calculate_gradients(
+        grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output
+    )
+    grad_input = grad_input.bfloat16()
+    return grad_input
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+        vocab_parallel_logits, logits_max = calculate_logits_max(vocab_parallel_logits)
+        torch.distributed.all_reduce(
+            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
+        )
+        (
+            target_mask,
+            masked_target_1d,
+            predicted_logits_sum_exp_logits,
+            exp_logits,
+        ) = calculate_predicted_logits(vocab_parallel_logits, target, logits_max)
+        # All reduce is needed to get the chunks from other GPUs.
+        # In the fused case, tensors are batches to invoke a single
+        # AllReduce call
+        torch.distributed.all_reduce(
+            predicted_logits_sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_tensor_model_parallel_group(),
+        )
+        exp_logits, loss = calculate_cross_entropy_loss(exp_logits, predicted_logits_sum_exp_logits)
+        # Store softmax, target-mask and masked-target for backward pass.
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+        return loss
+    @staticmethod
+    def backward(ctx, grad_output):
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+        grad_input = calculate_gradients(softmax, grad_output, target_mask, masked_target_1d)
+        return grad_input, None
+def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """
+    Performs cross entropy loss when logits are split across tensor parallel ranks
+    Args:
+        vocab_parallel_logits: logits split across tensor parallel ranks
+                               dimension is [sequence_length, batch_size, hidden_size]
+        target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
+    """
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
--- a/megatron/core/fusions/fused_layer_norm.py
+++ b/megatron/core/fusions/fused_layer_norm.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import importlib
+import inspect
+import numbers
+import torch
+from torch import Tensor
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from megatron.core.transformer import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+    HAVE_PERSIST_LAYER_NORM = False #True
+except:
+    HAVE_PERSIST_LAYER_NORM = False
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+    HAVE_FUSED_LAYER_NORM = True
+except:
+    HAVE_FUSED_LAYER_NORM = False
+class FusedLayerNorm(torch.nn.Module):
+    """Layer Norm, fused into a single CUDA kernel.
+    Args:
+      hidden_size (int): Transformer hidden dimension.
+      eps (float): Epsilon added to denominator, for numerical stability.
+      persist_layer_norm (bool): Use persistent fused layer norm kernel.
+      This kernel supports only a set of hidden sizes. Please
+      check persist_ln_hidden_sizes if your hidden size is supported.
+      zero_centered_gamma (bool): Adjust LayerNorm weights such that they are
+      centered around zero. This improves numerical stability.
+      config (TransformerConfig): Transformer config. Include to match custom
+      layer norm interfaces.
+      normalization (str): Normalization type, used for Transformer Engine.
+      Must equal 'LayerNorm' here.
+    """
+    def __init__(
+        self,
+        config: TransformerConfig,
+        hidden_size: int,
+        eps: float = 1e-5,
+        persist_layer_norm: bool = True,
+        zero_centered_gamma: bool = False,
+        normalization: str = "LayerNorm",  # included to match TE interface
+    ):
+        super().__init__()
+        self.config = config
+        self.zero_centered_gamma = self.config.layernorm_zero_centered_gamma
+        assert (
+            self.config.normalization == "LayerNorm"
+        ), f'({self.config.normalization}) is not supported in FusedLayerNorm'
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
+        persist_layer_norm = self.config.persist_layer_norm
+        if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
+            persist_layer_norm = False
+        if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM:
+            # TODO: Add pytorch only layer norm
+            raise ValueError(f'Apex must currently be installed to use megatron core.')
+        if isinstance(hidden_size, numbers.Integral):
+            hidden_size = (hidden_size,)
+        self.hidden_size = torch.Size(hidden_size)
+        self.eps = eps
+        # Parameters need to be initialized with torch.empty rather than torch.Tensor for correct device placement with nemo2.
+        self.weight = Parameter(torch.empty(*hidden_size))
+        self.bias = Parameter(torch.empty(*hidden_size))
+        self.reset_parameters()
+        self.persist_layer_norm = persist_layer_norm
+        self.sequence_parallel = self.config.sequence_parallel
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
+        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+    def reset_parameters(self):
+        if self.zero_centered_gamma:
+            init.zeros_(self.weight)
+            init.zeros_(self.bias)
+        else:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+    def forward(self, input: Tensor) -> Tensor:
+        weight = self.weight + 1 if self.zero_centered_gamma else self.weight
+        if self.persist_layer_norm:
+            if 'memory_efficient' in inspect.getfullargspec(FastLayerNormFN.forward).args:
+                output = FastLayerNormFN.apply(
+                    input, weight, self.bias, self.eps, self.config.memory_efficient_layer_norm
+                )
+            else:
+                output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
+        else:
+            if (
+                'memory_efficient'
+                in inspect.getfullargspec(FusedLayerNormAffineFunction.forward).args
+            ):
+                return FusedLayerNormAffineFunction.apply(
+                    input,
+                    weight,
+                    self.bias,
+                    self.hidden_size,
+                    self.eps,
+                    self.config.memory_efficient_layer_norm,
+                )
+            else:
+                return FusedLayerNormAffineFunction.apply(
+                    input, weight, self.bias, self.hidden_size, self.eps
+                )
+        return output
--- a/megatron/core/fusions/fused_softmax.py
+++ b/megatron/core/fusions/fused_softmax.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+from typing import Optional
+import torch
+import torch.nn as nn
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.utils import get_default_causal_mask
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+    Args:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+        assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled"
+    def forward(self, input: torch.Tensor, mask: Optional[torch.Tensor]):
+        """Forward pass of softmax with masked input.
+        In case attn_mask_type is causal the mask is generated and None can be passed.
+        A user-defined mask is only needed when attn_mask_type is not causal.
+        """
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 4096  # sk must be 16 ~ 2048
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 4096:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+        if self.scale is not None:
+            input = input * self.scale
+        # Generate causal mask if not given
+        sq, sk = input.size(2), input.size(3)
+        if self.attn_mask_type == AttnMaskType.causal and mask is None and sq > 1:
+            # If sq == 1 then either KV cache is used or one-element context is passed
+            # so keeping mask=None in this case; subsequent code should handle it
+            assert sq == sk, "causal mask is only for self attention"
+            mask = get_default_causal_mask(sq)
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+        return probs
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
--- a/megatron/core/inference/__init__.py
+++ b/megatron/core/inference/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
--- a/megatron/core/inference/gpt/__init__.py
+++ b/megatron/core/inference/gpt/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
--- a/megatron/core/inference/gpt/model_specs.py
+++ b/megatron/core/inference/gpt/model_specs.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import TEDotProductAttention, TENorm
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+# Use this spec for ModelOpt PTQ and TensorRT-LLM export
+def get_gpt_layer_modelopt_spec(
+    remap_te_layernorm: bool = False, qk_layernorm: bool = False
+) -> ModuleSpec:
+    """Mix the native spec with TENorm.
+    This is essentially the native local spec except for the layernorm implementation
+    is using TENorm from Transformer-Engine. The issue is that FusedLayerNorm from apex
+    has stopped supporting RMSNorm needed by llama.
+    """
+    sharded_state_dict_keys_map = {}
+    if remap_te_layernorm:
+        sharded_state_dict_keys_map = {
+            'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+            'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+        }
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=TENorm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=TENorm if qk_layernorm else IdentityOp,
+                    k_layernorm=TENorm if qk_layernorm else IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=TENorm,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            # Map TE-layernorm-fusion keys back
+            sharded_state_dict_keys_map=sharded_state_dict_keys_map,
+        ),
+    )
--- a/megatron/core/inference/gpt/state_dict_hooks.py
+++ b/megatron/core/inference/gpt/state_dict_hooks.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from logging import getLogger
+import torch
+logger = getLogger(__name__)
+def mcore_gpt_load_legacy_state_dict_pre_hook(
+    state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs,
+):
+    """Register a pre-hook to fix the state_dict key difference.
+    This prehook is used when trying to load the legacy Megatron-LM GPTModel into its
+    megatron/core variant that uses native ParallelLinear and Transformer-Engine Norm.
+    Only this particular spec supports post-training quantization and TensorRT-LLM
+    config export through `nvidia-modelopt` package.
+    Args:
+        state_dict: state dictionary
+        prefix: module name prefix
+        local_metadata: local metatdata
+        strict: whether is in strict mode
+        missing_keys: missing state dict keys
+        unexpected_keys: unexpected state dict keys
+        error_msgs: error messages
+    """
+    if "modelopt_state" in state_dict:
+        state_dict.pop("modelopt_state")
+    if "language_model" in state_dict:
+        language_model_state_dict = state_dict.pop("language_model")
+        if "embedding" in language_model_state_dict:
+            if "word_embeddings" in language_model_state_dict["embedding"]:
+                for key, param in language_model_state_dict["embedding"]["word_embeddings"].items():
+                    state_dict.update({"embedding.word_embeddings." + key: param})
+            if "position_embeddings" in language_model_state_dict["embedding"]:
+                for key, param in language_model_state_dict["embedding"][
+                    "position_embeddings"
+                ].items():
+                    state_dict.update({"embedding.position_embeddings." + key: param})
+        if "transformer" in language_model_state_dict:
+            for key, param in language_model_state_dict["transformer"].items():
+                state_dict.update({"decoder." + key: param})
+        else:
+            for key, param in language_model_state_dict["encoder"].items():
+                state_dict.update({"decoder." + key: param})
+        if "output_layer" in language_model_state_dict:
+            for key, param in language_model_state_dict["output_layer"].items():
+                state_dict.update({"output_layer." + key: param})
+    if torch.distributed.get_rank() == 0:
+        logger.info("ModelOptGPTModel {}".format(state_dict.keys()))
+    module_name_rewrite_list = [
+        ("input_norm", "input_layernorm"),
+        (".attention.query_key_value", ".self_attention.linear_qkv"),
+        (".attention.dense", ".self_attention.linear_proj"),
+        ("self_attention.query_key_value", "self_attention.linear_qkv"),
+        ("self_attention.dense", "self_attention.linear_proj"),
+        ("post_attention_layernorm", "pre_mlp_layernorm"),
+        ("post_attention_norm", "pre_mlp_layernorm"),
+        ("dense_h_to_4h", "linear_fc1"),
+        ("dense_4h_to_h", "linear_fc2"),
+        ("final_norm", "final_layernorm"),
+    ]
+    key_rewrite_list = []
+    for key, _ in state_dict.items():
+        for old_name, new_name in module_name_rewrite_list:
+            if old_name in key:
+                key_rewrite_list += [(key, key.replace(old_name, new_name))]
+    for old_key, new_key in key_rewrite_list:
+        if torch.distributed.get_rank() == 0:
+            logger.info("replace {} with {}".format(old_key, new_key))
+        state_dict[new_key] = state_dict[old_key]
+        state_dict.pop(old_key)
+def mcore_gpt_load_te_state_dict_pre_hook(
+    state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs,
+):
+    """Register a pre-hook to fix the state_dict key difference of.
+    This prehook is used when trying to load the megatron/core GPTModel that uses a
+    fused Transformer-Engine ParallelLinear into the variant that uses native ParallelLinear
+    and Transformer-Engine Norm (effectively to restore the fusion).
+    Only this particular spec supports post-training quantization and TensorRT-LLM
+    config export through `nvidia-modelopt` package.
+    Args:
+        state_dict: state dictionary
+        prefix: module name prefix
+        local_metadata: local metatdata
+        strict: whether is in strict mode
+        missing_keys: missing state dict keys
+        unexpected_keys: unexpected state dict keys
+        error_msgs: error messages
+    """
+    if "modelopt_state" in state_dict:
+        state_dict.pop("modelopt_state")
+    key_with_te_extra_state_to_pop = []
+    for key, _ in state_dict.items():
+        if "_extra_state" in key:
+            key_with_te_extra_state_to_pop += [key]
+    for key in key_with_te_extra_state_to_pop:
+        state_dict.pop(key)
+    module_name_rewrite_list = [
+        ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
+        ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
+        ("mlp.linear_fc1.layer_norm_weight", "pre_mlp_layernorm.weight"),
+        ("mlp.linear_fc1.layer_norm_bias", "pre_mlp_layernorm.bias"),
+    ]
+    key_rewrite_list = []
+    for key, _ in state_dict.items():
+        for old_name, new_name in module_name_rewrite_list:
+            if old_name in key:
+                key_rewrite_list += [(key, key.replace(old_name, new_name))]
+    for old_key, new_key in key_rewrite_list:
+        if torch.distributed.get_rank() == 0:
+            logger.info("replace {} with {}".format(old_key, new_key))
+        state_dict[new_key] = state_dict[old_key]
+        state_dict.pop(old_key)
--- a/megatron/core/inference_params.py
+++ b/megatron/core/inference_params.py
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+    def __init__(self, max_batch_size, max_sequence_length):
+        self.max_sequence_length = max_sequence_length
+        self.max_batch_size = max_batch_size
+        self.sequence_len_offset = 0
+        self.batch_size_offset = 0
+        self.key_value_memory_dict = {}
+    def swap_key_value_dict(self, batch_idx):
+        "swap between batches"
+        if len(self.key_value_memory_dict) == 0:
+            raise ValueError("should not swap when dict in empty")
+        for layer_number in self.key_value_memory_dict.keys():
+            inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
+            assert (
+                len(batch_idx) == inference_key_memory.shape[1]
+            )  # make sure batch size is the same
+            new_inference_key_memory = inference_key_memory[:, batch_idx]
+            new_inference_value_memory = inference_value_memory[:, batch_idx]
+            self.key_value_memory_dict[layer_number] = (
+                new_inference_key_memory,
+                new_inference_value_memory,
+            )
--- a/megatron/core/jit.py
+++ b/megatron/core/jit.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+TORCH_MAJOR = int(torch.__version__.split(".")[0])
+TORCH_MINOR = int(torch.__version__.split(".")[1])
+jit_fuser = torch.jit.script
+# nvFuser is deprecated in PyTorch JIT starting from 2.2
+#if (TORCH_MAJOR > 2) or (TORCH_MAJOR == 2 and TORCH_MINOR >= 2):
+#    jit_fuser = torch.compile
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+from typing import Callable, ContextManager, Optional
+import torch
+@dataclass
+class ModelParallelConfig:
+    """Base configuration for Megatron Core
+    The initialization function has an argument for each parameter.
+    """
+    ###################
+    # Model parallelism
+    ###################
+    tensor_model_parallel_size: int = 1
+    """Intra-layer model parallelism. Splits tensors across GPU ranks."""
+    pipeline_model_parallel_size: int = 1
+    """Inter-layer model parallelism. Splits transformer layers across GPU ranks."""
+    virtual_pipeline_model_parallel_size: Optional[int] = None
+    """Interleaved pipeline parallelism is used to improve performance by reducing the pipeline
+       bubble.  Considers a transformer block as a list of smaller transformer (virtual) blocks.
+       The number of virtual blocks per pipeline model parallel rank is the virtual model parallel
+       size.  See Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM:
+       arxiv.org/pdf/2104.04473.pdf for more details.
+    """
+    sequence_parallel: bool = False
+    """Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms
+       and dropout sequentially.  See Reducing Activation Recomputation in Large Transformer Models
+       (https://arxiv.org/abs/2205.05198) for more details.
+    """
+    context_parallel_size: int = 1
+    """Splits network input along sequence dimension across GPU ranks."""
+    expert_model_parallel_size: int = 1
+    """Distributes Moe Experts across sub data parallel dimension."""
+    moe_extended_tp: bool = False
+    """Alternative parallelization strategy for expert parallelism. Instead of distributing experts
+       across expert_model_parallel_size, each expert is sharded along extendended tensor parallel
+       domain (tensor_model_paralle_size * expert_model_parallel_size). It avoids the load balancing
+       problem with MOE training. 
+    """
+    ###################
+    # Initialization
+    ###################
+    perform_initialization: bool = True
+    """If true, weights are initialized. This option can be useful when you know you are going to
+       load values from a checkpoint.
+    """
+    use_cpu_initialization: bool = False
+    """When set to False, we initialize the weights directly on the GPU. CPU initialization is the
+       same regardless of tensor model parallelism, but GPU initialization is not. Transferring
+       weights from CPU to GPU can take a significant amount of time for large models.
+    """
+    ###################
+    # Training
+    ###################
+    fp16: bool = False
+    """If true, train with fp16 mixed precision training."""
+    bf16: bool = False
+    """If true, train with bf16 mixed precision training."""
+    params_dtype: torch.dtype = torch.float32
+    """dtype used when intializing the weights."""
+    timers: Callable = None
+    """Timers object to call for various timing functions. See megatron.core.timers.Timers"""
+    finalize_model_grads_func: Callable = None
+    """Function that finalizes gradients on all workers. Could include ensuring that grads are
+       all-reduced across data parallelism, pipeline parallelism, and sequence parallelism
+       dimensions.
+    """
+    grad_scale_func: Callable = None
+    """If using loss scaling, this function should take the loss and return the scaled loss. If
+       None, no function is called on the loss.
+    """
+    no_sync_func: Callable = None
+    """Function that creates a context that suppresses asynchronous data-parallel communication. If
+       the model is an instance of core.distributed.DistributedDataParallel, the default is to use
+       core.distributed.DistributedDataParallel.no_sync.
+    """
+    grad_sync_func: Callable = None
+    """Function that launches asynchronous gradient reductions (e.g. distributed optimizer gradient
+       reduce-scatters). The function should take one argument: an iterable of parameters whose
+       gradients are to be synchronized.
+    """
+    param_sync_func: Callable = None
+    """Function that launches asynchronous parameter synchronizations (e.g. distributed optimizer
+       parameter all-gathers). The function should take one argument: an iterable of parameters to
+       be synchronized.
+    """
+    deterministic_mode: bool = False
+    """If true, code that has deterministic execution will be chosen. This usually
+       means slower execution, but is good for debugging and testing. Defaults to False."""
+    enable_autocast: bool = False
+    """If true runs the forward step function inside torch.autocast context."""
+    autocast_dtype: torch.dtype = None
+    """dtype to pass to torch.amp.autocast when enabled. If None, is set to pipeline_dtype."""
+    num_microbatches_with_partial_activation_checkpoints: Optional[int] = None
+    """If int, set the number of microbatches where not all of the layers will be checkpointed and
+       recomputed. The rest of the microbatches within the window of maximum outstanding
+       microbatches will recompute all layers (either full recompute or selective recompute). If
+       None, the checkpoint and recompute will be left up to the forward_step function.
+    """
+    ###################
+    # Optimizations
+    ###################
+    gradient_accumulation_fusion: bool = False
+    """If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA extension
+       fused_weight_gradient_mlp_cuda module. To use gradient_accumulation_fusion you must install
+       APEX with --cpp_ext and --cuda_ext. For example: "pip install --global-option=\"--cpp_ext\"
+       --global-option=\"--cuda_ext\" ". Note that the extension requires CUDA>=11. Otherwise, you
+       must turn off gradient accumulation fusion.
+    """
+    async_tensor_model_parallel_allreduce: bool = False
+    """NOTE: Deprecated. This flag is ignored."""
+    use_te_rng_tracker: bool = False
+    """If true, uses RNG state tracker in TransformerEngine if exists.
+    """
+    tp_comm_overlap: bool = False
+    """If true, allows overlapping of Linear layer execution with tensor parallel communication
+       collectives like AllGather/ReduceScatter. Overlapping is done for the linear layers wherever
+       possible during the forward and the backward pass.
+    """
+    tp_comm_bulk_wgrad: bool = True
+    """If true, allows All-Gather overlap with Bprop activation gradient GEMM. Don't care if
+       tp_comm_overlap is False.
+    """
+    tp_comm_bulk_dgrad: bool = True
+    """If true, allows Reduce-Scatter overlap with Bprop weight gradient GEMM. Don't care if
+       tp_comm_overlap is False.
+    """
+    tp_comm_overlap_ag: bool = True
+    """If true, allows All-Gather overlap with GEMM by pipelining the GEMM and All-Gather.
+       Don't care if tp_comm_overlap is False.
+    """
+    tp_comm_overlap_rs: bool = True
+    """If true, allows Reduce-Scatter overlap with GEMM by pipelining the GEMM and Reduce-Scatter.
+       Don't care if tp_comm_overlap is False.
+    """
+    tp_comm_overlap_rs_dgrad: bool = False
+    """If true, allows Reduce-Scatter overlap with DGRAD GEMM by pipelining the
+       GEMM and Reduce-Scatter splits. Don't care if tp_comm_overlap is False.
+    """
+    tp_comm_split_ag: bool = True
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather
+       splits. Don't care if tp_comm_overlap is False.
+    """
+    tp_comm_atomic_ag: bool = False
+    """Deprecated from TransformerEngine v1.6.0.
+        If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather both
+       done atomically. Don't care if tp_comm_overlap is False.
+    """
+    tp_comm_split_rs: bool = True
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and
+       Reduce-Scatter splits. Don't care if tp_comm_overlap is False.
+    """
+    tp_comm_atomic_rs: bool = False
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and
+       Reduce-Scatter both done atomically. Don't care if tp_comm_overlap is False.
+    """
+    cross_entropy_loss_fusion: bool = False
+    """If this is enabled, the fused cross entropy implementation would be used.
+       Defaults to False.
+    """
+    ###################
+    # Pipeline Parallel
+    ###################
+    pipeline_dtype: torch.dtype = None
+    """dtype used in p2p communication, usually params_dtype"""
+    variable_seq_lengths: bool = False
+    """Support for variable sequence lengths across microbatches. Setting this communicates the size
+        of tensors during pipeline parallelism communication, because of this extra overhead it
+        should only be set if the sequence length varies by microbatch within a global batch.
+    """
+    overlap_p2p_comm: bool = False
+    """When True some of the peer to peer communication for pipeline parallelism will overlap with
+       computation. Must be False if batch_p2p_comm is true.
+    """
+    batch_p2p_comm: bool = True
+    """Use batch_isend_irecv instead of individual isend/irecv calls. Must be False if
+       overlap_p2p_comm is True.
+    """
+    batch_p2p_sync: bool = True
+    """When using batch_isend_irecv, do a cuda.device.synchronize afterward to work around a bug in
+       older version of PyTorch.
+    """
+    use_ring_exchange_p2p: bool = False
+    """Use custom ring_exchange kernel instead of torch.distributed.batch_isend_irecv(). Requires
+       custom built torch with torch.distributed.ring_exchange.
+    """
+    deallocate_pipeline_outputs: bool = False
+    """If True, output data is deallocated after the tensor is sent to the next pipeline stage.
+       Helps with saving memory, does nothing when pipeline parallel is not used.
+    """
+    defer_embedding_wgrad_compute: bool = False
+    """If true, defers the embedding WGRAD GEMMs while pipeline flush is
+       taking place enabling us to hide pipeline flush latency. Defaults to False.
+    """
+    pipeline_model_parallel_split_rank: Optional[int] = None
+    """If int, rank where encoder and decoder should be split in cases where the model has both an
+       encoder and decoder (e.g., T5). Ignored if None.
+    """
+    ###################
+    # CPU Offloading
+    ###################
+    cpu_offloading: bool = False
+    """When set to True, all the activations are offloaded to the CPU asynchronously."""
+    cpu_offloading_num_layers: int = 0
+    """Tells the number of transformer layers for which activations has to be offloaded."""
+    _cpu_offloading_context: ContextManager = None  # Used for internal use only, not to be set by the user. TODO: Need to move to the 'right' place when possible.
+    """For internal use only, do not set."""
+    cpu_offloading_activations: bool = True
+    """If True, offloads the activations to CPU."""
+    cpu_offloading_weights: bool = True
+    """If True, offloads the weights to CPU."""
+    ###################
+    # Timing
+    ###################
+    barrier_with_L1_time: bool = True
+    """If true, use barrier with level 1 time measurements. It is up to the user to make sure
+       calling barrier with their timers will not result in hangs. This can happen if for example
+       the user adds a level 1 timer that is not called by all ranks.
+    """
+    def __post_init__(self):
+        """ Python dataclass method that is used to modify attributes after initialization.
+            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        if self.sequence_parallel:
+            if self.tensor_model_parallel_size <= 1:
+                raise ValueError("Can not use sequence paralllelism without tensor parallelism")
+        if self.pipeline_model_parallel_size > 1:
+            if self.pipeline_dtype is None:
+                raise ValueError(
+                    "When using pipeline parallelism, pipeline_dtype must be specified"
+                )
+        if self.autocast_dtype is None:
+            self.autocast_dtype = self.params_dtype
+        if self.defer_embedding_wgrad_compute and self.pipeline_model_parallel_size == 1:
+            raise ValueError(
+                "Cannot defer embedding wgrad compute when pipeline model parallel is not used"
+            )
+        if self.defer_embedding_wgrad_compute and not self.gradient_accumulation_fusion:
+            raise ValueError(
+                "Cannot defer embedding wgrad compute when gradient accumulation fusion is not used"
+            )
+        if self.expert_model_parallel_size > 1 and self.tensor_model_parallel_size > 1:
+            if self.sequence_parallel is False:
+                raise ValueError(
+                    "When using expert parallelism and tensor parallelism, sequence parallelism must be used"
+                )
--- a/megatron/core/models/T5/__init__.py
+++ b/megatron/core/models/T5/__init__.py
+from .t5_model import T5Model