首次上传

d444a97a · yangzhong · d444a97a · d444a97a · d444a97a · d444a97a
Commit d444a97a authored Oct 30, 2025 by yangzhong
20 changed files
--- a/megatron/core/fusions/__pycache__/fused_bias_gelu.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_bias_gelu.cpython-310.pyc
--- a/megatron/core/fusions/__pycache__/fused_bias_swiglu.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_bias_swiglu.cpython-310.pyc
--- a/megatron/core/fusions/__pycache__/fused_cross_entropy.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_cross_entropy.cpython-310.pyc
--- a/megatron/core/fusions/__pycache__/fused_layer_norm.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_layer_norm.cpython-310.pyc
--- a/megatron/core/fusions/__pycache__/fused_softmax.cpython-310.pyc
+++ b/megatron/core/fusions/__pycache__/fused_softmax.cpython-310.pyc
--- a/megatron/core/fusions/fused_bias_dropout.py
+++ b/megatron/core/fusions/fused_bias_dropout.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+
+from megatron.core.jit import jit_fuser
+
+
+def _bias_dropout_add_func(x_with_bias, residual, prob, training):
+    # type: (Tuple[Tensor, Optional[Tensor]], Tensor, float, bool) -> Tensor
+    # NOTE: Previously, the argument `bias` used to be passed as
+    # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
+    # transformer layer but broadcasting should automatically take care of that.
+    # Also, looking at broadcasting semantics, `expand_as` and broadcasting
+    # seem to be identical performance-wise (both just change the view).
+
+    x, bias = x_with_bias  # unpack
+
+    # If we want to train mixed precision, then the output of this function
+    # should be half precision. However, in AMP O1, the input (residual) is
+    # in fp32, and it will up-cast the result to fp32, causing pipeline parallel
+    # GPU communication to hang. Therefore, we need to cast residual to the same
+    # dtype as x.
+    residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
+
+    # The Dropout operation, Residual Addition and the tensor returning can be
+    # done generically outside the if statement, but that stops fusing of Bias
+    # Addition-Dropout-Residual Addition operation. So doing it together inside
+    # the conditional branch to improve performance
+    if bias is not None:
+        x = x + bias
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out
+        return out
+    else:
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out
+        return out
+
+
+def bias_dropout_add_unfused(training):
+    def _bias_dropout_add(x_with_bias, residual, prob):
+        return _bias_dropout_add_func(x_with_bias, residual, prob, training)
+
+    return _bias_dropout_add
+
+
+@jit_fuser
+def bias_dropout_add_fused_train(
+    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float
+) -> torch.Tensor:
+    return _bias_dropout_add_func(x_with_bias, residual, prob, True)
+
+
+@jit_fuser
+def bias_dropout_add_fused_inference(
+    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float
+) -> torch.Tensor:
+    return _bias_dropout_add_func(x_with_bias, residual, prob, False)
+
+
+def get_bias_dropout_add(training, fused):
+    if fused:
+        # jit scripting for a nn.module (with dropout) is not
+        # triggering the fusion kernel. For now, we use two
+        # different nn.functional routines to account for varying
+        # dropout semantics during training and inference phases.
+        if training:
+            return bias_dropout_add_fused_train
+        else:
+            return bias_dropout_add_fused_inference
+    else:
+        return bias_dropout_add_unfused(training)
--- a/megatron/core/fusions/fused_bias_geglu.py
+++ b/megatron/core/fusions/fused_bias_geglu.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.jit import jit_fuser
+
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+
+@jit_fuser
+def geglu(y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return (y_1 * 0.5 * (1.0 + torch.tanh(0.79788456 * y_1 * (1 + 0.044715 * y_1 * y_1)))) * y_2
+
+
+@jit_fuser
+def bias_geglu(bias, y):
+    y = y + bias
+    return geglu(y)
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def geglu_back(g, y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    tanh_out = torch.tanh(0.79788456 * y_1 * (1 + 0.044715 * y_1 * y_1))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * y_1 * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * y_1 * y_1)) + 0.5 * (
+        1 + tanh_out
+    )
+    return torch.cat(((g * y_2) * ff, g * (y_1 * 0.5 * (1.0 + tanh_out))), -1)
+
+
+@jit_fuser
+def bias_geglu_back(g, y, bias):
+    y = y + bias
+    return geglu_back(g, y)
+
+
+class BiasGeGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_geglu(input, bias)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_geglu_back(grad_output, input, bias)
+        return tmp, tmp
+
+
+class GeGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return geglu(input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors
+        tmp = geglu_back(grad_output, input[0])
+        return tmp
+
+
+def bias_geglu_impl(input, bias):
+    ori_shape = input.shape
+    assert len(ori_shape) in [2, 3]
+    input = input.view(-1, ori_shape[-1])
+    if bias is not None:
+        output = BiasGeGLUFunction.apply(input, bias)
+    else:
+        output = GeGLUFunction.apply(input)
+
+    return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
--- a/megatron/core/fusions/fused_bias_gelu.py
+++ b/megatron/core/fusions/fused_bias_gelu.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.jit import jit_fuser
+
+# BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+
+@jit_fuser
+def bias_gelu(bias, y):
+    x = bias + y
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    return ff * g
+
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+
+    # This is required to make Sphinx happy :-(
+    @classmethod
+    def apply(cls, *args, **kwargs):
+        return super().apply(*args, **kwargs)
+
+
+bias_gelu_impl = GeLUFunction.apply
--- a/megatron/core/fusions/fused_bias_swiglu.py
+++ b/megatron/core/fusions/fused_bias_swiglu.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core.jit import jit_fuser
+
+###### BIAS SWIGLU FUSION/ NO AUTOGRAD ################
+
+
+@jit_fuser
+def swiglu(y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return F.silu(y_1) * y_2
+
+
+@jit_fuser
+def bias_swiglu(y, bias):
+    y = y + bias
+    return swiglu(y)
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def swiglu_back(g, y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return torch.cat(
+        (g * torch.sigmoid(y_1) * (1 + y_1 * (1 - torch.sigmoid(y_1))) * y_2, g * F.silu(y_1)), -1
+    )
+
+
+@jit_fuser
+def bias_swiglu_back(g, y, bias):
+    y = y + bias
+    return swiglu_back(g, y)
+
+
+class BiasSwiGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias, fp8_input_store):
+        input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input
+        ctx.save_for_backward(input_for_backward, bias)
+        ctx.ori_input_dtype = input.dtype
+        ctx.fp8_input_store = fp8_input_store
+        return bias_swiglu(input, bias)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input
+        tmp = bias_swiglu_back(grad_output, input, bias)
+        return tmp, tmp, None
+
+
+class SwiGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, fp8_input_store):
+        input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input
+        ctx.save_for_backward(input_for_backward)
+        ctx.ori_input_dtype = input.dtype
+        ctx.fp8_input_store = fp8_input_store
+        return swiglu(input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors[0]
+        input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input
+        tmp = swiglu_back(grad_output, input)
+        return tmp, None
+
+
+def bias_swiglu_impl(input, bias, fp8_input_store=False):
+    ori_shape = input.shape
+    assert len(ori_shape) in [2, 3]
+    input = input.view(-1, ori_shape[-1])
+    if bias is not None:
+        output = BiasSwiGLUFunction.apply(input, bias, fp8_input_store)
+    else:
+        output = SwiGLUFunction.apply(input, fp8_input_store)
+
+    return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
+
+
+# bias_swiglu_impl = BiasSwiGLUFunction.apply
+# swiglu_impl = SwiGLUFunction.apply
--- a/megatron/core/fusions/fused_cross_entropy.py
+++ b/megatron/core/fusions/fused_cross_entropy.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Tuple
+
+import torch
+
+from megatron.core.jit import jit_fuser
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy
+from megatron.core.tensor_parallel.utils import VocabUtility
+
+
+@jit_fuser
+def calculate_logits_max(vocab_parallel_logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    vocab_parallel_logits, logits_max = VocabParallelCrossEntropy.calculate_logits_max(
+        vocab_parallel_logits
+    )
+
+    return vocab_parallel_logits, logits_max
+
+
+@jit_fuser
+def calculate_predicted_logits(
+    vocab_parallel_logits: torch.Tensor,
+    target: torch.Tensor,
+    logits_max: torch.Tensor,
+    vocab_start_index: int,
+    vocab_end_index: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    (target_mask, masked_target_1d, predicted_logits, sum_exp_logits, exp_logits) = (
+        VocabParallelCrossEntropy.calculate_predicted_logits(
+            vocab_parallel_logits, target, logits_max, vocab_start_index, vocab_end_index
+        )
+    )
+
+    predicted_logits_sum_exp_logits = torch.cat((predicted_logits, sum_exp_logits))
+
+    return target_mask, masked_target_1d, predicted_logits_sum_exp_logits, exp_logits
+
+
+@jit_fuser
+def calculate_cross_entropy_loss(
+    exp_logits: torch.Tensor, predicted_logits_sum_exp_logits: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    split_val = predicted_logits_sum_exp_logits.size()[0] // 2
+    predicted_logits, sum_exp_logits = torch.split(predicted_logits_sum_exp_logits, split_val)
+
+    exp_logits, loss = VocabParallelCrossEntropy.calculate_cross_entropy_loss(
+        exp_logits, predicted_logits, sum_exp_logits
+    )
+
+    return exp_logits, loss
+
+
+@jit_fuser
+def calculate_gradients(
+    softmax: torch.Tensor,
+    grad_output: torch.Tensor,
+    target_mask: torch.Tensor,
+    masked_target_1d: torch.Tensor,
+) -> torch.Tensor:
+
+    (grad_2d, arange_1d, softmax_update, grad_input) = (
+        VocabParallelCrossEntropy.prepare_gradient_calculation_operands(softmax, target_mask)
+    )
+
+    grad_input = VocabParallelCrossEntropy.calculate_gradients(
+        grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output
+    )
+
+    grad_input = grad_input.to(torch.bfloat16)
+
+    return grad_input
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        vocab_parallel_logits, logits_max = calculate_logits_max(vocab_parallel_logits)
+        torch.distributed.all_reduce(
+            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
+        )
+
+        # Get the partition's vocab indices
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+
+        (target_mask, masked_target_1d, predicted_logits_sum_exp_logits, exp_logits) = (
+            calculate_predicted_logits(
+                vocab_parallel_logits, target, logits_max, vocab_start_index, vocab_end_index
+            )
+        )
+
+        # All reduce is needed to get the chunks from other GPUs.
+        # In the fused case, tensors are batches to invoke a single
+        # AllReduce call
+        torch.distributed.all_reduce(
+            predicted_logits_sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_tensor_model_parallel_group(),
+        )
+
+        exp_logits, loss = calculate_cross_entropy_loss(exp_logits, predicted_logits_sum_exp_logits)
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        grad_input = calculate_gradients(softmax, grad_output, target_mask, masked_target_1d)
+
+        return grad_input, None
+
+
+def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """
+    Performs cross entropy loss when logits are split across tensor parallel ranks
+
+    Args:
+        vocab_parallel_logits: logits split across tensor parallel ranks
+                               dimension is [sequence_length, batch_size, hidden_size]
+
+        target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
+
+    """
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
--- a/megatron/core/fusions/fused_layer_norm.py
+++ b/megatron/core/fusions/fused_layer_norm.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import importlib
+import inspect
+import numbers
+
+import torch
+from torch import Tensor
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from megatron.core.transformer import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+
+    HAVE_PERSIST_LAYER_NORM = True
+except ImportError:
+    HAVE_PERSIST_LAYER_NORM = False
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
+    HAVE_FUSED_LAYER_NORM = True
+except ImportError:
+    HAVE_FUSED_LAYER_NORM = False
+
+
+class FusedLayerNorm(torch.nn.Module):
+    """Layer Norm, fused into a single CUDA kernel.
+
+    Args:
+      hidden_size (int): Transformer hidden dimension.
+
+      eps (float): Epsilon added to denominator, for numerical stability.
+
+      persist_layer_norm (bool): Use persistent fused layer norm kernel.
+      This kernel supports only a set of hidden sizes. Please
+      check persist_ln_hidden_sizes if your hidden size is supported.
+
+      zero_centered_gamma (bool): Adjust LayerNorm weights such that they are
+      centered around zero. This improves numerical stability.
+
+      config (TransformerConfig): Transformer config. Include to match custom
+      layer norm interfaces.
+
+      normalization (str): Normalization type, used for Transformer Engine.
+      Must equal 'LayerNorm' here.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        hidden_size: int,
+        eps: float = 1e-5,
+        persist_layer_norm: bool = True,
+        zero_centered_gamma: bool = False,
+        normalization: str = "LayerNorm",  # included to match TE interface
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.zero_centered_gamma = self.config.layernorm_zero_centered_gamma
+        assert (
+            self.config.normalization == "LayerNorm"
+        ), f'({self.config.normalization}) is not supported in FusedLayerNorm'
+
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
+        persist_layer_norm = self.config.persist_layer_norm
+        if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
+            persist_layer_norm = False
+
+        if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM:
+            # TODO: Add pytorch only layer norm
+            raise ValueError(f'Apex must be installed to use FusedLayerNorm.')
+
+        if isinstance(hidden_size, numbers.Integral):
+            hidden_size = (hidden_size,)
+        self.hidden_size = torch.Size(hidden_size)
+        self.eps = eps
+        # Parameters need to be initialized with torch.empty rather than torch.Tensor for correct device placement with nemo2.
+        self.weight = Parameter(torch.empty(*hidden_size))
+        self.bias = Parameter(torch.empty(*hidden_size))
+        self.reset_parameters()
+        self.persist_layer_norm = persist_layer_norm
+        self.sequence_parallel = self.config.sequence_parallel
+
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
+        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+
+    def reset_parameters(self):
+
+        if self.zero_centered_gamma:
+            init.zeros_(self.weight)
+            init.zeros_(self.bias)
+        else:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+
+        weight = self.weight + 1 if self.zero_centered_gamma else self.weight
+
+        if self.persist_layer_norm:
+            if 'memory_efficient' in inspect.getfullargspec(FastLayerNormFN.forward).args:
+                output = FastLayerNormFN.apply(
+                    input, weight, self.bias, self.eps, self.config.memory_efficient_layer_norm
+                )
+            else:
+                output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
+
+        else:
+            if (
+                'memory_efficient'
+                in inspect.getfullargspec(FusedLayerNormAffineFunction.forward).args
+            ):
+                return FusedLayerNormAffineFunction.apply(
+                    input,
+                    weight,
+                    self.bias,
+                    self.hidden_size,
+                    self.eps,
+                    self.config.memory_efficient_layer_norm,
+                )
+            else:
+                return FusedLayerNormAffineFunction.apply(
+                    input, weight, self.bias, self.hidden_size, self.eps
+                )
+
+        return output
--- a/megatron/core/fusions/fused_softmax.py
+++ b/megatron/core/fusions/fused_softmax.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.utils import get_default_causal_mask
+
+
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
+
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+
+        return input_grads, None
+
+
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+
+
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+
+
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+
+    Args:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+
+        assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled"
+
+    def forward(self, input: torch.Tensor, mask: Optional[torch.Tensor]):
+        """Forward pass of softmax with masked input.
+
+        In case attn_mask_type is causal the mask is generated and None can be passed.
+        A user-defined mask is only needed when attn_mask_type is not causal.
+        """
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 4096  # sk must be 16 ~ 2048
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 4096:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+
+        if self.scale is not None:
+            input = input * self.scale
+
+        # Generate causal mask if not given
+        sq, sk = input.size(2), input.size(3)
+        if self.attn_mask_type == AttnMaskType.causal and mask is None and sq > 1:
+            # If sq == 1 then either KV cache is used or one-element context is passed
+            # so keeping mask=None in this case; subsequent code should handle it
+            assert sq == sk, "causal mask is only for self attention"
+            mask = get_default_causal_mask(sq)
+
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+
+        return probs
+
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
--- a/megatron/core/inference/__init__.py
+++ b/megatron/core/inference/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
--- a/megatron/core/inference/ammo_support/__init__.py
+++ b/megatron/core/inference/ammo_support/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import warnings
+
+warnings.warn(
+    "The 'megatron.core.inference.ammo_support' module is deprecated and will be removed in a future release. "
+    "Please use megatron.core.inference.modelopt_support instead",
+    DeprecationWarning,
+)
--- a/megatron/core/inference/ammo_support/gpt/model_specs.py
+++ b/megatron/core/inference/ammo_support/gpt/model_specs.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.inference.modelopt_support.gpt.model_specs import get_gpt_layer_modelopt_spec
--- a/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
+++ b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.inference.modelopt_support.gpt.state_dict_hooks import (
+    mcore_gpt_load_legacy_state_dict_pre_hook,
+    mcore_gpt_load_te_state_dict_pre_hook,
+)
--- a/megatron/core/inference/common_inference_params.py
+++ b/megatron/core/inference/common_inference_params.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from megatron.core.inference.sampling_params import (  # noqa: F401 # pylint: disable=unused-import
+    SamplingParams as CommonInferenceParams,
+)
--- a/megatron/core/inference/communication_utils.py
+++ b/megatron/core/inference/communication_utils.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+
+from megatron.core import parallel_state
+
+
+def _is_cuda(tensor):
+    """Check if a tensor is not none and is cuda."""
+    assert tensor is not None
+    assert tensor.is_cuda
+
+
+def broadcast_from_last_pipeline_stage(size, dtype, tensor=None):
+    """Broadcast a tensor from last pipeline stage to all ranks."""
+
+    if parallel_state.is_pipeline_last_stage():
+        _is_cuda(tensor)
+        assert tensor.is_contiguous()
+    else:
+        tensor = torch.empty(size, dtype=dtype, device=torch.cuda.current_device())
+    # Get the group and corresponding source rank.
+    src = parallel_state.get_pipeline_model_parallel_last_rank()
+    group = parallel_state.get_pipeline_model_parallel_group()
+    torch.distributed.broadcast(tensor, src, group)
+    return tensor
+
+
+def recv_from_prev_pipeline_rank_(recv_buffer=None):
+    """Receive from previous pipeline stage and update the
+    input buffer inplace."""
+    recv_prev_op = torch.distributed.P2POp(
+        torch.distributed.irecv, recv_buffer, parallel_state.get_pipeline_model_parallel_prev_rank()
+    )
+    reqs = torch.distributed.batch_isend_irecv([recv_prev_op])
+    for req in reqs:
+        req.wait()
+    # To protect against race condition when using batch_isend_irecv().
+    torch.cuda.synchronize()
+
+
+def send_to_next_pipeline_rank(tensor=None):
+    """Send output to the next pipeline stage."""
+    send_next_op = torch.distributed.P2POp(
+        torch.distributed.isend, tensor, parallel_state.get_pipeline_model_parallel_next_rank()
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_next_op])
+    for req in reqs:
+        req.wait()
+    # To protect against race condition when using batch_isend_irecv().
+    torch.cuda.synchronize()
--- a/megatron/core/inference/engines/__init__.py
+++ b/megatron/core/inference/engines/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
--- a/megatron/core/inference/engines/abstract_engine.py
+++ b/megatron/core/inference/engines/abstract_engine.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class AbstractEngine(ABC):
+    @staticmethod
+    @abstractmethod
+    def generate(self) -> dict:
+        """The abstract backend's generate function.
+
+        To define a new backend, implement this and return the outputs as a dictionary.
+
+        Returns:
+            dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`.
+        """
+        pass