merge nv_release_v2.10 to release_v2.10

Signed-off-by: wenjh <wenjh@sugon.com>

merge nv_release_v2.10 to release_v2.10
Signed-off-by: wenjh <wenjh@sugon.com>
970620a5 · wenjh · c1a1c04e · 769ed778 · 970620a5 · 970620a5
Commit 970620a5 authored Dec 27, 2025 by wenjh
20 changed files
--- a/transformer_engine/jax/csrc/extensions/pybind.cpp
+++ b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -142,6 +142,11 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
      .value("NVTE_BSHD", NVTE_QKV_Format::NVTE_BSHD)
      .value("NVTE_THD", NVTE_QKV_Format::NVTE_THD);

+  pybind11::enum_<NVTE_Softmax_Type>(m, "NVTE_Softmax_Type", pybind11::module_local())
+      .value("NVTE_VANILLA_SOFTMAX", NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX)
+      .value("NVTE_OFF_BY_ONE_SOFTMAX", NVTE_Softmax_Type::NVTE_OFF_BY_ONE_SOFTMAX)
+      .value("NVTE_LEARNABLE_SOFTMAX", NVTE_Softmax_Type::NVTE_LEARNABLE_SOFTMAX);
+
  pybind11::enum_<NVTE_Activation_Type>(m, "NVTE_Activation_Type", pybind11::module_local())
      .value("GELU", NVTE_Activation_Type::GELU)
      .value("GEGLU", NVTE_Activation_Type::GEGLU)

--- a/transformer_engine/jax/dense.py
+++ b/transformer_engine/jax/dense.py
@@ -21,12 +21,12 @@ from .quantize import (
    ScaledTensorFactory,
    ScaledTensor,
    ScalingMode,
-    QuantizeLayout,
    QuantizerSet,
    noop_quantizer_set,
    with_sharding_constraint_by_logical_axes,
    is_fp8_gemm_with_all_layouts_supported,
    TensorUsage,
+    QuantizeLayout,
 )



--- a/transformer_engine/jax/flax/module.py
+++ b/transformer_engine/jax/flax/module.py
@@ -7,6 +7,7 @@ Wrapper module for Transformer related layers with FP8 support.
 from functools import reduce
 import operator
 from typing import Any, Callable, Iterable, List, Sequence, Tuple, Union, NewType, Optional
+import warnings

 import numpy as np
 import jax.numpy as jnp
@@ -23,8 +24,9 @@ from ..layernorm import layernorm
 from ..layernorm_dense import layernorm_dense
 from ..layernorm_mlp import layernorm_mlp
 from ..activation import activation
-from ..softmax import softmax, SoftmaxType
+from ..softmax import softmax, SoftmaxFusionType
 from ..sharding import with_sharding_constraint_by_logical_axes
+from ..attention import AttnSoftmaxType
 from ..cpp_extensions import (
    is_softmax_kernel_available,
    jax_scaled_softmax,
@@ -171,15 +173,20 @@ class Softmax(nn.Module):  # pylint: disable=too-few-public-methods
    ----------
    scale_factor : float, default = 1.0
        Scalar for the input to softmax.
-    softmax_type : SoftmaxType, default = SoftmaxType.SCALED
+    softmax_fusion_type : SoftmaxFusionType, default = SoftmaxFusionType.SCALED
+        Indicate the type of softmax.
+    softmax_type : AttnSoftmaxType, default = AttnSoftmaxType.VANILLA_SOFTMAX
        Indicate the type of softmax.
    """

    scale_factor: float = 1.0
-    softmax_type: SoftmaxType = SoftmaxType.SCALED
+    softmax_fusion_type: SoftmaxFusionType = SoftmaxFusionType.SCALED
+    softmax_type: AttnSoftmaxType = AttnSoftmaxType.VANILLA_SOFTMAX

    @nn.compact
-    def __call__(self, inputs: Array, mask: Array = None, bias: Array = None) -> jnp.ndarray:
+    def __call__(
+        self, inputs: Array, mask: Array = None, bias: Array = None, softmax_offset: Array = None
+    ) -> jnp.ndarray:
        batch = inputs.shape[0]
        heads = inputs.shape[1]
        q_seqlen = inputs.shape[2]
@@ -187,33 +194,52 @@ class Softmax(nn.Module):  # pylint: disable=too-few-public-methods
        input_dtype = inputs.dtype
        logits = inputs

+        if softmax_offset is not None:
+            assert self.softmax_type == AttnSoftmaxType.LEARNABLE_SOFTMAX
+        if self.softmax_type == AttnSoftmaxType.OFF_BY_ONE_SOFTMAX:
+            softmax_offset = 0.0
+
        # use primitives
        if is_softmax_kernel_available(
-            self.softmax_type, batch, heads, q_seqlen, k_seqlen, input_dtype
+            self.softmax_fusion_type,
+            self.softmax_type,
+            batch,
+            heads,
+            q_seqlen,
+            k_seqlen,
+            input_dtype,
        ):
            if bias is not None:
                logits = logits + bias.astype(input_dtype)

            mask_ = mask
-            if self.softmax_type is not SoftmaxType.SCALED_MASKED:
+            if self.softmax_fusion_type is not SoftmaxFusionType.SCALED_MASKED:
                mask_ = None

-            outputs = softmax(logits, mask_, self.scale_factor, self.softmax_type)
+            outputs = softmax(logits, mask_, self.scale_factor, self.softmax_fusion_type)
        # use default jax based implementation
        else:
+            warnings.warn(
+                "Using unfused JAX softmax implementation instead of TE fused primitives. ",
+                UserWarning,
+                stacklevel=2,
+            )
+
            if bias is not None:
                logits = logits + bias.astype(input_dtype)

-            if self.softmax_type is SoftmaxType.SCALED:
-                outputs = jax_scaled_softmax(logits, self.scale_factor)
-            elif self.softmax_type is SoftmaxType.SCALED_MASKED:
-                outputs = jax_scaled_masked_softmax(logits, mask, self.scale_factor)
-            elif self.softmax_type is SoftmaxType.SCALED_UPPER_TRIANG_MASKED:
-                outputs = jax_scaled_upper_triang_masked_softmax(logits, self.scale_factor)
+            if self.softmax_fusion_type is SoftmaxFusionType.SCALED:
+                outputs = jax_scaled_softmax(logits, self.scale_factor, softmax_offset)
+            elif self.softmax_fusion_type is SoftmaxFusionType.SCALED_MASKED:
+                outputs = jax_scaled_masked_softmax(logits, mask, self.scale_factor, softmax_offset)
+            elif self.softmax_fusion_type is SoftmaxFusionType.SCALED_UPPER_TRIANG_MASKED:
+                outputs = jax_scaled_upper_triang_masked_softmax(
+                    logits, self.scale_factor, softmax_offset
+                )
            else:
                raise ValueError(
-                    f"Unsupported softmax type: {self.softmax_type}. softmax_type must be [SCALED,"
-                    " SCALED_MASKED, SCALED_UPPER_TRIANG_MASKED]"
+                    f"Unsupported softmax fusion: {self.softmax_fusion_type}. softmax_fusion_type"
+                    " must be [SCALED, SCALED_MASKED, SCALED_UPPER_TRIANG_MASKED]"
                )
        assert input_dtype == outputs.dtype
        return outputs
@@ -253,26 +279,26 @@ class LayerNorm(nn.Module):  # pylint: disable=too-few-public-methods
    layernorm_type : {'layernorm', 'rmsnorm'}, default = 'layernorm'
        Indicate the type of layer normalization.
    zero_centered_gamma : bool, default = False
-        If set to `True`, the LayerNorm formula changes to
+        If set to ``True``, the LayerNorm formula changes to

        .. math::
-            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} *
+            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} \cdot
            (1 + \gamma) + \beta

-        This parameter is only applicable for 'layernorm'.
-        The default of `scale_init` will also be changed. See `scale_init`.
+        This parameter is only applicable for ``'layernorm'``.
+        The default of ``scale_init`` will also be changed. See ``scale_init``.
    scale_init : Initializer, default = None
        Used for initializing scale factors :math:`\gamma`.
-        If `None` is provided, scale_init is set according to the value of zero_centered_gamma.
-        If zero_centered_gamma is set to `True`, then scale_init is `flax.linen.initializers.zeros`.
-        Otherwise, scale_init is `flax.linen.initializers.ones`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        If ``None`` is provided, scale_init is set according to the value of zero_centered_gamma.
+        If zero_centered_gamma is set to ``True``, then scale_init is ``flax.linen.initializers.zeros``.
+        Otherwise, scale_init is ``flax.linen.initializers.ones``.
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    scale_axes : Tuple[str, ...], default = ('embed', )
        The name of axes used to shard the scale factors :math:`\gamma` with a corresponding mesh.
    bias_init : Initializer, default = flax.linen.initializers.zeros
        Used for initializing shift factors :math:`\beta`,
        only used when :attr:`layernorm_type='layernorm'`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    bias_axes : Tuple[str, ...], default = ('embed', )
        The name of axes used to shard the shift factors :math:`\beta` with a corresponding mesh.
        only used when :attr:`layernorm_type='layernorm'`.
@@ -398,15 +424,15 @@ class DenseGeneral(TransformerEngineBase):
    kernel_init : Initializer, default =
        flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal')
        Used for initializing weights.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    kernel_axes : Tuple[str, ...], default = ()
        The name of axes used to shard the weights with a corresponding mesh.
    use_bias: bool, default = False
        Indicate whether to enable bias shifting.
-        If set to False, the layer will not learn an additive bias.
+        If set to ``False``, the layer will not learn an additive bias.
    bias_init: Initializer, default = flax.linen.initializers.zeros
        Used for initializing bias, only used when :attr:`use_bias=True`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    bias_axes: Tuple[str, ...], default = ()
        The name of axes used to shard bias with a corresponding mesh,
        only used when :attr:`use_bias=True`.
@@ -417,12 +443,12 @@ class DenseGeneral(TransformerEngineBase):
        :attr:`enable_low_rank_adaptation=True`
    low_rank_adaptation_alpha: float, default = None
        The alpha for computing the scaling factor of LoRA output.
-        :math:`\frac{alpha}{rank} * lora_output`. None means no scaling.
+        :math:`\frac{alpha}{rank} \cdot lora\_output`. ``None`` means no scaling.
    axis:  Union[Iterable[int], int], default = -1
        An integer tuple with axes to apply the transformation on.
    input_axes: Tuple[str, ...], default = None
        Indicate the logical axes of sharding constraint to the input, like
-        (BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES). Default is None, which means not to insert
+        ``(BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES)``. Default is ``None``, which means not to insert
        sharding constraint.

    Optimization parameters
@@ -571,48 +597,48 @@ class LayerNormDenseGeneral(TransformerEngineBase):
    epsilon : float, default = 1e-6
        A value added to the denominator of layer normalization for numerical stability.
    zero_centered_gamma : bool, default = False
-        If set to `True`, the LayerNorm formula changes to
+        If set to ``True``, the LayerNorm formula changes to

        .. math::
-            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} *
+            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} \cdot
            (1 + \gamma) + \beta

-        This parameter is only applicable for 'layernorm'.
-        The default of `scale_init` will also be changed. See `scale_init`
+        This parameter is only applicable for ``'layernorm'``.
+        The default of ``scale_init`` will also be changed. See ``scale_init``
    scale_init : Initializer, default = None
        Used for initializing scale factors :math:`\gamma`.
-        If `None` is provided, scale_init is set according to the value of zero_centered_gamma.
-        If zero_centered_gamma is set to `True`, then scale_init is `flax.linen.initializers.zeros`.
-        Otherwise, scale_init is `flax.linen.initializers.ones`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        If ``None`` is provided, scale_init is set according to the value of zero_centered_gamma.
+        If zero_centered_gamma is set to ``True``, then scale_init is ``flax.linen.initializers.zeros``.
+        Otherwise, scale_init is ``flax.linen.initializers.ones``.
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    scale_axes : Tuple[str, ...], default = ('embed', )
        The name of axes used to shard the scale factors :math:`\gamma` with a corresponding mesh,
        only used when :attr:`enable_layernorm=True`.
    ln_bias_init: Initializer, default = flax.linen.initializers.zeros
        Used for initializing shift factors :math:`\beta`,
        only used when :attr:`enable_layernorm=True` and :attr:`layernorm_type='layernorm'`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    ln_bias_axes: Tuple[str, ...], default = ('embed', )
        The name of axes used to shard the shift factors :math:`\beta` with a corresponding mesh.
        It is only used when :attr:`enable_layernorm=True` and :attr:`layernorm_type='layernorm'`.
    kernel_init : Initializer, default =
        flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal')
        Used for initializing weights.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    kernel_axes : Tuple[str, ...], default = ()
        The name of axes used to shard the weights with a corresponding mesh.
    use_bias: bool, default = False
        Indicate whether to enable bias shifting.
-        If set to False, the layer will not learn an additive bias.
+        If set to ``False``, the layer will not learn an additive bias.
    bias_init: Initializer, default = flax.linen.initializers.zeros
        Used for initializing bias, only used when :attr:`use_bias=True`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    bias_axes: Tuple[str, ...], default = ()
        The name of axes used to shard bias with a corresponding mesh,
        only used when :attr:`use_bias=True`.
    return_layernorm_output: bool, default = False
        Indicate whether to return the output of layer normalization.
-        If set False, return None as the second tensor in outputs.
+        If set ``False``, return ``None`` as the second tensor in outputs.
    enable_low_rank_adaptation: bool, default = False
        Indicate whether to enable low rank adaptation for each dense layer.
    low_rank_adaptation_dim: int, default = 32
@@ -620,16 +646,16 @@ class LayerNormDenseGeneral(TransformerEngineBase):
        :attr:`enable_low_rank_adaptation=True`
    low_rank_adaptation_alpha: float, default = None
        The alpha for computing the scaling factor of LoRA output.
-        :math:`\frac{alpha}{rank} * lora_output`. None means no scaling.
+        :math:`\frac{alpha}{rank} \cdot lora\_output`. ``None`` means no scaling.
    axis:  Union[Iterable[int], int], default = -1
        An integer tuple with axes to apply the transformation on.
    layernorm_input_axes: Tuple[str, ...], default = None
        Indicate the logical axes of sharding constraint to the input of layernorm, like
-        (BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES). Default is None, which means not to insert
+        ``(BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES)``. Default is ``None``, which means not to insert
        sharding constraint.
    dot_input_axes: Tuple[str, ...], default = None
        Indicate the logical axes of sharding constraint to the input of dot, like
-        (BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES). Default is None, which means not to insert
+        ``(BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES)``. Default is ``None``, which means not to insert
        sharding constraint.

    Optimization parameters
@@ -861,34 +887,34 @@ class LayerNormMLP(TransformerEngineBase):
    epsilon : float, default = 1e-6
        A value added to the denominator of layer normalization for numerical stability.
    zero_centered_gamma : bool, default = False
-        If set to `True`, the LayerNorm formula changes to
+        If set to ``True``, the LayerNorm formula changes to

        .. math::
-            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} *
+            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} \cdot
            (1 + \gamma) + \beta

-        This parameter is only applicable for 'layernorm'.
-        The default of `scale_init` will also be changed. See `scale_init`.
+        This parameter is only applicable for ``'layernorm'``.
+        The default of ``scale_init`` will also be changed. See ``scale_init``.
    scale_init : Initializer, default = None
        Used for initializing scale factors :math:`\gamma`.
-        If `None` is provided, scale_init is set according to the value of zero_centered_gamma.
-        If zero_centered_gamma is set to `True`, then scale_init is `flax.linen.initializers.zeros`.
-        Otherwise, scale_init is `flax.linen.initializers.ones`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        If ``None`` is provided, scale_init is set according to the value of zero_centered_gamma.
+        If zero_centered_gamma is set to ``True``, then scale_init is ``flax.linen.initializers.zeros``.
+        Otherwise, scale_init is ``flax.linen.initializers.ones``.
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    scale_axes : Tuple[str, ...], default = ('embed', )
        The name of axes used to shard the scale factors :math:`\gamma` with a corresponding mesh,
        only used when :attr:`enable_layernorm=True`.
    ln_bias_init: Initializer, default = flax.linen.initializers.zeros
        Used for initializing shift factors :math:`\beta`,
        only used when :attr:`enable_layernorm=True` and :attr:`layernorm_type='layernorm'`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    ln_bias_axes: Tuple[str, ...], default = ('embed', )
        The name of axes used to shard the shift factors :math:`\beta` with a corresponding mesh.
        Only used when :attr:`enable_layernorm=True` and :attr:`layernorm_type='layernorm'`.
    kernel_init : Initializer, default =
        flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal')
        Used for initializing the weights of both dense layer transformations.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    kernel_axes_1 : Tuple[str, ...], default = ('embed', 'act', 'mlp')
        The name of axes used to shard the weights with a corresponding mesh for
        the weight of the first dense layer transformation.
@@ -897,10 +923,10 @@ class LayerNormMLP(TransformerEngineBase):
        the weight of the second dense layer transformation.
    use_bias: bool, default = False
        Indicate whether to enable bias shifting.
-        If set to False, the layer will not learn an additive bias.
+        If set to ``False``, the layer will not learn an additive bias.
    bias_init: Initializer, default = flax.linen.initializers.zeros
        Used for initializing bias, only used when :attr:`use_bias=True`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    bias_axes_1: Tuple[str, ...], default = ('mlp',)
        The name of axes used to shard bias with a corresponding mesh  for
        the weight of the first dense layer transformation.
@@ -911,7 +937,7 @@ class LayerNormMLP(TransformerEngineBase):
        Only used when :attr:`use_bias=True`.
    return_layernorm_output: bool, default = False
        Indicate whether to return the output of layer normalization.
-        If set False, return None as the second tensor in outputs.
+        If set ``False``, return ``None`` as the second tensor in outputs.
    activations: Sequence[Union[str, Callable]], default = ('gelu',)
        The sequence of activation functions to apply after the first dense layer transformation.
        Each activation has its own transformation layer.
@@ -932,20 +958,20 @@ class LayerNormMLP(TransformerEngineBase):
        :attr:`enable_low_rank_adaptation=True`.
    low_rank_adaptation_alpha: float, default = None
        The alpha for computing the scaling factor of LoRA output.
-        :math:`\frac{alpha}{rank} * lora_output`. None means no scaling.
+        :math:`\frac{alpha}{rank} \cdot lora\_output`. ``None`` means no scaling.
    axis:  Union[Iterable[int], int], default = -1
        An integer tuple with axes to apply the transformation on.
    layernorm_input_axes: Tuple[str, ...], default = None
        Indicate the logical axes of sharding constraint to the input of layernorm, like
-        (BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES). Default is None, which means not to insert
+        ``(BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES)``. Default is ``None``, which means not to insert
        sharding constraint.
    dot_1_input_axes: Tuple[str, ...], default = None
        Indicate the logical axes of sharding constraint to the input of 1st dot, like
-        (BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES). Default is None, which means not to insert
+        ``(BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES)``. Default is ``None``, which means not to insert
        sharding constraint.
    dot_2_input_axes: Tuple[str, ...], default = None
        Indicate the logical axes of sharding constraint to the input of 2nd dot, like
-        (BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES). Default is None, which means not to insert
+        ``(BATCH_AXES, SEQLEN_AXES, HIDDEN_AXES)``. Default is ``None``, which means not to insert
        sharding constraint.
    ffn1_ckpt_name: str = "ffn1"
        Checkpoint name for the output of the first fully-connected layer in the MLP block.

--- a/transformer_engine/jax/flax/transformer.py
+++ b/transformer_engine/jax/flax/transformer.py
@@ -23,11 +23,17 @@ from jax.ad_checkpoint import checkpoint_name

 from .module import DenseGeneral, LayerNormDenseGeneral, LayerNormMLP
 from .module import LayerNorm, Softmax
-from ..attention import AttnBiasType, AttnMaskType, QKVLayout, SequenceDescriptor
+from ..attention import (
+    AttnBiasType,
+    AttnMaskType,
+    AttnSoftmaxType,
+    QKVLayout,
+    SequenceDescriptor,
+)
 from ..attention import is_fused_attn_kernel_available, make_swa_mask, canonicalize_attn_mask_type
 from ..attention import fused_attn
 from ..attention import CPStrategy
-from ..softmax import SoftmaxType
+from ..softmax import SoftmaxFusionType
 from ..sharding import num_of_devices
 from ..sharding import get_sharding_map_logic_axis_to_mesh_axis
 from ..sharding import with_sharding_constraint_by_logical_axes
@@ -118,8 +124,9 @@ class _UnfusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-
    dtype: DType = jnp.float32
    float32_logits: bool = False
    scale_factor: Optional[float] = None
-    transpose_batch_sequence: bool = True
+    transpose_batch_sequence: bool = False
    window_size: Optional[Tuple[int, int]] = None
+    softmax_type: AttnSoftmaxType = AttnSoftmaxType.VANILLA_SOFTMAX

    @nn.compact
    def __call__(
@@ -145,6 +152,22 @@ class _UnfusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-

        input_dtype = query.dtype

+        # Infer number of attention heads from query shape
+        # query shape: [..., h, d] where h is num_attention_heads
+        num_attention_heads = query.shape[-2]
+
+        # Initialize softmax_offset for learnable softmax
+        # Note: OFF_BY_ONE_SOFTMAX is handled internally by the Softmax module
+        softmax_offset = None
+        if self.softmax_type == AttnSoftmaxType.LEARNABLE_SOFTMAX:
+            # For learnable softmax, create a learnable parameter with proper sharding and shape (1, h, 1, 1)
+            softmax_offset = self.param(
+                "softmax_offset",
+                nn.with_logical_partitioning(nn.initializers.zeros, (None, HEAD_AXES, None, None)),
+                (1, num_attention_heads, 1, 1),
+                jnp.float32,
+            )
+
        if self.scale_factor is None:
            scale_factor = 1.0 / sqrt(query.shape[-1])
        else:
@@ -213,8 +236,8 @@ class _UnfusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-
            new_mask = jnp.where(original_mask == 0, swa_mask, original_mask)
            return new_mask

-        def convert_to_softmax_type(attn_mask_type, mask):
-            """Convert the attn_mask_type to SoftmaxType"""
+        def convert_to_softmax_fusion_type(attn_mask_type, mask):
+            """Convert the attn_mask_type to SoftmaxFusionType"""
            # mask is ignored for no_mask and causal_mask without sliding window
            if attn_mask_type == AttnMaskType.NO_MASK:
                mask = None
@@ -224,21 +247,23 @@ class _UnfusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-
                mask = apply_swa_mask(mask)
            # Currently cuDNN backend only supports SWA for causal/padding_causal, follow this
            if mask is not None:
-                return SoftmaxType.SCALED_MASKED, mask
+                return SoftmaxFusionType.SCALED_MASKED, mask
            if attn_mask_type is AttnMaskType.CAUSAL_MASK:
-                return SoftmaxType.SCALED_UPPER_TRIANG_MASKED, mask
+                return SoftmaxFusionType.SCALED_UPPER_TRIANG_MASKED, mask
            if attn_mask_type is AttnMaskType.NO_MASK:
-                return SoftmaxType.SCALED, mask
+                return SoftmaxFusionType.SCALED, mask
            raise ValueError(
                f"Unsupported {attn_mask_type=}, supported attn_mask_type="
                "{'no_mask', 'padding', 'causal', 'padding_causal', 'causal_padding'}"
            )

-        softmax_type, mask = convert_to_softmax_type(self.attn_mask_type, mask)
+        softmax_fusion_type, mask = convert_to_softmax_fusion_type(self.attn_mask_type, mask)

-        attn_weights = Softmax(softmax_type=softmax_type, scale_factor=fused_scale_factor)(
-            attn_weights, mask, bias
-        ).astype(input_dtype)
+        attn_weights = Softmax(
+            softmax_fusion_type=softmax_fusion_type,
+            softmax_type=self.softmax_type,
+            scale_factor=fused_scale_factor,
+        )(attn_weights, mask, bias, softmax_offset=softmax_offset).astype(input_dtype)

        if is_gqa:
            attn_weights = attn_weights.reshape(attn_weights_with_groups_shape)
@@ -279,6 +304,7 @@ class _FusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-me
    context_parallel_axis: str = ""
    context_parallel_strategy: CPStrategy = CPStrategy.DEFAULT
    context_checkpoint_name: str = "context"
+    softmax_type: AttnSoftmaxType = AttnSoftmaxType.VANILLA_SOFTMAX

    @nn.compact
    def __call__(
@@ -303,6 +329,17 @@ class _FusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-me
            scale_factor = self.scale_factor
        del self.scale_factor

+        num_attention_heads = query.shape[-2]
+        softmax_offset = None
+        if self.softmax_type == AttnSoftmaxType.LEARNABLE_SOFTMAX:
+            # For learnable softmax, create a learnable parameter with proper sharding and shape (1, h, 1, 1)
+            softmax_offset = self.param(
+                "softmax_offset",
+                nn.with_logical_partitioning(nn.initializers.zeros, (None, HEAD_AXES, None, None)),
+                (1, num_attention_heads, 1, 1),
+                jnp.float32,
+            )
+
        if self.qkv_layout.is_qkvpacked():
            """qkvpacked format, treat
            query: qkvpacked tensor, shape = [..., 3, h, d]
@@ -320,6 +357,7 @@ class _FusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-me
                attn_mask_type=self.attn_mask_type,
                attn_bias_type=self.attn_bias_type,
                qkv_layout=self.qkv_layout,
+                softmax_type=self.softmax_type,
                scaling_factor=scale_factor,
                dropout_probability=self.attention_dropout,
                is_training=not deterministic,
@@ -329,6 +367,7 @@ class _FusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-me
                context_parallel_axis=self.context_parallel_axis,
                context_parallel_strategy=self.context_parallel_strategy,
                context_checkpoint_name=self.context_checkpoint_name,
+                softmax_offset=softmax_offset,
            )
        elif self.qkv_layout.is_kvpacked():
            """kvpacked format, treat
@@ -348,6 +387,7 @@ class _FusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-me
                attn_mask_type=self.attn_mask_type,
                attn_bias_type=self.attn_bias_type,
                qkv_layout=self.qkv_layout,
+                softmax_type=self.softmax_type,
                scaling_factor=scale_factor,
                dropout_probability=self.attention_dropout,
                is_training=not deterministic,
@@ -357,6 +397,7 @@ class _FusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-me
                context_parallel_axis=self.context_parallel_axis,
                context_parallel_strategy=self.context_parallel_strategy,
                context_checkpoint_name=self.context_checkpoint_name,
+                softmax_offset=softmax_offset,
            )
        elif self.qkv_layout.is_separate():
            if self.transpose_batch_sequence:
@@ -371,6 +412,7 @@ class _FusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-me
                attn_mask_type=self.attn_mask_type,
                attn_bias_type=self.attn_bias_type,
                qkv_layout=self.qkv_layout,
+                softmax_type=self.softmax_type,
                scaling_factor=scale_factor,
                dropout_probability=self.attention_dropout,
                is_training=not deterministic,
@@ -380,6 +422,7 @@ class _FusedDotProductAttention(nn.Module):  # pylint: disable=too-few-public-me
                context_parallel_axis=self.context_parallel_axis,
                context_parallel_strategy=self.context_parallel_strategy,
                context_checkpoint_name=self.context_checkpoint_name,
+                softmax_offset=softmax_offset,
            )
        else:
            raise ValueError(f"Unsupported {self.qkv_layout=}.")
@@ -426,7 +469,7 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
        The hidden dimension of each attention head.
    num_attention_heads: int
        The number of attention heads.
-    num_gqa_groups: int, default = `None`
+    num_gqa_groups: int, default = None
        Number of GQA groups. When `None` is present, it is equal to num_attention_heads.
        Grouped Query Attention is described in
        `this paper <https://arxiv.org/pdf/2305.13245.pdf>`_.
@@ -439,32 +482,45 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
    attn_mask_type: str, default = 'causal'
        This parameter specifies the type of attention mask to be applied during the softmax
        operation.
-        Available options are {'no_mask', 'padding', 'causal', 'causal_padding', 'padding_causal'}
+        Available options are {'no_mask', 'padding', 'causal', 'causal_padding', 'padding_causal'}.

        Each described below:

-        * no_mask: No attention mask is applied. This means the attention will consider the
+        * ``no_mask``: No attention mask is applied. This means the attention will consider the
          full sequence without any restrictions.
-        * padding: Indicates the presence of padding at the end of each sequence.
-          Users must provide a mask with the shape [batch, 1, max_seqlen_q, max_seqlen_kv] in the
+        * ``padding``: Indicates the presence of padding at the end of each sequence.
+          Users must provide a mask with the shape ``[batch, 1, max_seqlen_q, max_seqlen_kv]`` in the
          :attr:`__call__` method to specify the padding positions.
-        * causal: An upper triangular mask is applied to the softmax inputs,
+        * ``causal``: An upper triangular mask is applied to the softmax inputs,
          ensuring that the prediction for a certain position is only dependent on known outputs
          from positions before it.
-        * causal_padding / padding_causal: A combination of both causal and padding masks.
-          Both 'causal_padding' and 'padding_causal' are acceptable and have the same effect.
+        * ``causal_padding`` / ``padding_causal``: A combination of both causal and padding masks.
+          Both ``'causal_padding'`` and ``'padding_causal'`` are acceptable and have the same effect.
+
+        |
+
+        .. note:: :attr:`mask` in :attr:`__call__` is ignored for ``'no_mask'`` and ``'causal'``.

-        .. note:: :attr:`mask` in :attr:`__call__` is ignored for 'no_mask' and 'causal'.
+        |

-        .. note:: THD format only supports 'padding' or 'causal_padding' mask type.
+        .. note:: THD format only supports ``'padding'`` or ``'causal_padding'`` mask type.

-       attn_mask_type       mask/sequence_descriptor       SWA          softmax type
-       --------------------------------------------------------------------------------------------
-       no_mask              None                           None         SCALED
-       causal               None                           None         SCALED_UPPER_TRIANG_MASKED
-       causal               None                           Yes          SCALED_MASKED
-       padding              Required                       Yes/No       SCALED_MASKED
-       padding_causal       Required                       Yes/No       SCALED_MASKED
+        |
+
+        .. table::
+            :widths: auto
+
+            ================== ============ ========== ==============================
+            attn_mask_type     mask/sd      SWA        softmax type
+            ================== ============ ========== ==============================
+            no_mask            None         None       SCALED
+            causal             None         None       SCALED_UPPER_TRIANG_MASKED
+            causal             None         Yes        SCALED_MASKED
+            padding            Required     Yes/No     SCALED_MASKED
+            padding_causal     Required     Yes/No     SCALED_MASKED
+            ================== ============ ========== ==============================
+
+        where sd stands for sequence_descriptor.

    attn_bias_type: Optional[str], default = None
        Type of the attention bias passed in the attention.
@@ -501,19 +557,49 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
        Scale factor to apply on query. When :attr:`None` is present, the scale factor is equal
        to :math:`\frac{1}{\sqrt{head\_dim}}`. This is useful for model like T5X, which doesn't
        need to apply scale on query, which is to set :attr:`scale_factor=1.`.
-    transpose_batch_sequence: bool, default = True
+    TODO(KshitijLakhani): Reset this to bool only with default False arg in TransformerEngine v2.12
+    transpose_batch_sequence: bool | None, default = None (however, default is forced to False in post_init)
        Indicate whether the input tensors were switched axis of batch
-        and sequence length dimension. if set to True, the input tensors
+        and sequence length dimension. If set to True, the input tensors
        should be in (seqlen, batch, ...), otherwise (batch, seqlen, ...).
    window_size: Optional[Tuple[int, int]], default = None
        Sliding window size. The default value is no sliding window.
    max_segments_per_seq: Optional[int], default = 1
        The maximum number of segments per sequence, also used for THD format (sequence packing).
-    context_parallel_causal_load_balanced (bool):
-            Indicates the sequences are ordered for causal mask load balancing when running context parallelism.
-    context_parallel_axis (str): The name of the context parallel axis.
-    context_parallel_strategy (CPStrategy): The strategy of context parallel. 0: DEFAULT, 1: ALL_GATHER, 2: RING.
-    context_checkpoint_name (str): The name of the context checkpoint in the forward pass of fused attention.
+    context_parallel_causal_load_balanced: bool
+        Indicates the sequences are ordered for causal mask load balancing when running context parallelism.
+    context_parallel_axis: str
+        The name of the context parallel axis.
+    context_parallel_strategy: CPStrategy
+        The strategy of context parallel. 0: DEFAULT, 1: ALL_GATHER, 2: RING.
+    context_checkpoint_name: str
+        The name of the context checkpoint in the forward pass of fused attention.
+    softmax_type: str = {'vanilla', 'off-by-one', 'learnable'}, default = 'vanilla'
+        Softmax type as described in the paper
+        `Efficient Streaming Language Models with Attention Sinks
+        <https://arxiv.org/pdf/2309.17453v3>`_.
+
+        For a given attention score :math:`S = Q \cdot K^T`, of shape ``[b, h, s_q, s_kv]``:
+
+        * ``'vanilla'``:
+
+          .. math::
+             Softmax(S)_{:,:,:,i} = \frac{\exp(S_{:,:,:,i})}{\sum_j \exp(S_{:,:,:,j})}
+
+        * ``'off-by-one'``:
+
+          .. math::
+             Softmax(S)_{:,:,:,i} = \frac{\exp(S_{:,:,:,i})}{1 + \sum_j \exp(S_{:,:,:,j})}
+
+        * ``'learnable'``:
+
+          .. math::
+             Softmax(S)_{:,h,:,i} = \frac{\exp(S_{:,h,:,i})}{\exp(\alpha_h) + \sum_j \exp(S_{:,h,:,j})}
+
+          where :math:`\alpha` is a learnable parameter of shape ``[h]``.
+
+        ``'off-by-one'`` and ``'learnable'`` softmax types are also called sink attention
+        (``'zero sink'`` and ``'learnable sink'``).

    Optimization parameters
    -----------------------
@@ -532,13 +618,25 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
    float32_logits: bool = False
    qkv_layout: str = "bshd_bshd_bshd"
    scale_factor: Optional[float] = None
-    transpose_batch_sequence: bool = True
+    transpose_batch_sequence: bool | None = None
    window_size: Optional[Tuple[int, int]] = None
    max_segments_per_seq: Optional[int] = 1
    context_parallel_causal_load_balanced: bool = False
    context_parallel_axis: str = ""
    context_parallel_strategy: str = "DEFAULT"
    context_checkpoint_name: str = "context"
+    softmax_type: str = "vanilla"
+
+    def __post_init__(self):
+        # TODO(KshitijLakhani): Remove warning in TransformerEngine v2.12
+        # None implies that the user is relying on defaults, hence warn the user and set the new defaults
+        if self.transpose_batch_sequence is None:
+            warnings.warn(
+                "transpose_batch_sequence defaults to False in DotProductAttention starting"
+                " TransformerEngine v2.10"
+            )
+            self.transpose_batch_sequence = False
+        super().__post_init__()

    @nn.compact
    def __call__(
@@ -564,7 +662,7 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
        mask: jax.numpy.ndarray, default = None
            Boolean tensor used to mask out the attention softmax input.
            :attr:`True` means to mask out the corresponding values.
-            Ignored when :attr:`self.attn_mask_type` is either 'no_mask' or 'causal'.
+            Ignored when :attr:`self.attn_mask_type` is either ``'no_mask'`` or ``'causal'``.
        bias: jax.numpy.ndarray, default = None
            A tensor used to shift attention softmax input.
        *:
@@ -595,6 +693,7 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
            attn_bias_type = AttnBiasType[self.attn_bias_type.upper()]
        attn_mask_type = canonicalize_attn_mask_type(self.attn_mask_type)
        qkv_layout = QKVLayout[self.qkv_layout.upper()]
+        softmax_type = AttnSoftmaxType.from_str(self.softmax_type)
        del self.attn_bias_type, self.attn_mask_type, self.qkv_layout

        if attn_bias_type == AttnBiasType.NO_BIAS:
@@ -626,6 +725,7 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
            qkv_layout,
            attn_bias_type,
            attn_mask_type,
+            softmax_type,
            self.attention_dropout,
            self.num_attention_heads,
            self.num_gqa_groups,
@@ -702,6 +802,7 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
                scale_factor=scale_factor,
                transpose_batch_sequence=self.transpose_batch_sequence,
                window_size=self.window_size,
+                softmax_type=softmax_type,
            )(
                query,
                key,
@@ -726,6 +827,7 @@ class DotProductAttention(nn.Module):  # pylint: disable=too-few-public-methods
                context_parallel_axis=self.context_parallel_axis,
                context_parallel_strategy=context_parallel_strategy,
                context_checkpoint_name=self.context_checkpoint_name,
+                softmax_type=softmax_type,
            )(
                query,
                key,
@@ -747,7 +849,7 @@ def rotary_pos_emb(
 ):
    """
    Rotary Positional Embedding
-    x should be in shape of
+    x should be of shape
    [Batch, Seqlen, ..., Heads, Hidden] if transpose_batch_sequence is False, or
    [Seqlen, Batch, ..., Heads, Hidden] if transpose_batch_sequence is True.
    """
@@ -885,7 +987,7 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
        The hidden dimension of each attention head.
    num_attention_heads: int
        The number of attention heads.
-    num_gqa_groups: int, default = `None`
+    num_gqa_groups: int, default = None
        Number of GQA groups. When `None` is present, it is equal to num_attention_heads.
        Grouped Query Attention is described in
        `this paper <https://arxiv.org/pdf/2305.13245.pdf>`_.
@@ -898,28 +1000,28 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
    attn_mask_type: str, default = 'causal'
        This parameter specifies the type of attention mask to be applied during the softmax
        operation.
-        Available options are {'no_mask', 'padding', 'causal', 'causal_padding', 'padding_causal'}
+        Available options are {'no_mask', 'padding', 'causal', 'causal_padding', 'padding_causal'}.

        Each described below:

-        * no_mask: No attention mask is applied. This means the attention will consider the
+        * ``no_mask``: No attention mask is applied. This means the attention will consider the
          full sequence without any restrictions.
-        * padding: Indicates the presence of padding at the end of each sequence.
-          Users must provide a mask with the shape [batch, 1, max_seqlen_q, max_seqlen_kv] in the
+        * ``padding``: Indicates the presence of padding at the end of each sequence.
+          Users must provide a mask with the shape ``[batch, 1, max_seqlen_q, max_seqlen_kv]`` in the
          :attr:`__call__` method to specify the padding positions.
-        * causal: An upper triangular mask is applied to the softmax inputs,
+        * ``causal``: An upper triangular mask is applied to the softmax inputs,
          ensuring that the prediction for a certain position is only dependent on known outputs
          from positions before it.
-        * causal_padding / padding_causal: A combination of both causal and padding masks.
-          Both 'causal_padding' and 'padding_causal' are acceptable and have the same effect.
+        * ``causal_padding`` / ``padding_causal``: A combination of both causal and padding masks.
+          Both ``'causal_padding'`` and ``'padding_causal'`` are acceptable and have the same effect.

-        .. note:: :attr:`mask` in :attr:`__call__` is ignored for 'no_mask' and 'causal'.
+        .. note:: :attr:`mask` in :attr:`__call__` is ignored for ``'no_mask'`` and ``'causal'``.

    attn_bias_type: Optional[str], default = None
        Type of the attention bias passed in the attention.
-        Available options: {'no_bias', 'pre_scale_bias', 'post_scale_bias'}.
+        Available options: ``{'no_bias', 'pre_scale_bias', 'post_scale_bias'}``.
        When default is present, the type is automatically decided by the MHA's bias parameter.
-        Where it is `post_scale_bias` if there is bias. Otherwise `no_bias` is used.
+        Where it is ``'post_scale_bias'`` if there is bias. Otherwise ``'no_bias'`` is used.
    dropout_rng_name: str, default = 'dropout'
        The key in given RNGs via flax.linen.Module.apply that is used
        to generate Dropout masks in the core attention.
@@ -928,27 +1030,27 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
    layernorm_epsilon: float, default = 1e-6
        A value added to the denominator of layer normalization for numerical stability.
    zero_centered_gamma: bool, default = False
-        If set to `True`, the LayerNorm formula changes to
+        If set to ``True``, the LayerNorm formula changes to

        .. math::
-            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} *
+            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} \cdot
            (1 + \gamma) + \beta

-        This parameter is only applicable for 'layernorm'.
+        This parameter is only applicable for ``'layernorm'``.
    kernel_init: Initializer, default =
-        flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'normal')
+        ``flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'normal')``
        Used for initializing the QKV and output projection weights.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    use_bias: bool, default = False
        Indicate whether or not to enable bias shifting for QKV and output projections.
-        If set to False, the layer will not learn additive biases.
-    bias_init: Initializer, default = flax.linen.initializers.zeros
+        If set to ``False``, the layer will not learn additive biases.
+    bias_init: Initializer, default = ``flax.linen.initializers.zeros``
        Used for initializing bias of QKVO projections, only used when :attr:`use_bias=True`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    input_layernorm: bool, default = True
-        If set to False, layer normalization to the input is not applied.
+        If set to ``False``, layer normalization to the input is not applied.
    return_layernorm_output: bool, default = False
-        If set to True, output of layernorm is returned from the forward together with the output
+        If set to ``True``, output of layernorm is returned from the forward together with the output
        of the linear transformation.
        Example use case: residual connection for transformer module is taken post layernorm.
    enable_rotary_pos_emb: bool, default = False
@@ -958,17 +1060,17 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
        only used when :attr:`enable_rotary_pos_emb=True`
    rotary_pos_emb_group_method: str, default = 'consecutive'
        Indicate the method to coupled the coordinates. It should be one of
-        ['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2`
-        , d is the hidden dimension. 'consecutive' pairs index :math:`i` with :math:`i + 1`.
+        ``['consecutive', 'alternate']``. ``'alternate'`` is to pair index :math:`i` with :math:`i + d/2`
+        , d is the hidden dimension. ``'consecutive'`` pairs index :math:`i` with :math:`i + 1`.
    low_rank_adaptation_scope: str, default = 'none'
        Indicate the scope to apply low rank adaptation. It should be one of
-        ['none', 'all', 'qkv_proj', 'output_proj', 'exclude_qkv_proj', 'exclude_output_proj']
+        ``['none', 'all', 'qkv_proj', 'output_proj', 'exclude_qkv_proj', 'exclude_output_proj']``
    low_rank_adaptation_dim: int, default = 32
        The dimension for low rank adaptation, only used when
        :attr:`enable_low_rank_adaptation=True`
    low_rank_adaptation_alpha: float, default = None
        The alpha for computing the scaling factor of LoRA output.
-        :math:`\frac{alpha}{rank} * lora_output`. None means no scaling.
+        :math:`\frac{alpha}{rank} \cdot lora\_output`. ``None`` means no scaling.
    enable_sequence_parallel: bool, default = False
        Whether to enable sequence parallelism to operations except dot.
    num_heads: int, default = None
@@ -988,14 +1090,15 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
        If set to True, this module exposes a single fused
        parameter for query-key-value for self-attention and key-value for
        cross-attention.
-    transpose_batch_sequence: bool, default = True
+    TODO(KshitijLakhani): Reset this to bool only with default False arg in TransformerEngine v2.12
+    transpose_batch_sequence: bool | None, default = None (however, default is forced to False in post_init)
        Indicate whether the input tensors were switched axis of batch
        and sequence length dimension. if set to True, the input tensors
        should be in (seqlen, batch, hidden), otherwise (batch, seqlen, hidden).
    scale_attn_logits: bool, default = False
        Indicate whether to scale attention logits.
-        If set to True, :math:`\frac{Q}{\sqrt{head\_dim}*K}`,
-        else :math:`Q*K`
+        If set to True, :math:`\frac{Q \cdot K^T}{\sqrt{head\_dim}}`,
+        else :math:`Q \cdot K^T`
    scaled_query_init: bool, default = True
        Whether to scale WQ on initialization by :math:`\frac{1}{\sqrt{head\_dim}}`
    float32_logits: bool, default = False
@@ -1005,6 +1108,32 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
        Deprecated. Please refer `fuse_qkv_params`
    window_size: Optional[Tuple[int, int]], default = None
        Sliding window size. Default value is no sliding window.
+    softmax_type: str = {'vanilla', 'off-by-one', 'learnable'}, default = 'vanilla'
+        Softmax type as described in the paper
+        `Efficient Streaming Language Models with Attention Sinks
+        <https://arxiv.org/pdf/2309.17453v3>`_.
+
+        For a given attention score :math:`S = Q \cdot K^T`, of shape ``[b, h, s_q, s_kv]``:
+
+        * ``'vanilla'``:
+
+          .. math::
+             Softmax(S)_{:,:,:,i} = \frac{\exp(S_{:,:,:,i})}{\sum_j \exp(S_{:,:,:,j})}
+
+        * ``'off-by-one'``:
+
+          .. math::
+             Softmax(S)_{:,:,:,i} = \frac{\exp(S_{:,:,:,i})}{1 + \sum_j \exp(S_{:,:,:,j})}
+
+        * ``'learnable'``:
+
+          .. math::
+             Softmax(S)_{:,h,:,i} = \frac{\exp(S_{:,h,:,i})}{\exp(\alpha_h) + \sum_j \exp(S_{:,h,:,j})}
+
+          where :math:`\alpha` is a learnable parameter of shape ``[h]``.
+
+        ``'off-by-one'`` and ``'learnable'`` softmax types are also called sink attention
+        (``'zero sink'`` and ``'learnable sink'``).
    """

    head_dim: int
@@ -1030,12 +1159,13 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
    low_rank_adaptation_alpha: float = None
    dtype: DType = jnp.float32
    fuse_qkv_params: bool = True
-    transpose_batch_sequence: bool = True
+    transpose_batch_sequence: bool | None = None
    enable_sequence_parallel: bool = False
    scale_attn_logits: bool = False
    scaled_query_init: bool = True
    float32_logits: bool = False
    window_size: Optional[Tuple[int, int]] = None
+    softmax_type: str = "vanilla"

    # Deprecated parameters
    num_heads: Optional[int] = None
@@ -1045,6 +1175,15 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
    fuse_qkv: Optional[bool] = None

    def __post_init__(self):
+        # Deal with changed defaults in API
+        # TODO(KshitijLakhani): Remove warning in TransformerEngine v2.12
+        # None implies that the user is relying on defaults, hence warn the user and set the new defaults
+        if self.transpose_batch_sequence is None:
+            warnings.warn(
+                "transpose_batch_sequence defaults to False in MultiHeadAttention starting"
+                " TransformerEngine v2.10"
+            )
+            self.transpose_batch_sequence = False
        # Deal with the deprecated parameters
        if self.num_heads is not None:
            self.num_attention_heads = self.num_heads
@@ -1109,7 +1248,7 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
        mask: jax.numpy.ndarray, default = None
            Boolean tensor used to mask out the attention softmax input.
            :attr:`True` means mask out the corresponding values.
-            Ignored when :attr:`self.attn_mask_type` is either 'no_mask' or 'causal'.
+            Ignored when :attr:`self.attn_mask_type` is either ``'no_mask'`` or ``'causal'``.
        bias: jax.numpy.ndarray, default = None
            A tensor used to shift the attention softmax input.
        *
@@ -1440,6 +1579,7 @@ class MultiHeadAttention(nn.Module):  # pylint: disable=too-few-public-methods
            scale_factor=scale_factor,
            transpose_batch_sequence=self.transpose_batch_sequence,
            window_size=self.window_size,
+            softmax_type=self.softmax_type,
        )(*dpa_args, mask, bias, deterministic=deterministic)
        x = x.reshape((x.shape[0], x.shape[1], x.shape[2] * x.shape[3]))

@@ -1594,7 +1734,7 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
        Intermediate size to which input samples are projected.
    num_attention_heads: int, default = 8
        Number of attention heads in the transformer layer.
-    num_gqa_groups: int, default = `None`
+    num_gqa_groups: int, default = None
        Number of GQA groups. When `None` is present, it is equal to num_attention_heads.
        Grouped Query Attention is described in
        `this paper <https://arxiv.org/pdf/2305.13245.pdf>`_.
@@ -1628,31 +1768,31 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
        The key in given RNGs via flax.linen.Module.apply that for
        generating Dropout masks in the Multi-Head Attention.
    mha_kernel_init: Initializer, default =
-        flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'normal')
+        ``flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'normal')``
        Used for initializing weights of QKV and Output projection weights.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    mlp_kernel_init: Initializer, default =
-        flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal')
+        ``flax.linen.initializers.variance_scaling(1.0, 'fan_in', 'truncated_normal')``
        Used for initializing weights of FC1 and FC2 layers.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    mlp_activations: Sequence[str], default = ('gelu', )
        The sequence of activation functions to apply after the first linear transformation.
        Each activation has its own transformation layer.
    mlp_activation_params: dict = None
-         This is only used when ('clamped_silu', 'clamped_linear') is in :attr:`mlp_activations`. At the moment
-        ClampedSwiglu is the only activation that requires parameters.
+         This is only used when ``('clamped_silu', 'clamped_linear')`` is in :attr:`mlp_activations`. At the moment
+        ``ClampedSwiglu`` is the only activation that requires parameters.
    use_bias: bool, default = False
        Indicate whether to enable bias shifting for QKVO projections, FC1 and FC2.
-        If set to False, the layer will not learn additive biases.
-    bias_init: Initializer, default = flax.linen.initializers.zeros
+        If set to ``False``, the layer will not learn additive biases.
+    bias_init: Initializer, default = ``flax.linen.initializers.zeros``
        Used for initializing bias of QKVO projections,
        FC1 and FC2. It is only used when :attr:`use_bias=True`.
-        It should be a callable object with three arguments (jax.random.PRNGKey, shape, dtype).
+        It should be a callable object with three arguments ``(jax.random.PRNGKey, shape, dtype)``.
    apply_residual_connection_post_layernorm: bool, default = False
-        If set to True, residual connections are taken from the output
+        If set to ``True``, residual connections are taken from the output
        of layer norm (default is taken from input of layer norm)
    output_layernorm: bool, default = False
-        If set to True, layer normalization is applied on the output side,
+        If set to ``True``, layer normalization is applied on the output side,
        after the final dropout-add. default behavior is to apply layer
        normalization on the input side, before the QKV transformation.
    float32_attention_logits: bool, default = False
@@ -1660,43 +1800,43 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
        For fused attention backend, the accumulation is always float32 without the perf overhead.
    layer_type: TransformerLayerType, default = TransformerLayerType.ENCODER
        If set to TransformerLayerType.DECODER, an additional cross-attention block
-        is added after self-attention.this can be used for structures like `T5`
+        is added after self-attention.this can be used for structures like T5
        Transformer in conjunction with the TransformerLayerType.ENCODER option.
    self_attn_mask_type: str, default = 'causal'
        This parameter specifies the type of attention mask to be applied during the softmax
        operation in the self attention.
-        Available options are {'no_mask', 'padding', 'causal', 'causal_padding', 'padding_causal'}
+        Available options are {'no_mask', 'padding', 'causal', 'causal_padding', 'padding_causal'}.

        Each described below:

-        * no_mask: No attention mask is applied. This means the self attention will consider the
+        * ``no_mask``: No attention mask is applied. This means the self attention will consider the
          full sequence without any restrictions.
-        * padding: Indicates the presence of padding at the end of each sequence.
-          Users must provide a mask with the shape [batch, 1, max_seqlen_q, max_seqlen_kv] in the
+        * ``padding``: Indicates the presence of padding at the end of each sequence.
+          Users must provide a mask with the shape ``[batch, 1, max_seqlen_q, max_seqlen_kv]`` in the
          :attr:`__call__` method to specify the padding positions.
-        * causal: An upper triangular mask is applied to the softmax inputs,
+        * ``causal``: An upper triangular mask is applied to the softmax inputs,
          ensuring that the prediction for a certain position is only dependent on known outputs
          from positions before it.
-        * causal_padding / padding_causal: A combination of both causal and padding masks.
-          Both 'causal_padding' and 'padding_causal' are acceptable and have the same effect.
+        * ``causal_padding`` / ``padding_causal``: A combination of both causal and padding masks.
+          Both ``'causal_padding'`` and ``'padding_causal'`` are acceptable and have the same effect.

-        .. note:: :attr:`attention_mask` in :attr:`__call__` is ignored for 'no_mask' and 'causal'.
+        .. note:: :attr:`attention_mask` in :attr:`__call__` is ignored for ``'no_mask'`` and ``'causal'``.

    self_attn_bias_type: Optional[str], default = None
        Type of the attention bias passed into the self attention.
-        Available options: {'no_bias', 'pre_scale_bias', 'post_scale_bias'}.
+        Available options: ``{'no_bias', 'pre_scale_bias', 'post_scale_bias'}``.
        When default is present, the type is automatically decided by the MHA's bias parameter.
-        Where it is `post_scale_bias` if there is bias. Otherwise `no_bias` is used.
+        Where it is ``'post_scale_bias'`` if there is bias. Otherwise ``'no_bias'`` is used.
    enable_relative_embedding: bool, default = True
        Whether to enable relative embedding as shifting of attention logits.
    relative_embedding: flax.linen.Module, default = None
        The module for relative embedding execution, only used when
-        :attr:`enable_relative_embedding=True`. Default is None, which will create
+        :attr:`enable_relative_embedding=True`. Default is ``None``, which will create
        an instance of RelativePositionBiases if :attr:`enable_relative_embedding=True`.
-        Default: RelativePositionBiases( num_buckets=32, max_distance=128,
+        Default: ``RelativePositionBiases( num_buckets=32, max_distance=128,
        num_attention_heads=self.num_attention_heads, dtype=self.dtype,
        embedding_init=flax.linen.initializers.variance_scaling(1.0, 'fan_avg', 'uniform'),
-        name='relpos_bias')
+        name='relpos_bias')``
    enable_rotary_pos_emb: bool, default = False
        Whether to enable rotary position embedding to projected query and key in MHA.
    rotary_pos_emb_windows: Tuple[int, int], default = (1, 10000)
@@ -1704,23 +1844,50 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
        only used when :attr:`enable_rotary_pos_emb=True`
    rotary_pos_emb_group_method: str, default = 'consecutive'
        Indicate the method to couple the coordinates. It should be one of
-        ['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2`,
-        where :math:`d` is the hidden dimension. 'consecutive' pairs index :math:`i` with
+        ``['consecutive', 'alternate']``. ``'alternate'`` is to pair index :math:`i` with :math:`i + d/2`,
+        where :math:`d` is the hidden dimension. ``'consecutive'`` pairs index :math:`i` with
        :math:`i + 1`.
    low_rank_adaptation_scope: str, default = 'none'
        Indicate the scope to apply low rank adaptation. It should be one of
-        ['none', 'all', 'qkv_proj', 'output_proj', 'mlp', 'exclude_qkv_proj',
-        'exclude_output_proj', 'exclude_mlp']
+        ``['none', 'all', 'qkv_proj', 'output_proj', 'mlp', 'exclude_qkv_proj',
+        'exclude_output_proj', 'exclude_mlp']``
    low_rank_adaptation_dim: int, default = 32
        The dimension for low rank adaptation, only used when
        :attr:`enable_low_rank_adaptation=True`
    low_rank_adaptation_alpha: float, default = None
        The alpha for computing the scaling factor of LoRA output.
-        :math:`\frac{alpha}{rank} * lora\_output`. None means no scaling.
+        :math:`\frac{alpha}{rank} \cdot lora\_output`. ``None`` means no scaling.
    enable_sequence_parallel: bool, default = False
        Whether to enable sequence parallelism to operations except dot.
    window_size: Optional[Tuple[int, int]], default = None
        Sliding window size. Default value is no sliding window.
+    softmax_type: str = {'vanilla', 'off-by-one', 'learnable'}, default = 'vanilla'
+        Softmax type as described in the paper
+        `Efficient Streaming Language Models with Attention Sinks
+        <https://arxiv.org/pdf/2309.17453v3>`_.
+
+        For a given attention score :math:`S = Q \cdot K^T`, of shape ``[b, h, s_q, s_kv]``:
+
+        * ``'vanilla'``:
+
+          .. math::
+             Softmax(S)_{:,:,:,i} = \frac{\exp(S_{:,:,:,i})}{\sum_j \exp(S_{:,:,:,j})}
+
+        * ``'off-by-one'``:
+
+          .. math::
+             Softmax(S)_{:,:,:,i} = \frac{\exp(S_{:,:,:,i})}{1 + \sum_j \exp(S_{:,:,:,j})}
+
+        * ``'learnable'``:
+
+          .. math::
+             Softmax(S)_{:,h,:,i} = \frac{\exp(S_{:,h,:,i})}{\exp(\alpha_h) + \sum_j \exp(S_{:,h,:,j})}
+
+          where :math:`\alpha` is a learnable parameter of shape ``[h]``.
+
+        ``'off-by-one'`` and ``'learnable'`` softmax types are also called sink attention
+        (``'zero sink'`` and ``'learnable sink'``).
+        Only supported for fused attention backend.

    Optimization parameters
    -----------------------
@@ -1730,19 +1897,19 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
        When > 0.0, applies stochastic depth per sample in the main
        path of the residual block.
    fuse_qkv_params: bool, default = True
-        If set to True, `TransformerLayer` module exposes a single fused
+        If set to ``True``, ``TransformerLayer`` module exposes a single fused
        parameter for query-key-value for self-attention and key-value for
        cross-attention.
    transpose_batch_sequence: bool, default = False
        Indicate whether the input tensors were switched axis of batch
-        and sequence length dimension. if set to True, the input tensors
-        should be in (seqlen, batch, hidden), otherwise (batch, seqlen, hidden).
+        and sequence length dimension. if set to ``True``, the input tensors
+        should be in ``(seqlen, batch, hidden)``, otherwise ``(batch, seqlen, hidden)``.
    scale_attn_logits: bool, default = False
        Indicate whether to scale attention logits.
-        if set to True, :math:`\frac{Q}{\sqrt{head_dim}*K}`,
-        else :math:`Q*K`
-    scaled_query_init: bool, default = `True`
-        Whether to scale WQ on initialization by :math:`\sqrt{head_dim}`
+        if set to ``True``, :math:`\frac{Q \cdot K^T}{\sqrt{head\_dim}}`,
+        else :math:`Q \cdot K^T`
+    scaled_query_init: bool, default = True
+        Whether to scale WQ on initialization by :math:`\sqrt{head\_dim}`
    """

    hidden_size: int = 512
@@ -1786,6 +1953,7 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
    scale_attn_logits: bool = False
    scaled_query_init: bool = True
    window_size: Optional[Tuple[int, int]] = None
+    softmax_type: str = "vanilla"

    def __post_init__(self):
        if self.mha_kernel_init is None:
@@ -1824,7 +1992,7 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
        attention_mask : jax.numpy.ndarray, default = None
            Boolean tensor used to mask out self-attention softmax input.
            :attr:`True` means mask out the corresponding values.
-            Ignored when :attr:`self.self_attn_mask_type` is either 'no_mask' or 'causal'.
+            Ignored when :attr:`self.self_attn_mask_type` is either ``'no_mask'`` or ``'causal'``.
        encoder_decoder_mask: jax.numpy.ndarray, default = None
            Boolean tensor used to mask out cross-attention softmax input when
            :attr:`layer_type=TransformerLayerType.DECODER`.
@@ -1946,6 +2114,7 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
            bias_init=self.bias_init,
            name=mha_name,
            window_size=self.window_size,
+            softmax_type=self.softmax_type,
        )(inputs, inputs, attention_mask, attn_bias, deterministic=deterministic, decode=decode)

        def hidden_dropout(x, deterministic):
@@ -2024,6 +2193,7 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
                bias_init=self.bias_init,
                name="encoder_decoder_attention",
                window_size=self.window_size,
+                softmax_type=self.softmax_type,
            )(x, encoded, encoder_decoder_mask, deterministic=deterministic)

            y = with_sharding_constraint_by_logical_axes(

--- a/transformer_engine/jax/sharding.py
+++ b/transformer_engine/jax/sharding.py
@@ -44,9 +44,6 @@ def _get_mesh_info(resource: str, mesh: jax.sharding.Mesh):

 def _validate_mesh_resource_configuration(mesh_resource):
    """Validate that the mesh resource configuration is consistent and conflict-free."""
-    is_dp_enabled = (
-        mesh_resource.dp_resource is not None and get_mesh_axis_size(mesh_resource.dp_resource) > 1
-    )
    is_tp_enabled = (
        mesh_resource.tp_resource is not None and get_mesh_axis_size(mesh_resource.tp_resource) > 1
    )
@@ -54,16 +51,7 @@ def _validate_mesh_resource_configuration(mesh_resource):
        mesh_resource.tpsp_resource is not None
        and get_mesh_axis_size(mesh_resource.tpsp_resource) > 1
    )
-    is_fsdp_enabled = (
-        mesh_resource.fsdp_resource is not None
-        and get_mesh_axis_size(mesh_resource.fsdp_resource) > 1
-    )

-    assert not (is_dp_enabled and is_fsdp_enabled), (
-        "Data parallelism and full-sharded data parallelism cannot be enabled at the same time."
-        f" Got dp_resource={mesh_resource.dp_resource} and"
-        f" fsdp_resource={mesh_resource.fsdp_resource}"
-    )
    assert not (is_tp_enabled and is_tpsp_enabled), (
        "Tensor parallelism and tensor sequence parallelism cannot be enabled at the same time."
        f" Got tp_resource={mesh_resource.tp_resource} and"

--- a/transformer_engine/jax/softmax.py
+++ b/transformer_engine/jax/softmax.py
@@ -12,8 +12,8 @@ import jax.numpy as jnp
 from . import cpp_extensions as tex


-class SoftmaxType(Enum):
-    """SoftmaxType."""
+class SoftmaxFusionType(Enum):
+    """SoftmaxFusionType."""

    SCALED = "scaled"
    SCALED_MASKED = "scaled_masked"
@@ -24,27 +24,27 @@ def softmax(
    logits: jnp.ndarray,
    mask: Optional[jnp.ndarray] = None,
    scale_factor: Optional[float] = 1.0,
-    softmax_type: Optional[SoftmaxType] = SoftmaxType.SCALED,
+    softmax_fusion_type: Optional[SoftmaxFusionType] = SoftmaxFusionType.SCALED,
 ):
    """
    Softmax wrapper
    """
-    output = _softmax(logits, mask, scale_factor, softmax_type)
+    output = _softmax(logits, mask, scale_factor, softmax_fusion_type)
    return output


 @partial(jax.custom_vjp, nondiff_argnums=(2, 3))
-def _softmax(logits, mask, scale_factor, softmax_type):
+def _softmax(logits, mask, scale_factor, softmax_fusion_type):

-    output, _ = _softmax_fwd_rule(logits, mask, scale_factor, softmax_type)
+    output, _ = _softmax_fwd_rule(logits, mask, scale_factor, softmax_fusion_type)
    return output


-def _softmax_fwd_rule(logits, mask, scale_factor, softmax_type):
-    if softmax_type is SoftmaxType.SCALED_MASKED:
+def _softmax_fwd_rule(logits, mask, scale_factor, softmax_fusion_type):
+    if softmax_fusion_type is SoftmaxFusionType.SCALED_MASKED:
        assert mask is not None
        output = tex.scaled_masked_softmax_fwd(logits, mask, scale_factor)
-    elif softmax_type is SoftmaxType.SCALED_UPPER_TRIANG_MASKED:
+    elif softmax_fusion_type is SoftmaxFusionType.SCALED_UPPER_TRIANG_MASKED:
        output = tex.scaled_upper_triang_masked_softmax_fwd(logits, scale_factor)
    else:
        output = tex.scaled_softmax_fwd(logits, scale_factor)
@@ -52,12 +52,12 @@ def _softmax_fwd_rule(logits, mask, scale_factor, softmax_type):
    return output, (output, logits, mask)


-def _softmax_bwd_rule(scale_factor, softmax_type, ctx, dz):
+def _softmax_bwd_rule(scale_factor, softmax_fusion_type, ctx, dz):
    (softmax_output, logits, mask) = ctx

-    if softmax_type is SoftmaxType.SCALED_MASKED:
+    if softmax_fusion_type is SoftmaxFusionType.SCALED_MASKED:
        dgrad = tex.scaled_masked_softmax_bwd(dz, softmax_output, logits, mask, scale_factor)
-    elif softmax_type is SoftmaxType.SCALED_UPPER_TRIANG_MASKED:
+    elif softmax_fusion_type is SoftmaxFusionType.SCALED_UPPER_TRIANG_MASKED:
        dgrad = tex.scaled_upper_triang_masked_softmax_bwd(dz, softmax_output, logits, scale_factor)
    else:
        dgrad = tex.scaled_softmax_bwd(dz, softmax_output, logits, scale_factor)

--- a/transformer_engine/pytorch/__init__.py
+++ b/transformer_engine/pytorch/__init__.py
@@ -7,22 +7,14 @@
 # pylint: disable=wrong-import-position

 import functools
-from packaging.version import Version as PkgVersion

 import torch

 from transformer_engine.common import load_framework_extension
-
-
-@functools.lru_cache(maxsize=None)
-def torch_version() -> tuple[int, ...]:
-    """Get PyTorch version"""
-    return PkgVersion(str(torch.__version__)).release
-
+from transformer_engine.pytorch.torch_version import torch_version

 assert torch_version() >= (2, 1), f"Minimum torch version 2.1 required. Found {torch_version()}."

-
 load_framework_extension("torch")
 from transformer_engine.pytorch.module import LayerNormLinear
 from transformer_engine.pytorch.module import Linear

--- a/transformer_engine/pytorch/attention/dot_product_attention/backends.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -20,7 +20,12 @@ from transformer_engine.pytorch.utils import (
    get_device_compute_capability,
    split_tensor_along_dim,
 )
-from transformer_engine.pytorch.utils import attention_mask_func, nvtx_range_push, nvtx_range_pop
+from transformer_engine.pytorch.utils import (
+    attention_mask_func,
+    nvtx_range_push,
+    nvtx_range_pop,
+    get_nvtx_range_context,
+)
 from transformer_engine.pytorch.tensor.float8_tensor import (
    Float8Quantizer,
    Float8CurrentScalingQuantizer,
@@ -678,6 +683,7 @@ class FlashAttention(torch.nn.Module):
        inference_params: Optional[InferenceParams] = None,
        flash_attention_backend: Optional[PkgVersion] = PkgVersion("0"),
        fp8_output: bool = False,
+        num_splits: Optional[int] = 1,
    ) -> torch.Tensor:
        """flash-attn fprop"""

@@ -954,6 +960,7 @@ class FlashAttention(torch.nn.Module):
                else:
                    fa_3_optional_forward_kwargs = {}
                    fa_3_optional_forward_kwargs["window_size"] = window_size
+                    fa_3_optional_forward_kwargs["num_splits"] = num_splits
                    if inference_params is None:
                        fa_3_optional_forward_kwargs["deterministic"] = self.deterministic
                    else:
@@ -1447,7 +1454,7 @@ class FusedAttnFunc(torch.autograd.Function):
            dk = dk[..., : d_out.shape[-1]]
            dv = dv[..., : d_out.shape[-1]]
        else:
-            with torch.cuda.nvtx.range("FusedAttnFunc.backward"):
+            with get_nvtx_range_context("FusedAttnFunc.backward"):
                # get nominal data type of dq, dk, dv
                # FP16/BF16 attention: torch.float16 or torch.bfloat16
                # FP8 attention:       torch.float16 or torch.bfloat16

--- a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
@@ -4,6 +4,7 @@

 """Context Parallelism."""
 import os
+import itertools
 from typing import List, Union, Tuple
 import torch
 import transformer_engine_torch as tex
@@ -260,6 +261,146 @@ def reorder_seq_chunks_for_a2a_after_attn(x, chunk_ids_for_a2a, seq_dim, cp_size
    return x


+def reorder_seq_chunks_before_a2a_after_attn_thd(x, cu_seqlens, cp_size, seq_dim=0):
+    """
+    Reorder sequence chunks for A2A communication that happens after attention
+    compute.
+
+    Args:
+        x:              The input tensor to be reordered.
+        cu_seqlens:     The cumulative sequence lengths of the input tensor.
+        cp_size:        The number of ranks participating in context parallelism.
+        seq_dim:        The dimension in which to reorder.
+
+    Returns:
+        The reordered tensor.
+
+    Example:
+        x:              [ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  0.,  1.,  2.,  3.,  4.,  5.,
+                          6.,  7.,  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  0.,  1.,  2.,  3.,
+                          4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15.]
+        cu_seqlens:     [ 0, 8, 16, 24, 40]
+        cp_size:        4
+
+        Returns:        [ 0.,  7.,  0.,  7.,  0.,  7.,  0.,  1., 14., 15.,  1.,  6.,  1.,  6.,
+                          1.,  6.,  2.,  3., 12., 13.,  2.,  5.,  2.,  5.,  2.,  5.,  4.,  5.,
+                          10., 11.,  3.,  4.,  3.,  4.,  3.,  4.,  6.,  7.,  8.,  9.]
+
+
+        This logic is similar to how the DualChunking is done to split the sequence
+        for each rank. Here, the indices of sequence chunks for all those ranks
+        are concatenated together. So the returned tensor ends up looking like as if
+        the chunks from all the ranks are concatenated together.
+
+         e.g. [
+                0.,  7.,  0.,  7.,  0.,  7.,  0.,  1., 14., 15.,  # chunk on rank 0
+                1.,  6.,  1.,  6.,  1.,  6.,  2.,  3., 12., 13.,  # chunk on rank 1
+                2.,  5.,  2.,  5.,  2.,  5.,  4.,  5., 10., 11.,  # chunk on rank 2
+                3.,  4.,  3.,  4.,  3.,  4.,  6.,  7.,  8.,  9.   # chunk on rank 3
+             ]
+    """
+    total_slices_of_any_sequence = 2 * cp_size
+    slice_sizes = (cu_seqlens[1:] - cu_seqlens[:-1]) // total_slices_of_any_sequence
+
+    indices = [
+        (
+            # 1st segment
+            torch.arange(
+                seq_start + (cp_rank * slice_size),
+                seq_start + ((cp_rank + 1) * slice_size),
+                device=cu_seqlens.device,
+            ),
+            # 2nd segment
+            torch.arange(
+                seq_start + ((total_slices_of_any_sequence - cp_rank - 1) * slice_size),
+                seq_start + ((total_slices_of_any_sequence - cp_rank) * slice_size),
+                device=cu_seqlens.device,
+            ),
+        )
+        for cp_rank in range(cp_size)
+        for slice_size, seq_start in zip(slice_sizes, cu_seqlens[:-1])
+    ]
+
+    # flatten the list of tuples to a list
+    indices = list(itertools.chain(*indices))
+    indices = torch.cat(indices)
+    return x.index_select(seq_dim, indices)
+
+
+def reorder_seq_chunks_after_a2a_before_attn_thd(x, cu_seqlens, seq_chunk_ids, cp_size, seq_dim=0):
+    """
+    Reorder sequence chunks for A2A communication that happens before attention
+    compute.
+
+    Args:
+        x:              The input tensor to be reordered.
+        cu_seqlens:     The cumulative sequence lengths of the input tensor.
+        seq_chunk_ids:  The sequence chunk ids of the input `x` which is to be reordered.
+        cp_size:        The number of ranks participating in context parallelism.
+        seq_dim:        The dimension in which to reorder.
+
+    Returns:
+        The reordered tensor.
+
+    Example:
+        x:              [ 0.,  7.,  0.,  7.,  0.,  7.,  0.,  1., 14., 15.,  1.,  6.,  1.,  6.,
+                          1.,  6.,  2.,  3., 12., 13.,  2.,  5.,  2.,  5.,  2.,  5.,  4.,  5.,
+                          10., 11.,  3.,  4.,  3.,  4.,  3.,  4.,  6.,  7.,  8.,  9.]
+        cu_seqlens:     [ 0,  8, 16, 24, 40]
+        seq_chunk_ids:  [ 0, 2, 4, 6, 7, 5, 3, 1]
+        cp_size:        4
+
+        Returns:        [ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  0.,  1.,  2.,  3.,  4.,  5.,
+                          6.,  7.,  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  0.,  1.,  2.,  3.,
+                          4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15.]
+
+        Note that the input sequences (x) are arranged after A2A communication as if DualChunked
+        chunks on all the ranks are concatenated together in the `seq_dim`.
+
+        e.g. [
+                0.,  7.,  0.,  7.,  0.,  7.,  0.,  1., 14., 15.,  # chunk on rank 0
+                1.,  6.,  1.,  6.,  1.,  6.,  2.,  3., 12., 13.,  # chunk on rank 1
+                2.,  5.,  2.,  5.,  2.,  5.,  4.,  5., 10., 11.,  # chunk on rank 2
+                3.,  4.,  3.,  4.,  3.,  4.,  6.,  7.,  8.,  9.   # chunk on rank 3
+             ]
+
+        Then the logic to serialize the sequences is:
+        1. For every sequence segment on any rank (denoted by `start` and `end`):
+            1a. For every chunk (in `chunk_id` and the total of those are twice as many as the number of CP ranks) :
+                1aa. The first `cp_size` number of chunks form the first half of the whole sequence. Get those indices.
+                1ab. The second `cp_size` number of chunks form the second half of the whole sequence. Get those indices.
+            1b. Concatenate the indices of the first half and the second half.
+        2. Reorder the entire input tensor by those indices.
+    """
+
+    max_cum_seqlen_per_cp_rank = cu_seqlens[-1] // cp_size
+    cu_seqlens_on_any_cp_rank = cu_seqlens // cp_size
+
+    # Go through all the sequence segments (the sizes should be the same from all the ranks)
+    indices = [
+        torch.arange(
+            # Calculate 'left' boundary
+            (
+                start + max_cum_seqlen_per_cp_rank * (chunk_id // 2)
+                if loc < cp_size
+                else (start + end) // 2 + max_cum_seqlen_per_cp_rank * (chunk_id // 2)
+            ),
+            # Calculate 'right' boundary
+            (
+                (start + end) // 2 + max_cum_seqlen_per_cp_rank * (chunk_id // 2)
+                if loc < cp_size
+                else end + max_cum_seqlen_per_cp_rank * (chunk_id // 2)
+            ),
+            device=cu_seqlens.device,
+        )
+        for start, end in zip(cu_seqlens_on_any_cp_rank[:-1], cu_seqlens_on_any_cp_rank[1:])
+        for loc, chunk_id in enumerate(seq_chunk_ids)
+    ]
+
+    indices = torch.cat(indices)
+    return x.index_select(seq_dim, indices)
+
+
 def flash_attn_a2a_communicate(
    a2a_inputs: Union[torch.Tensor, List[torch.Tensor]],
    chunk_ids_for_a2a: torch.Tensor,
@@ -268,8 +409,14 @@ def flash_attn_a2a_communicate(
    cp_group: dist_group_type,
    cp_stream: torch.cuda.Stream,
    before_attn: bool,
+    qkv_format: str = "bshd",
+    cu_seqlens_padded: torch.Tensor = None,
 ) -> Union[torch.Tensor, List[torch.Tensor]]:
    """A2A communication for context parallelism."""
+
+    assert (
+        qkv_format != "thd" or cu_seqlens_padded is not None
+    ), "cu_seqlens_padded is required for THD format!"
    a2a_inputs = [a2a_inputs] if not isinstance(a2a_inputs, list) else a2a_inputs
    a2a_outputs, a2a_reqs = [None] * len(a2a_inputs), [None] * len(a2a_inputs)
    if before_attn:
@@ -283,20 +430,33 @@ def flash_attn_a2a_communicate(
                with torch.cuda.stream(cp_stream):
                    a2a_reqs[i - 2].wait()
                    x = a2a_outputs[i - 2]
-                    # reorder the sequence chunks
-                    x = reorder_seq_chunks_for_a2a_before_attn(
-                        x, chunk_ids_for_a2a, seq_dim, cp_size
-                    )
-                    # [b, cp*2, s//2, h//cp, d] -> [b, cp*s, h//cp, d]
-                    # or [cp*2, s//2, b, h//cp, d] -> [cp*s, b, h//cp, d]
-                    a2a_outputs[i - 2] = x.view(*x.shape[:seq_dim], -1, *x.shape[(seq_dim + 2) :])
+                    if qkv_format in ["bshd", "sbhd"]:
+                        # reorder the sequence chunks
+                        x = reorder_seq_chunks_for_a2a_before_attn(
+                            x, chunk_ids_for_a2a, seq_dim, cp_size
+                        )
+                        # [b, cp*2, s//2, np//cp, hn] -> [b, cp*s, np//cp, hn]
+                        # or [cp*2, s//2, b, np//cp, hn] -> [cp*s, b, np//cp, hn]
+                        a2a_outputs[i - 2] = x.view(
+                            *x.shape[:seq_dim], -1, *x.shape[(seq_dim + 2) :]
+                        )
+                    else:  # qkv_format == "thd"
+                        # [cp, t, np//cp, hn] -> [cp*t, np//cp, hn]
+                        x = x.view(-1, *x.shape[2:])
+                        # reorder the sequence chunks
+                        a2a_outputs[i - 2] = reorder_seq_chunks_after_a2a_before_attn_thd(
+                            x, cu_seqlens_padded, chunk_ids_for_a2a, cp_size
+                        )
+
            if i < len(a2a_inputs):
                x = a2a_inputs[i]
-                # [b, s, h, d] -> [b, s, cp, h//cp, d]
-                # or [s, b, h, d] -> [s, b, cp, h//cp, d]
+                # [b, s, np, hn] -> [b, s, cp, np//cp, hn]
+                # or [s, b, np, hn] -> [s, b, cp, np//cp, hn]
+                # or [t, np, hn] -> [t, cp, np//cp, hn]
                x = x.view(*x.shape[:-2], cp_size, x.shape[-2] // cp_size, x.shape[-1])
-                # [b, s, cp, h//cp, d] -> [cp, b, s, h//cp, d]
-                # or [s, b, cp, h//cp, d] -> [cp, s, b, h//cp, d]
+                # [b, s, cp, np//cp, hn] -> [cp, b, s, np//cp, hn]
+                # or [s, b, cp, np//cp, hn] -> [cp, s, b, np//cp, hn]
+                # or [t, cp, np//cp, hn] -> [cp, t, np//cp, hn]
                a2a_inputs[i] = x.movedim(-3, 0).contiguous()
    else:
        for i in range(len(a2a_inputs) + 2):
@@ -307,22 +467,30 @@ def flash_attn_a2a_communicate(
                )
            if i < len(a2a_inputs):
                x = a2a_inputs[i]
-                # [b, cp*s, h//cp, d] -> [b, cp*2, s//2, h//cp, d]
-                # or [cp*s, b, h//cp, d] -> [cp*2, s//2, b, h//cp, d]
-                x = x.view(*x.shape[:seq_dim], cp_size * 2, -1, *x.shape[(seq_dim + 1) :])
-                # reorder the sequence chunks
-                a2a_inputs[i] = reorder_seq_chunks_for_a2a_after_attn(
-                    x, chunk_ids_for_a2a, seq_dim, cp_size
-                )
+                if qkv_format in ["bshd", "sbhd"]:
+                    # [b, cp*s, np//cp, hn] -> [b, cp*2, s//2, np//cp, hn]
+                    # or [cp*s, b, np//cp, hn] -> [cp*2, s//2, b, np//cp, hn]
+                    x = x.view(*x.shape[:seq_dim], cp_size * 2, -1, *x.shape[(seq_dim + 1) :])
+                    # reorder the sequence chunks
+                    a2a_inputs[i] = reorder_seq_chunks_for_a2a_after_attn(
+                        x, chunk_ids_for_a2a, seq_dim, cp_size
+                    )
+                else:  # qkv_format == "thd"
+                    # reorder the sequence chunks
+                    x = reorder_seq_chunks_before_a2a_after_attn_thd(x, cu_seqlens_padded, cp_size)
+                    # [cp*t, np//cp, hn] -> [cp, t, np//cp, hn]
+                    a2a_inputs[i] = x.view(cp_size, -1, *x.shape[-2:])
            if i > 1:
                with torch.cuda.stream(cp_stream):
                    a2a_reqs[i - 2].wait()
                    x = a2a_outputs[i - 2]
-                    # [cp, 2, b, s//2, h//cp, d] -> [b, 2, s//2, cp, h//cp, d]
-                    # or [cp, 2, s//2, b, h//cp, d] -> [2, s//2, b, cp, h//cp, d]
+                    # [cp, 2, b, s//2, np//cp, hn] -> [b, 2, s//2, cp, np//cp, hn]
+                    # or [cp, 2, s//2, b, np//cp, hn] -> [2, s//2, b, cp, np//cp, hn]
+                    # or [cp, t, np//cp, hn] -> [t, cp, np//cp, hn]
                    x = x.movedim(0, -3).movedim(0, seq_dim).contiguous()
-                    # [b, 2, s//2, cp, h//cp, d] -> [b*s, h, d]
-                    # or [2, s//2, b, cp, h//cp, d] -> [s*b, h, d]
+                    # [b, 2, s//2, cp, np//cp, hn] -> [b*s, np, hn]
+                    # or [2, s//2, b, cp, np//cp, hn] -> [s*b, np, hn]
+                    # or [t, cp, np//cp, hn] -> [t, np, hn]
                    a2a_outputs[i - 2] = x.view(-1, x.shape[-3] * x.shape[-2], x.shape[-1])
    torch.cuda.current_stream().wait_stream(cp_stream)
    return a2a_outputs[0] if len(a2a_inputs) == 1 else a2a_outputs
@@ -3145,7 +3313,9 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):

        causal = "causal" in attn_mask_type
        padding = "padding" in attn_mask_type
-        assert not padding, f"{attn_mask_type} mask type is not supported!"
+        assert (
+            not padding or qkv_format == "thd"
+        ), f"{attn_mask_type} mask type is not supported for BSHD and SBHD!"
        assert attn_bias_type == "no_bias", f"{attn_bias_type} bias type is not supported!"
        assert q.shape[-1] % 8 == 0, "Hidden size per attention head should be multiple of 8!"
        assert (
@@ -3196,11 +3366,14 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):
            q.shape[-2] % cp_size == 0 and k.shape[-2] % cp_size == 0
        ), "The number of attention heads needs to be divisible by CP size!"

-        assert qkv_format != "thd", f"{qkv_format} format is not supported!"
        qkv_layout = qkv_format + "_" + qkv_format + "_" + qkv_format

-        batch_dim = qkv_format.index("b")
-        seq_dim = qkv_format.index("s")
+        if qkv_format in ["bshd", "sbhd"]:
+            batch_dim = qkv_format.index("b")
+            seq_dim = qkv_format.index("s")
+        else:  # qkv_format == "thd"
+            batch_dim = seq_dim = qkv_format.index("t")
+
        assert (
            q.shape[seq_dim] % 2 == 0 and k.shape[seq_dim] % 2 == 0
        ), "Sequence length per GPU needs to be divisible by 2!"
@@ -3246,7 +3419,15 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):

        chunk_ids_for_a2a = get_seq_chunk_ids_for_reordering_before_attn(cp_size, q.device)
        q, k, v = flash_attn_a2a_communicate(
-            [q, k, v], chunk_ids_for_a2a, seq_dim, cp_size, cp_group, cp_stream, True
+            [q, k, v],
+            chunk_ids_for_a2a,
+            seq_dim,
+            cp_size,
+            cp_group,
+            cp_stream,
+            before_attn=True,
+            qkv_format=qkv_format,
+            cu_seqlens_padded=cu_seqlens_q_padded,
        )
        if softmax_type != "vanilla":
            softmax_offset = flash_attn_a2a_communicate_softmax_offset(
@@ -3337,7 +3518,15 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):

        chunk_ids_for_a2a = get_seq_chunk_ids_for_reordering_after_attn(cp_size, out_.device)
        out_ = flash_attn_a2a_communicate(
-            out_, chunk_ids_for_a2a, seq_dim, cp_size, cp_group, cp_stream, False
+            out_,
+            chunk_ids_for_a2a,
+            seq_dim,
+            cp_size,
+            cp_group,
+            cp_stream,
+            before_attn=False,
+            qkv_format=qkv_format,
+            cu_seqlens_padded=cu_seqlens_q_padded,
        )
        if return_max_logit:
            max_logit = flash_attn_a2a_communicate_softmax_offset(
@@ -3454,9 +3643,15 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):
            cu_seqlens_kv_padded,
            *aux_ctx_tensors,
        ) = restore_from_saved(ctx.tensor_objects, ctx.saved_tensors)
-        qkv_layout = ctx.qkv_format + "_" + ctx.qkv_format + "_" + ctx.qkv_format
+
+        qkv_format = ctx.qkv_format
+        qkv_layout = qkv_format + "_" + qkv_format + "_" + qkv_format
        causal = "causal" in ctx.attn_mask_type
-        seq_dim = ctx.qkv_format.index("s")
+
+        if qkv_format in ["bshd", "sbhd"]:
+            seq_dim = qkv_format.index("s")
+        else:  # qkv_format == "thd"
+            seq_dim = qkv_format.index("t")

        bwd_nominal_dtype = ctx.fwd_nominal_dtype
        dqkv_te_dtype = None
@@ -3486,14 +3681,23 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):
                fused_attn_backend = FusedAttnBackend["F16_arbitrary_seqlen"]

        if not ctx.use_fused_attention:
-            out = out.view(ctx.batch_size, -1, *out.shape[-2:])
-            dout = dout.view(ctx.batch_size, -1, *dout.shape[-2:])
+            if qkv_format in ["bshd", "sbhd"]:
+                out = out.view(ctx.batch_size, -1, *out.shape[-2:])
+                dout = dout.view(ctx.batch_size, -1, *dout.shape[-2:])
        else:
            dout = dout.view(*ctx.out_shape)

        chunk_ids_for_a2a = get_seq_chunk_ids_for_reordering_before_attn(cp_size, dout.device)
        dout = flash_attn_a2a_communicate(
-            dout, chunk_ids_for_a2a, seq_dim, cp_size, ctx.cp_group, ctx.cp_stream, True
+            dout,
+            chunk_ids_for_a2a,
+            seq_dim,
+            cp_size,
+            ctx.cp_group,
+            ctx.cp_stream,
+            before_attn=True,
+            qkv_format=qkv_format,
+            cu_seqlens_padded=cu_seqlens_q_padded,
        )

        flash_attn_bwd = None
@@ -3510,7 +3714,7 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):
                fa_backward_kwargs["window_size"] = ctx.window_size
                fa_backward_kwargs["deterministic"] = ctx.deterministic
            else:
-                if ctx.qkv_format == "thd":
+                if qkv_format == "thd":
                    from transformer_engine.pytorch.attention.dot_product_attention.backends import (
                        _flash_attn_varlen_bwd,
                    )
@@ -3579,7 +3783,7 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):
            fa_backward_args_thd = get_fa_args(
                False,
                ctx.use_flash_attn_3,
-                ctx.qkv_format,
+                qkv_format,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_kv=cu_seqlens_kv,
                max_seqlen_q=ctx.max_seqlen_q,
@@ -3604,12 +3808,20 @@ class AttnFuncWithCPAndQKVOA2A(torch.autograd.Function):

        chunk_ids_for_a2a = get_seq_chunk_ids_for_reordering_after_attn(cp_size, dq.device)
        dq, dk, dv = flash_attn_a2a_communicate(
-            [dq, dk, dv], chunk_ids_for_a2a, seq_dim, cp_size, ctx.cp_group, ctx.cp_stream, False
+            [dq, dk, dv],
+            chunk_ids_for_a2a,
+            seq_dim,
+            cp_size,
+            ctx.cp_group,
+            ctx.cp_stream,
+            before_attn=False,
+            qkv_format=qkv_format,
+            cu_seqlens_padded=cu_seqlens_q_padded,
        )

-        if ctx.qkv_format == "bshd":
+        if qkv_format == "bshd":
            dq, dk, dv = [x.view(ctx.batch_size, -1, *x.shape[-2:]) for x in [dq, dk, dv]]
-        elif ctx.qkv_format == "sbhd":
+        elif qkv_format == "sbhd":
            dq, dk, dv = [x.view(-1, ctx.batch_size, *x.shape[-2:]) for x in [dq, dk, dv]]

        d_bias = None

--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -152,25 +152,25 @@ __all__ = ["DotProductAttention"]


 class DotProductAttention(TransformerEngineBaseModule):
-    """Allows the model to jointly attend to information from different
+    r"""Allows the model to jointly attend to information from different
    representation subspaces as described in the paper:
    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.

    .. note::

-        Argument :attr:`attention_mask` in the `forward` call is only used when
-        :attr:`attn_mask_type` includes '"padding"' or `"arbitrary"`.
+        Argument :attr:`attention_mask` in the ``forward`` call is only used when
+        :attr:`attn_mask_type` includes '"padding"' or ``"arbitrary"``.

    .. warning::

        FlashAttention uses a non-deterministic algorithm for optimal performance. To observe
-        deterministic behavior at the cost of performance, use FlashAttention version >= `2.4.1`
+        deterministic behavior at the cost of performance, use FlashAttention version >= ``2.4.1``
        and set the environment variable :attr:`NVTE_ALLOW_NONDETERMINISTIC_ALGO=0`. In order
-        to disable`flash-attn` entirely, set :attr:`NVTE_FLASH_ATTN=0`.
+        to disable ``flash-attn`` entirely, set :attr:`NVTE_FLASH_ATTN=0`.

    .. note::

-        Transformer Engine stores the FP8 metadata under a `._extra_state` key when checkpointing.
+        Transformer Engine stores the FP8 metadata under a ``._extra_state`` key when checkpointing.
        As the FP8 attention support expands from one backend to multiple backends, the location
        of that key has also shifted (see `FP8 checkpoint compatibility <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/faq.html#fp8-checkpoint-compatibility>`_).

@@ -182,118 +182,137 @@ class DotProductAttention(TransformerEngineBaseModule):
    kv_channels : Union[int, Tuple[int, int]]
                the head size in key and value tensors. If the same, :attr:`kv_channels` can be
                an integer; if not, :attr:`kv_channels` should be a tuple of two integers.
-    num_gqa_groups : Optional[int] = None
+    num_gqa_groups : Optional[int], default = None
                    number of GQA groups in the transformer layer.
                    Grouped Query Attention is described in
                    `this paper <https://arxiv.org/pdf/2305.13245.pdf>`_.
                    This only affects the keys and values, not the queries.
                    GQA-1 is equivalent to Multi-Query Attention
                    (`MQA <https://arxiv.org/pdf/1911.02150.pdf>`_), while GQA-H
-                    is equivalent to MHA, i.e. `num_gqa_groups = num_attention_heads`.
-    attention_dropout: float, default = 0.0
+                    is equivalent to MHA, i.e. ``num_gqa_groups = num_attention_heads``.
+    attention_dropout : float, default = 0.0
                      dropout probability for the dropout op during multi-head attention.
-    attn_mask_type: str, default = `causal`
-                   type of attention mask passed into softmax operation, options are "`no_mask`",
-                   "`padding`", "`causal`", "`padding,causal`", "`causal,padding`",
-                   "`padding_causal`", "`causal_bottom_right`", "`padding_causal_bottom_right`", and
-                   "`arbitrary`", where "`padding,causal`", "`causal,padding`" and "`padding_causal`"
+    attn_mask_type : str, default = "causal"
+                   type of attention mask passed into softmax operation, options are ``"no_mask"``,
+                   ``"padding"``, ``"causal"``, ``"padding,causal"``, ``"causal,padding"``,
+                   ``"padding_causal"``, ``"causal_bottom_right"``, ``"padding_causal_bottom_right"``, and
+                   ``"arbitrary"``, where ``"padding,causal"``, ``"causal,padding"`` and ``"padding_causal"``
                   are equivalent. This arg can be overridden by :attr:`attn_mask_type` in the
-                   `forward` method. It is useful for cases involving compilation/tracing, e.g.
+                   :meth:`forward` method. It is useful for cases involving compilation/tracing, e.g.
                   ONNX export, and the forward arg is useful for dynamically changing mask types,
                   e.g. a different mask for training and inference.
-                   1. For "`no_mask`", no attention mask is applied.
-                   2. For "`causal`", "`causal_bottom_right`", or the causal mask in
-                   "`padding_causal`" and "`padding_causal_bottom_right`", Transformer Engine
-                   calculates and applies an upper triangular mask to the softmax input.
-                   No user input is needed. Causal masks without the "`bottom_right`" appendix align
-                   the diagonal line to the top left corner of the softmax matrix. With
-                   "`bottom_right`", the causal mask is aligned to the bottom right corner, which is
-                   often used in inference/KV caching.
-                   3. For "`padding`", or the padding mask in "`padding_causal`" and
-                   "`padding_causal_bottom_right`", users need to provide the locations of padded
-                   tokens, either via :attr:`cu_seqlens_q` and :attr:`cu_seqlens_kv` (both in shape
-                   [batch_size + 1]), or via :attr:`attention_mask` (one tensor for self-attention
-                   in shape [batch_size, 1, 1, max_seqlen_q], or two tensors in a tuple for
-                   cross-attention in shapes [batch_size, 1, 1, max_seqlen_q] and
-                   [batch_size, 1, 1, max_seqlen_kv]).
-                   4. For "`arbitrary`", users need to provide a mask that is broadcastable to
-                   the shape of softmax input [batch_size, num_heads, max_seqlen_q, max_seqlen_kv].
-    window_size: Optional[Tuple[int, int]], default = `None`
+
+                   1. For ``"no_mask"``, no attention mask is applied.
+                   2. For ``"causal"``, ``"causal_bottom_right"``, or the causal mask in
+                      ``"padding_causal"`` and ``"padding_causal_bottom_right"``, Transformer Engine
+                      calculates and applies an upper triangular mask to the softmax input.
+                      No user input is needed. Causal masks without the ``"bottom_right"`` appendix align
+                      the diagonal line to the top left corner of the softmax matrix. With
+                      ``"bottom_right"``, the causal mask is aligned to the bottom right corner, which is
+                      often used in inference/KV caching.
+                   3. For ``"padding"``, or the padding mask in ``"padding_causal"`` and
+                      ``"padding_causal_bottom_right"``, users need to provide the locations of padded
+                      tokens, either via :attr:`cu_seqlens_q` and :attr:`cu_seqlens_kv` (both of shape
+                      ``[batch_size + 1]``), or via :attr:`attention_mask` (one tensor for self-attention
+                      of shape ``[batch_size, 1, 1, max_seqlen_q]``, or two tensors in a tuple for
+                      cross-attention of shapes ``[batch_size, 1, 1, max_seqlen_q]`` and
+                      ``[batch_size, 1, 1, max_seqlen_kv]``).
+                   4. For ``"arbitrary"``, users need to provide a mask that is broadcastable to
+                      the shape of softmax input ``[batch_size, num_heads, max_seqlen_q, max_seqlen_kv]``.
+
+    window_size : Optional[Tuple[int, int]], default = None
                sliding window size for local attention, where query at position i attends to keys
-                in [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q
-                + window_size[1]] inclusive. Special cases (-1, -1) and (-1, 0) mean no sliding
-                window and causal mask specifically. Both `causal` and `causal_bottom_right` masks
-                map to `window_size = (-1, 0)` and Transformer Engine distinguishes them based on
-                `attn_mask_type`. Similar to :attr:`attn_mask_type`, `window_size` can
-                be overridden by :attr:`window_size` in `forward` as well.
-    attention_type: str, default = `self`
-                   type of attention, either "`self`" and "`cross`".
-    layer_number: int, default = `None`
-                 layer number of the current `DotProductAttention` when multiple such modules
+                in ``[i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q
+                + window_size[1]] inclusive. Special cases ``(-1, -1)`` and ``(-1, 0)`` mean no sliding
+                window and causal mask specifically. Both ``causal`` and ``causal_bottom_right`` masks
+                map to ``window_size = (-1, 0)`` and Transformer Engine distinguishes them based on
+                ``attn_mask_type``. Similar to :attr:`attn_mask_type`, ``window_size`` can
+                be overridden by :attr:`window_size` in ``forward`` as well.
+    attention_type : str, default = "self"
+                   type of attention, either ``"self"`` and ``"cross"``.
+    layer_number : int, default = None
+                 layer number of the current ``DotProductAttention`` when multiple such modules
                 are concatenated, for instance in consecutive transformer blocks.
-    qkv_format: str, default = `sbhd`
-               dimension format for `query_layer`, `key_layer` and `value_layer`,
-               {`sbhd`, `bshd`, `thd`}. `s` stands for the sequence length, `b` batch size,
-               `h` the number of heads, `d` head size, and `t` the total number of tokens
-               in a batch, with `t = sum(s_i), for i = 0...b-1`. `sbhd` and `bshd` formats
+    qkv_format : str, default = "sbhd"
+               dimension format for ``query_layer``, ``key_layer`` and ``value_layer``,
+               {``"sbhd"``, ``"bshd"``, ``"thd"``}. ``s`` stands for the sequence length, ``b`` batch size,
+               ``h`` the number of heads, ``d`` head size, and ``t`` the total number of tokens
+               in a batch, with ``t = sum(s_i), for i = 0...b-1``. ``"sbhd"`` and ``"bshd"`` formats
               are used for when sequences in a batch are of equal length or padded to
-               equal length, and the `thd` format is used for when sequences in a batch
+               equal length, and the ``"thd"`` format is used for when sequences in a batch
               have different lengths. Please note that these formats do not reflect how
-               tensors `query_layer`, `key_layer`, `value_layer` are laid out in memory.
-               For that, please use `get_qkv_layout` to gain the layout information.
-    softmax_scale: Optional[float], default = `None`
-                softmax scale for the attention scores. If `None`, defaults to
-                `1.0/math.sqrt(kv_channels if isinstance(kv_channels, int) else kv_channels[0])`.
-    softmax_type: str = {'vanilla', 'off-by-one', 'learnable'}, default = 'vanilla'
-                 softmax type as described in this paper:
+               tensors ``query_layer``, ``key_layer``, ``value_layer`` are laid out in memory.
+               For that, please use ``get_qkv_layout`` to gain the layout information.
+    softmax_scale : Optional[float], default = None
+                softmax scale for the attention scores. If ``None``, defaults to
+                ``1.0/math.sqrt(kv_channels if isinstance(kv_channels, int) else kv_channels[0])``.
+    softmax_type : str = {'vanilla', 'off-by-one', 'learnable'}, default = 'vanilla'
+                 Softmax type as described in the paper
                 `Efficient Streaming Language Models with Attention Sinks
                 <https://arxiv.org/pdf/2309.17453v3>`_.
-                 For a given attention score S = Q*K^T, of shape [b, h, s_q, s_kv],
-                 'vanilla': S[:,:,:,i] = exp(S[:,:,:,i])/sum(exp(S[:,:,:,:]), dim=-1),
-                 'off-by-one': S[:,:,:,i] = exp(S[:,:,:,i])/(1 + sum(exp(S[:,:,:,:]), dim=-1)), and
-                 'learnable': S[:,j,:,i] = exp(S[:,j,:,i])/(exp(alpha[j]) + sum(exp(S[:,j,:,:]), dim=-1)),
-                 where alpha is a learnable parameter in shape [h].
-                 'off-by-one' and 'learnable' softmax types are also called sink attention
-                 ('zero sink' and 'learnable sink').
-    return_max_logit: Optional[bool], default = `False`
+
+                 For a given attention score :math:`S = Q \cdot K^T`, of shape ``[b, h, s_q, s_kv]``:
+
+                 * ``'vanilla'``:
+
+                   .. math::
+                      Softmax(S)_{:,:,:,i} = \frac{\exp(S_{:,:,:,i})}{\sum_j \exp(S_{:,:,:,j})}
+
+                 * ``'off-by-one'``:
+
+                   .. math::
+                      Softmax(S)_{:,:,:,i} = \frac{\exp(S_{:,:,:,i})}{1 + \sum_j \exp(S_{:,:,:,j})}
+
+                 * ``'learnable'``:
+
+                   .. math::
+                      Softmax(S)_{:,h,:,i} = \frac{\exp(S_{:,h,:,i})}{\exp(\alpha_h) + \sum_j \exp(S_{:,h,:,j})}
+
+                   where :math:`\alpha` is a learnable parameter of shape ``[h]``.
+
+                 ``'off-by-one'`` and ``'learnable'`` softmax types are also called sink attention
+                 (``'zero sink'`` and ``'learnable sink'``).
+
+    return_max_logit : Optional[bool], default = False
                     If true, returns the maximum attention score that can be used in a Muon optimizer to
                     rescale the Q and K projection weights (see `Muon is Scalable for LLM Training
                     <https://arxiv.org/pdf/2502.16982>`_).
-                     max_logit = max(S), where S = mask(Q*K^T*softmax_scale + bias) in shape [b, h, s_q, s_kv],
-                     and max_logit is in shape [h].
+                     :math:`\text{max_logit} = \max(S)`, where :math:`S = \text{mask}(Q \cdot K^T \cdot \text{softmax_scale} + \text{bias})` of shape ``[b, h, s_q, s_kv]``,
+                     and :math:`\text{max_logit}` is of shape ``[h]``.

    Parallelism parameters
    ----------------------
-    sequence_parallel : bool, default = `False`
-                       if set to `True`, uses sequence parallelism.
+    sequence_parallel : bool, default = False
+                       if set to ``True``, uses sequence parallelism.
    tp_size : int, default = 1
             tensor parallel world size.
-    tp_group : ProcessGroup, default = `None`
+    tp_group : ProcessGroup, default = None
              tensor parallel process group.
-    cp_group : Union[ProcessGroup, List[ProcessGroup]], default = `None`
+    cp_group : Union[ProcessGroup, List[ProcessGroup]], default = None
              context parallel process group.
-              ProcessGroup is for cp_comm_type of "p2p", "all_gather", and "a2a".
-              List[ProcessGroup] is for cp_comm_type of "a2a+p2p", where cp_group[0]
-              and cp_group[1] are for a2a and p2p communications respectively.
-    cp_global_ranks : list of global rank IDs, default = `None`
-                     global rank IDs of GPUs that are in cp_group.
-    cp_stream : CUDA stream, default = `None`
+              ``ProcessGroup`` is for :attr:`cp_comm_type` of ``"p2p"``, ``"all_gather"``, and ``"a2a"``.
+              ``List[ProcessGroup]`` is for :attr:`cp_comm_type` of ``"a2a+p2p"``, where :attr:`cp_group[0]`
+              and :attr:`cp_group[1]` are for ``"a2a"`` and ``"p2p"`` communications respectively.
+    cp_global_ranks : list of global rank IDs, default = None
+                     global rank IDs of GPUs that are in ``cp_group``.
+    cp_stream : CUDA stream, default = None
               context parallelism splits flash attention into multiple steps for
               compute and communication overlapping. To address the wave quantization
               issue of each split step, we add an additional CUDA stream so that we
               can overlap two flash attention kernels.
-    cp_comm_type : str, default = `p2p`
+    cp_comm_type : str, default = "p2p"
                  inter-gpu communication type for context parallelism.
-                  Can be "p2p" or "all_gather" or "a2a" or "a2a+p2p".
-                  "p2p": Exchange KV chunks with P2P communications in ring topology.
-                         P2P is async and can be overlapped with attention compute.
-                  "all_gather": All-gather to get full sequence of KV before attention.
-                                The all-gather is not async, and cannot be overlapped.
-                  "a2a": Like DeepSpeed Ulysses, scatter attention heads across the CP
-                         group, and gather to get full sequence of QKV.
-                  "a2a+p2p": hierarchical CP implementation. First applying a2a to QKV
-                  across each CP sub-group (e.g., via NVLink), then exchanging KV with
-                  p2p between sub-groups (e.g., via IBLink).
+                  Can be ``"p2p"`` or ``"all_gather"`` or ``"a2a"`` or ``"a2a+p2p"``.
+
+                  - ``"p2p"``: Exchange KV chunks with P2P communications in ring topology.
+                    P2P is async and can be overlapped with attention compute.
+                  - ``"all_gather"``: All-gather to get full sequence of KV before attention.
+                    The all-gather is not async, and cannot be overlapped.
+                  - ``"a2a"``: Like DeepSpeed Ulysses, scatter attention heads across the CP
+                    group, and gather to get full sequence of QKV.
+                  - ``"a2a+p2p"``: hierarchical CP implementation. First applying a2a to QKV
+                    across each CP sub-group (e.g., via NVLink), then exchanging KV with
+                    p2p between sub-groups (e.g., via IBLink).
    """

    def __init__(
@@ -468,8 +487,8 @@ class DotProductAttention(TransformerEngineBaseModule):
    ):
        """
        This function helps to load Transformer Engine 1.6 and 1.7 checkpoints, where FP8 attention
-        metadata is stored under the `core_attention.fused_attention._extra_state` key and not the
-        `core_attention._extra_state` key. Please see `FP8 checkpoint compatibility
+        metadata is stored under the ``core_attention.fused_attention._extra_state`` key and not the
+        ``core_attention._extra_state`` key. Please see `FP8 checkpoint compatibility
        <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/faq.html#fp8-checkpoint-compatibility>`_ for more details.
        """
        fused_attn_key = False
@@ -522,25 +541,26 @@ class DotProductAttention(TransformerEngineBaseModule):
        ----------
        cp_group : Union[ProcessGroup, List[ProcessGroup]]
                  context parallel process group.
-                  ProcessGroup is for cp_comm_type of "p2p", "all_gather", and "a2a".
-                  List[ProcessGroup] is for cp_comm_type of "a2a+p2p", where cp_group[0]
-                  and cp_group[1] are for a2a and p2p communications respectively.
+                  ``ProcessGroup`` is for :attr:`cp_comm_type` of ``"p2p"``, ``"all_gather"``, and ``"a2a"``.
+                  ``List[ProcessGroup]`` is for :attr:`cp_comm_type` of ``"a2a+p2p"``, where :attr:`cp_group[0]`
+                  and :attr:`cp_group[1]` are for ``"a2a"`` and ``"p2p"`` communications respectively.
        cp_global_ranks : List[int]
                         list of global ranks in the context group.
        cp_stream : torch.cuda.Stream
                   cuda stream for context parallel execution.
-        cp_comm_type : str, default = `p2p`
+        cp_comm_type : str, default = "p2p"
                      inter-gpu communication type for context parallelism.
-                      Can be "p2p" or "all_gather" or "a2a" or "a2a+p2p".
-                      "p2p": Exchange KV chunks with P2P communications in ring topology.
-                             P2P is async and can be overlapped with attention compute.
-                      "all_gather": All-gather to get full sequence of KV before attention.
-                                    The all-gather is not async, and cannot be overlapped.
-                      "a2a": Like DeepSpeed Ulysses, scatter attention heads across the CP
-                             group, and gather to get full sequence of QKV.
-                      "a2a+p2p": hierarchical CP implementation. First applying a2a to QKV
-                      across each CP sub-group (e.g., via NVLink), then exchanging KV with
-                      p2p between sub-groups (e.g., via IBLink).
+                      Can be ``"p2p"`` or ``"all_gather"`` or ``"a2a"`` or ``"a2a+p2p"``.
+
+                      - ``"p2p"``: Exchange KV chunks with P2P communications in ring topology.
+                        P2P is async and can be overlapped with attention compute.
+                      - ``"all_gather"``: All-gather to get full sequence of KV before attention.
+                        The all-gather is not async, and cannot be overlapped.
+                      - ``"a2a"``: Like DeepSpeed Ulysses, scatter attention heads across the CP
+                        group, and gather to get full sequence of QKV.
+                      - ``"a2a+p2p"``: hierarchical CP implementation. First applying a2a to QKV
+                        across each CP sub-group (e.g., via NVLink), then exchanging KV with
+                        p2p between sub-groups (e.g., via IBLink).
        """
        self.cp_group = cp_group
        self.cp_global_ranks = cp_global_ranks
@@ -799,14 +819,15 @@ class DotProductAttention(TransformerEngineBaseModule):
        inference_params: Optional[InferenceParams] = None,
        pad_between_seqs: Optional[bool] = None,
        fp8_output: Optional[bool] = False,
+        num_splits: Optional[int] = 1,
    ) -> torch.Tensor:
-        """
+        r"""
        Dot Product Attention Layer.

        .. note::

            Argument :attr:`attention_mask` is only used when :attr:`attn_mask_type`
-            includes '"padding"' or `"arbitrary"`.
+            includes ``"padding"`` or ``"arbitrary"``.

        .. note::

@@ -845,24 +866,24 @@ class DotProductAttention(TransformerEngineBaseModule):
               Pass in :attr:`cu_seqlens_q` and :attr:`cu_seqlens_kv`, or :attr:`attention_mask`
               (which will be converted to :attr:`cu_seqlens_q` and :attr:`cu_seqlens_kv`), to provide
               the real sequence length information. For example, a batch of 3 sequences
-               [a a a b b c c c c] can be padded to [a a a PAD b b PAD PAD c c c c], and the cumulative
+               ``[a a a b b c c c c]`` can be padded to ``[a a a PAD b b PAD PAD c c c c]``, and the cumulative
               sequence length tensors would be
-               :attr:`cu_seqlens_q` = :attr:`cu_seqlens_kv` = [0, 3, 5, 9] for self-attention.
+               :attr:`cu_seqlens_q` = :attr:`cu_seqlens_kv` = ``[0, 3, 5, 9]`` for self-attention.

            2. Do not perform padding on training data. Use :attr:`qkv_format` = "thd" and
               :attr:`attn_mask_type` = {"padding", "padding_causal", "padding_causal_bottom_right"}.
               Pass in :attr:`cu_seqlens_q` and :attr:`cu_seqlens_kv`, or :attr:`attention_mask`,
-               as in option 1. For example, a batch of 3 sequences [a a a b b c c c c] can be processed
+               as in option 1. For example, a batch of 3 sequences ``[a a a b b c c c c]`` can be processed
               without any padding, and the sequence length tensors would be
-               :attr:`cu_seqlens_q` = :attr:`cu_seqlens_kv` = [0, 3, 5, 9] for self-attention.
+               :attr:`cu_seqlens_q` = :attr:`cu_seqlens_kv` = ``[0, 3, 5, 9]`` for self-attention.

               In certain use cases, a varying number of identifier tokens are inserted between
               sequences. These tokens do not participate in the attention calculation.
               :attr:`cu_seqlens_q_padded` and :attr:`cu_seqlens_kv_padded` must be specified
               in such cases to correctly identify the start and end of each sequence in a batch.
-               For example, a batch of 3 sequences [a a a 1 b b 2 2 c c c c 3] would have
-               :attr:`cu_seqlens_q` = :attr:`cu_seqlens_kv` = [0, 3, 5, 9], and
-               :attr:`cu_seqlens_q_padded` = :attr:`cu_seqlens_kv_padded` = [0, 4, 8, 13]
+               For example, a batch of 3 sequences ``[a a a 1 b b 2 2 c c c c 3]`` would have
+               :attr:`cu_seqlens_q` = :attr:`cu_seqlens_kv` = ``[0, 3, 5, 9]``, and
+               :attr:`cu_seqlens_q_padded` = :attr:`cu_seqlens_kv_padded` = ``[0, 4, 8, 13]``
               for self-attention.

        .. note::
@@ -897,85 +918,89 @@ class DotProductAttention(TransformerEngineBaseModule):
        value_layer : torch.Tensor
                     Value tensor.
        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]],
-             default = `None`. Boolean tensor(s) used to mask out attention softmax input.
-             It should be `None` for causal masks and "`no_mask`". For padding masks, it should be
-             a single tensor of [batch_size, 1, 1, seqlen_q] for self-attention, and a tuple of
-             two tensors in shapes [batch_size, 1, 1, seqlen_q] and [batch_size, 1, 1, seqlen_kv]
-             for cross-attention. For "`arbitrary`" mask, it should be in a shape broadcastable
-             to [batch_size, num_heads, max_seqlen_q, max_seqlen_kv]. A `True` value means
-             the corresponding position is masked out and a `False` means that position
+             default = None. Boolean tensor(s) used to mask out attention softmax input.
+             It should be ``None`` for causal masks and ``"no_mask"``. For padding masks, it should be
+             a single tensor of ``[batch_size, 1, 1, seqlen_q]`` for self-attention, and a tuple of
+             two tensors of shapes ``[batch_size, 1, 1, seqlen_q]`` and ``[batch_size, 1, 1, seqlen_kv]``
+             for cross-attention. For ``"arbitrary"`` mask, it should be of a shape broadcastable
+             to ``[batch_size, num_heads, max_seqlen_q, max_seqlen_kv]``. A ``True`` value means
+             the corresponding position is masked out and a ``False`` means that position
             is allowed to participate in attention.
-        qkv_format: str, default = `None`
+        qkv_format: str, default = None
                   If provided, overrides :attr:`qkv_format` from initialization.
-        cu_seqlens_q: Optional[torch.Tensor], default = `None`
-                   Cumulative sum of sequence lengths (without offset) in a batch for `query_layer`,
+        cu_seqlens_q: Optional[torch.Tensor], default = None
+                   Cumulative sum of sequence lengths (without offset) in a batch for ``query_layer``,
                   with shape [batch_size + 1] and dtype torch.int32.
                   See :ref:`note<cu_seqlens note>` for more details.
-        cu_seqlens_kv: Optional[torch.Tensor], default = `None`
-                   Cumulative sum of sequence lengths (without offset) in a batch for `key_layer`
-                   and `value_layer`, with shape [batch_size + 1] and dtype torch.int32.
+        cu_seqlens_kv: Optional[torch.Tensor], default = None
+                   Cumulative sum of sequence lengths (without offset) in a batch for ``key_layer``
+                   and ``value_layer``, with shape [batch_size + 1] and dtype torch.int32.
                   See :ref:`note<cu_seqlens note>` for more details.
-        cu_seqlens_q_padded: Optional[torch.Tensor], default = `None`
+        cu_seqlens_q_padded: Optional[torch.Tensor], default = None
                   Cumulative sum of sequence lengths (with offset) in a batch for
-                   `query_layer`, with shape [batch_size + 1] and dtype torch.int32.
+                   ``query_layer``, with shape ``[batch_size + 1]`` and dtype torch.int32.
                   When there is no padding between sequences in a batch,
-                   `cu_seqlens_q_padded = cu_seqlens_q`.
+                   :attr:`cu_seqlens_q_padded` = :attr:`cu_seqlens_q`.
                   See :ref:`note<cu_seqlens note>` for more details.
-        cu_seqlens_kv_padded: Optional[torch.Tensor], default = `None`
-                   Cumulative sum of sequence lengths (with offset) in a batch for `key_layer`
-                   and `value_layer`, with shape [batch_size + 1] and dtype torch.int32.
+        cu_seqlens_kv_padded: Optional[torch.Tensor], default = None
+                   Cumulative sum of sequence lengths (with offset) in a batch for ``key_layer``
+                   and ``value_layer``, with shape ``[batch_size + 1]`` and dtype torch.int32.
                   When there is no padding between sequences in a batch,
-                   `cu_seqlens_kv_padded = cu_seqlens_kv`.
+                   :attr:`cu_seqlens_kv_padded` = :attr:`cu_seqlens_kv`.
                   See :ref:`note<cu_seqlens note>` for more details.
-        max_seqlen_q: Optional[int], default = `None`
-                      Maximum sequence length in `query_layer`.
+        max_seqlen_q: Optional[int], default = None
+                      Maximum sequence length in ``query_layer``.
                      See :ref:`note<max_seqlen note>` for more details.
-        max_seqlen_kv: Optional[int], default = `None`
-                       Maximum sequence length in `key_layer` and `value_layer`.
+        max_seqlen_kv: Optional[int], default = None
+                       Maximum sequence length in ``key_layer`` and ``value_layer``.
                       See :ref:`note<max_seqlen note>` for more details.
        attn_mask_type: {'no_mask', 'padding', 'causal', 'padding,causal', 'causal,padding',
                       'padding_causal', 'causal_bottom_right', 'padding_causal_bottom_right',
-                       'arbitrary'}, default = `None`. Type of attention mask passed into
+                       'arbitrary'}, default = None. Type of attention mask passed into
                       softmax operation. 'padding,causal', 'causal,padding' and 'padding_causal'
                       are equivalent. By default, causal masks are aligned to the top left corner
-                       of the softmax matrix. When "`bottom_right`" is specified in the mask type,
+                       of the softmax matrix. When ``"bottom_right"`` is specified in the mask type,
                       causal masks are aligned to the bottom right corner.
-        window_size: Optional[Tuple[int, int]], default = `None`
+        window_size: Optional[Tuple[int, int]], default = None
                    Sliding window size for local attention.
-        checkpoint_core_attention : bool, default = `False`
+        checkpoint_core_attention : bool, default = False
                                   If true, forward activations for attention are recomputed
                                   during the backward pass in order to save memory that would
                                   otherwise be occupied to store the forward activations until
                                   backprop.
-        core_attention_bias_type: str, default = `no_bias`
-                    Bias type, {`no_bias`, `pre_scale_bias`, `post_scale_bias`, `alibi`}
-        core_attention_bias: Optional[torch.Tensor], default = `None`
-                    Bias tensor for Q * K.T, shape [1, num_head, max_seqlen_q, max_seqlen_kv].
-                    It should be 'None' for 'no_bias' and 'alibi' bias types.
-        alibi_slopes: Optional[torch.Tensor], default = `None`
-                     ALiBi slopes in FP32 and shape [nheads] or [batch_size, nheads].
+        core_attention_bias_type: str, default = "no_bias"
+                    Bias type, {``"no_bias"``, ``"pre_scale_bias"``, ``"post_scale_bias"``, ``"alibi"``}
+        core_attention_bias: Optional[torch.Tensor], default = None
+                    Bias tensor for :math:`Q \cdot K^T`, shape ``[1, num_head, max_seqlen_q, max_seqlen_kv]``.
+                    It should be ``None`` for ``"no_bias"`` and ``"alibi"`` bias types.
+        alibi_slopes: Optional[torch.Tensor], default = None
+                     ALiBi slopes in FP32 and shape ``[nheads]`` or ``[batch_size, nheads]``.
                     It adds a bias of (-alibi_slope * (i + seqlen_k - seqlen_q - j))
                     to the attention score of query i and key j.
-        fast_zero_fill: bool, default = `True`
+        fast_zero_fill: bool, default = True
                    Whether to use the fast path to set output tensors to 0 or not.
-        inference_params: Optional[InferenceParams], default = `None`
+        inference_params: Optional[InferenceParams], default = None
            Optimizes execution performance during inference by caching Keys and Values of the
            current decoding iteration. These cached values are appended to the K and V values
            computed in previous iterations, eliminating the need to recalculate them for the
            entire sequence.
-            Initialization of `inference_params` is required prior to use to ensure sufficient
+            Initialization of ``inference_params`` is required prior to use to ensure sufficient
            memory allocation.
            Adjustments of the sequence_len_offset should be done after a complete forward pass.
            If rotary positional embeddings (RoPE) are utilized, they must be prepared beforehand.
            Supports "sbhd" and "bshd" layouts, with the "sbhd" layout being more efficient.
-        pad_between_seqs: Optional[bool], default = `None`
-            If None, inferred from qkv_format, cu_seqlens and cu_seqlens_padded.
-            If true, there are padding tokens between individual sequences in a packed batch.
-        fp8_output: Optional[bool], default = `False`
+        pad_between_seqs: Optional[bool], default = None
+            If ``None``, inferred from qkv_format, cu_seqlens and cu_seqlens_padded.
+            If ``True``, there are padding tokens between individual sequences in a packed batch.
+        fp8_output: Optional[bool], default = False
            Whether to enforce output to be in FP8 or not.
+        num_splits: Optional[int], default = 1
+            Optional split control for FlashAttention-3 only. When set, this value is forwarded
+            to the FA3 backend to control internal kernel splitting behavior for non-context-parallel
+            cases. It is ignored for other backends and when context parallelism is enabled.
        """

-        with torch.cuda.device(query_layer.device), self.prepare_forward(
+        with self.prepare_forward(
            query_layer,
            num_gemms=3,
            allow_non_contiguous=True,
@@ -1315,6 +1340,7 @@ class DotProductAttention(TransformerEngineBaseModule):
                softmax_type=self.softmax_type,
                return_max_logit=self.return_max_logit,
                cuda_graph=is_graph_capturing(),
+                num_splits=num_splits,
            )
            global _attention_backends
            if is_in_onnx_export_mode():
@@ -1413,6 +1439,7 @@ class DotProductAttention(TransformerEngineBaseModule):
                    inference_params=inference_params,
                    flash_attention_backend=flash_attention_backend,
                    fp8_output=fp8_output,
+                    num_splits=num_splits,
                )

            if use_fused_attention:

--- a/transformer_engine/pytorch/attention/dot_product_attention/softmax.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/softmax.py
@@ -156,7 +156,9 @@ class FusedScaleMaskSoftmax(nn.Module):
        softmax_in_fp32: bool = True,
    ) -> None:
        super().__init__()
-        self.scaled_masked_softmax_fusion = bool(int(os.getenv("NVTE_MASKED_SOFTMAX_FUSION", "1")))
+        self.scaled_masked_softmax_fusion_type = bool(
+            int(os.getenv("NVTE_MASKED_SOFTMAX_FUSION", "1"))
+        )
        self.mask_func = mask_func
        self.softmax_in_fp32 = softmax_in_fp32

@@ -189,7 +191,7 @@ class FusedScaleMaskSoftmax(nn.Module):
        """Check FusedScaleMaskSoftmax kernel availability based on size"""
        attn_batches = b * np

-        if not self.scaled_masked_softmax_fusion:
+        if not self.scaled_masked_softmax_fusion_type:
            return False  # user doesn't want to fuse
        if not self.input_in_float16:
            return False  # input must be fp16

--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -135,7 +135,7 @@ class FlashAttentionUtils:
    # Please follow these instructions to install FA3
    v3_installation_steps = """\
 (1) git clone https://github.com/Dao-AILab/flash-attention.git
-(2) cd flash-attention/ && git checkout 3ba6f82 && git submodule update --init && cd hopper/ && python setup.py install
+(2) cd flash-attention/hopper && python setup.py install
 (3) python_path=`python -c "import site; print(site.getsitepackages()[0])"`
 (4) mkdir -p $python_path/flash_attn_3
 (5) cp flash_attn_interface.py $python_path/flash_attn_3/flash_attn_interface.py"""
@@ -175,64 +175,66 @@ class AttentionParams:

    Parameters
    ----------
-    qkv_type: Union[torch.Tensor, Float8Tensor], default = `torch.Tensor`
+    qkv_type : Union[torch.Tensor, Float8Tensor], default = torch.Tensor
        Type of query/key/value tensors, {`torch.Tensor`, `Float8Tensor`}.
-    qkv_dtype: torch.dtype, default = `torch.bfloat16`
+    qkv_dtype : torch.dtype, default = torch.bfloat16
        Data type of query/key/value tensors.
-    qkv_layout: str, default = "sbh3d"
+    qkv_layout : str, default = "sbh3d"
        Query/key/value tensor memory layout.
-    batch_size: int, default = 1
+    batch_size : int, default = 1
        Batch size.
-    num_heads: int, default = 16
+    num_heads : int, default = 16
        Number of attention heads in the query tensor.
-    num_gqa_groups: int, default = 16
+    num_gqa_groups : int, default = 16
        Number of attention heads in key and value tensors.
-    max_seqlen_q: int, default = 128
+    max_seqlen_q : int, default = 128
        Maximum sequence length of the query tensor.
-    max_seqlen_kv: int, default = 128
+    max_seqlen_kv : int, default = 128
        Maximum sequence length of the key and value tensors.
-    head_dim_qk: int, default = 64
+    head_dim_qk : int, default = 64
        The size of each attention head in query and key tensors.
-    head_dim_v: int, default = 64
+    head_dim_v : int, default = 64
        The size of each attention head in the value tensor.
-    attn_mask_type: str, default = `no_mask`
+    attn_mask_type : str, default = no_mask
        Attention mask type, {`no_mask`, `padding`, `causal`, `padding_causal`,
        `causal_bottom_right`, `padding_causal_bottom_right`, `arbitrary`}
-    window_size: Tuple[int, int], default = None
+    window_size : Tuple[int, int], default = None
        Sliding window attention size.
-    alibi_slopes_shape: Optional[Union[torch.Size, List]], default = `None`
+    alibi_slopes_shape : Optional[Union[torch.Size, List]], default = None
        Tensor shape of :attr:`alibi_slopes` in `DotProductAttention`.
-    core_attention_bias_type: str, default = `no_bias`
+    core_attention_bias_type : str, default = no_bias
        Attention bias type, {`no_bias`, `pre_scale_bias`, `post_scale_bias`, `alibi`}.
-    core_attention_bias_shape: str, default = `1hss`
+    core_attention_bias_shape : str, default = 1hss
        Attention bias shape, {`1hss`, `b1ss`, `bhss`}.
-    core_attention_bias_requires_grad: bool, default = `True`
+    core_attention_bias_requires_grad : bool, default = True
        Whether attention bias requires gradient.
-    pad_between_seqs: bool, default = `False`
+    pad_between_seqs : bool, default = False
        Whether there is padding between sequences in a batch.
        This only applies to `qkv_format=thd`.
-    attention_dropout: float, default = 0.0
+    attention_dropout : float, default = 0.0
        Attention dropout.
-    context_parallel: bool, default = `False`
+    context_parallel : bool, default = False
        Whether context parallelism is used or not.
-    cp_comm_type: str, default = "p2p"
+    cp_comm_type : str, default = "p2p"
        The communication type of context parallelism.
-    deterministic: bool, default = `False`
+    deterministic : bool, default = False
        Whether to run `DotProductAttention` with determinism or not.
-    is_training: bool, default = `True`
+    is_training : bool, default = True
        Whether in training mode (`True`) or inference mode (`False`)
-    fp8: bool, default = `False`
+    fp8 : bool, default = False
        Whether `DotProductAttention` is in an `autocast` region.
-    fp8_meta: Optional[Dict[str Any]], default = `None`
+    fp8_meta : Optional[Dict[str Any]], default = None
        The FP8 metadata tensor of `DotProductAttention`.
-    inference_params: Optional[InferenceParams], default = `None`
+    inference_params : Optional[InferenceParams], default = None
        Inference-related parameters. See InferenceParams for details.
-    softmax_type: str, default = "vanilla"
+    softmax_type : str, default = "vanilla"
        The type of softmax operation. See DotProductAttention for details.
-    return_max_logit: bool, default = `False`
+    return_max_logit : bool, default = False
        Whether to output max_logit.
-    cuda_graph: bool, default = `False`
+    cuda_graph : bool, default = `False`
        Whether support for cuda graph capture is needed or not.
+    num_splits : int, default = 1
+        The number of kernels to split attention to.
    """

    qkv_type: Union[torch.Tensor, Float8Tensor] = torch.Tensor
@@ -263,6 +265,7 @@ class AttentionParams:
    softmax_type: str = "vanilla"
    return_max_logit: bool = False
    cuda_graph: bool = False
+    num_splits: int = 1

    def __eq__(self, other):
        """
@@ -295,15 +298,15 @@ def get_attention_backend(

    Returns
    ----------
-    use_flash_attention: bool
+    use_flash_attention : bool
        Whether the `FlashAttention` backend has been selected.
-    use_fused_attention: bool
+    use_fused_attention : bool
        Whether the `FusedAttention` backend has been selected.
-    fused_attention_backend: tex.NVTE_Fused_Attn_Backend
+    fused_attention_backend : tex.NVTE_Fused_Attn_Backend
        If `use_fused_attention = True`, one of `FusedAttention` three sub-backends, else `None`.
-    use_unfused_attention: bool
+    use_unfused_attention : bool
        Whether the `UnfusedDotProductAttention` backend has been selected.
-    available_backends: List[bool]
+    available_backends : List[bool]
        All available backends that could support the provided input. A list of Booleans
        in the form of [use_flash_attention, use_fused_attention, use_unfused_attention].
    """
@@ -338,6 +341,7 @@ def get_attention_backend(
    softmax_type = attention_params.softmax_type
    return_max_logit = attention_params.return_max_logit
    cuda_graph = attention_params.cuda_graph
+    num_splits = attention_params.num_splits

    # Run config
    logger = logging.getLogger("DotProductAttention")
@@ -515,6 +519,18 @@ def get_attention_backend(
            use_flash_attention = False
            use_fused_attention = False

+    # Filter: num_splits
+    if num_splits != 1:
+        if use_flash_attention_2 and FlashAttentionUtils.is_installed:
+            logger.debug("Disabling FlashAttention 2 for num_splits")
+            use_flash_attention_2 = False
+        if use_fused_attention:
+            logger.debug("Disabling FusedAttention for num_splits")
+            use_fused_attention = False
+        if use_unfused_attention:
+            logger.debug("Disabling UnfusedDotProductAttention for num_splits")
+            use_unfused_attention = False
+
    # Filter: Return max_logit
    if return_max_logit:
        if use_flash_attention:
@@ -833,8 +849,8 @@ def get_attention_backend(
    # ----------------------------------------------------------------------------------------
    # no_mask                     | None                                 | All
    # padding                     |                                      | All
-    #     self-attention          | One tensor in shape [b, 1, 1, sq]    |
-    #     cross-attention         | Tuple of two tensors in shapes       |
+    #     self-attention          | One tensor of shape [b, 1, 1, sq]    |
+    #     cross-attention         | Tuple of two tensors of shapes       |
    #                             | [b, 1, 1, sq] and [b, 1, 1, skv]     |
    # causal                      | None                                 |
    #     self-attention          |                                      | All
@@ -844,7 +860,7 @@ def get_attention_backend(
    #     cross-attention         |                                      | FusedAttention, UnfusedDotProductAttention
    # causal_bottom_right         | None                                 | All
    # padding_causal_bottom_right | Same as "padding"                    | All
-    # arbitrary                   | One tensor in shape broadcastable to | UnfusedDotProductAttention
+    # arbitrary                   | One tensor of shape broadcastable to | UnfusedDotProductAttention
    #                             | [b, h, sq, skv]                      |
    if attn_mask_type == "arbitrary":
        if (use_flash_attention_2 and FlashAttentionUtils.is_installed) or (
@@ -1269,42 +1285,42 @@ def get_full_mask(

    Parameters
    ----------
-    max_seqlen_q: int
+    max_seqlen_q : int
        Maximum sequence length for queries.
-    max_seqlen_kv: int
+    max_seqlen_kv : int
        Maximum sequence length for keys and values.
-    attn_mask_type: str, default = `no_mask`
-        Attention mask type, {"`no_mask`", "`padding`", "`causal`", "`padding_causal`",
-        "`causal_bottom_right`", "`padding_causal_bottom_right`", "`arbitrary`"}
-    attention_mask: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        default = `None`
+    attn_mask_type : str, default = no_mask
+        Attention mask type, {``"no_mask"``, ``"padding"``, ``"causal"``, ``"padding_causal"``,
+        ``"causal_bottom_right"``, ``"padding_causal_bottom_right"``, ``"arbitrary"``}
+    attention_mask : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        default = None
        Boolean tensor(s) used to mask out attention softmax input. Please see DotProductAttention
        for the requirements of `attention_mask` for different `attn_mask_type`s.
-    window_size: Tuple[int, int], default = `None`
+    window_size : Tuple[int, int], default = None
        Sliding window size for local attention, where query at position i attends to keys
        in [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q
        + window_size[1]] inclusive. Special cases (-1, -1) and (-1, 0) mean no sliding
        window and causal mask specifically. Both `causal` and `causal_bottom_right` masks
        map to `window_size = (-1, 0)` and Transformer Engine distinguishes them based on
        `attn_mask_type`.
-    attention_type: str, default = "self"
+    attention_type : str, default = "self"
        Attention type, {"self", "cross"}
-    bottom_right_alignment: bool, default = `True`
+    bottom_right_alignment : bool, default = True
        Whether to align the diagonal of the sliding window attention to the bottom right (`True`)
        or top left (`False`) corner of the softmax matrix. Ignored if `attn_mask_type` explicitly
        specifies "causal" or "causal_bottom_right".

    Returns
    ----------
-    attn_mask_type: str
+    attn_mask_type : str
        For sliding window attention (>=0, >0), "arbitrary"; otherwise, the same as input `attn_mask_type`
-    attention_mask: torch.Tensor
+    attention_mask : torch.Tensor
        The full attention mask based on `attn_mask_type`, `attention_mask` and `window_size`
-    actual_seqlens_q: torch.Tensor
-        For padding masks, the actual sequence lengths for queries, in shape [batch_size].
+    actual_seqlens_q : torch.Tensor
+        For padding masks, the actual sequence lengths for queries, of shape [batch_size].
        For other masks, `None`.
-    actual_seqlens_kv: Optional[torch.Tensor], default = `None`
-        For padding masks, the actual sequence lengths for keys and values, in shape [batch_size].
+    actual_seqlens_kv : Optional[torch.Tensor], default = None
+        For padding masks, the actual sequence lengths for keys and values, of shape [batch_size].
        For other masks, `None`.
    """
    # perform basic checks
@@ -1390,29 +1406,29 @@ def get_alibi(
    """
    Parameters
    ----------
-    num_heads: int
+    num_heads : int
        Number of heads.
-    max_seqlen_q: int
+    max_seqlen_q : int
        Maximum sequence length for queries.
-    max_seqlen_kv: int
+    max_seqlen_kv : int
        Maximum sequence length for keys and values.
-    actual_seqlens_q: Optional[torch.Tensor], default = `None`
-        Actual sequence lengths for queries, in shape [batch_size].
-    actual_seqlens_kv: Optional[torch.Tensor], default = `None`
-        Actual sequence lengths for keys and values, in shape [batch_size].
-    alibi_slopes: Optional[torch.Tensor], default = `None`
-        Custom ALiBi slopes, FP32, CUDA tensor, in shape [num_heads] or [batch_size, num_heads].
-    bias_dtype: Optional[torch.dtype], default = `None`
+    actual_seqlens_q : Optional[torch.Tensor], default = None
+        Actual sequence lengths for queries, of shape [batch_size].
+    actual_seqlens_kv : Optional[torch.Tensor], default = None
+        Actual sequence lengths for keys and values, of shape [batch_size].
+    alibi_slopes : Optional[torch.Tensor], default = None
+        Custom ALiBi slopes, FP32, CUDA tensor, of shape [num_heads] or [batch_size, num_heads].
+    bias_dtype : Optional[torch.dtype], default = None
        Dtype of the generated ALiBi bias. If None, use torch.float32.
-    bottom_right_alignment: bool, default = `True`
+    bottom_right_alignment : bool, default = True
        Whether to align the diagonal of the ALiBi bias to the bottom right corner of
        the matrix (`True`) or top left (`False`).

    Returns
    ----------
-    alibi_slopes: torch.Tensor
+    alibi_slopes : torch.Tensor
        ALiBi slopes in FP32 and shape [num_heads] or [batch_size, num_heads].
-    alibi_bias: torch.Tensor
+    alibi_bias : torch.Tensor
        ALiBi bias in FP32 or `bias_dtype`. Its shape is
        (1) [1, num_heads, max_seqlen_q, max_seqlen_kv] if `alibi_slopes` is in [num_heads] shape,
        and `actual_seqlens_q` and `actual_seqlens_kv` are `None`; or
@@ -1580,8 +1596,9 @@ def _pack_tensor(
    """
    Packs the given tensor using the `indices`.
    """
+    dtype = tensor.dtype if not isinstance(tensor, Float8Tensor) else torch.uint8
    padding_indice = torch.zeros(
-        1, tensor.shape[1], tensor.shape[2], dtype=tensor.dtype, device=tensor.device
+        1, tensor.shape[1], tensor.shape[2], dtype=dtype, device=tensor.device
    )
    indices = indices.repeat(1, tensor.shape[1], tensor.shape[2])
    if isinstance(tensor, Float8Tensor):
@@ -1636,8 +1653,9 @@ def _unpack_tensor(
    Inverse of `_pack_tensor`.
    """
    indices = indices.repeat(1, tensor.shape[1], tensor.shape[2])
+    dtype = tensor.dtype if not isinstance(tensor, Float8Tensor) else torch.uint8
    unpacked = torch.zeros(
-        dim0 + 1, tensor.shape[1], tensor.shape[2], dtype=tensor.dtype, device=tensor.device
+        dim0 + 1, tensor.shape[1], tensor.shape[2], dtype=dtype, device=tensor.device
    )
    if isinstance(tensor, Float8Tensor):
        unpacked.scatter_(0, indices, tensor._data)
@@ -1814,18 +1832,18 @@ def get_qkv_format(

    Parameters
    ----------
-    qkv_layout: str
+    qkv_layout : str
       Memory layout of `q`, `k` and `v`. See get_qkv_layout() for more details.
-    inference_params: InferenceParams, default = `None`
+    inference_params : InferenceParams, default = None
        InferenceParams related to KV caching.

    Returns
    ----------
-    qkv_format: str, default = `sbhd`
+    qkv_format : str, default = sbhd
        Dimension format for `q`, `k` and `v`, {`sbhd`, `bshd`, `thd`}.
-    q_format: str
+    q_format : str
        Format of the `q` tensor, {`bshd`, `sbhd`, `thd`}.
-    kv_format: str
+    kv_format : str
        Format of the `k` and `v` tensors, {`bshd`, `sbhd`, `thd`}.
    """
    splited = qkv_layout.replace("paged_kv_", "").split("_")
@@ -1851,23 +1869,23 @@ def get_qkv_layout(

    Parameters
    ----------
-    q: torch.Tensor
+    q : torch.Tensor
        Query tensor.
-    k: torch.Tensor
+    k : torch.Tensor
        Key tensor.
-    v: torch.Tensor
+    v : torch.Tensor
        Value tensor.
-    qkv_format: str, default = `sbhd`
+    qkv_format : str, default = sbhd
        Dimension format for `q`, `k` and `v`, {`sbhd`, `bshd`, `thd`}. `s` stands for
        the sequence length dimension, `b` batch size, `h` the number of attention heads,
        `d` head size, and `t` the total number of tokens in a batch, i.e.
        `t = sum(s_i) for i = 0...b-1`.
-    inference_params: InferenceParams, default = `None`
+    inference_params : InferenceParams, default = None
        InferenceParams related to KV caching.

    Returns
    ----------
-    qkv_layout: str
+    qkv_layout : str
       Memory layout of `q`, `k` and `v`. Each `qkv_layout` maps to a pair of `q_format` and
       `kv_format` in {`bshd`, `sbhd`, `thd`}. The `paged_kv_` prefix is used to indicate that
       paged KV caching is in play. A few examples of the layouts are as follows.
@@ -1889,18 +1907,18 @@ def get_qkv_layout(
       `thd_2bshd`: {`thd_bshd_bshd`, `paged_kv_thd_bshd_bshd`}
       `thd_2sbhd`: {`thd_sbhd_sbhd`, `paged_kv_thd_sbhd_sbhd`}

-    q: torch.Tensor
+    q : torch.Tensor
        Query tensor. It may be different from input `q` as we try to fit tensors to
        a supported layout.
-    k: torch.Tensor
+    k : torch.Tensor
        Key tensor. It may be different from input `k` as we try to fit tensors to
        a supported layout.
-    v: torch.Tensor
+    v : torch.Tensor
        Value tensor. It may be different from input `v` as we try to fit tensors to
        a supported layout.
-    q_format: str
+    q_format : str
        Format of the query tensor, {`bshd`, `sbhd`, `thd`}.
-    kv_format: str
+    kv_format : str
        Format of the key and value tensors, {`bshd`, `sbhd`, `thd`}.
    """


--- a/transformer_engine/pytorch/attention/inference.py
+++ b/transformer_engine/pytorch/attention/inference.py
@@ -98,29 +98,29 @@ class InferenceParams:

    Parameters
    ----------
-    max_batch_size: int
+    max_batch_size : int
        Maximum batch size in inference
-    max_sequence_length: int
+    max_sequence_length : int
        Maximum sequence length in inference
-    num_heads_kv: int
+    num_heads_kv : int
        Number of attention heads in keys and values
-    head_dim_k: int
+    head_dim_k : int
        Head size for keys
-    dtype: torch.dtype
+    dtype : torch.dtype
        Data type of the KV cache
-    head_dim_v: int, default = None
+    head_dim_v : int, default = None
        Head size for values. If None, initialized as head_dim_k.
-    is_paged: bool, default = False
+    is_paged : bool, default = False
        Whether the KV cache is paged (True) or non-paged (False)
-    total_num_pages: int, default = None
+    total_num_pages : int, default = None
        Total number of pages in the KV cache. Required for is_paged = True.
-    page_size: int, default = None
+    page_size : int, default = None
        Page size of the KV cache. Required for is_paged = True.
-    max_ctx_len: int, default = None
+    max_ctx_len : int, default = None
        Maximum context length in inference. 1 <= max_ctx_len <= max_sequence_length.
-    qkv_format: str, default = "bshd"
+    qkv_format : str, default = "bshd"
        Format of the incoming query/key/value tensors in current iteration
-    custom_cache_manager: KVCacheManager, default = None
+    custom_cache_manager : KVCacheManager, default = None
        Custom cache manager, with KVCacheManager as the base class.
    """

@@ -525,9 +525,9 @@ class NonPagedKVCacheManager(KVCacheManager):
        new_v: torch.Tensor
            New value tokens for layer_number in current inference iteration
        cu_new_seqlens: torch.Tensor
-            Cumulative sequence lengths for new_k and new_v, in shape [batch_size + 1]
+            Cumulative sequence lengths for new_k and new_v, of shape [batch_size + 1]
        cu_cached_seqlens: torch.Tensor
-            Cumulative sequence lengths for k_cache and v_cache (after new tokens are copied in), in shape [batch_size + 1]
+            Cumulative sequence lengths for k_cache and v_cache (after new tokens are copied in), of shape [batch_size + 1]
        qkv_format: str
            Format of new_k and new_v tensors, {'bshd', 'sbhd', 'thd'}

@@ -701,7 +701,7 @@ class PagedKVCacheManager(KVCacheManager):
        return [x.page_id for x in self.allocated_pages[seq]]

    def get_page_table(self, sequences: List[int]):
-        """Get the page table, in shape [batch_size, max_pages_per_seq]"""
+        """Get the page table, of shape [batch_size, max_pages_per_seq]"""
        page_table = torch.Tensor(
            [
                self.get_page_list(seq) + [0] * (self.max_pages_per_seq - self.get_page_count(seq))
@@ -783,9 +783,9 @@ class PagedKVCacheManager(KVCacheManager):
        new_v: torch.Tensor
            New value tokens for layer_number in current inference iteration
        cu_new_seqlens: torch.Tensor
-            Cumulative sequence lengths for new_k and new_v, in shape [batch_size + 1]
+            Cumulative sequence lengths for new_k and new_v, of shape [batch_size + 1]
        cu_cached_seqlens: torch.Tensor
-            Cumulative sequence lengths for k_cache and v_cache (after new tokens are copied in), in shape [batch_size + 1]
+            Cumulative sequence lengths for k_cache and v_cache (after new tokens are copied in), of shape [batch_size + 1]
        qkv_format: str
            Format of new_k and new_v tensors, {'bshd', 'sbhd', 'thd'}


--- a/transformer_engine/pytorch/attention/multi_head_attention.py
+++ b/transformer_engine/pytorch/attention/multi_head_attention.py
@@ -50,8 +50,8 @@ class MultiheadAttention(torch.nn.Module):

    .. note::

-        Argument :attr:`attention_mask` in the `forward` call is only used when
-        :attr:`attn_mask_type` includes '"padding"' or `"arbitrary"`.
+        Argument :attr:`attention_mask` in the :meth:`forward() <MultiheadAttention.forward>` method is only used when
+        :attr:`attn_mask_type` includes ``"padding"`` or ``"arbitrary"``.

    Parameters
    ----------
@@ -59,57 +59,56 @@ class MultiheadAttention(torch.nn.Module):
                 size of each input sample.
    num_attention_heads : int
                         number of attention heads in the transformer layer.
-    kv_channels: int, default = `None`
+    kv_channels : int, default = None
                number of key-value channels. defaults to
-                :attr:`hidden_size` / :attr:`num_attention_heads` if `None`.
-    attention_dropout: float, default = 0.1
+                :attr:`hidden_size` / :attr:`num_attention_heads` if ``None``.
+    attention_dropout : float, default = 0.1
                      dropout probability for the dropout op during multi-head attention.
    layernorm_epsilon : float, default = 1e-5
                       a value added to the denominator of layer normalization
                       for numerical stability.
-    init_method : Callable, default = `None`
+    init_method : Callable, default = None
                 used for initializing weights of QKV and FC1 weights in the following way:
-                 `init_method(weight)`. When set to `None`, defaults to
-                 `torch.nn.init.normal_(mean=0.0, std=0.023)`.
-    output_layer_init_method : Callable, default = `None`
+                 ``init_method(weight)``. When set to ``None``, defaults to
+                 ``torch.nn.init.normal_(mean=0.0, std=0.023)``.
+    output_layer_init_method : Callable, default = None
                              used for initializing weights of PROJ and FC2 in the following way:
-                              `output_layer_init_method(weight)`. When set to `None`, defaults to
-                              `torch.nn.init.normal_(mean=0.0, std=0.023)`.
-    layer_number: int, default = `None`
-                 layer number of the current `TransformerLayer` when multiple such modules are
+                              ``output_layer_init_method(weight)``. When set to ``None``, defaults to
+                              ``torch.nn.init.normal_(mean=0.0, std=0.023)``.
+    layer_number : int, default = None
+                 layer number of the current ``TransformerLayer`` when multiple such modules are
                 concatenated to form a transformer block.
-    attn_mask_type: {'no_mask', 'padding', 'causal', 'padding_causal', 'causal_bottom_right',
+    attn_mask_type : {'no_mask', 'padding', 'causal', 'padding_causal', 'causal_bottom_right',
                   'padding_causal_bottom_right','arbitrary'},
-                   default = `causal`
+                   default = "causal"
                   type of attention mask passed into softmax operation. Overridden by
-                   :attr:`attn_mask_type` in the `forward` method. The forward
+                   :attr:`attn_mask_type` in the :meth:`forward` method. The :meth:`forward`
                   arg is useful for dynamically changing mask types, e.g. a different
-                   mask for training and inference. The init arg is useful for cases
+                   mask for training and inference. The :meth:`__init__` arg is useful for cases
                   involving compilation/tracing, e.g. ONNX export.
-    window_size: Optional[Tuple[int, int]], default = `None`
+    window_size : Optional[Tuple[int, int]], default = None
                sliding window size for local attention, where query at position i attends to keys
-                in [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q
-                + window_size[1]] inclusive. Special cases (-1, -1) and (-1, 0) mean no sliding
-                window and causal mask specifically. Both `causal` and `causal_bottom_right` masks
-                map to `window_size = (-1, 0)` and Transformer Engine distinguishes them based on
-                `attn_mask_type`. Similar to :attr:`attn_mask_type`, `window_size` can
-                be overridden by :attr:`window_size` in `forward` as well.
-    num_gqa_groups : int, default = `None`
+                in ``[i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]]`` inclusive. Special cases ``(-1, -1)`` and ``(-1, 0)`` mean no sliding
+                window and causal mask specifically. Both ``"causal"`` and ``"causal_bottom_right"`` masks
+                map to ``window_size = (-1, 0)`` and Transformer Engine distinguishes them based on
+                ``attn_mask_type``. Similar to :attr:`attn_mask_type`, ``window_size`` can
+                be overridden by :attr:`window_size` in :meth:`forward` as well.
+    num_gqa_groups : int, default = None
                         number of GQA groups in the transformer layer.
                         Grouped Query Attention is described in
                         `this paper <https://arxiv.org/pdf/2305.13245.pdf>`_.
                         This only affects the keys and values, not the querys.
                         GQA-1 is equivalent to Multi-Query Attention
                         (`MQA <https://arxiv.org/pdf/1911.02150.pdf>`_), while GQA-H
-                         is equivalent to MHA, i.e. `num_gqa_groups = num_attention_heads`.
-    return_layernorm_output : bool, default = `False`
-                             if set to `True`, output of layernorm is returned from the forward
+                         is equivalent to MHA, i.e. ``num_gqa_groups = num_attention_heads``.
+    return_layernorm_output : bool, default = False
+                             if set to ``True``, output of layernorm is returned from the :meth:`forward` method
                             together with the output of the linear transformation.
                             Example use case: residual connection for transformer module is
                             taken post layernorm.
-    input_layernorm: bool, default = `False`
-                     if set to `True`, layer normalization to the input is applied.
-    attention_type: { 'self', 'cross' }, default = 'self'
+    input_layernorm : bool, default = False
+                     if set to ``True``, layer normalization to the input is applied.
+    attention_type : { 'self', 'cross' }, default = 'self'
                   type of attention applied.
    zero_centered_gamma : bool, default = 'False'
                         if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
@@ -120,103 +119,118 @@ class MultiheadAttention(torch.nn.Module):
                            (1 + \gamma) + \beta
    normalization : { 'LayerNorm', 'RMSNorm' }, default = 'LayerNorm'
                   type of normalization applied.
-    qkv_weight_interleaved : bool, default = `True`
-                            if set to `False`, the QKV weight is interpreted as a concatenation of
-                            query, key, and value weights along the `0th` dimension. The default
-                            interpretation is that the individual `q`, `k`, and `v` weights for each
-                            attention head are interleaved. This parameter is set to `False` when
+    qkv_weight_interleaved : bool, default = True
+                            if set to ``False``, the QKV weight is interpreted as a concatenation of
+                            query, key, and value weights along the ``0th`` dimension. The default
+                            interpretation is that the individual ``q``, ``k``, and ``v`` weights for each
+                            attention head are interleaved. This parameter is set to ``False`` when
                            using :attr:`fuse_qkv_params=False`.
-    rotary_pos_interleaved : bool, default = `False`
+    rotary_pos_interleaved : bool, default = False
                            whether to use interleaved rotary position embeddings.
-    bias : bool, default = `True`
-          if set to `False`, the transformer layer will not learn any additive biases.
+    bias : bool, default = True
+          if set to ``False``, the transformer layer will not learn any additive biases.
    device : Union[torch.device, str], default = "cuda"
          The device on which the parameters of the model will be allocated. It is the user's
          responsibility to ensure all parameters are moved to the GPU before running the
          forward pass.
-    qkv_format: str, default = `sbhd`
-            dimension format for `query_layer`, `key_layer` and `value_layer`,
-            {`sbhd`, `bshd`}. `s` stands for the sequence length, `b` batch size,
-            `h` the number of heads and `d` head size. `sbhd` and `bshd` formats
+    qkv_format : str, default = "sbhd"
+            dimension format for ``query_layer``, ``key_layer`` and ``value_layer``,
+            {``"sbhd"``, ``"bshd"``}. ``s`` stands for the sequence length, ``b`` batch size,
+            ``h`` the number of heads and ``d`` head size. ``"sbhd"`` and ``"bshd"`` formats
            are used for when sequences in a batch are of equal length or padded to
            equal length. Please note that these formats do not reflect how
-            tensors `query_layer`, `key_layer`, `value_layer` are laid out in memory.
-            For that, please use `get_qkv_layout` to gain the layout information.
-    name: str, default = `None`
+            tensors ``query_layer``, ``key_layer``, ``value_layer`` are laid out in memory.
+            For that, please use ``get_qkv_layout`` to gain the layout information.
+    name : str, default = None
        name of the module, currently used for debugging purposes.
-    softmax_type: str = {'vanilla', 'off-by-one', 'learnable'}, default = 'vanilla'
-                 softmax type as described in this paper:
+    softmax_type : str = {'vanilla', 'off-by-one', 'learnable'}, default = 'vanilla'
+                 Softmax type as described in the paper
                 `Efficient Streaming Language Models with Attention Sinks
                 <https://arxiv.org/pdf/2309.17453v3>`_.
-                 For a given attention score S = Q*K^T, of shape [b, h, s_q, s_kv],
-                 'vanilla': S[:,:,:,i] = exp(S[:,:,:,i])/sum(exp(S[:,:,:,:]), dim=-1),
-                 'off-by-one': S[:,:,:,i] = exp(S[:,:,:,i])/(1 + sum(exp(S[:,:,:,:]), dim=-1)), and
-                 'learnable': S[:,j,:,i] = exp(S[:,j,:,i])/(exp(alpha[j]) + sum(exp(S[:,j,:,:]), dim=-1)),
-                 where alpha is a learnable parameter in shape [h].
-                 'off-by-one' and 'learnable' softmax types are also called sink attention
-                 ('zero sink' and 'learnable sink').
+
+                 For a given attention score :math:`S = Q \cdot K^T`, of shape ``[b, h, s_q, s_kv]``:
+
+                 * ``'vanilla'``:
+
+                   .. math::
+                      S_{:,:,:,i} =  = \frac{\exp(S_{:,:,:,i})}{\sum_j \exp(S_{:,:,:,j})}
+
+                 * ``'off-by-one'``:
+
+                   .. math::
+                      S_{:,:,:,i} =  = \frac{\exp(S_{:,:,:,i})}{1 + \sum_j \exp(S_{:,:,:,j})}
+
+                 * ``'learnable'``:
+
+                   .. math::
+                      S_{:,:,:,i} =  = \frac{\exp(S_{:,h,:,i})}{\exp(\alpha_h) + \sum_j \exp(S_{:,h,:,j})}
+
+                   where :math:`\alpha` is a learnable parameter of shape ``[h]``.
+
+                 ``'off-by-one'`` and ``'learnable'`` softmax types are also called sink attention
+                 (``'zero sink'`` and ``'learnable sink'``).

    Parallelism parameters
    ----------------------
-    set_parallel_mode : bool, default = `False`
-                      if set to `True`, QKV and FC1 layers are used as Column Parallel
+    set_parallel_mode : bool, default = False
+                      if set to ``True``, QKV and FC1 layers are used as Column Parallel
                      whereas PROJ and FC2 is used as Row Parallel as described
                      `here <https://arxiv.org/pdf/1909.08053.pdf>`_.
-    sequence_parallel : bool, default = `False`
-                       if set to `True`, uses sequence parallelism.
-    tp_group : ProcessGroup, default = `None`
+    sequence_parallel : bool, default = False
+                       if set to ``True``, uses sequence parallelism.
+    tp_group : ProcessGroup, default = None
              tensor parallel process group.
    tp_size : int, default = 1
             used as TP (tensor parallel) world size when TP groups are not formed during
             initialization. In this case, users must call the
-             `set_tensor_parallel_group(tp_group)` method on the initialized module before the
+             ``set_tensor_parallel_group(tp_group)`` method on the initialized module before the
             forward pass to supply the tensor parallel group needed for tensor and sequence
             parallel collectives.

    Optimization parameters
    -----------------------
    fuse_wgrad_accumulation : bool, default = 'False'
-                             if set to `True`, enables fusing of creation and accumulation of
+                             if set to ``True``, enables fusing of creation and accumulation of
                             the weight gradient. When enabled, it is assumed that the weights
-                             have an additional `main_grad` attribute (used instead of the
-                             regular `grad`) which is a pre-allocated buffer of the correct
+                             have an additional ``main_grad`` attribute (used instead of the
+                             regular ``grad``) which is a pre-allocated buffer of the correct
                             size to accumulate gradients in.
-    params_dtype : torch.dtype, default = `torch.get_default_dtype()`
+    params_dtype : torch.dtype, default = torch.get_default_dtype()
                  it controls the type used to allocate the initial parameters. Useful when
                  the model is trained with lower precision and the original FP32 parameters
                  would not fit in GPU memory.
-    return_bias : bool, default = `False`
-                 when set to `True`, this module will not apply the additive bias itself, but
-                 instead return the bias value during the forward pass together with the
+    return_bias : bool, default = False
+                 when set to ``True``, this module will not apply the additive bias itself, but
+                 instead return the bias value during the :meth:`forward` method together with the
                 output of the linear transformation :math:`y = xA^T`. This is useful when
                 the bias addition can be fused to subsequent operations.
-    fuse_qkv_params: bool, default = 'False'
-                    if set to `True`, `TransformerLayer` module exposes a single fused
+    fuse_qkv_params : bool, default = 'False'
+                    if set to ``True``, ``TransformerLayer`` module exposes a single fused
                    parameter for query-key-value. This enables optimizations such as QKV
                    fusion without concatentations/splits and also enables the argument
-                    `fuse_wgrad_accumulation`.
-    qk_norm_type: Optional[str], default = None
+                    ``fuse_wgrad_accumulation``.
+    qk_norm_type : Optional[str], default = None
                    type of normalization to apply to query and key tensors.
-                    Options: None, 'L2Normalization', 'RMSNorm', 'LayerNorm'. When None, no normalization is applied.
-                    When 'L2Normalization', L2 normalization is applied to query and key tensors.
-                    When 'RMSNorm', RMS normalization is applied to query and key tensors.
-                    When 'LayerNorm', layer normalization is applied to query and key tensors.
+                    Options: ``None``, ``'L2Normalization'``, ``'RMSNorm'``, ``'LayerNorm'``. When ``None``, no normalization is applied.
+                    When ``'L2Normalization'``, L2 normalization is applied to query and key tensors.
+                    When ``'RMSNorm'``, RMS normalization is applied to query and key tensors.
+                    When ``'LayerNorm'``, layer normalization is applied to query and key tensors.
                    Normalization is applied after RoPE (if applicable) but before attention computation
-                    when `qk_norm_before_rope` is False. This follows the e.g. Llama4 approach
+                    when ``qk_norm_before_rope`` is ``False``. This follows the e.g. Llama4 approach
                    for QK normalization to improve training stability and model performance.
-    qk_norm_eps: float, default = 1e-6
+    qk_norm_eps : float, default = 1e-6
                    epsilon value for normalization of query and key tensors.
-                    Only used when `qk_norm_type` is not None.
-    qk_norm_before_rope: bool, default = `False`
-                    if set to `True`, query and key normalization is applied before rotary position
-                    embedding. When `False` (default), normalization is applied after RoPE.
+                    Only used when ``qk_norm_type`` is not ``None``.
+    qk_norm_before_rope : bool, default = False
+                    if set to ``True``, query and key normalization is applied before rotary position
+                    embedding. When ``False`` (default), normalization is applied after RoPE.
                    This parameter allows supporting different architectural variants that apply
                    QK normalization at different points.
-    seq_length: Optional[int], default = `None`
+    seq_length : Optional[int], default = None
                    sequence length of input samples. Needed for JIT Warmup, a technique where jit
                    fused functions are warmed up before training to ensure same kernels are used for
                    forward propagation and activation recompute phase.
-    micro_batch_size: Optional[int], default = `None`
+    micro_batch_size : Optional[int], default = None
                    batch size per training step. Needed for JIT Warmup, a technique where jit
                    fused functions are warmed up before training to ensure same kernels are
                    used for forward propagation and activation recompute phase.
@@ -535,7 +549,7 @@ class MultiheadAttention(torch.nn.Module):

        Parameters
        ----------
-        tp_group : ProcessGroup, default = `None`
+        tp_group : ProcessGroup, default = None
                  tensor parallel process group.
        """
        self.tp_group = tp_group
@@ -555,25 +569,26 @@ class MultiheadAttention(torch.nn.Module):
        ----------
        cp_group : Union[ProcessGroup, List[ProcessGroup]]
                  context parallel process group.
-                  ProcessGroup is for cp_comm_type of "p2p", "all_gather", and "a2a".
-                  List[ProcessGroup] is for cp_comm_type of "a2a+p2p", where cp_group[0]
-                  and cp_group[1] are for a2a and p2p communications respectively.
+                  ``ProcessGroup`` is for :attr:`cp_comm_type` of ``"p2p"``, ``"all_gather"``, and ``"a2a"``.
+                  ``List[ProcessGroup]`` is for :attr:`cp_comm_type` of ``"a2a+p2p"``, where :attr:`cp_group[0]`
+                  and :attr:`cp_group[1]` are for ``"a2a"`` and ``"p2p"`` communications respectively.
        cp_global_ranks : List[int]
                         list of global ranks in the context group.
        cp_stream : torch.cuda.Stream
                   cuda stream for context parallel execution.
-        cp_comm_type : str, default = `p2p`
+        cp_comm_type : str, default = "p2p"
                      inter-gpu communication type for context parallelism.
-                      Can be "p2p" or "all_gather" or "a2a", "a2a+p2p".
-                      "p2p": Exchange KV chunks with P2P communications in ring topology.
-                             P2P is async and can be overlapped with attention compute.
-                      "all_gather": All-gather to get full sequence of KV before attention.
-                                    The all-gather is not async, and cannot be overlapped.
-                      "a2a": Like DeepSpeed Ulysses, scatter attention heads across the CP
-                             group, and gather to get full sequence of QKV.
-                      "a2a+p2p": hierarchical CP implementation. First applying a2a to QKV
-                      across each CP sub-group (e.g., via NVLink), then exchanging KV with
-                      p2p between sub-groups (e.g., via IBLink).
+                      Can be ``"p2p"`` or ``"all_gather"`` or ``"a2a"`` or ``"a2a+p2p"``.
+
+                      - ``"p2p"``: Exchange KV chunks with P2P communications in ring topology.
+                        P2P is async and can be overlapped with attention compute.
+                      - ``"all_gather"``: All-gather to get full sequence of KV before attention.
+                        The all-gather is not async, and cannot be overlapped.
+                      - ``"a2a"``: Like DeepSpeed Ulysses, scatter attention heads across the CP
+                        group, and gather to get full sequence of QKV.
+                      - ``"a2a+p2p"``: hierarchical CP implementation. First applying a2a to QKV
+                        across each CP sub-group (e.g., via NVLink), then exchanging KV with
+                        p2p between sub-groups (e.g., via IBLink).
        """
        if isinstance(cp_group, dist_group_type):
            self.cp_size = get_distributed_world_size(cp_group)
@@ -622,39 +637,39 @@ class MultiheadAttention(torch.nn.Module):
        fast_zero_fill: bool = True,
        pad_between_seqs: Optional[bool] = None,
    ) -> Tuple[Union[torch.Tensor, None], ...]:
-        """
+        r"""
        Forward propagation for MultiheadAttention layer.

        .. note::

            Argument :attr:`attention_mask` is only used when :attr:`attn_mask_type`
-            includes `"padding"` or `"arbitrary"`.
+            includes ``"padding"`` or ``"arbitrary"``.

        Parameters
        ----------
        hidden_states : torch.Tensor
             Input tensor.
        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]],
-             default = `None`. Boolean tensor(s) used to mask out attention softmax input.
-             It should be `None` for causal masks and "`no_mask`". For padding masks, it should be
-             a single tensor of [batch_size, 1, 1, seqlen_q] for self-attention, and a tuple of
-             two tensors in shapes [batch_size, 1, 1, seqlen_q] and [batch_size, 1, 1, seqlen_kv]
-             for cross-attention. For "`arbitrary`" mask, it should be in a shape broadcastable to
-             [batch_size, num_heads, max_seqlen_q, max_seqlen_kv]. A `True` value means
-             the corresponding position is masked out and a `False` means that position
+             default = None. Boolean tensor(s) used to mask out attention softmax input.
+             It should be ``None`` for causal masks and ``"no_mask"``. For padding masks, it should be
+             a single tensor of ``[batch_size, 1, 1, seqlen_q]`` for self-attention, and a tuple of
+             two tensors of shapes ``[batch_size, 1, 1, seqlen_q]`` and ``[batch_size, 1, 1, seqlen_kv]``
+             for cross-attention. For ``"arbitrary"`` mask, it should be of a shape broadcastable to
+             ``[batch_size, num_heads, max_seqlen_q, max_seqlen_kv]``. A ``True`` value means
+             the corresponding position is masked out and a ``False`` means that position
             is allowed to participate in attention.
        attn_mask_type: {'no_mask', 'padding', 'causal', 'padding_causal', 'causal_bottom_right',
                       'padding_causal_bottom_right','arbitrary'},
-                       default = `None`
+                       default = None
                       type of attention mask passed into softmax operation. By default,
                       causal masks are aligned to the top left corner of the softmax matrix.
-                       When "`bottom_right`" is specified in the mask type, causal masks are
+                       When ``"bottom_right"`` is specified in the mask type, causal masks are
                       aligned to the bottom right corner.
-        window_size: Optional[Tuple[int, int]], default = `None`
+        window_size: Optional[Tuple[int, int]], default = None
                    sliding window size for local attention.
-        encoder_output : Optional[torch.Tensor], default = `None`
+        encoder_output : Optional[torch.Tensor], default = None
             Output of the encoder block to be fed into the decoder block if using
-             `layer_type="decoder"`.
+             ``layer_type="decoder"``.
        is_first_microbatch : {True, False, None}, default = None
                             During training using either gradient accumulation or
                             pipeline parallelism a minibatch of data is further split
@@ -668,46 +683,46 @@ class MultiheadAttention(torch.nn.Module):
                             * it also allows skipping gradient accumulation during the
                               first microbatch (since it is the first gradient being
                               produced)
-        checkpoint_core_attention: bool, default = `False`
-                                  If true, forward activations for core attention are recomputed
+        checkpoint_core_attention: bool, default = False
+                                  If ``True``, forward activations for core attention are recomputed
                                  during the backward pass in order to save memory that would
                                  otherwise be occupied to store the forward activations until
                                  backprop.
-        rotary_pos_emb: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], default = `None`
+        rotary_pos_emb: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], default = None
                       Embeddings for query and key tensors for applying rotary position
                       embedding. By default no input embedding is applied.
-        core_attention_bias_type: str, default = `no_bias`
-                    Bias type, {`no_bias`, `pre_scale_bias`, 'post_scale_bias`, `alibi`}
-        core_attention_bias: Optional[torch.Tensor], default = `None`
-                    Bias tensor for Q * K.T, shape [1, num_head, max_seqlen_q, max_seqlen_kv].
-                    It should be 'None' for 'no_bias' and 'alibi' bias types.
-        alibi_slopes: Optional[torch.Tensor], default = `None`
-                     ALiBi slopes in FP32 and shape [nheads] or [batch_size, nheads].
-                     It adds a bias of (-alibi_slope * (i + seqlen_k - seqlen_q - j))
+        core_attention_bias_type: str, default = "no_bias"
+                    Bias type, {``"no_bias"``, ``"pre_scale_bias"``, ``"post_scale_bias"``, ``"alibi"``}
+        core_attention_bias: Optional[torch.Tensor], default = None
+                    Bias tensor for :math:`Q \cdot K^T`, shape ``[1, num_head, max_seqlen_q, max_seqlen_kv]``.
+                    It should be ``None`` for ``"no_bias"`` and ``"alibi"`` bias types.
+        alibi_slopes: Optional[torch.Tensor], default = None
+                     ALiBi slopes in FP32 and shape ``[nheads]`` or ``[batch_size, nheads]``.
+                     It adds a bias of ``(-alibi_slope * (i + seqlen_k - seqlen_q - j))``
                     to the attention score of query i and key j.
-        cu_seqlens_q: Optional[torch.Tensor], default = `None`
-                   Cumulative sum of sequence lengths (without offset) in a batch for `query_layer`,
-                   with shape [batch_size + 1] and dtype torch.int32.
-        cu_seqlens_kv: Optional[torch.Tensor], default = `None`
-                   Cumulative sum of sequence lengths (without offset) in a batch for `key_layer`
-                   and `value_layer`, with shape [batch_size + 1] and dtype torch.int32.
-        cu_seqlens_q_padded: Optional[torch.Tensor], default = `None`
-                   Cumulative sum of sequence lengths (with offset) in a batch for `query_layer`,
-                   with shape [batch_size + 1] and dtype torch.int32.
-        cu_seqlens_kv_padded: Optional[torch.Tensor], default = `None`
-                   Cumulative sum of sequence lengths (with offset) in a batch for `key_layer`
-                   and `value_layer`, with shape [batch_size + 1] and dtype torch.int32.
-        max_seqlen_q: Optional[int], default = `None`
-                      Maximum sequence length in `query_layer`.
-                      Calculated from `cu_seqlens_q` if not provided.
-        max_seqlen_kv: Optional[int], default = `None`
-                       Maximum sequence length in `key_layer` and `value_layer`.
-                       Calculated from `cu_seqlens_kv` if not provided.
-        fast_zero_fill: bool, default = `True`
+        cu_seqlens_q: Optional[torch.Tensor], default = None
+                   Cumulative sum of sequence lengths (without offset) in a batch for ``query_layer``,
+                   with shape ``[batch_size + 1]`` and dtype torch.int32.
+        cu_seqlens_kv: Optional[torch.Tensor], default = None
+                   Cumulative sum of sequence lengths (without offset) in a batch for ``key_layer``
+                   and ``value_layer``, with shape ``[batch_size + 1]`` and dtype torch.int32.
+        cu_seqlens_q_padded: Optional[torch.Tensor], default = None
+                   Cumulative sum of sequence lengths (with offset) in a batch for ``query_layer``,
+                   with shape ``[batch_size + 1]`` and dtype torch.int32.
+        cu_seqlens_kv_padded: Optional[torch.Tensor], default = None
+                   Cumulative sum of sequence lengths (with offset) in a batch for ``key_layer``
+                   and ``value_layer``, with shape ``[batch_size + 1]`` and dtype torch.int32.
+        max_seqlen_q: Optional[int], default = None
+                      Maximum sequence length in ``query_layer``.
+                      Calculated from ``cu_seqlens_q`` if not provided.
+        max_seqlen_kv: Optional[int], default = None
+                       Maximum sequence length in ``key_layer`` and ``value_layer``.
+                       Calculated from ``cu_seqlens_kv`` if not provided.
+        fast_zero_fill: bool, default = True
                    Whether to set output tensors to 0 or not before use.
-        pad_between_seqs: Optional[bool], default = `None`
-            If None, inferred from qkv_format, cu_seqlens and cu_seqlens_padded.
-            If true, there are padding tokens between individual sequences in a packed batch.
+        pad_between_seqs: Optional[bool], default = None
+            If ``None``, inferred from qkv_format, cu_seqlens and cu_seqlens_padded.
+            If ``True``, there are padding tokens between individual sequences in a packed batch.
        """
        # hidden_states: [sq, b, h]


--- a/transformer_engine/pytorch/attention/rope.py
+++ b/transformer_engine/pytorch/attention/rope.py
@@ -287,16 +287,16 @@ def _apply_rotary_pos_emb_base(

    Parameters
    ----------
-    t: torch.Tensor
+    t : torch.Tensor
        Input tensor of shape `[s, b, h, d]` or `[b, s, h, d]`, on which rotary positional
        embedding will be applied.
-    freqs: torch.Tensor
+    freqs : torch.Tensor
        Rotary positional embedding tensor of shape `[s2, 1, 1, d2]` or `[s2, b, 1, d2]`
        and dtype 'float', with `s2 >= s` and `d2 <= d`.
-    tensor_format: {'sbhd', 'bshd'}, default = 'sbhd'
+    tensor_format : {'sbhd', 'bshd'}, default = 'sbhd'
        Should be `bshd` if `t` is of shape `[bs, seq, ...]`, or `sbhd` if `t` is of shape
        `[seq, bs, ...]`.
-    interleaved: bool, default = False
+    interleaved : bool, default = False
        Whether to use interleaved rotary position embedding.
    """
    # [seq, 1, 1, dim] -> [1, seq, 1, dim] or
@@ -324,7 +324,7 @@ def _get_freqs_on_this_cp_rank(
    """Get the position embedding on the current context parallel rank.

    Args:
-        freqs: torch.Tensor. Positional embedding tensor in shape `[s2, 1, 1, d2]`.
+        freqs: torch.Tensor. Positional embedding tensor of shape `[s2, 1, 1, d2]`.
        seqlen: int. Length of the current sequence.
        cp_size: int. Context parallel world size.
        cp_rank: int. Context parallel rank.
@@ -372,29 +372,29 @@ def apply_rotary_pos_emb(

    Parameters
    ----------
-    t: torch.Tensor
+    t : torch.Tensor
        Input tensor of shape `[s, b, h, d]`, `[b, s, h, d]` or `[t, h, d]`, on which
        rotary positional embedding will be applied.
-    freqs: torch.Tensor
+    freqs : torch.Tensor
        Rotary positional embedding tensor of shape `[s2, 1, 1, d2]` and dtype 'float',
        with `s2 >= s` and `d2 <= d`.
-    start_positions: torch.Tensor, default = None.
+    start_positions : torch.Tensor, default = None.
        Tokens in a sequence `i` should be applied with position encoding offset by
        `start_positions[i]`. If `start_positions=None`, there's no offset.
-    tensor_format: {'sbhd', 'bshd', 'thd'}, default = 'sbhd'
+    tensor_format : {'sbhd', 'bshd', 'thd'}, default = 'sbhd'
        is `bshd` if `t` is of shape `[bs, seq, ...]`, or `sbhd` if `t` is
        of shape `[seq, bs, ...]`. 'thd' is only supported when `fused` is True.
-    interleaved: bool, default = False
+    interleaved : bool, default = False
        Whether to use interleaved rotary position embedding.
-    fused: bool, default = False
+    fused : bool, default = False
        Whether to use a fused applying RoPE implementation.
-    cu_seqlens: torch.Tensor, default = None.
+    cu_seqlens : torch.Tensor, default = None.
        Cumulative sum of sequence lengths in a batch for `t`, with shape [b + 1] and
        dtype torch.int32. Only valid when `tensor_format` is 'thd'.
        Should be `cu_seqlens_padded` when cp_size > 1.
-    cp_size: int, default = 1.
+    cp_size : int, default = 1.
        Context parallel world size. Only valid when `tensor_format` is 'thd' and `fused` is True.
-    cp_rank: int, default = 0.
+    cp_rank : int, default = 0.
        Context parallel rank. Only valid when `tensor_format` is 'thd' and `fused` is True.
    """
    assert (
@@ -492,32 +492,32 @@ def apply_fused_qkv_rotary_pos_emb(

    Parameters
    ----------
-    qkv: torch.Tensor
+    qkv : torch.Tensor
        Input tensor of shape `[s, b, h, d]` or `[b, s, h, d]`, on which
        rotary positional embedding will be applied. This tensor has q, k, v concatenated
        along the last dimension.
-    q_freqs: torch.Tensor
+    q_freqs : torch.Tensor
        Rotary positional embedding Q tensor of shape `[s2, 1, 1, d2]` and dtype 'float',
        with `s2 >= s` and `d2 <= d`.
-    k_freqs: torch.Tensor
+    k_freqs : torch.Tensor
        Rotary positional embedding K tensor of shape `[s2, 1, 1, d2]` and dtype 'float',
        with `s2 >= s` and `d2 <= d`.
-    qkv_split_arg_list: List[int]
+    qkv_split_arg_list : List[int]
        List of integers that specify the split of the qkv tensor. The list should have 3 elements,
        the first element is the number of elements in the q tensor, the second element is the number
        of elements in the k tensor, and the third element is the number of elements in the v tensor.
        The sum of the elements in the list should be equal to the last dimension of the qkv tensor.
-    start_positions: torch.Tensor, default = None.
+    start_positions : torch.Tensor, default = None.
        Tokens in a sequence `i` should be applied with position encoding offset by
        `start_positions[i]`. If `start_positions=None`, there's no offset.
-    tensor_format: {'sbhd', 'bshd'}, default = 'sbhd'
+    tensor_format : {'sbhd', 'bshd'}, default = 'sbhd'
        is `bshd` if `qkv` is of shape `[bs, seq, ...]`, or `sbhd` if `qkv` is
        of shape `[seq, bs, ...]`.
-    interleaved: bool, default = False
+    interleaved : bool, default = False
        Whether to use interleaved rotary position embedding.
-    cp_size: int, default = 1.
+    cp_size : int, default = 1.
        Context parallel world size.
-    cp_rank: int, default = 0.
+    cp_rank : int, default = 0.
        Context parallel rank.
    """


--- a/transformer_engine/pytorch/cpp_extensions/fused_attn.py
+++ b/transformer_engine/pytorch/cpp_extensions/fused_attn.py
@@ -146,89 +146,89 @@ def fused_attn_fwd(

    Parameters
    ----------
-    is_training: bool
+    is_training : bool
                if True, runs training and produces auxiliary tensors aux_ctx_tensors
                for the backward; if False, runs inference and doesn't produce aux_ctx_tensors
-    max_seqlen_q: int
+    max_seqlen_q : int
                max sequence length for Q, used for padding;
                may be larger than max(seqlens_q),
                seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1]
-    max_seqlen_kv: int
+    max_seqlen_kv : int
                max sequence length for K and V, used for padding;
                may be larger than max(seqlens_kv),
                seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1]
-    cu_seqlens_q: torch.Tensor
+    cu_seqlens_q : torch.Tensor
                cumulative sequence lengths for Q; shape [batch_size + 1]
-    cu_seqlens_kv: torch.Tensor
+    cu_seqlens_kv : torch.Tensor
                cumulative sequence lengths for K and V; shape [batch_size + 1]
-    q: torch.Tensor
+    q : torch.Tensor
                input tensor Q; shape sbhd, bshd or thd (see `qkv_layout` for details)
-    k: torch.Tensor
+    k : torch.Tensor
                input tensor K; shape sbhd, bshd or thd (see `qkv_layout` for details)
-    v: torch.Tensor
+    v : torch.Tensor
                input tensor V; shape sbhd, bshd or thd (see `qkv_layout` for details)
-    fake_dtype: tex.DType
+    fake_dtype : tex.DType
                data type of Q, K and V - in case of high precision, fake dtype in case of FP8;
                in torch.dtype
-    fused_attention_backend: tex.NVTE_Fused_Attn_Backend
+    fused_attention_backend : tex.NVTE_Fused_Attn_Backend
                please see FusedAttention module for details on supported backends.
-    attn_bias: torch.Tensor, default = None
+    attn_bias : torch.Tensor, default = None
                input tensor Bias when attn_bias_type is "pre_scale_bias" or "post_scale_bias";
                shape [1, num_heads, max_seqlen_q, max_seqlen_kv], same data type as q, k and v
-    cu_seqlens_q_padded: torch.Tensor, default = None
+    cu_seqlens_q_padded : torch.Tensor, default = None
                cumulative sequence offsets for Q; shape [batch_size + 1]
-    cu_seqlens_kv_padded: torch.Tensor, default = None
+    cu_seqlens_kv_padded : torch.Tensor, default = None
                cumulative sequence offsets for KV; shape [batch_size + 1]
-    page_table_k: torch.Tensor, default = None
+    page_table_k : torch.Tensor, default = None
                page table for K cache; shape [batch_size, max_pages_per_seq_k]
-    page_table_v: torch.Tensor, default = None
+    page_table_v : torch.Tensor, default = None
                page table for V cache; shape [batch_size, max_pages_per_seq_v]
-    s_quantizer: Quantizer, default = None
+    s_quantizer : Quantizer, default = None
                Quantizer object for the intermediate value S.
-    o_quantizer: Quantizer, default = None
+    o_quantizer : Quantizer, default = None
                Quantizer object for the output of the attention.
-    attn_scale: float, default = None
+    attn_scale : float, default = None
                if not None, use attn_scale as the attention scale for Q*K.T BMM;
                if None, use 1.0/sqrt(head_dim_qk) as the default
-    dropout: float, default = 0.0
+    dropout : float, default = 0.0
                dropout probability, 0.0 means no dropout, 1.0 means no output;
                dropout must be 0.0 if is_training is False
-    fast_zero_fill: bool, default = True
+    fast_zero_fill : bool, default = True
                if True, initializes the output tensor O to zero using the fast filling method;
                if False, uses PyTorch's .fill_() method
-    qkv_layout: str, default = "sbh3d"
+    qkv_layout : str, default = "sbh3d"
                layout of Q, K and V;
                {"sb3hd", "sbh3d", "sbhd_sb2hd", "sbhd_sbh2d", "sbhd_sbhd_sbhd",
                "bs3hd", "bsh3d", "bshd_bs2hd", "bshd_bsh2d", "bshd_bshd_bshd",
                "t3hd", "th3d", "thd_t2hd", "thd_th2d", "thd_thd_thd"}
-    attn_bias_type: str, default = "no_bias"
+    attn_bias_type : str, default = "no_bias"
                type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias", "alibi"}
-    attn_mask_type: str, default = "padding"
+    attn_mask_type : str, default = "padding"
                type of the attention mask; {"padding", "causal", "padding_causal", "no_mask"}
-    softmax_type: str, default = "vanilla"
+    softmax_type : str, default = "vanilla"
                type of the attention softmax; {"vanilla", "off-by-one", "learnable"}
-    window_size: Tuple[int, int], default = (-1, -1)
+    window_size : Tuple[int, int], default = (-1, -1)
                sliding window size for local attention, where query at position i attends to keys
                in [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q
                + window_size[1]] inclusive. Special cases (-1, -1) and (-1, 0) mean no sliding
                window and causal mask specifically.
-    rng_gen: torch.Generator, default = None
+    rng_gen : torch.Generator, default = None
                random number generator;
                if None, uses the default CUDA generator from PyTorch; otherwise, uses rng_gen
-    softmax_offset: torch.Tensor, default = None
-                softmax offset tensor in shape [1, h_q, 1, 1].
+    softmax_offset : torch.Tensor, default = None
+                softmax offset tensor of shape [1, h_q, 1, 1].
                See softmax_type in DotProductAttention for details.
-    return_max_logit: bool, default = False
+    return_max_logit : bool, default = False
                      whether to return the maximum attention score
-    cuda_graph: bool, default = False
+    cuda_graph : bool, default = False
                whether or not cuda graph capture is enabled.

    Returns
    ----------
-    o: torch.Tensor
+    o : torch.Tensor
                output tensor O, of the attention calculation; same data type as Q, K and V;
                same shape as Q
-    aux_ctx_tensors: List[torch.Tensor]
+    aux_ctx_tensors : List[torch.Tensor]
                auxiliary output tensors used for the backward;
                if is_training is True, aux_ctx_tensors = [softmax-related tensors, rng_state]
                if is_training is False, aux_ctx_tensors = None
@@ -252,7 +252,7 @@ def fused_attn_fwd(
                rng_state: torch.Tensor, optional, if backend is not F16_max512_seqlen
                    state of the random number generator;
                    [seed, offset], dtype uint64
-    max_logit: if return_max_logit = True, shape [h] and same data type as O; otherwise None
+    max_logit : if return_max_logit = True, shape [h] and same data type as O; otherwise None
    """

    if attn_scale is None:
@@ -377,89 +377,89 @@ def fused_attn_bwd(

    Parameters
    ----------
-    max_seqlen_q: int
+    max_seqlen_q : int
                max sequence length for Q, used for padding; may be larger than max(seqlens_q),
                seqlens_q = cu_seqlens_q[1:] - cu_seqlens_q[:-1]
-    max_seqlen_kv: int
+    max_seqlen_kv : int
                max sequence length for K and V, used for padding;
                may be larger than max(seqlens_kv),
                seqlens_kv = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1]
-    cu_seqlens_q: torch.Tensor
+    cu_seqlens_q : torch.Tensor
                cumulative sequence lengths for Q; shape [batch_size + 1]
-    cu_seqlens_kv: torch.Tensor
+    cu_seqlens_kv : torch.Tensor
                cumulative sequence lengths for K and V; shape [batch_size + 1]
-    q: torch.Tensor
+    q : torch.Tensor
                input tensor Q; shape sbhd, bshd or thd (see `qkv_layout` for details)
-    k: torch.Tensor
+    k : torch.Tensor
                input tensor K; shape sbhd, bshd or thd (see `qkv_layout` for details)
-    v: torch.Tensor
+    v : torch.Tensor
                input tensor V; shape sbhd, bshd or thd (see `qkv_layout` for details)
-    o: torch.Tensor
+    o : torch.Tensor
                input tensor O (output of forward); same data type as Q, K and V;
                same shape as Q
-    d_o: torch.Tensor
+    d_o : torch.Tensor
                input tensor dO (gradient of O); same data type as Q, K and V;
                same shape as Q
-    fake_dtype: tex.DType
+    fake_dtype : tex.DType
                data type of Q, K and V - in case of high precision, fake dtype in case of FP8;
                in torch.dtype
-    dqkv_dtype: tex.DType
+    dqkv_dtype : tex.DType
                data type of dQ, dK and dV; in tex.DType, not torch.dtype
-    aux_ctx_tensors: List[torch.Tensor]
+    aux_ctx_tensors : List[torch.Tensor]
                auxiliary output tensors of the forward pass when its is_training is True,
                e.g. aux_ctx_tensors = [M, ZInv, rng_state]
-    fused_attention_backend: tex.NVTE_Fused_Attn_Backend
+    fused_attention_backend : tex.NVTE_Fused_Attn_Backend
                please see FusedAttention module for details on supported backends.
-    cu_seqlens_q_padded: torch.Tensor, default = None
+    cu_seqlens_q_padded : torch.Tensor, default = None
                cumulative sequence offsets for Q; shape [batch_size + 1]
-    cu_seqlens_kv_padded: torch.Tensor, default = None
+    cu_seqlens_kv_padded : torch.Tensor, default = None
                cumulative sequence offsets for KV; shape [batch_size + 1]
-    s_quantizer: Quantizer, default = None
+    s_quantizer : Quantizer, default = None
                Quantizer object for the intermediate value S.
-    dp_quantizer: Quantizer, default = None
+    dp_quantizer : Quantizer, default = None
                Quantizer object for the intermediate value dP.
-    dqkv_quantizer: Quantizer, default = None
+    dqkv_quantizer : Quantizer, default = None
                Quantizer object for the output values of the fused_attn_bwd.
-    dropout: float, default = 0.0
+    dropout : float, default = 0.0
                dropout probability, 0.0 means no dropout, 1.0 means no output;
                dropout must be 0.0 if is_training is False
-    fast_zero_fill: bool, default = True
+    fast_zero_fill : bool, default = True
                if True, initializes the output tensor O to zero using the fast filling method;
                if False, uses PyTorch's .fill_() method
-    qkv_layout: str, default = "sbh3d"
+    qkv_layout : str, default = "sbh3d"
                layout of Q, K and V;
                {"sb3hd", "sbh3d", "sbhd_sb2hd", "sbhd_sbh2d", "sbhd_sbhd_sbhd",
                "bs3hd", "bsh3d", "bshd_bs2hd", "bshd_bsh2d", "bshd_bshd_bshd",
                "t3hd", "th3d", "thd_t2hd", "thd_th2d", "thd_thd_thd"}
-    attn_bias_type: str, default = "no_bias"
+    attn_bias_type : str, default = "no_bias"
                type of the bias; {"no_bias", "pre_scale_bias", "post_scale_bias", "alibi"}
-    attn_mask_type: str, default = "padding"
+    attn_mask_type : str, default = "padding"
                type of the attention mask; {"padding", "causal", "padding_causal", "no_mask"}
-    softmax_type: str, default = "vanilla"
+    softmax_type : str, default = "vanilla"
                type of the attention softmax; {"vanilla", "off-by-one", "learnable"}
-    window_size: Tuple[int, int], default = (-1, -1)
+    window_size : Tuple[int, int], default = (-1, -1)
                sliding window size for local attention, where query at position i attends to keys
                in [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q
                + window_size[1]] inclusive. Special cases (-1, -1) and (-1, 0) mean no sliding
                window and causal mask specifically.
-    deterministic: bool, default = False
+    deterministic : bool, default = False
                whether to execute the backward pass with deterministic behaviours.
-    cuda_graph: bool, default = False
+    cuda_graph : bool, default = False
                whether or not cuda graph capture is enabled.

    Returns
    ----------
-    d_q: torch.Tensor
+    d_q : torch.Tensor
                gradient tensor of Q; same data type and shape as Q
-    d_k: torch.Tensor
+    d_k : torch.Tensor
                gradient tensor of K; same data type and shape as K
-    d_v: torch.Tensor
+    d_v : torch.Tensor
                gradient tensor of V; same data type and shape as V
-    d_bias: torch.Tensor, optional
+    d_bias : torch.Tensor, optional
                gradient tensor of Bias when attn_bias_type is "pre_scale_bias"
                or "post_scale_bias"; same data type and shape as Bias
-    d_softmax_offset: torch.Tensor, optional
-                gradient tensor of softmax offset in shape [1, h_q, 1, 1].
+    d_softmax_offset : torch.Tensor, optional
+                gradient tensor of softmax offset of shape [1, h_q, 1, 1].
                See softmax_type in DotProductAttention for details.
    """
    if attn_scale is None:

--- a/transformer_engine/pytorch/cpp_extensions/gemm.py
+++ b/transformer_engine/pytorch/cpp_extensions/gemm.py
@@ -6,6 +6,7 @@

 from typing import Iterable, Optional, Tuple, Union, List
 import os
+import functools
 import torch
 import transformer_engine_torch as tex
 import warnings
@@ -24,7 +25,7 @@ from ..tensor.utils import is_custom
 from ..custom_recipes.gemm import custom_gemm
 from ..tensor.storage.float8_tensor_storage import Float8TensorStorage
 from ...debug.pytorch.debug_quantization import DebugQuantizer
-from transformer_engine.pytorch.fp8 import blockwise_fp8_block_len
+from transformer_engine.pytorch.quantization import blockwise_fp8_block_len
 from transformer_engine.pytorch.triton.per_token_group_quant import (per_token_quant_fp8_to_int8, 
                                                                     per_token_quant_fp8_to_int8_v2, 
                                                                     per_token_quant_fp8_to_int8_opt, 
@@ -40,7 +41,7 @@ from transformer_engine.pytorch.triton.per_token_group_quant import (per_token_q
                                                                     tensorwise_dequantize_float_add)
 from transformer_engine.pytorch.utils import get_device_compute_capability

-from transformer_engine.pytorch.fp8 import int8_simulation_fp8, int8_simulation_fp8_tensorwise
+from transformer_engine.pytorch.quantization import int8_simulation_fp8, int8_simulation_fp8_tensorwise
 int8_simulation_fp8_tensorwise_batched = bool(int(os.getenv("NVTE_INT8_SIM_FP8_TENSORWISE_BATCHED", "0")))    
 __all__ = [
    "general_gemm",
@@ -48,6 +49,43 @@ __all__ = [
    "batchgemm",
 ]

+
+ub_stream_nums = int(os.getenv("NVTE_UB_STREAM_NUMS", "2"))
+_NUM_MAX_UB_STREAMS = ub_stream_nums if IS_HIP_EXTENSION else 3
+
+
+def get_cublas_workspace_size_bytes() -> None:
+    """Return 32 MiB if using hopper, 4 MiB for all other architectures."""
+    # Add env for control the padding for blaslt
+    if IS_HIP_EXTENSION:
+        return 134_217_728
+    if torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 9:
+        # 32 MiB for NVFP4 GEMM, plus additional 1024 B for alignment and misc scales
+        return 32 * 1024 * 1024 + 1024
+    return 4_194_304
+
+
+@functools.lru_cache(maxsize=None)
+def get_cublas_workspace(device: int, ub: bool, grouped_gemm: bool) -> torch.Tensor:
+    """Returns workspace for cublas GEMM."""
+    assert not (ub and grouped_gemm), "UB is unsupported for grouped GEMM."
+
+    if ub:
+        return torch.empty(
+            get_cublas_workspace_size_bytes() * _NUM_MAX_UB_STREAMS,
+            dtype=torch.uint8,
+            device=device,
+        )
+    if grouped_gemm:
+        _multi_stream_cublas_workspace = []
+        for _ in range(tex.get_num_cublas_streams()):
+            _multi_stream_cublas_workspace.append(
+                torch.empty(get_cublas_workspace_size_bytes(), dtype=torch.uint8, device=device)
+            )
+        return _multi_stream_cublas_workspace
+
+    return torch.empty(get_cublas_workspace_size_bytes(), dtype=torch.uint8, device=device)
+
 def w8a8_block_int8_matmul_wgrad_batched_native(A_list, B_list, As_list, Bs_list, C_list, accumulate, out_dtype=torch.float16):
    for i in range(len(C_list)):
        assert C_list[i] is not None
@@ -124,10 +162,31 @@ def validate_gemm_scale(scale: Optional[float], required: bool) -> float:
    return 0.0


+def get_tensor_device(tensor: torch.Tensor) -> int:
+    """
+    Returns tensor device as an integer.
+
+    This method is used because checking instances of
+    QuantizedTensor or Storage incurs more CPU overhead.
+    The order of attributes checked is important to also
+    minimize overhead.
+    """
+    if hasattr(tensor, "device"):
+        return tensor.device.index
+    if hasattr(tensor, "_rowwise_data") and tensor._rowwise_data is not None:
+        return tensor._rowwise_data.device.index
+    if hasattr(tensor, "_columnwise_data") and tensor._columnwise_data is not None:
+        return tensor._columnwise_data.device.index
+    if hasattr(tensor, "_data") and tensor._data is not None:
+        return tensor._data.device.index
+    if hasattr(tensor, "_transpose") and tensor._transpose is not None:
+        return tensor._transpose.device.index
+    return torch.cuda.current_device()
+
+
 def general_gemm(
    A: torch.Tensor,
    B: torch.Tensor,
-    workspace: torch.Tensor,
    out_dtype: Optional[torch.dtype] = None,
    quantization_params: Optional[Quantizer] = None,
    gelu: bool = False,
@@ -154,6 +213,7 @@ def general_gemm(

    alpha = validate_gemm_scale(alpha, True)
    beta = validate_gemm_scale(beta, accumulate)
+    workspace = get_cublas_workspace(get_tensor_device(A), ub is not None, False)

    # if ub_type is not None:
    #     assert ub is not None, (
@@ -414,7 +474,6 @@ def general_grouped_gemm(
    B: List[torch.Tensor],
    out: List[torch.Tensor],
    out_dtype: torch.dtype,
-    workspaces: List[torch.Tensor],
    layout: str = "TN",
    m_splits: Optional[List[int]] = None,
    gelu: bool = False,
@@ -442,6 +501,8 @@ def general_grouped_gemm(
    out_dtype = TE_DType[out[0].dtype] if D_dtype is None else D_dtype

    sm_count = get_sm_count()
+    workspaces = get_cublas_workspace(get_tensor_device(A[0]), False, True)
+
    if grad and use_bias:
        grad_bias = [
            torch.empty(B[i].shape[1], dtype=out[0].dtype, device="cuda") for i in range(num_gemms)

--- a/transformer_engine/pytorch/cpu_offload.py
+++ b/transformer_engine/pytorch/cpu_offload.py
@@ -471,6 +471,8 @@ class OffloadSynchronizer:
        """
        if self.num_of_fwds in [None, self.num_layers - 1]:
            # reset the offload synchronizer
+            for layer_id in self.layer_states:
+                self.layer_states[layer_id].release_all_memory()
            self.num_of_fwds = 0
        else:
            self.num_of_fwds += 1
@@ -655,60 +657,64 @@ def get_cpu_offload_context(

    Parameters
    ----------
-    enabled: bool, default = `False`
+    enabled : bool, default = False
             When set to True, CPU Offloading functionality is enabled.
-    num_layers: int, default = 1
+    num_layers : int, default = 1
            Determines the number of layers
            you want to offload activations/weights for.
-    model_layers: int, default = 1
+    model_layers : int, default = 1
            Number of layers in the model that will be used under this context.
-    offload_activations: bool, default = `True`
+    offload_activations : bool, default = True
            Deprecated.
-    offload_weights: bool, default = `True`
+    offload_weights : bool, default = True
            Deprecated.
-    double_buffering: bool, default = `False`
+    double_buffering : bool, default = False
            Deprecated.
-    retain_pinned_cpu_buffers: bool, default = `False`
+    retain_pinned_cpu_buffers : bool, default = False
            If True, the pinned CPU buffers are retained after offloading
            and reused for the next iteration. It is useful for cuda graphs capture.
-    manual_synchronization: bool, default = `False`
+    manual_synchronization : bool, default = False
            If True, the synchronization is done manually by the user.
            Additional argument manual_controller is returned. See more in manual control section.
-    offload_stream: torch.cuda.Stream, default = `None`
+    offload_stream : torch.cuda.Stream, default = None
            If provided, the offload stream is used for offloading and reloading.
            Otherwise, a new stream is allocated internally. It can be other than None
            only if manual_synchronization is True.

-    Manual synchronization
-    ----------
+    Notes
+    -----
+    **Manual synchronization:**
+
    By default, layers are offloaded/reloaded asynchronously
    with respect to the current forward/backward stream with predefined synchronization,
    to ensure that activation memory usage is equal to
-    `(num_layers - num_offloaded_layers) * T`, where `T` is the memory footprint of a layer.
+    ``(num_layers - num_offloaded_layers) * T``, where ``T`` is the memory footprint of a layer.

-    For more control over the offloading and reloading process, you can set `manual_synchronization=True`.
-    In this case, an additional argument, `manual_controller`, is returned.
+    For more control over the offloading and reloading process, you can set ``manual_synchronization=True``.
+    In this case, an additional argument, ``manual_controller``, is returned.

-    The `manual_controller` provides the following methods:
-    - `start_offload_layer(layer_id: int)`
-    - `release_activation_forward_gpu_memory(layer_id: int)`
-    - `start_reload_layer(layer_id: int)`
+    The ``manual_controller`` provides the following methods:
+    - ``start_offload_layer(layer_id: int)``
+    - ``release_activation_forward_gpu_memory(layer_id: int)``
+    - ``start_reload_layer(layer_id: int)``

    If none of these methods are invoked for a given layer, that layer will not be offloaded or reloaded.
-    If `start_offload_layer()` is called for a layer, offload copies for that layer begin asynchronously on the offload stream.
+    If ``start_offload_layer()`` is called for a layer, offload copies for that layer begin asynchronously on the offload stream.

    Since GPU activations must be kept in memory until the copy is finished, pointers to all activations are stored.
-    To release this memory, you need to call `release_activation_forward_gpu_memory(layer_id)`.
+    To release this memory, you need to call ``release_activation_forward_gpu_memory(layer_id)``.
    This method makes the current stream wait for an event recorded on the offload stream after all tensors from the layer have been offloaded.

-    The `start_reload_layer()` method is used to start reloading a layer.
-    Each tensor reload is awaited to finish before `tensor_pop()` for that tensor is called on the current stream.
+    The ``start_reload_layer()`` method is used to start reloading a layer.
+    Each tensor reload is awaited to finish before ``tensor_pop()`` for that tensor is called on the current stream.

-    You can provide an `offload_stream` to be used for offload and reload operations.
+    You can provide an ``offload_stream`` to be used for offload and reload operations.
    This allows for more detailed synchronization, such as delaying the start of offloading.

-    Example:
+    **Example:**
+
    .. code-block:: python
+
        offload_stream = torch.cuda.Stream()
        cpu_offload_context, sync_function, manual_controller = get_cpu_offload_context(
            enabled=True, model_layers=num_layers, manual_synchronization=True, offload_stream=offload_stream)
@@ -730,10 +736,10 @@ def get_cpu_offload_context(
        for i in range(num_layers):
            out[i].sum().backward()

-    V1 code path
-    ----------
+    **V1 code path:**
+
    If you want to use the v1 code path for offloading,
-    please set the environment variable NVTE_CPU_OFFLOAD_V1 to 1.
+    please set the environment variable ``NVTE_CPU_OFFLOAD_V1`` to 1.

    """
    if NVTE_CPU_OFFLOAD_V1:
@@ -746,6 +752,11 @@ def get_cpu_offload_context(
            double_buffering=double_buffering,
        )

+    if not enabled:
+        if manual_synchronization:
+            return contextlib.nullcontext(), lambda x: x, None
+        return contextlib.nullcontext(), lambda x: x
+
    if not offload_weights and not offload_activations:
        raise ValueError(
            "CPU Offloading is enabled while it is not "
@@ -761,6 +772,8 @@ def get_cpu_offload_context(

        # Weights offloading is deprecated but we maintain backward compatibility by doing nothing.
        if not offload_activations:
+            if manual_synchronization:
+                return contextlib.nullcontext(), lambda x: x, None
            return contextlib.nullcontext(), lambda x: x

    if TEDebugState.debug_enabled:
@@ -846,15 +859,13 @@ def get_cpu_offload_context(

    cpu_offload_context = _CpuOffloadContext()

-    if enabled:
-        if manual_synchronization:
-            return (
-                cpu_offload_context,
-                cpu_offload_context.synchronization_function,
-                offload_synchronizer,
-            )
+    if manual_synchronization:
        return (
            cpu_offload_context,
            cpu_offload_context.synchronization_function,
+            offload_synchronizer,
        )
-    return contextlib.nullcontext(), lambda x: x
+    return (
+        cpu_offload_context,
+        cpu_offload_context.synchronization_function,
+    )
--- a/transformer_engine/pytorch/cpu_offload_v1.py
+++ b/transformer_engine/pytorch/cpu_offload_v1.py
@@ -693,18 +693,18 @@ def get_cpu_offload_context(

    Parameters
    ----------
-    enabled: bool, default = `False`
+    enabled : bool, default = `False`
             When set to True, CPU Offloading functionality is enabled.
-    num_layers: int, default = 1
+    num_layers : int, default = 1
                Determines the number of transformer layers
                you want to offload activations/weights for.
-    model_layers: int, default = 1
+    model_layers : int, default = 1
                  Number of layers in the model that will be used under this context.
-    offload_activations: bool, default = `True`
+    offload_activations : bool, default = `True`
                         When set to `True`, offloads the activations for the TE layer.
-    offload_weights: bool, default = `True`
+    offload_weights : bool, default = `True`
                     When set to `True`, offloads the weights for the TE layer.
-    double_buffering: bool, default = `False`
+    double_buffering : bool, default = `False`
                      When set to `True`, uses double buffering for offloading.

    """

--- a/transformer_engine/pytorch/cross_entropy.py
+++ b/transformer_engine/pytorch/cross_entropy.py
@@ -4,6 +4,9 @@

 """Cross Entropy Loss API"""

+from typing import Optional
+import warnings
+
 import torch

 import transformer_engine.pytorch.triton.cross_entropy as triton_cross_entropy
@@ -23,7 +26,7 @@ class CrossEntropyFunction(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
-        _input,
+        inp,
        target,
        label_smoothing=0.0,
        reduce_loss=False,
@@ -37,7 +40,7 @@ class CrossEntropyFunction(torch.autograd.Function):

        Parameters:
        ctx : The context object.
-        _input (tensor): The input tensor of shape (B, SQ, V) or (SQ, B, V) where B is batch size, SQ is sequence length, V is vocab size.
+        inp (tensor): The input tensor of shape (B, SQ, V) or (SQ, B, V) where B is batch size, SQ is sequence length, V is vocab size.
        target (tensor): The target tensor of shape (B,SQ) or (SQ, B) where each value is in [0, V-1].
        label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
        reduce_loss (bool): If true, returns the averaged loss across the B*SQ dimension.
@@ -47,8 +50,8 @@ class CrossEntropyFunction(torch.autograd.Function):
        Returns:
        tensor: The computed loss.
        """
-        loss, _input = triton_cross_entropy.cross_entropy_forward(
-            _input,
+        loss, inp = triton_cross_entropy.cross_entropy_forward(
+            inp,
            target,
            label_smoothing,
            reduce_loss,
@@ -56,7 +59,7 @@ class CrossEntropyFunction(torch.autograd.Function):
            ignore_idx,
        )

-        ctx.save_for_backward(_input.detach())
+        ctx.save_for_backward(inp.detach())
        ctx.is_cg_capturable = is_cg_capturable
        return loss

@@ -72,12 +75,10 @@ class CrossEntropyFunction(torch.autograd.Function):
        Returns:
        tuple: A tuple with the gradients with respect to the inputs. The elements are tensors or None.
        """
-        (_input,) = ctx.saved_tensors
-        _input = triton_cross_entropy.cross_entropy_backward(
-            _input, grad_output, ctx.is_cg_capturable
-        )
+        (inp,) = ctx.saved_tensors
+        inp = triton_cross_entropy.cross_entropy_backward(inp, grad_output, ctx.is_cg_capturable)
        return (
-            _input,
+            inp,
            None,
            None,
            None,
@@ -87,4 +88,65 @@ class CrossEntropyFunction(torch.autograd.Function):
        )


-parallel_cross_entropy = CrossEntropyFunction.apply
+def parallel_cross_entropy(
+    inp: torch.Tensor,
+    target: torch.Tensor,
+    label_smoothing: float = 0.0,
+    reduce_loss: bool = False,
+    dist_process_group: Optional[torch.distributed.ProcessGroup] = None,
+    ignore_idx: int = -100,
+    is_cg_capturable: bool = False,
+    *,
+    _input: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Cross Entropy loss with optional distributed reduction.
+
+    The input tensor can be in BF16/FP32, the loss and gradient calculation happens in
+    FP32 only. The returned loss is always in FP32, the input gradients are upcasted
+    to the datatype of the input.
+
+    If ``dist_process_group`` is passed for distributed loss calculation, the input to each
+    distributed rank should be ``(*, V/world_size)``. Note that each of the ranks should
+    get equal shards along the V dimension.
+
+    Parameters
+    ----------
+    inp : torch.Tensor
+        The input tensor of shape ``(B, SQ, V)`` or ``(SQ, B, V)`` where B is batch size,
+        SQ is sequence length, V is vocab size.
+    target : torch.Tensor
+        The target tensor of shape ``(B, SQ)`` or ``(SQ, B)`` where each value is in ``[0, V-1]``.
+    label_smoothing : float, default = 0.0
+        The amount of smoothing when computing the loss, where 0.0 means no smoothing.
+    reduce_loss : bool, default = False
+        If True, returns the averaged loss across the B*SQ dimension.
+    dist_process_group : torch.distributed.ProcessGroup, default = None
+        The distributed process group the loss computation is split across, None if on 1 device.
+    ignore_idx : int, default = -100
+        The index for which loss and gradients are made to zero.
+    is_cg_capturable : bool, default = False
+        Whether the operation is CUDA graph capturable.
+
+    Returns
+    -------
+    torch.Tensor
+        The computed loss.
+    """
+    # Handle backward compatibility with _input parameter
+    if _input is not None:
+        warnings.warn(
+            "The '_input' parameter is deprecated. Please use 'inp' instead.",
+            FutureWarning,
+        )
+        inp = _input
+
+    return CrossEntropyFunction.apply(
+        inp,
+        target,
+        label_smoothing,
+        reduce_loss,
+        dist_process_group,
+        ignore_idx,
+        is_cg_capturable,
+    )