add

0d99ae1f · silencealiang · c271aaae · 0d99ae1f · 0d99ae1f · 0d99ae1f
Commit 0d99ae1f authored Mar 14, 2025 by silencealiang
20 changed files
--- a/megatron/core/rerun_state_machine.py
+++ b/megatron/core/rerun_state_machine.py
@@ -12,6 +12,9 @@ from typing import Any, Callable, Iterable, NamedTuple, Optional, Set, Tuple, Un
 import numpy as np
 import torch

+import megatron.core.parallel_state as mpu
+from megatron.core.dist_checkpointing.mapping import ShardedObject
+
 """DISCLAIMER: THIS IS AN EXPERIMENTAL FEATURE.

 The rerun state machine implementation in this file is alpha-level code to help
@@ -34,6 +37,7 @@ EXIT_CODE_RESUME_TO_DISAMBIGUATE: int = 16
 EXIT_CODE_FAILED_ON_RESULT_VALIDATION: int = 17

 SerializableStateType = Union[list, dict]
+DataIteratorArgType = Optional[Union["RerunDataIterator", list["RerunDataIterator"]]]


 class Caller(NamedTuple):
@@ -203,12 +207,14 @@ class RerunStateMachine:

        self.saved_results: dict[Call, Any] = {}
        self.stats: dict[Caller, QuickStats] = defaultdict(lambda: QuickStats())
-        logger.warning(f"RerunStateMachine initialized in mode {mode}")
+        if _safe_get_rank() == 0:
+            logger.warning(f"RerunStateMachine initialized in mode {mode}")

    def set_mode(self, mode: RerunMode) -> None:
        """Method to set the operating mode"""

-        logger.warning(f"Setting RerunStateMachine mode {mode}")
+        if _safe_get_rank() == 0:
+            logger.warning(f"Setting RerunStateMachine mode {mode}")
        self.mode = mode

    def get_mode(self) -> RerunMode:
@@ -216,9 +222,7 @@ class RerunStateMachine:

        return self.mode

-    def should_run_forward_backward(
-        self, data_iterator: Optional[Union["RerunDataIterator", list]]
-    ) -> bool:
+    def should_run_forward_backward(self, data_iterator: DataIteratorArgType) -> bool:
        """Method instructing whether to (re)run the forward-backward pass.

        Args:
@@ -243,17 +247,7 @@ class RerunStateMachine:

        self.validation_counts = defaultdict(int)

-        data_iterators: list[RerunDataIterator] = []
-        if self.mode != RerunMode.DISABLED and data_iterator is not None:
-            if not isinstance(data_iterator, list):
-                data_iterators = [data_iterator]
-            else:
-                data_iterators = data_iterator
-            for d in data_iterators:
-                assert (
-                    isinstance(d, RerunDataIterator),
-                    "data iterator is not wrapped with RerunDataIterator",
-                )
+        data_iterators: list[RerunDataIterator] = self._sanitize_data_iterators(data_iterator)

        # Are we about to start the initial run?
        if self.state == RerunState.NOT_RUNNING_YET:
@@ -263,10 +257,9 @@ class RerunStateMachine:
            if self.data_iterator_checkpoints is not None:
                assert (
                    len(self.data_iterator_checkpoints) == len(data_iterators),
-                    "data_iterator has different length than checkpointed data iterator",
-                )
+                ), "data iterator has different length than checkpointed data iterator"
                for i, d in enumerate(data_iterators):
-                    d.set_checkpoint_state(self.data_iterator_checkpoints[i])
+                    d.load_state_dict(self.data_iterator_checkpoints[i])
                self.data_iterator_checkpoints = None
            self._save_state()
            if data_iterators:
@@ -632,17 +625,15 @@ class RerunStateMachine:
        self.last_loss = loss
        return result

-    def get_checkpoint_state(
-        self, data_iterator: Optional[Union["RerunDataIterator", list]]
-    ) -> list[dict[str, Any]]:
+    def state_dict(self, data_iterator: DataIteratorArgType, use_dist_ckpt: bool) -> dict[str, Any]:
        """Method that returns a state dict to be checkpointed.

        Args:
            data_iterator: the data iterator that needs to be checkpointed (or None
                if this checkpoint is not requested by the rerun state machine).
+            use_dist_ckpt: generate a distributed checkpoint.
        Returns:
-            A list of state dicts, each state dict representing the rerun state machine
-            for one rank.
+            A state dict representing the rerun state machine.

        Example usage:

@@ -651,26 +642,15 @@ class RerunStateMachine:
                ...
                rerun_state_machine = get_rerun_state_machine()
                checkpoint['rerun_state_machine'] = (
-                    rerun_state_machine.get_checkpoint_state(data_iterator)
+                    rerun_state_machine.state_dict(data_iterator, False)
                )
                ...
                return checkpoint
        """

-        data_iterators: list[RerunDataIterator]
-        if self.mode == RerunMode.DISABLED:
-            data_iterators = []
-        elif isinstance(data_iterator, (list, tuple)):
-            data_iterators = data_iterator
-        else:
-            data_iterators = [data_iterator] if data_iterator is not None else []
-        for d in data_iterators:
-            assert (
-                isinstance(d, RerunDataIterator),
-                "data iterator is not wrapped with RerunDataIterator",
-            )
+        data_iterators: list[RerunDataIterator] = self._sanitize_data_iterators(data_iterator)

-        state: dict[str, Any] = {
+        state_dict: dict[str, Any] = {
            'mode': self.mode,
            'state': self.state,
            'current_iteration': self.current_iteration,
@@ -679,7 +659,7 @@ class RerunStateMachine:
            'restart_again_requested': self.restart_again_requested,
            'continue_requested': self.continue_requested,
            # logged_sdc_enabled should not be saved (set at the job startup time).
-            'error_injector_checkpoint': self.error_injector.get_checkpoint_state(),
+            'error_injector_checkpoint': self.error_injector.state_dict(),
            # validation_counts should not be saved (reset at the beginning of the training loop).
            'failed_validation_call': self.failed_validation_call,
            'initial_result': self.initial_result,
@@ -687,29 +667,31 @@ class RerunStateMachine:
            'suspicious_device': self.suspicious_device,
            # No need to save saved_state (RNG state  already captured in checkpoint).
            'data_iterator_checkpoints': (
-                [d.get_checkpoint_state() for d in data_iterators] if data_iterators else None
+                [d.state_dict() for d in data_iterators] if data_iterators else None
            ),
            'last_loss': self.last_loss,
            # No need to save saved_results and stats (resets when job resumes).
        }
-        state_list: list[dict[str, Any]]
-        if (
-            torch.distributed.is_initialized()
-            and torch.distributed.get_world_size() > 1
-            and self.mode != RerunMode.DISABLED
-        ):
-            state_list = [None for i in range(torch.distributed.get_world_size())]
-            torch.distributed.all_gather_object(state_list, state)
-        else:
-            state_list = [state]
-        return state_list
+        if use_dist_ckpt:
+            pp_rank = mpu.get_pipeline_model_parallel_rank()
+            pp_size = mpu.get_pipeline_model_parallel_world_size()
+            tp_rank = mpu.get_tensor_model_parallel_rank()
+            tp_size = mpu.get_tensor_model_parallel_world_size()
+            state_dict = ShardedObject(
+                'rerun_state_machine_state',
+                state_dict,
+                (pp_size, tp_size),
+                (pp_rank, tp_rank),
+                replica_id=mpu.get_data_parallel_rank(with_context_parallel=True),
+            )
+        return state_dict

-    def set_checkpoint_state(self, state_list: list[dict[str, Any]]) -> None:
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
        """Method that restores the state from a checkpoint.

        Args:
-            state_list: the list of state dicts saved in the checkpoint and originally
-                obtained from get_checkpoint_state().
+            state_dict: the state dict saved in the checkpoint and originally
+                obtained from state_dict().
        Returns:
            None

@@ -719,31 +701,43 @@ class RerunStateMachine:
                ...
                if 'rerun_state_machine' in checkpoint:
                    rerun_state_machine = get_rerun_state_machine()
-                    rerun_state_machine.set_checkpoint_state(checkpoint['rerun_state_machine'])
+                    rerun_state_machine.load_state_dict(checkpoint['rerun_state_machine'])
        """

        if self.mode == RerunMode.DISABLED:
            return
-        rank: int = _safe_get_rank()
-        if rank == 0:
-            logger.warning(
-                "Getting RerunStaeMachine state from checkpoint, args rerun options ignored"
-            )
-        state = state_list[rank]
-        self.mode = state['mode']
-        self.state = state['state']
-        self.current_iteration = state['current_iteration']
-        self.rerun_requested = state['rerun_requested']
-        self.checkpoint_requested = state['checkpoint_requested']
-        self.restart_again_requested = state['restart_again_requested']
-        self.continue_requested = state['continue_requested']
-        self.error_injector.set_checkpoint_state(state['error_injector_checkpoint'])
-        self.failed_validation_call = state['failed_validation_call']
-        self.initial_result = state['initial_result']
-        self.suspicious_node = state['suspicious_node']
-        self.suspicious_device = state['suspicious_device']
-        self.data_iterator_checkpoints = state['data_iterator_checkpoints']
-        self.last_loss = state['last_loss']
+        logger.warning("Getting RerunStaeMachine state from checkpoint, args rerun options ignored")
+        self.mode = state_dict['mode']
+        self.state = state_dict['state']
+        self.current_iteration = state_dict['current_iteration']
+        self.rerun_requested = state_dict['rerun_requested']
+        self.checkpoint_requested = state_dict['checkpoint_requested']
+        self.restart_again_requested = state_dict['restart_again_requested']
+        self.continue_requested = state_dict['continue_requested']
+        self.error_injector.load_state_dict(state_dict['error_injector_checkpoint'])
+        self.failed_validation_call = state_dict['failed_validation_call']
+        self.initial_result = state_dict['initial_result']
+        self.suspicious_node = state_dict['suspicious_node']
+        self.suspicious_device = state_dict['suspicious_device']
+        self.data_iterator_checkpoints = state_dict['data_iterator_checkpoints']
+        self.last_loss = state_dict['last_loss']
+
+    def _sanitize_data_iterators(
+        self, data_iterator: DataIteratorArgType
+    ) -> list["RerunDataIterator"]:
+        data_iterators: list[RerunDataIterator]
+        if self.mode == RerunMode.DISABLED:
+            data_iterators = []
+        elif not isinstance(data_iterator, list):
+            data_iterators = [data_iterator]
+        else:
+            data_iterators = data_iterator
+        data_iterators = [d for d in data_iterators if d is not None]
+        for d in data_iterators:
+            assert (
+                isinstance(d, RerunDataIterator),
+            ), "data iterator is not wrapped with RerunDataIterator"
+        return data_iterators

    def _get_validation_call_info(self) -> Call:
        """Internal method to get the context about the caller to validate_result()."""
@@ -837,8 +831,8 @@ class RerunDataIterator:
        replay_data_iterator = RerunDataIterator(data_iterator)
    """

-    def __init__(self, iterable: Any, make_iterable: bool = True) -> None:
-        self.iterable: Iterable[Any] = iter(iterable) if make_iterable else iterable
+    def __init__(self, iterable: Iterable[Any]) -> None:
+        self.iterable: Iterable[Any] = iterable
        self.saved_microbatches: list[Any] = []
        self.replaying: bool = False
        self.replay_pos: int = 0
@@ -870,7 +864,7 @@ class RerunDataIterator:
        self.replaying = False
        self.saved_microbatches = []

-    def get_checkpoint_state(self) -> SerializableStateType:
+    def state_dict(self) -> SerializableStateType:
        """Method to capture the state of the iterator as a serializable dict."""

        return {
@@ -879,7 +873,7 @@ class RerunDataIterator:
            'replay_pos': self.replay_pos,
        }

-    def set_checkpoint_state(self, state_dict: SerializableStateType) -> None:
+    def load_state_dict(self, state_dict: SerializableStateType) -> None:
        """Method to restore the state saved as a serializable dict."""

        self.saved_microbatches = state_dict['saved_microbatches']
@@ -1051,7 +1045,7 @@ class RerunErrorInjector:
        else:
            raise RuntimeError("Should not be here")

-    def get_checkpoint_state(self) -> SerializableStateType:
+    def state_dict(self) -> SerializableStateType:
        """Method to capture the state of the error injector as a serializable dict."""

        return {
@@ -1061,7 +1055,7 @@ class RerunErrorInjector:
            'injected_error_type': self.injected_error_type,
        }

-    def set_checkpoint_state(self, state_dict: SerializableStateType) -> None:
+    def load_state_dict(self, state_dict: SerializableStateType) -> None:
        """Method to restore the state saved as a serializable dict."""

        self.error_injection_rate = state_dict['error_injection_rate']
@@ -1107,7 +1101,14 @@ def _set_rerun_state_machine(rerun_state_machine) -> None:
 def _safe_get_rank() -> int:
    """Internal function that safely checks and returns the rank of the caller."""

-    return torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+
+    # If torch.distributed is not initialized, try to read environment variables.
+    try:
+        return int(os.environ.get("RANK", 0))
+    except (ValueError, TypeError):
+        return 0


 def _compare_floats(a: torch.Tensor, b: torch.Tensor) -> float:

--- a/megatron/core/ssm/__init__.py
+++ b/megatron/core/ssm/__init__.py
--- a/megatron/core/ssm/mamba_block.py
+++ b/megatron/core/ssm/mamba_block.py
--- a/megatron/core/ssm/mamba_hybrid_layer_allocation.py
+++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py
--- a/megatron/core/ssm/mamba_layer.py
+++ b/megatron/core/ssm/mamba_layer.py
--- a/megatron/core/ssm/mamba_mixer.py
+++ b/megatron/core/ssm/mamba_mixer.py
--- a/megatron/core/ssm/triton_cache_manager.py
+++ b/megatron/core/ssm/triton_cache_manager.py
--- a/megatron/core/tensor_parallel/__init__.py
+++ b/megatron/core/tensor_parallel/__init__.py
--- a/megatron/core/tensor_parallel/cross_entropy.py
+++ b/megatron/core/tensor_parallel/cross_entropy.py
@@ -120,6 +120,7 @@ class VocabParallelCrossEntropy:


 class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @torch.compile(mode='max-autotune-no-cudagraphs')
    @staticmethod
    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
        """Vocab parallel cross entropy forward function."""

--- a/megatron/core/tensor_parallel/data.py
+++ b/megatron/core/tensor_parallel/data.py
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -237,7 +237,7 @@ class VocabParallelEmbedding(torch.nn.Module):
            )
            if config.perform_initialization:
                _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
-
+    @torch.compile(mode='max-autotune-no-cudagraphs')
    def forward(self, input_):
        """Forward.


--- a/megatron/core/tensor_parallel/mappings.py
+++ b/megatron/core/tensor_parallel/mappings.py
@@ -462,13 +462,13 @@ class _AllToAll(torch.autograd.Function):
 # -----------------
 # Helper functions.
 # -----------------
-
+from megatron.core.jit import no_torch_dynamo

 def copy_to_tensor_model_parallel_region(input_):
    """Wrapper for autograd function: forward: copy, backward allreduce"""
    return _CopyToModelParallelRegion.apply(input_)

-
+@no_torch_dynamo()
 def reduce_from_tensor_model_parallel_region(input_):
    """Wrapper for autograd function: forward: all reduce, backward copy"""
    return _ReduceFromModelParallelRegion.apply(input_)
@@ -501,7 +501,7 @@ def gather_from_sequence_parallel_region(
        input_, tensor_parallel_output_grad, group, output_split_sizes, use_global_buffer
    )

-
+@no_torch_dynamo()
 def reduce_scatter_to_sequence_parallel_region(
    input_, group=None, input_split_sizes=None, use_global_buffer=False
 ):

--- a/megatron/core/tensor_parallel/random.py
+++ b/megatron/core/tensor_parallel/random.py
--- a/megatron/core/tensor_parallel/utils.py
+++ b/megatron/core/tensor_parallel/utils.py
--- a/megatron/core/timers.py
+++ b/megatron/core/timers.py
--- a/megatron/core/transformer/__init__.py
+++ b/megatron/core/transformer/__init__.py
--- a/megatron/core/transformer/attention.py
+++ b/megatron/core/transformer/attention.py
--- a/megatron/core/transformer/cuda_graphs.py
+++ b/megatron/core/transformer/cuda_graphs.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

+import gc
+import inspect
 import logging
-import time
+from collections import defaultdict
+from contextlib import nullcontext
+from dataclasses import fields, is_dataclass
 from enum import Enum

 import torch
+from torch.utils._pytree import tree_flatten

+from megatron.core import parallel_state
+from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version

 try:
-    from transformer_engine.pytorch import make_graphed_callables
-    from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
+    from transformer_engine.pytorch.distributed import get_all_rng_states, graph_safe_rng_available
+    from transformer_engine.pytorch.fp8 import FP8GlobalStateManager, fp8_autocast
+    from transformer_engine.pytorch.graph import restore_fp8_tensors, save_fp8_tensors
+    from transformer_engine.pytorch.graph import set_capture_end as te_set_capture_end
+    from transformer_engine.pytorch.graph import set_capture_start as te_set_capture_start
+    from transformer_engine.pytorch.module.base import TransformerEngineBaseModule

    HAVE_TE_GRAPHS = True
 except:
    HAVE_TE_GRAPHS = False

+_IS_GRAPH_CAPTURING = False

-class GraphStatus(Enum):
+
+def is_graph_capturing():
+    """Query if currently capturing."""
+
+    return _IS_GRAPH_CAPTURING
+
+
+def _set_capture_start():
+    """Set graph capture has started."""
+
+    _IS_GRAPH_CAPTURING = True
+
+
+def _set_capture_end():
+    """Set graph capture has ended."""
+
+    _IS_GRAPH_CAPTURING = False
+
+
+def _check_supported_type(arg):
+    """Check if arg is a supported type for cudagraph input/outputs."""
+
+    _SUPPORTED_TYPES = {torch.Tensor, type(None), bool, int, str, float}
+    assert type(arg) in _SUPPORTED_TYPES or is_dataclass(
+        arg
+    ), f"Cudagraphs recieved an arg of type {type(arg)} which is not supported."
+
+
+class _CudagraphGlobalRecord:
+    """A global datastructure that records of the ordering of all _CudaGraphRunner's
+    first fwd or bwd passes. 'create_cudagraphs' will use this to create
+    cudagraphs in execution order, which is required for cudagraphs sharing a mempool."""
+
+    """A global flag that if true, all cudagraph runners
+    fwd and bwd passes will be performed using their cudagraphed versions."""
+    cudagraph_created = False
+
+    """A record of fwd and bwd graph creation, populated with 'record_fwd_graph' and 
+    'record_bwd_graph."""
+    cudagraph_record = []
+
+    @classmethod
+    def record_fwd_graph(cls, runner, args, kwargs):
+        """Record a fwd graph to 'cudagraph_record"""
+
+        vpp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+        vpp_rank = 0 if vpp_rank is None else vpp_rank
+        cls.cudagraph_record.append((runner, "fwd", vpp_rank, args, kwargs))
+
+    @classmethod
+    def record_bwd_graph(cls, runner):
+        """Record a bwd graph to 'cudagraph_record"""
+
+        vpp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+        vpp_rank = 0 if vpp_rank is None else vpp_rank
+        cls.cudagraph_record.append((runner, "bwd", vpp_rank))
+
+    @classmethod
+    def create_cudagraphs(cls):
+        """Iterate through 'cudagraph_record' creating graphs in the order in which
+        they were recorded."""
+
+        # Cudagraphs have already been created, check that no cudagraphed modules ran in eager mode
+        if cls.cudagraph_created:
+            assert len(cls.cudagraph_record) == 0, (
+                "One or more _CudaGraphRunners requested to create a graph after cudagraphs",
+                "were already created!",
+            )
+            return
+
+        # No cudagraphs have been created or recorded, so do nothing
+        if len(cls.cudagraph_record) == 0:
+            return
+
+        # Otherwise, create all the recorded cudagraphs.
+        logging.getLogger(__name__).info(f"Creating {len(cls.cudagraph_record)} cudagraphs")
+
+        has_te_modules = False
+        for g in cls.cudagraph_record:
+            base_module = g[0].base_module
+            has_te_modules = has_te_modules or any(
+                [isinstance(m, TransformerEngineBaseModule) for m in base_module.modules()]
+            )
+
+        # If graphing only transformer layers with self attention, then apply the following
+        # transformer layer specific optimizations that reduce memory usage and tensor copies:
+        # These eventually will become unneccessary with:
+        # https://github.com/pytorch/pytorch/pull/137318
+        # 1. Some inputs to TransformerLayer (e.g. rotary_emb) are the same over all layers
+        #    and only need to be set once.
+        # 2. Because the next layer consumes the previous layer's hidden states, all fwd
+        #    cudagraphs can alternate reusing the same hidden_state input, output buffer.
+        #    Similarly, bwd graphs can alternate the same output, input grad buffers.
+        optimize_transformer_layer_graph_buffers = all(
+            [g[0].is_transformer_decoder_layer for g in cls.cudagraph_record]
+        )
+        if optimize_transformer_layer_graph_buffers:
+            prev_fwd_hidden_state_output = None
+            prev_bwd_hidden_state_inputgrad = None
+
+        fwd_mempools = defaultdict(lambda: defaultdict(torch.cuda.graph_pool_handle))
+        bwd_mempool = torch.cuda.graph_pool_handle()
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        _set_capture_start()
+        if has_te_modules:
+            te_set_capture_start()
+
+        for idx, g in enumerate(cls.cudagraph_record):
+            runner, graph_type, vp_rank = g[0:3]
+
+            # All model chunks in the same microbatch use the same mempool. For deep pipelines,
+            # i.e. when virtual pipelining is used, additonally all bwd passes share the same
+            # mempool. This reduces memory usage since when there are few graphs per mempool,
+            # the memory usage increases due to fragmentation. Otherwise when VP=1, it is more
+            # effective to have fwd and bwd passes share the same mempool.
+            fwd_mempool = fwd_mempools[vp_rank][runner.position]
+            vpp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+            if vpp_size is None or vpp_size == 1:
+                bwd_mempool = fwd_mempool
+
+            if optimize_transformer_layer_graph_buffers:
+                if graph_type == 'fwd':
+                    args, kwargs = g[3:]
+
+                    if not runner.is_first_layer:
+                        kwargs['hidden_states'] = prev_fwd_hidden_state_output
+                    runner.create_fwd_graph(fwd_mempool, args, kwargs, clone_inputs=False)
+
+                    # The output of TransformerLayer is: (hidden_states, None)
+                    prev_fwd_hidden_state_output, _ = runner.fwd_graph_outputs
+                else:
+                    runner.create_bwd_graph(
+                        bwd_mempool, static_grad_outputs=prev_bwd_hidden_state_inputgrad
+                    )
+
+                    # The first input grad TransformerLayer is for 'hidden_states'
+                    if not runner.is_last_layer:
+                        prev_bwd_hidden_state_inputgrad = runner.static_grad_inputs[0]
+            else:
+                runner, graph_type = g[0:2]
+                if graph_type == 'fwd':
+                    args, kwargs = g[3:]
+                    runner.create_fwd_graph(fwd_mempool, args, kwargs)
+                else:
+                    runner.create_bwd_graph(bwd_mempool)
+
+        for g in cls.cudagraph_record:
+            runner = g[0]
+            runner.cudagraph_created = True
+
+        cls.cudagraph_created = True
+        cls.cudagraph_record = []
+
+        _set_capture_end()
+        if has_te_modules:
+            te_set_capture_end()
+
+
+def create_cudagraphs():
+    """Should be called at the end of each schedule function,
+    (e.g. forward_backward_pipelining_with_interleaving) in
+    `megatron.core.pipeline_parallel.schedules.py`. During the first step, _CudaGraphRunners
+    populate _CudagraphGlobalRecord with the global order in which cudagraphs should be created.
+    At the end for the first step, this function calls each runner's `create_fwd_graph` and
+    `create_bwd_graph` in the order recorded in _CudagraphGlobalRecord, which allows cudagraphs
+    to be created in execution order, which allows multiple cudagraphs to share a single
+    memory pool, minimizing cudagraph memory usage."""
+
+    _CudagraphGlobalRecord.create_cudagraphs()
+
+
+class _GraphStatus(Enum):
    """An Enum to track if a cudagraph is ready to perform a forward or backward pass."""

-    FWD_READY = 0
-    BWD_READY = 1
+    FWD_READY = 0  # Set immediately after a bwd pass
+    BWD_READY = 1  # Set immediately after a fwd pass


-class GraphStatusFunc(torch.autograd.Function):
-    """Inserts a node into the autograd graph that tracks whether an object has an outstanding
-    backward pass by toggling the value of GraphStatus. This is mainly used to detect when to create
-    multiple graphs per transformer layer for pipeline parallelism.
-    We don't use backward module hooks as they change forward output tensors to views, see:
-    https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_full_backward_hook
-    """
+class _CudagraphFuncNoop(torch.autograd.Function):
+    """Inserts a noop node into the autograd graph, used to record when a bwd graph needs
+    to be created."""

    @staticmethod
-    def forward(ctx, runner, obj):
-        """Occurs immediately before the graph's forward pass.
-        Marks the graph's backward pass as ready."""
+    def forward(ctx, runner, inputs):
+        """Forward pass, does nothing but registers an autograd node."""
+
+        assert (
+            runner.status == _GraphStatus.FWD_READY
+        ), "Tried calling the fwd cudagraph when the bwd cudagraph was expected to be called next!"
+
        ctx.runner = runner
-        runner.status = GraphStatus.BWD_READY
-        return obj
+        return inputs

    @staticmethod
-    def backward(ctx, grad):
-        """Occurs immediately after the graph's backward pass.
-        Marks the graph's forward pass as ready."""
-        assert ctx.runner.status == GraphStatus.BWD_READY
-        ctx.runner.status = GraphStatus.FWD_READY
-        return None, grad
-
-
-class TensorDescription:
-    """Records the attributes of a tensor. Used to check if a
-    tensor argument matches the tensor with which the module
-    was graph captured with."""
-
-    def __init__(self, tensor):
-        self.shape = tuple(tensor.shape)
-        self.dtype = tensor.dtype
-        self.device = tensor.device
-
-    def matches_tensor(self, tensor):
-        """Check if 'tensor' matches the attributes of this TensorDescription."""
-
-        assert torch.is_tensor(tensor)
-        return (
-            tensor.shape == self.shape
-            and tensor.dtype == self.dtype
-            and tensor.device == self.device
-        )
+    def backward(ctx, grads):
+        """If this is the first bwd pass of this runner, record that a
+        bwd graph needs to be created."""

+        runner = ctx.runner
+        assert (
+            runner.status == _GraphStatus.BWD_READY
+        ), "Tried calling the bwd cudagraph when the fwd cudagraph was expected to be called next!"

-class CudaGraphCallable(torch.nn.Module):
-    """Wraps a module to be cudagraphable, records the output of the cudagraph.
-    Reinserts non-tensor args, kwargs that were previously filtered out by 'get_tensor_args'.
-    """
+        runner.status = _GraphStatus.FWD_READY
+
+        if not runner.bwd_graph_recorded:
+            _CudagraphGlobalRecord.record_bwd_graph(runner)
+            runner.bwd_graph_recorded = True
+
+        return None, grads
+
+
+class _CudagraphFunc(torch.autograd.Function):
+    """Replays the runner's cudagraphs with autograd. Handles copying data into/out of the
+    cudagraph io and fp8 if used."""
+
+    @staticmethod
+    def forward(ctx, runner, is_first_microbatch, *inputs):
+        """Replay the forward graph of the passed runner."""

-    def __init__(self, module, groundtruth_args, groundtruth_kwargs):
-        super().__init__()
-        self.add_module('base_module', module)
-
-        # The Pytorch cudagraph API requires only tensor inputs, so we strip
-        # non-tensor arguments and reinsert them in forward() using these groundtruth attributes.
-        # We will also check future calls to the cudagraph against these to ensure the cudagraph
-        # is called with the same inputs as it was captured with.
-        self.groundtruth_outputs = []
-        self.groundtruth_args = tuple(
-            TensorDescription(a) if torch.is_tensor(a) else a for a in groundtruth_args
-        )
-        self.groundtruth_kwargs = {
-            k: TensorDescription(v) if torch.is_tensor(v) else v
-            for k, v in groundtruth_kwargs.items()
-        }
-
-    def forward(self, *arg_tensors, **kwarg_tensors):
-        """Call the forward pass of the cudagraph. Also checks the outputs
-        of the cudagraph matches what the graph was traced with."""
-
-        args = list(self.groundtruth_args)
-        arg_tensors = list(arg_tensors)
-        for idx, groundtruth_arg in enumerate(self.groundtruth_args):
-            if isinstance(groundtruth_arg, TensorDescription):
-                args[idx] = arg_tensors.pop(0)
-
-        kwargs = dict(self.groundtruth_kwargs)
-        for k, v in self.groundtruth_kwargs.items():
-            if isinstance(v, TensorDescription):
-                kwargs[k] = kwarg_tensors[k]
-
-        # Use forward() instead of __call__ to avoid triggering hooks
-        out = self.base_module.forward(*args, **kwargs)
-        if torch.is_tensor(out):
-            out = tuple(out)
-
-        self.groundtruth_outputs = [TensorDescription(o) if torch.is_tensor(o) else o for o in out]
-
-        out = tuple(o for o in out if torch.is_tensor(o))
        assert (
-            len(out) > 0
-        ), """A graphed module returned no tensors in training mode, however the graphed module 
-            must output at least one tensor, so that a corresponding backward node
-            may be registered in the autograd graph."""
+            runner.fwd_graph is not None
+        ), "Tried replaying fwd cudagraph before calling 'create_fwd_cudagraph!"
+        assert (
+            runner.status == _GraphStatus.FWD_READY
+        ), "Tried calling the fwd cudagraph when the bwd cudagraph was expected to be called next!"
+        assert len(inputs) == len(
+            runner.fwd_graph_input_surface
+        ), "Fwd cudagraph received a different number of tensors than what it was graphed with!"
+
+        # Copy new data into fwd graph input buffer
+        for user_input, cudagraph_input in zip(inputs, runner.fwd_graph_input_surface):
+            if user_input.data_ptr() != cudagraph_input.data_ptr():
+                cudagraph_input.copy_(user_input)

-        if len(out) == 1:
-            return out[0]
+        ctx.runner = runner
+        if runner.fp8_enabled:
+            for m in runner.base_module.modules():
+                if isinstance(m, TransformerEngineBaseModule):
+                    m.fp8_meta["fp8_group"] = FP8GlobalStateManager.get_fp8_group()
+                    m.fp8_meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
+
+                    if is_te_min_version("1.13.0"):
+                        FP8GlobalStateManager.add_fp8_tensors_to_global_buffer(m.fp8_meta)
+                    else:
+                        FP8GlobalStateManager.add_fp8_tensors_to_global_buffer(
+                            m.fp8_meta, fp8_weights=m._get_fp8_params()
+                        )
+
+            is_first_fp8_module = FP8GlobalStateManager.is_first_fp8_module()
+            if is_first_fp8_module:
+                FP8GlobalStateManager.set_skip_fp8_weight_update_tensor(not is_first_microbatch)
+            ctx.is_first_fp8_module = is_first_fp8_module
+
+        runner.fwd_graph.replay()
+
+        # if last transformer layer, return a clone of the cudagraph output buffer, as releasing
+        # the cudagraph output buffer into the rest of the system may allow it to be corrupted
+        if runner.is_last_layer:
+            out = tuple(o.clone().detach() for o in runner.fwd_graph_output_surface)
+        else:
+            out = tuple(o.detach() for o in runner.fwd_graph_output_surface)
        return out

+    @staticmethod
+    def backward(ctx, *grads):
+        """Replay the backward graph of the passed runner."""

-class CudaGraphRunner(torch.nn.Module):
-    """Wraps a single cudagraph and its expected arguments. Checks that
-    the provided args are the same as what the graph was traced with.
-    """
+        runner = ctx.runner
+        assert (
+            runner.bwd_graph is not None
+        ), "Tried replaying bwd cudagraph before calling 'create_bwd_cudagraph'!"
+        assert (
+            runner.status == _GraphStatus.BWD_READY
+        ), "Tried calling the bwd cudagraph when the fwd cudagraph was expected to be called next!"
+        assert len(grads) == len(
+            runner.static_grad_outputs
+        ), "Bwd cudagraph received a different number of tensors than what it was graphed with!"
+
+        # Copy new data into bwd graph input buffer
+        for user_output_grad, cudagraph_output_grad in zip(grads, runner.static_grad_outputs):
+            if user_output_grad.data_ptr() != cudagraph_output_grad.data_ptr():
+                cudagraph_output_grad.copy_(user_output_grad)
+
+        runner.bwd_graph.replay()
+        runner.status = _GraphStatus.FWD_READY
+
+        # Update FP8 scale factors if needed
+        if runner.fp8_enabled and ctx.is_first_fp8_module:
+            FP8GlobalStateManager.reduce_and_update_fp8_tensors(forward=False)
+
+        # If using gradient_accumulation_fusion, whenever `main_grad` is calculated
+        # the `grad_added_to_main_grad` attribute is expected to set. However when using
+        # cudagraphs this doesn't occur so we emulate this behavior here.
+        for param, grad_added in runner.groundtruth_grad_added_to_main_grad.items():
+            param.grad_added_to_main_grad = grad_added
+
+        if runner.is_first_layer:
+            output_grads = tuple(
+                b.clone().detach() if b is not None else b for b in runner.static_grad_inputs
+            )
+        else:
+            output_grads = tuple(
+                b.detach() if b is not None else b for b in runner.static_grad_inputs
+            )
+        return None, None, *output_grads
+
+
+class _CudaGraphRunner(torch.nn.Module):
+    """Represents the execution of a cudagraphed module for a single microbatch.
+    If there are multiple outstanding microbatches per module, such as for pipeline parallelism,
+    CudaGraphManager automatically creates multiple _CudaGraphRunners per module."""
+
+    def __init__(self, base_module, position):
+        """Creates a _CudaGraphRunner, which holds a single pair of fwd and bwd cudagraphs, which
+        are not created until this runner records its graph creation into
+        '_CudagraphGlobalRecord', and 'create_cudagraphs()' is called."""

-    def __init__(self, graphed_module, wrapped_module):
        super().__init__()

-        self.graphed_module = graphed_module
-        self.groundtruth_args = wrapped_module.groundtruth_args
-        self.groundtruth_kwargs = wrapped_module.groundtruth_kwargs
-        self.groundtruth_outputs = wrapped_module.groundtruth_outputs
-        self.status = GraphStatus.FWD_READY
+        self.base_module = base_module
+        self.position = position
+        self.fwd_graph = None
+        self.bwd_graph = None
+
+        self.fwd_graph_recorded = False
+        self.bwd_graph_recorded = False
+        self.cudagraph_created = False
+        self.status = _GraphStatus.FWD_READY
+
+        self.fuse_wgrad_accumulation = False
+        self.backward_retain_grad = False
+        self.fp8_enabled = False
+        self.deallocate_pipeline_outputs = False
+        if isinstance(self.base_module.config, TransformerConfig):
+            self.fuse_wgrad_accumulation = self.base_module.config.gradient_accumulation_fusion
+            self.backward_retain_grad = self.base_module.config.cuda_graph_retain_backward_graph
+            self.fp8_enabled = self.base_module.config.fp8 is not None
+            self.deallocate_pipeline_outputs = self.base_module.config.deallocate_pipeline_outputs
+
+            if self.fp8_enabled:
+                self.fp8_recipe = FP8GlobalStateManager.get_fp8_recipe()
+                FP8GlobalStateManager.set_skip_fp8_weight_update_tensor(False)
+
+        from megatron.core.transformer.transformer_layer import TransformerLayer
+
+        self.is_first_layer = None
+        self.is_last_layer = None
+        self.is_transformer_decoder_layer = False
+        if isinstance(base_module, TransformerLayer) and isinstance(
+            base_module.cross_attention, IdentityOp
+        ):
+            self.is_transformer_decoder_layer = True
+
+            total_num_layers = base_module.config.num_layers
+            pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+            vpp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+            if vpp_size is None:
+                vpp_size = 1
+
+            layers_per_chunk = total_num_layers // vpp_size // pp_size
+            self.is_first_layer = ((base_module.layer_number - 1) % layers_per_chunk) == 0
+            self.is_last_layer = (base_module.layer_number % layers_per_chunk) == 0
+
+    def get_fp8_context(self):
+        """Return a new fp8 context in cudagraph mode."""
+
+        if self.fp8_enabled:
+            return fp8_autocast(
+                enabled=True, calibrating=False, fp8_recipe=self.fp8_recipe, _graph=True
+            )
+        return nullcontext()
+
+    def create_fwd_graph(self, mempool, args, kwargs, clone_inputs=True):
+        """Create a fwd cudagraph for this runner. Should be called inside
+        'create_cudagraphs()'."""
+
+        # save grads and other variables that may be affected by graph warmup
+        if self.training and torch.is_grad_enabled():
+            save_main_grads = [
+                param.main_grad.clone()
+                for param in self.base_module.parameters()
+                if hasattr(param, 'main_grad')
+            ]
+
+        if self.fp8_enabled:
+            if is_te_min_version("1.13.0"):
+                saved_fp8_tensors = save_fp8_tensors([self.base_module], self.fp8_recipe)
+            else:
+                saved_fp8_tensors = save_fp8_tensors(
+                    [self.base_module], self.fp8_recipe.amax_history_len
+                )
+
+        if clone_inputs:
+            args, kwargs = self.replace_tensors(args, kwargs)

-    def static_args_match(self, args, kwargs):
+        self.fwd_graph_input_args = args
+        self.fwd_graph_input_kwargs = kwargs
+
+        input_tensors = self.get_tensors(args, kwargs)
+        self.fwd_graph_input_surface = input_tensors + tuple(self.base_module.parameters())
+
+        self.fwd_graph = torch.cuda.CUDAGraph()
+
+        # For cases with multiple active RNG states, e.g. TP.
+        if graph_safe_rng_available():
+            for _, state in get_all_rng_states().items():
+                self.fwd_graph.register_generator_state(state)
+
+        # warmup again as case graph capture mode may execute a different codepath
+        for _ in range(2):
+            with self.get_fp8_context():
+                outputs = self.base_module.forward(
+                    *self.fwd_graph_input_args, **self.fwd_graph_input_kwargs
+                )
+            if self.training and torch.is_grad_enabled():
+                outputs = self.get_tensors(outputs)
+                grad_inputs = torch.autograd.grad(
+                    outputs=tuple(o for o in outputs if o.requires_grad),
+                    inputs=tuple(i for i in self.fwd_graph_input_surface if i.requires_grad),
+                    grad_outputs=tuple(
+                        torch.zeros_like(o) if o.requires_grad else None for o in outputs
+                    ),
+                    only_inputs=True,
+                    allow_unused=True,
+                )
+
+        with self.get_fp8_context():
+            torch.cuda.synchronize()
+            with torch.cuda.graph(self.fwd_graph, pool=mempool):
+                outputs = self.base_module.forward(
+                    *self.fwd_graph_input_args, **self.fwd_graph_input_kwargs
+                )
+
+        # save cudagraph output buffer
+        self.fwd_graph_outputs = outputs
+        self.fwd_graph_output_surface = self.get_tensors(outputs)
+
+        if self.training and torch.is_grad_enabled():
+            assert (
+                len(self.fwd_graph_output_surface) > 0
+            ), """Tried graphing a moudule that returned no tensors in training mode, 
+                however the graphed module must output at least one tensor, 
+                so that a corresponding backward node may be registered in the autograd graph."""
+
+            # restore cached grads
+            for param in self.base_module.parameters():
+                if hasattr(param, 'main_grad'):
+                    saved_grad = save_main_grads.pop(0)
+                    assert (
+                        param.main_grad.shape == saved_grad.shape
+                    ), "Error restoring grads while cudagraphing!"
+                    param.main_grad.copy_(saved_grad)
+
+        if self.fp8_enabled:
+            restore_fp8_tensors([self.base_module], saved_fp8_tensors)
+
+    def create_bwd_graph(self, mempool, static_grad_outputs=None):
+        """Create a bwd cudagraph for this runner. Should be called inside
+        'create_cudagraphs()'."""
+
+        self.bwd_graph = torch.cuda.CUDAGraph()
+
+        # For cases with multiple active RNG states, e.g. TP.
+        if graph_safe_rng_available():
+            for _, state in get_all_rng_states().items():
+                self.bwd_graph.register_generator_state(state)
+
+        if static_grad_outputs is None:
+            static_grad_outputs = tuple(
+                torch.zeros_like(o) if o.requires_grad else None
+                for o in self.fwd_graph_output_surface
+            )
+        else:
+            if torch.is_tensor(static_grad_outputs):
+                static_grad_outputs = (static_grad_outputs,)
+
+        torch.cuda.synchronize()
+        with torch.cuda.graph(self.bwd_graph, pool=mempool):
+            grad_inputs = torch.autograd.grad(
+                outputs=tuple(o for o in self.fwd_graph_output_surface if o.requires_grad),
+                inputs=tuple(i for i in self.fwd_graph_input_surface if i.requires_grad),
+                grad_outputs=tuple(o for o in static_grad_outputs if o is not None),
+                retain_graph=self.backward_retain_grad,
+                only_inputs=True,
+                allow_unused=True,
+            )
+
+        # Constructs a tuple suitable for returning from Graphed.backward:
+        # Pads out the actually-needed grads with Nones in gradient slots for inputs
+        # that don't require grad. I couldn't think of a one-liner for this pattern.
+        static_grad_inputs = []
+        grad_idx = 0
+        for arg in self.fwd_graph_input_surface:
+            if arg.requires_grad:
+                static_grad_inputs.append(grad_inputs[grad_idx])
+                grad_idx += 1
+            else:
+                static_grad_inputs.append(None)
+        static_grad_inputs = tuple(static_grad_inputs)
+
+        self.groundtruth_grad_added_to_main_grad = {}
+        if self.fuse_wgrad_accumulation:
+            for param in self.base_module.parameters():
+                if hasattr(param, "grad_added_to_main_grad"):
+                    self.groundtruth_grad_added_to_main_grad[param] = param.grad_added_to_main_grad
+
+        self.static_grad_outputs = static_grad_outputs
+        self.static_grad_inputs = static_grad_inputs
+
+    def record_graph_capture(self, args, kwargs):
+        """If this is the first time this runner has encountered a fwd pass, a cudagraph needs to
+        be created. Record this to _CudagraphGlobalRecord which will mapped to a cudagraph when
+        'create_cudagraphs()` is called. Subsequent fwd passes will replay the cudagraph.
+        """
+        if not self.fwd_graph_recorded:
+            _CudagraphGlobalRecord.record_fwd_graph(self, args, kwargs)
+            self.fwd_graph_recorded = True
+
+        # Run the forward pass as normal in eager mode.
+        out = super(MegatronModule, self.base_module).__call__(*args, **kwargs)
+
+        # Register a noop autograd node that toggles `self.graph_status` in the bwd pass, which
+        # tracks when the runner completes its bwd pass.
+        # If it's the first bwd encountered by this runner, record it to _CudagraphGlobalRecord
+        out = tuple(_CudagraphFuncNoop.apply(self, o) if torch.is_tensor(o) else o for o in out)
+
+        if self.deallocate_pipeline_outputs:
+            out = tuple(o.clone() if torch.is_tensor(o) else o for o in out)
+
+        return out
+
+    def replay_graph_capture(self, is_first_microbatch, args, kwargs):
+        """Replay the fwd cuda graph with autograd."""
+
+        assert self.matches_graph_inputs(
+            args, kwargs
+        ), "Tried replaying a cudagraph with different arguments than what if was created with!"
+
+        inp_tensors = self.get_tensors(args, kwargs)
+        func_args = inp_tensors + tuple(self.parameters())
+
+        out = _CudagraphFunc.apply(self, is_first_microbatch, *func_args)
+        out = list(out)
+        return tuple(out.pop(0) if torch.is_tensor(o) else o for o in self.fwd_graph_outputs)
+
+    def forward(self, is_first_microbatch, args, kwargs):
+        """Forward pass of the runner. If cudagraphs have not been created, record the
+        execution of this fwd and bwd pass for graph capture. Else, replay the cudagraphs."""
+
+        if not self.cudagraph_created:
+            out = self.record_graph_capture(args, kwargs)
+        else:
+            out = self.replay_graph_capture(is_first_microbatch, args, kwargs)
+
+        # If forward only, next replay should be a forward pass as well
+        if self.training and torch.is_grad_enabled():
+            self.status = _GraphStatus.BWD_READY
+        else:
+            self.status = _GraphStatus.FWD_READY
+
+        return out
+
+    def matches_graph_inputs(self, args, kwargs):
        """Check the the passed args, kwargs match with the arg, kwargs
        the graph was created with."""

        def check(val, ref):
-            if isinstance(ref, TensorDescription):
-                return ref.matches_tensor(val)
-            return ref == val
+            _check_supported_type(val)
+            _check_supported_type(ref)
+
+            # check that the args are the same type
+            if not ((type(val) == type(ref)) or (is_dataclass(val) and is_dataclass(ref))):
+                return False
+
+            # if tensors, check they have the same shape, device and type
+            # differing memory layout is allowed as 'copy_' is able to handle different layouts
+            if isinstance(ref, torch.Tensor):
+                return (
+                    val.shape == ref.shape and val.dtype == ref.dtype and val.device == ref.device
+                )

-        if len(args) != len(self.groundtruth_args):
+            # if dataclass, check args in fields are the same
+            elif is_dataclass(ref):
+                for field in fields(ref):
+                    if not check(getattr(val, field.name), getattr(ref, field.name)):
+                        return False
+                return True
+            else:
+                return ref == val
+
+        if len(args) != len(self.fwd_graph_input_args):
            return False
-        for idx, groundtruth_arg in enumerate(self.groundtruth_args):
-            if not check(args[idx], groundtruth_arg):
+        for arg, graph_arg in zip(args, self.fwd_graph_input_args):
+            if not check(args, graph_arg):
                return False

-        if kwargs.keys() != self.groundtruth_kwargs.keys():
+        if kwargs.keys() != self.fwd_graph_input_kwargs.keys():
            return False
-        for k, v in self.groundtruth_kwargs.items():
+        for k, v in self.fwd_graph_input_kwargs.items():
            if not check(kwargs[k], v):
                return False
        return True

-    def forward(self, args, kwargs, is_first_microbatch=None):
-        """Call the forward pass of the cuda graph."""
-        if self.training and torch.is_grad_enabled():
-            args = list(args)
-            for pos in range(len(args)):
-                if torch.is_tensor(args[pos]):
-                    args[pos] = GraphStatusFunc.apply(self, args[pos])
-            for k, v in kwargs.items():
-                if torch.is_tensor(v):
-                    kwargs[k] = GraphStatusFunc.apply(self, v)
-
-        ret_tensors = self.graphed_module(is_first_microbatch=is_first_microbatch, *args, **kwargs)
-        ret_tensors = [ret_tensors] if torch.is_tensor(ret_tensors) else list(ret_tensors)
-        out = tuple(
-            ret_tensors.pop(0) if isinstance(o, TensorDescription) else o
-            for o in self.groundtruth_outputs
-        )
-
-        # Check that the static graph matches what was recorded during graph capture
-        assert len(out) == len(self.groundtruth_outputs)
-        for idx, o in enumerate(self.groundtruth_outputs):
-            if isinstance(o, TensorDescription):
-                assert o.matches_tensor(out[idx])
+    def replace_tensors(self, args, kwargs=None):
+        """Replace all tensors inside arg, kwargs with zeroed copies."""
+
+        def clone_tensor(ten):
+            cloned = torch.zeros_like(ten)
+            cloned.requires_grad = ten.requires_grad
+            return cloned
+
+        def process_arg(arg):
+            _check_supported_type(arg)
+            if torch.is_tensor(arg):
+                return clone_tensor(arg)
+            elif is_dataclass(arg):
+                for field in fields(arg):
+                    attr = getattr(arg, field.name)
+                    if torch.is_tensor(attr):
+                        setattr(arg, field.name, clone_tensor(attr))
+            return arg
+
+        args_replaced = []
+        for arg in args:
+            args_replaced.append(process_arg(arg))
+        if kwargs is None:
+            return arg
+
+        kwargs_replaced = {}
+        for k, v in kwargs.items():
+            kwargs_replaced[k] = process_arg(v)
+
+        return args_replaced, kwargs_replaced
+
+    def get_tensors(self, args, kwargs=None):
+        """Filter and flatten all tensors from args and kwargs."""
+
+        def extract_tensors(arg):
+            _check_supported_type(arg)
+            if torch.is_tensor(arg):
+                return [arg]
+            elif is_dataclass(arg):
+                tens = []
+                for field in fields(arg):
+                    attr = getattr(arg, field.name)
+                    if torch.is_tensor(attr):
+                        tens.append(attr)
+                return tens
            else:
-                assert o == out[idx]
+                return []

-        if len(out) == 1:
-            return out[0]
-        return out
+        tens = []
+        args, _ = tree_flatten(args)
+        for a in args:
+            tens.extend(extract_tensors(a))
+
+        if kwargs is not None:
+            kwargs, _ = tree_flatten(kwargs)
+            for k in kwargs:
+                tens.extend(extract_tensors(k))
+        return tuple(tens)


 class CudaGraphManager(torch.nn.Module):
@@ -199,14 +704,29 @@ class CudaGraphManager(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.cudagraph_runners = []
-        self.is_first_microbatch = True
+        self.is_first_microbatch = False
        assert HAVE_TE_GRAPHS, "CudaGraphManager currently requires TransformerEngine"

        # Cudagraph stream capture requires no operations on the default stream prior to the
-        # capture, so change to a side stream. At graph capture change it back.
+        # capture, so change to a side stream.
        self.stream = torch.cuda.current_stream()
        torch.cuda.set_stream(torch.cuda.Stream())

+    def call_ddp_preforward_hook(self, module):
+        """Call any DDP pre-forward hooks which are used to launch async data parallel
+        param gather. Any other pre-forward hooks are not allowed."""
+
+        from megatron.core.distributed import distributed_data_parallel
+
+        if module._forward_pre_hooks:
+            for _, hook in module._forward_pre_hooks.items():
+                assert (
+                    inspect.getmodule(hook) == distributed_data_parallel
+                ), "Tried to cudagraph a module with user registered pre-forward hooks, \
+                which is not allowed."
+                # Only hooks from Mcore DDP, which take no args, should be called at this point.
+                hook(module)
+
    def __call__(self, megatron_module, args, kwargs):
        """Calls the forward pass of the cudagraphed module.

@@ -230,84 +750,22 @@ class CudaGraphManager(torch.nn.Module):

        runner = None
        for _runner in self.cudagraph_runners:
-            if _runner.static_args_match(args, kwargs) and _runner.status == GraphStatus.FWD_READY:
+            if _runner.status == _GraphStatus.FWD_READY:
                runner = _runner
                break

        if runner is None:
            if self.training and torch.is_grad_enabled():
-                runner = self.create_cudagraph_module(megatron_module, args, kwargs)
+                runner = _CudaGraphRunner(megatron_module, len(self.cudagraph_runners))
                self.cudagraph_runners.append(runner)
-                logging.getLogger(__name__).info(
-                    f"Creating cudagraph; now have {len(self.cudagraph_runners)}"
-                )
            else:
                # No cudagraphs were found in inference mode, so fallback to eager since
                # tensor.requires_grad is needed to correctly trace the backward graph.
                return super(MegatronModule, megatron_module).__call__(*args, **kwargs)

-        tensor_args, tensor_kwargs = self.get_tensor_args(args, kwargs)
-        out = runner(tensor_args, tensor_kwargs, is_first_microbatch=self.is_first_microbatch)
-        self.is_first_microbatch = False
-        return out
-
-    def get_tensor_args(self, args, kwargs):
-        """Filter out non-tensor arguments from args and kwargs.
-        Needed since 'make_graphed_callables' expects Torch.tensor arg, kwargs."""
-        tensor_kwargs = {}
-        for k, v in kwargs.items():
-            if torch.is_tensor(v):
-                tensor_kwargs[k] = v
-        tensor_args = tuple(arg for arg in args if torch.is_tensor(arg))
-        return tensor_args, tensor_kwargs
-
-    def create_cudagraph_module(self, megatron_module, args, kwargs):
-        """Record the graph capture stream. Runs warmup iterations of
-        megatron_module, and creates a autograd function, where the
-        forward, backward functions are the cudagraphs of module's forward,
-        backward passes. Finally wraps this cudagraph function with a CudaGraphRunner.
-        """
-
-        torch.cuda.synchronize()
-        torch.cuda.set_stream(self.stream)
-        start = time.time()
-
-        wrapped_module = CudaGraphCallable(megatron_module, args, kwargs)
-        sample_args, sample_kwargs = self.get_tensor_args(args, kwargs)
-
-        # Cudagraphs require no autograd history recorded on sample inputs
-        sample_args_detached = tuple(n.detach() for n in sample_args)
-        sample_kwargs_detached = {k: v.detach() for k, v in sample_kwargs.items()}
-        sample_args_copy = tuple(torch.clone(n) for n in sample_args_detached)
-        sample_kwargs_copy = {k: torch.clone(v) for k, v in sample_kwargs_detached.items()}
-
-        # Zero out input args inplace so cudagraph warmup doesnt affect grads
-        for orig, detach in zip(sample_args, sample_args_detached):
-            detach.zero_()
-            detach.requires_grad = orig.requires_grad
-        for k, detach in sample_kwargs_detached.items():
-            detach.zero_()
-            detach.requires_grad = sample_kwargs[k].requires_grad
-
-        fp8_enabled = megatron_module.config.fp8 is not None
-        fp8_recipe = FP8GlobalStateManager.get_fp8_recipe() if fp8_enabled else None
-        graphed_module = make_graphed_callables(
-            modules=wrapped_module,
-            sample_args=sample_args_detached,
-            sample_kwargs=sample_kwargs_detached,
-            _order=[1, -1],
-            allow_unused_input=True,
-            fp8_enabled=fp8_enabled,
-            fp8_recipe=fp8_recipe,
-            fp8_weight_caching=True,
-        )
-
-        # Restore zeroed out sample args
-        # Detach again since pytorch prohibits inplace ops on leaf nodes
-        for orig, copy in zip(sample_args, sample_args_copy):
-            orig.detach().copy_(copy)
-        for k, orig in sample_kwargs.items():
-            orig.detach().copy_(sample_kwargs_copy[k])
+        # Trigger Mcore DDP pre-forward hooks
+        self.call_ddp_preforward_hook(megatron_module)
+        for module in megatron_module.modules():
+            self.call_ddp_preforward_hook(module)

-        logging.getLogger(__name__).info(f'Time spent in cudagraph capture: {time.time() - start}s')
-        return CudaGraphRunner(graphed_module, wrapped_module)
+        return runner(self.is_first_microbatch, args, kwargs)
--- a/megatron/core/transformer/custom_layers/__init__.py
+++ b/megatron/core/transformer/custom_layers/__init__.py
--- a/megatron/core/transformer/custom_layers/transformer_engine.py
+++ b/megatron/core/transformer/custom_layers/transformer_engine.py