[PyTorch] Add pool argument to make_graphed_callable (#1218)

Add pool argument to make_graphed_callable Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[PyTorch] Add pool argument to make_graphed_callable (#1218)
Add pool argument to make_graphed_callable Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
728c558b · Kirthi Shankar Sivamani · GitHub · 7b152a83 · 728c558b
Unverified Commit 728c558b authored Sep 30, 2024 by Kirthi Shankar Sivamani Committed by GitHub Sep 30, 2024
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

transformer_engine/pytorch/graph.py transformer_engine/pytorch/graph.py +7 -1

No files found.
--- a/transformer_engine/pytorch/graph.py
+++ b/transformer_engine/pytorch/graph.py
@@ -61,6 +61,7 @@ def _make_graphed_callables(
    fp8_weight_caching: bool = False,
    sample_kwargs: Optional[SingleOrTuple[Dict[str, Any]]] = None,
    _order: Optional[List[int]] = None,
+    pool: Optional[Tuple[int, ...]] = None,
 ) -> SingleOrTuple[Callable]:
    """
    Helper method for `make_graphed_callables`
@@ -193,7 +194,7 @@ def _make_graphed_callables(
                fwd_graph.register_generator_state(state)
                bwd_graph.register_generator_state(state)

-    mempool = graph_pool_handle()
+    mempool = graph_pool_handle() if pool is None else pool

    # Warmup
    # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work
@@ -518,6 +519,7 @@ def make_graphed_callables(
    fp8_recipe: Optional[DelayedScaling] = None,
    fp8_weight_caching: bool = False,
    _order: Optional[List[int]] = None,
+    pool: Optional[Tuple[int, ...]] = None,
 ) -> Union[Callable, Tuple[Callable, ...]]:
    """
    Make CUDA graph version of Transformer Engine modules
@@ -541,6 +543,9 @@ def make_graphed_callables(
                        and outputs are disconnected in compute graph.
    sample_kwargs: (tuple of) dict, optional
                   Keyword arguments to callable(s)
+    pool: (tuple of) int, default = `None`, optional
+          An instance returned from function `torch.cuda.graph_pool_handle` that hints
+          this graph may share memory with the indicated pool.

    FP8-related parameters
    ----------------------
@@ -617,6 +622,7 @@ def make_graphed_callables(
        fp8_weight_caching=fp8_weight_caching,
        sample_kwargs=sample_kwargs,
        _order=_order,
+        pool=pool,
    )

    # Ensures warmup does not affect numerics for ops such as dropout.