[DCU] Fix WS leak when init+destroy ub more than 1

8fda607c · yuguo · 9da3621b · 8fda607c
Commit 8fda607c authored Apr 29, 2025 by yuguo
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 2 deletions

transformer_engine/pytorch/module/base.py transformer_engine/pytorch/module/base.py +8 -2

No files found.
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -225,9 +225,15 @@ def initialize_ub(
                flush=True,
            )

-    # Increase the workspace by the number of maximum concurrent streams
+    # Allocate cuBLAS workspace with expanded size for chunking in overlapping GEMM calls
    global _cublas_workspace
-    _cublas_workspace = get_workspace().repeat(_NUM_MAX_UB_STREAMS)
+    if _cublas_workspace is None:
+        _cublas_workspace = get_workspace().repeat(_NUM_MAX_UB_STREAMS)
+    elif _cublas_workspace.numel() != get_cublas_workspace_size_bytes() * _NUM_MAX_UB_STREAMS:
+        # This ensures we don't do `.repeat()` on an already expanded workspace
+        _cublas_workspace = torch.empty(
+            get_cublas_workspace_size_bytes(), dtype=torch.uint8, device="cuda"
+        ).repeat(_NUM_MAX_UB_STREAMS)

    # Default buffer precision: AllGather buffers use fp8 when using fp8 recipe
    layers_all_gather_overlap = [