[Paddle] Fix device memory leak (#1029)

* i Signed-off-by: Tian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com> * . Signed-off-by: Tian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com> --------- Signed-off-by: Tian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com>

[Paddle] Fix device memory leak (#1029)
* i Signed-off-by: Tian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com> * . Signed-off-by: Tian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com> --------- Signed-off-by: Tian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com>
08b49976 · Tian Zheng · GitHub · 9edcaf0e · 08b49976 · 08b49976
Unverified Commit 08b49976 authored Jul 25, 2024 by Tian Zheng Committed by GitHub Jul 24, 2024
Showing with 32 additions and 20 deletions

transformer_engine/paddle/csrc/custom_ops.cu transformer_engine/paddle/csrc/custom_ops.cu +10 -11

transformer_engine/paddle/layer/base.py transformer_engine/paddle/layer/base.py +22 -9

No files found.
--- a/transformer_engine/paddle/csrc/custom_ops.cu
+++ b/transformer_engine/paddle/csrc/custom_ops.cu
@@ -1355,14 +1355,13 @@ void amax_and_scale_update_inplace(paddle::Tensor &amax_history,  // NOLINT
      static_cast<NVTEDType>(fp8_dtype), margin, amax_history.stream());
 }
-void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history,  // NOLINT
+void amax_and_scale_update_inplace_legacy(
+    paddle::Tensor &amax_history,  // NOLINT
    paddle::Tensor &scale,         // NOLINT
    paddle::Tensor &scale_inv,     // NOLINT
    const paddle::Tensor &non_weight_mask,
-                                          const paddle::Tensor &current_step_id_tensor,
+    const paddle::optional<paddle::Tensor> &current_step_id_tensor, bool update_weight_scale_inv,
-                                          bool update_weight_scale_inv, bool fwd_update,
+    bool fwd_update, float fp8_max, float margin, const std::string &amax_compute) {
-                                          float fp8_max, float margin,
-                                          const std::string &amax_compute) {
 #if PADDLE_VERSION > 261
  NVTE_CHECK(amax_compute == "max" || amax_compute == "most_recent");
@@ -1380,8 +1379,7 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history,  // NOLI
  auto amax_numel = amax.numel();
  size_t num_blocks = (amax_history_numel + BLOCK_SIZE - 1) / BLOCK_SIZE;
-  const int *current_step_id_ptr = nullptr;
+  const int *current_step_id_ptr = GetOptionalDataPtr<int>(current_step_id_tensor);
-  if (fwd_update) current_step_id_ptr = current_step_id_tensor.data<int>();
  auto parameterSetter = [current_step_id_ptr,
                          fwd_update](phi::backends::gpu::CUDAKernelParams &params) {
    if (fwd_update) {
@@ -1758,7 +1756,8 @@ PD_BUILD_OP(te_scaled_upper_triang_masked_softmax_backward)
        PD_KERNEL(transformer_engine::paddle_ext::te_scaled_upper_triang_masked_softmax_backward));
 PD_BUILD_OP(amax_and_scale_update_inplace_legacy)
-    .Inputs({"_amax_history", "_scale", "_scale_inv", "non_weight_mask", "current_step_id_tensor"})
+    .Inputs({"_amax_history", "_scale", "_scale_inv", "non_weight_mask",
+             paddle::Optional("current_step_id_tensor")})
    .Outputs({"amax_history", "scale", "scale_inv"})
    .SetInplaceMap({{"_amax_history", "amax_history"},
                    {"_scale", "scale"},

--- a/transformer_engine/paddle/layer/base.py
+++ b/transformer_engine/paddle/layer/base.py
@@ -84,15 +84,7 @@ class TransformerEngineBaseLayer(paddle.nn.Layer, ABC):
        self.fp8_weights = []
        self.fp8_weight_cache = {}
        self.registered_pp_start_callback = False
+        self.current_step_id = None
-        self.current_step_id = paddle.to_tensor([1], dtype=paddle.int32, place=paddle.CPUPlace())
-        def current_step_id_callback(step_id=None, **kwargs):  # pylint: disable=unused-argument
-            self.current_step_id.copy_(
-                paddle.to_tensor([step_id], dtype=paddle.int32, place=paddle.CPUPlace()), True
-            )
-        register_pp_fwd_begin_hook(current_step_id_callback)
    def set_activation_dtype(self, inp: paddle.Tensor) -> None:
        """Get activation data type for AMP."""
@@ -301,6 +293,27 @@ class TransformerEngineBaseLayer(paddle.nn.Layer, ABC):
            if self.fp8_meta.get("update_amax_and_scale_fwd", False):
                global_fp8_fwd_buffer = get_global_fp8_state().get_fp8_fwd_buffer()
                global_fp8_fwd_buffer.wait()
+                # Register PP forward begin hook when CUDAGraph is enabled.
+                # NOTE(tizheng): register_pp_fwd_begin_hook prevents layer parameters from being freed
+                # when the layer object is deleted. Need to find a better way.
+                if get_global_fp8_state().is_cudagraph_enabled() and self.current_step_id is None:
+                    self.current_step_id = paddle.to_tensor(
+                        [1], dtype=paddle.int32, place=paddle.CPUPlace()
+                    )
+                    def current_step_id_callback(
+                        step_id=None, **kwargs
+                    ):  # pylint: disable=unused-argument
+                        self.current_step_id.copy_(
+                            paddle.to_tensor(
+                                [step_id], dtype=paddle.int32, place=paddle.CPUPlace()
+                            ),
+                            True,
+                        )
+                    if is_pp_enabled():
+                        register_pp_fwd_begin_hook(current_step_id_callback)
                if self.fp8_meta["recipe"].reduce_amax:
                    global_fp8_fwd_buffer.copy_amax_from_buffer(self.fp8_meta)
                    amax_and_scale_update(