Unverified Commit 08b49976 authored by Tian Zheng's avatar Tian Zheng Committed by GitHub
Browse files

[Paddle] Fix device memory leak (#1029)



* i
Signed-off-by: default avatarTian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com>

* .
Signed-off-by: default avatarTian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com>

---------
Signed-off-by: default avatarTian Zheng (Engrg-Hardware 1) <tizheng@nvidia.com>
parent 9edcaf0e
...@@ -1355,14 +1355,13 @@ void amax_and_scale_update_inplace(paddle::Tensor &amax_history, // NOLINT ...@@ -1355,14 +1355,13 @@ void amax_and_scale_update_inplace(paddle::Tensor &amax_history, // NOLINT
static_cast<NVTEDType>(fp8_dtype), margin, amax_history.stream()); static_cast<NVTEDType>(fp8_dtype), margin, amax_history.stream());
} }
void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history, // NOLINT void amax_and_scale_update_inplace_legacy(
paddle::Tensor &amax_history, // NOLINT
paddle::Tensor &scale, // NOLINT paddle::Tensor &scale, // NOLINT
paddle::Tensor &scale_inv, // NOLINT paddle::Tensor &scale_inv, // NOLINT
const paddle::Tensor &non_weight_mask, const paddle::Tensor &non_weight_mask,
const paddle::Tensor &current_step_id_tensor, const paddle::optional<paddle::Tensor> &current_step_id_tensor, bool update_weight_scale_inv,
bool update_weight_scale_inv, bool fwd_update, bool fwd_update, float fp8_max, float margin, const std::string &amax_compute) {
float fp8_max, float margin,
const std::string &amax_compute) {
#if PADDLE_VERSION > 261 #if PADDLE_VERSION > 261
NVTE_CHECK(amax_compute == "max" || amax_compute == "most_recent"); NVTE_CHECK(amax_compute == "max" || amax_compute == "most_recent");
...@@ -1380,8 +1379,7 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history, // NOLI ...@@ -1380,8 +1379,7 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history, // NOLI
auto amax_numel = amax.numel(); auto amax_numel = amax.numel();
size_t num_blocks = (amax_history_numel + BLOCK_SIZE - 1) / BLOCK_SIZE; size_t num_blocks = (amax_history_numel + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int *current_step_id_ptr = nullptr; const int *current_step_id_ptr = GetOptionalDataPtr<int>(current_step_id_tensor);
if (fwd_update) current_step_id_ptr = current_step_id_tensor.data<int>();
auto parameterSetter = [current_step_id_ptr, auto parameterSetter = [current_step_id_ptr,
fwd_update](phi::backends::gpu::CUDAKernelParams &params) { fwd_update](phi::backends::gpu::CUDAKernelParams &params) {
if (fwd_update) { if (fwd_update) {
...@@ -1758,7 +1756,8 @@ PD_BUILD_OP(te_scaled_upper_triang_masked_softmax_backward) ...@@ -1758,7 +1756,8 @@ PD_BUILD_OP(te_scaled_upper_triang_masked_softmax_backward)
PD_KERNEL(transformer_engine::paddle_ext::te_scaled_upper_triang_masked_softmax_backward)); PD_KERNEL(transformer_engine::paddle_ext::te_scaled_upper_triang_masked_softmax_backward));
PD_BUILD_OP(amax_and_scale_update_inplace_legacy) PD_BUILD_OP(amax_and_scale_update_inplace_legacy)
.Inputs({"_amax_history", "_scale", "_scale_inv", "non_weight_mask", "current_step_id_tensor"}) .Inputs({"_amax_history", "_scale", "_scale_inv", "non_weight_mask",
paddle::Optional("current_step_id_tensor")})
.Outputs({"amax_history", "scale", "scale_inv"}) .Outputs({"amax_history", "scale", "scale_inv"})
.SetInplaceMap({{"_amax_history", "amax_history"}, .SetInplaceMap({{"_amax_history", "amax_history"},
{"_scale", "scale"}, {"_scale", "scale"},
......
...@@ -84,15 +84,7 @@ class TransformerEngineBaseLayer(paddle.nn.Layer, ABC): ...@@ -84,15 +84,7 @@ class TransformerEngineBaseLayer(paddle.nn.Layer, ABC):
self.fp8_weights = [] self.fp8_weights = []
self.fp8_weight_cache = {} self.fp8_weight_cache = {}
self.registered_pp_start_callback = False self.registered_pp_start_callback = False
self.current_step_id = None
self.current_step_id = paddle.to_tensor([1], dtype=paddle.int32, place=paddle.CPUPlace())
def current_step_id_callback(step_id=None, **kwargs): # pylint: disable=unused-argument
self.current_step_id.copy_(
paddle.to_tensor([step_id], dtype=paddle.int32, place=paddle.CPUPlace()), True
)
register_pp_fwd_begin_hook(current_step_id_callback)
def set_activation_dtype(self, inp: paddle.Tensor) -> None: def set_activation_dtype(self, inp: paddle.Tensor) -> None:
"""Get activation data type for AMP.""" """Get activation data type for AMP."""
...@@ -301,6 +293,27 @@ class TransformerEngineBaseLayer(paddle.nn.Layer, ABC): ...@@ -301,6 +293,27 @@ class TransformerEngineBaseLayer(paddle.nn.Layer, ABC):
if self.fp8_meta.get("update_amax_and_scale_fwd", False): if self.fp8_meta.get("update_amax_and_scale_fwd", False):
global_fp8_fwd_buffer = get_global_fp8_state().get_fp8_fwd_buffer() global_fp8_fwd_buffer = get_global_fp8_state().get_fp8_fwd_buffer()
global_fp8_fwd_buffer.wait() global_fp8_fwd_buffer.wait()
# Register PP forward begin hook when CUDAGraph is enabled.
# NOTE(tizheng): register_pp_fwd_begin_hook prevents layer parameters from being freed
# when the layer object is deleted. Need to find a better way.
if get_global_fp8_state().is_cudagraph_enabled() and self.current_step_id is None:
self.current_step_id = paddle.to_tensor(
[1], dtype=paddle.int32, place=paddle.CPUPlace()
)
def current_step_id_callback(
step_id=None, **kwargs
): # pylint: disable=unused-argument
self.current_step_id.copy_(
paddle.to_tensor(
[step_id], dtype=paddle.int32, place=paddle.CPUPlace()
),
True,
)
if is_pp_enabled():
register_pp_fwd_begin_hook(current_step_id_callback)
if self.fp8_meta["recipe"].reduce_amax: if self.fp8_meta["recipe"].reduce_amax:
global_fp8_fwd_buffer.copy_amax_from_buffer(self.fp8_meta) global_fp8_fwd_buffer.copy_amax_from_buffer(self.fp8_meta)
amax_and_scale_update( amax_and_scale_update(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment