Unverified Commit 10a1018c authored by Zhaodong Bing's avatar Zhaodong Bing Committed by GitHub
Browse files

[ROCm] fix sleep mode not releasing GPU memory problem on ROCm (#37533)


Signed-off-by: default avatarbingzhaodong <aaab8b@gmail.com>
Co-authored-by: default avatarTJian <tunjian.tan@embeddedllm.com>
parent aec2dc6c
......@@ -232,6 +232,28 @@ void unmap_and_release(unsigned long long device, ssize_t size,
}
}
// ROCm workaround: hipMemRelease does not return physical VRAM to the
// free pool while the virtual-address reservation is still held.
// Cycling cuMemAddressFree → cuMemAddressReserve (at the same address)
// forces the driver to actually release the physical pages while keeping
// the same VA available for a later create_and_map.
if (first_error == no_error) {
first_error = cuMemAddressFree(d_mem, size);
if (first_error == no_error) {
CUdeviceptr d_mem_new = 0;
first_error = cuMemAddressReserve(&d_mem_new, size, 0, d_mem, 0);
if (first_error == no_error && d_mem_new != d_mem) {
cuMemAddressFree(d_mem_new, size);
snprintf(error_msg, sizeof(error_msg),
"ROCm: VA re-reserve got %p instead of %p", (void*)d_mem_new,
(void*)d_mem);
error_code = CUresult(1);
std::cerr << error_msg << std::endl;
return;
}
}
}
if (first_error != no_error) {
CUDA_CHECK(first_error);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment