Unverified Commit 79eb9369 authored by Peter Pan's avatar Peter Pan Committed by GitHub
Browse files

fix CUDAGraph memory being counted twice (#37426)


Signed-off-by: default avatarPeter Pan <Peter.Pan@daocloud.io>
Signed-off-by: default avatarPeter Pan <peter.pan@daocloud.io>
Co-authored-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
parent e80cfe57
......@@ -417,9 +417,7 @@ class Worker(WorkerBase):
)
self.non_torch_memory = profile_result.non_torch_increase
self.peak_activation_memory = (
profile_result.torch_peak_increase + cudagraph_memory_estimate_applied
)
self.peak_activation_memory = profile_result.torch_peak_increase
self.cudagraph_memory_estimate = cudagraph_memory_estimate
free_gpu_memory = profile_result.after_profile.free_memory
......@@ -638,6 +636,7 @@ class Worker(WorkerBase):
# slightly underestimate the memory consumption.
# So leave a small buffer (=150MiB) to avoid OOM.
redundancy_buffer_memory = 150 * (1 << 20)
non_kv_cache_memory = (
self.model_runner.model_memory_usage
+ self.peak_activation_memory
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment