Unverified Commit 1ffc4dee authored by Hz, Ji's avatar Hz, Ji Committed by GitHub
Browse files

enable memory tracker metrics for npu (#27280)

parent d7dcfa89
...@@ -459,6 +459,11 @@ class TrainerMemoryTracker: ...@@ -459,6 +459,11 @@ class TrainerMemoryTracker:
elif is_torch_xpu_available(): elif is_torch_xpu_available():
import torch import torch
self.torch = torch
self.gpu = {}
elif is_torch_npu_available():
import torch
self.torch = torch self.torch = torch
self.gpu = {} self.gpu = {}
else: else:
...@@ -517,6 +522,9 @@ class TrainerMemoryTracker: ...@@ -517,6 +522,9 @@ class TrainerMemoryTracker:
elif is_torch_xpu_available(): elif is_torch_xpu_available():
self.torch.xpu.reset_peak_memory_stats() self.torch.xpu.reset_peak_memory_stats()
self.torch.xpu.empty_cache() self.torch.xpu.empty_cache()
elif is_torch_npu_available():
self.torch.npu.reset_peak_memory_stats()
self.torch.npu.empty_cache()
# gpu # gpu
if self.torch is not None: if self.torch is not None:
...@@ -524,6 +532,8 @@ class TrainerMemoryTracker: ...@@ -524,6 +532,8 @@ class TrainerMemoryTracker:
self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated() self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated()
elif is_torch_xpu_available(): elif is_torch_xpu_available():
self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated() self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
elif is_torch_npu_available():
self.gpu_mem_used_at_start = self.torch.npu.memory_allocated()
# cpu # cpu
self.cpu_mem_used_at_start = self.cpu_mem_used() self.cpu_mem_used_at_start = self.cpu_mem_used()
...@@ -551,6 +561,8 @@ class TrainerMemoryTracker: ...@@ -551,6 +561,8 @@ class TrainerMemoryTracker:
self.torch.cuda.empty_cache() self.torch.cuda.empty_cache()
elif is_torch_xpu_available(): elif is_torch_xpu_available():
self.torch.xpu.empty_cache() self.torch.xpu.empty_cache()
elif is_torch_npu_available():
self.torch.npu.empty_cache()
# concepts: # concepts:
# - alloc_delta: the difference of allocated memory between the end and the start # - alloc_delta: the difference of allocated memory between the end and the start
...@@ -565,6 +577,9 @@ class TrainerMemoryTracker: ...@@ -565,6 +577,9 @@ class TrainerMemoryTracker:
elif is_torch_xpu_available(): elif is_torch_xpu_available():
self.gpu_mem_used_now = self.torch.xpu.memory_allocated() self.gpu_mem_used_now = self.torch.xpu.memory_allocated()
self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated() self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated()
elif is_torch_npu_available():
self.gpu_mem_used_now = self.torch.npu.memory_allocated()
self.gpu_mem_used_peak = self.torch.npu.max_memory_allocated()
else: else:
raise ValueError("No available GPU device found!") raise ValueError("No available GPU device found!")
......
...@@ -1944,18 +1944,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): ...@@ -1944,18 +1944,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
metrics = trainer.train().metrics metrics = trainer.train().metrics
check_func("init_mem_cpu_alloc_delta", metrics) check_func("init_mem_cpu_alloc_delta", metrics)
check_func("train_mem_cpu_alloc_delta", metrics) check_func("train_mem_cpu_alloc_delta", metrics)
if torch.cuda.device_count() > 0: if backend_device_count(torch_device) > 0:
check_func("init_mem_gpu_alloc_delta", metrics) check_func("init_mem_gpu_alloc_delta", metrics)
check_func("train_mem_gpu_alloc_delta", metrics) check_func("train_mem_gpu_alloc_delta", metrics)
metrics = trainer.evaluate() metrics = trainer.evaluate()
check_func("eval_mem_cpu_alloc_delta", metrics) check_func("eval_mem_cpu_alloc_delta", metrics)
if torch.cuda.device_count() > 0: if backend_device_count(torch_device) > 0:
check_func("eval_mem_gpu_alloc_delta", metrics) check_func("eval_mem_gpu_alloc_delta", metrics)
metrics = trainer.predict(RegressionDataset()).metrics metrics = trainer.predict(RegressionDataset()).metrics
check_func("test_mem_cpu_alloc_delta", metrics) check_func("test_mem_cpu_alloc_delta", metrics)
if torch.cuda.device_count() > 0: if backend_device_count(torch_device) > 0:
check_func("test_mem_gpu_alloc_delta", metrics) check_func("test_mem_gpu_alloc_delta", metrics)
def test_mem_metrics(self): def test_mem_metrics(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment