Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1ffc4dee
"tests/models/vscode:/vscode.git/clone" did not exist on "a72f1c9f5b907f96cbb7de3bbb02a1d431d34071"
Unverified
Commit
1ffc4dee
authored
Nov 06, 2023
by
Hz, Ji
Committed by
GitHub
Nov 06, 2023
Browse files
enable memory tracker metrics for npu (#27280)
parent
d7dcfa89
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
3 deletions
+18
-3
src/transformers/trainer_utils.py
src/transformers/trainer_utils.py
+15
-0
tests/trainer/test_trainer.py
tests/trainer/test_trainer.py
+3
-3
No files found.
src/transformers/trainer_utils.py
View file @
1ffc4dee
...
...
@@ -459,6 +459,11 @@ class TrainerMemoryTracker:
elif
is_torch_xpu_available
():
import
torch
self
.
torch
=
torch
self
.
gpu
=
{}
elif
is_torch_npu_available
():
import
torch
self
.
torch
=
torch
self
.
gpu
=
{}
else
:
...
...
@@ -517,6 +522,9 @@ class TrainerMemoryTracker:
elif
is_torch_xpu_available
():
self
.
torch
.
xpu
.
reset_peak_memory_stats
()
self
.
torch
.
xpu
.
empty_cache
()
elif
is_torch_npu_available
():
self
.
torch
.
npu
.
reset_peak_memory_stats
()
self
.
torch
.
npu
.
empty_cache
()
# gpu
if
self
.
torch
is
not
None
:
...
...
@@ -524,6 +532,8 @@ class TrainerMemoryTracker:
self
.
gpu_mem_used_at_start
=
self
.
torch
.
cuda
.
memory_allocated
()
elif
is_torch_xpu_available
():
self
.
gpu_mem_used_at_start
=
self
.
torch
.
xpu
.
memory_allocated
()
elif
is_torch_npu_available
():
self
.
gpu_mem_used_at_start
=
self
.
torch
.
npu
.
memory_allocated
()
# cpu
self
.
cpu_mem_used_at_start
=
self
.
cpu_mem_used
()
...
...
@@ -551,6 +561,8 @@ class TrainerMemoryTracker:
self
.
torch
.
cuda
.
empty_cache
()
elif
is_torch_xpu_available
():
self
.
torch
.
xpu
.
empty_cache
()
elif
is_torch_npu_available
():
self
.
torch
.
npu
.
empty_cache
()
# concepts:
# - alloc_delta: the difference of allocated memory between the end and the start
...
...
@@ -565,6 +577,9 @@ class TrainerMemoryTracker:
elif
is_torch_xpu_available
():
self
.
gpu_mem_used_now
=
self
.
torch
.
xpu
.
memory_allocated
()
self
.
gpu_mem_used_peak
=
self
.
torch
.
xpu
.
max_memory_allocated
()
elif
is_torch_npu_available
():
self
.
gpu_mem_used_now
=
self
.
torch
.
npu
.
memory_allocated
()
self
.
gpu_mem_used_peak
=
self
.
torch
.
npu
.
max_memory_allocated
()
else
:
raise
ValueError
(
"No available GPU device found!"
)
...
...
tests/trainer/test_trainer.py
View file @
1ffc4dee
...
...
@@ -1944,18 +1944,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
metrics
=
trainer
.
train
().
metrics
check_func
(
"init_mem_cpu_alloc_delta"
,
metrics
)
check_func
(
"train_mem_cpu_alloc_delta"
,
metrics
)
if
torch
.
cuda
.
device_count
()
>
0
:
if
backend_
device_count
(
torch_device
)
>
0
:
check_func
(
"init_mem_gpu_alloc_delta"
,
metrics
)
check_func
(
"train_mem_gpu_alloc_delta"
,
metrics
)
metrics
=
trainer
.
evaluate
()
check_func
(
"eval_mem_cpu_alloc_delta"
,
metrics
)
if
torch
.
cuda
.
device_count
()
>
0
:
if
backend_
device_count
(
torch_device
)
>
0
:
check_func
(
"eval_mem_gpu_alloc_delta"
,
metrics
)
metrics
=
trainer
.
predict
(
RegressionDataset
()).
metrics
check_func
(
"test_mem_cpu_alloc_delta"
,
metrics
)
if
torch
.
cuda
.
device_count
()
>
0
:
if
backend_
device_count
(
torch_device
)
>
0
:
check_func
(
"test_mem_gpu_alloc_delta"
,
metrics
)
def
test_mem_metrics
(
self
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment