Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenPCDet
Commits
25d9f503
Commit
25d9f503
authored
Aug 21, 2022
by
Shaoshuai Shi
Browse files
support to show gpu utilization/memory when training
parent
d35088b6
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
4 deletions
+14
-4
tools/train.py
tools/train.py
+5
-1
tools/train_utils/train_utils.py
tools/train_utils/train_utils.py
+9
-3
No files found.
tools/train.py
View file @
25d9f503
...
...
@@ -47,6 +47,8 @@ def parse_config():
parser
.
add_argument
(
'--use_tqdm_to_record'
,
action
=
'store_true'
,
default
=
False
,
help
=
'if True, the intermediate losses will not be logged to file, only tqdm will be used'
)
parser
.
add_argument
(
'--logger_iter_interval'
,
type
=
int
,
default
=
50
,
help
=
''
)
parser
.
add_argument
(
'--ckpt_save_time_interval'
,
type
=
int
,
default
=
300
,
help
=
'in terms of seconds'
)
parser
.
add_argument
(
'--wo_gpu_stat'
,
action
=
'store_true'
,
help
=
''
)
args
=
parser
.
parse_args
()
...
...
@@ -162,6 +164,7 @@ def main():
# -----------------------start training---------------------------
logger
.
info
(
'**********************Start training %s/%s(%s)**********************'
%
(
cfg
.
EXP_GROUP_PATH
,
cfg
.
TAG
,
args
.
extra_tag
))
train_model
(
model
,
optimizer
,
...
...
@@ -183,7 +186,8 @@ def main():
logger
=
logger
,
logger_iter_interval
=
args
.
logger_iter_interval
,
ckpt_save_time_interval
=
args
.
ckpt_save_time_interval
,
use_logger_to_record
=
not
args
.
use_tqdm_to_record
use_logger_to_record
=
not
args
.
use_tqdm_to_record
,
show_gpu_stat
=
not
args
.
wo_gpu_stat
)
if
hasattr
(
train_set
,
'use_shared_memory'
)
and
train_set
.
use_shared_memory
:
...
...
tools/train_utils/train_utils.py
View file @
25d9f503
...
...
@@ -11,7 +11,7 @@ from pcdet.utils import common_utils, commu_utils
def
train_one_epoch
(
model
,
optimizer
,
train_loader
,
model_func
,
lr_scheduler
,
accumulated_iter
,
optim_cfg
,
rank
,
tbar
,
total_it_each_epoch
,
dataloader_iter
,
tb_log
=
None
,
leave_pbar
=
False
,
use_logger_to_record
=
False
,
logger
=
None
,
logger_iter_interval
=
50
,
cur_epoch
=
None
,
total_epochs
=
None
,
ckpt_save_dir
=
None
,
ckpt_save_time_interval
=
300
):
total_epochs
=
None
,
ckpt_save_dir
=
None
,
ckpt_save_time_interval
=
300
,
show_gpu_stat
=
False
):
if
total_it_each_epoch
==
len
(
train_loader
):
dataloader_iter
=
iter
(
train_loader
)
...
...
@@ -93,6 +93,11 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
f
'time_cost(epoch):
{
tbar
.
format_interval
(
trained_time_each_epoch
)
}
/
{
tbar
.
format_interval
(
remaining_second_each_epoch
)
}
, '
f
'time_cost(all):
{
tbar
.
format_interval
(
trained_time_past_all
)
}
/
{
tbar
.
format_interval
(
remaining_second_all
)
}
, '
f
'
{
disp_str
}
'
)
if
show_gpu_stat
and
accumulated_iter
%
(
3
*
logger_iter_interval
)
==
0
:
try
:
os
.
system
(
'gpustat'
)
except
:
print
(
'To show the GPU utilization, please install gpustat through "pip install gpustat"'
)
else
:
pbar
.
update
()
pbar
.
set_postfix
(
dict
(
total_it
=
accumulated_iter
))
...
...
@@ -124,7 +129,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
start_epoch
,
total_epochs
,
start_iter
,
rank
,
tb_log
,
ckpt_save_dir
,
train_sampler
=
None
,
lr_warmup_scheduler
=
None
,
ckpt_save_interval
=
1
,
max_ckpt_save_num
=
50
,
merge_all_iters_to_one_epoch
=
False
,
use_logger_to_record
=
False
,
logger
=
None
,
logger_iter_interval
=
None
,
ckpt_save_time_interval
=
None
):
use_logger_to_record
=
False
,
logger
=
None
,
logger_iter_interval
=
None
,
ckpt_save_time_interval
=
None
,
show_gpu_stat
=
False
):
accumulated_iter
=
start_iter
with
tqdm
.
trange
(
start_epoch
,
total_epochs
,
desc
=
'epochs'
,
dynamic_ncols
=
True
,
leave
=
(
rank
==
0
))
as
tbar
:
total_it_each_epoch
=
len
(
train_loader
)
...
...
@@ -155,7 +160,8 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
cur_epoch
=
cur_epoch
,
total_epochs
=
total_epochs
,
use_logger_to_record
=
use_logger_to_record
,
logger
=
logger
,
logger_iter_interval
=
logger_iter_interval
,
ckpt_save_dir
=
ckpt_save_dir
,
ckpt_save_time_interval
=
ckpt_save_time_interval
ckpt_save_dir
=
ckpt_save_dir
,
ckpt_save_time_interval
=
ckpt_save_time_interval
,
show_gpu_stat
=
show_gpu_stat
)
# save trained model
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment