"tools/vscode:/vscode.git/clone" did not exist on "226bf4507f7b6af385f8b6d75bb85d15de5750b0"
Commit 25d9f503 authored by Shaoshuai Shi's avatar Shaoshuai Shi
Browse files

support to show gpu utilization/memory when training

parent d35088b6
...@@ -47,6 +47,8 @@ def parse_config(): ...@@ -47,6 +47,8 @@ def parse_config():
parser.add_argument('--use_tqdm_to_record', action='store_true', default=False, help='if True, the intermediate losses will not be logged to file, only tqdm will be used') parser.add_argument('--use_tqdm_to_record', action='store_true', default=False, help='if True, the intermediate losses will not be logged to file, only tqdm will be used')
parser.add_argument('--logger_iter_interval', type=int, default=50, help='') parser.add_argument('--logger_iter_interval', type=int, default=50, help='')
parser.add_argument('--ckpt_save_time_interval', type=int, default=300, help='in terms of seconds') parser.add_argument('--ckpt_save_time_interval', type=int, default=300, help='in terms of seconds')
parser.add_argument('--wo_gpu_stat', action='store_true', help='')
args = parser.parse_args() args = parser.parse_args()
...@@ -162,6 +164,7 @@ def main(): ...@@ -162,6 +164,7 @@ def main():
# -----------------------start training--------------------------- # -----------------------start training---------------------------
logger.info('**********************Start training %s/%s(%s)**********************' logger.info('**********************Start training %s/%s(%s)**********************'
% (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag)) % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
train_model( train_model(
model, model,
optimizer, optimizer,
...@@ -183,7 +186,8 @@ def main(): ...@@ -183,7 +186,8 @@ def main():
logger=logger, logger=logger,
logger_iter_interval=args.logger_iter_interval, logger_iter_interval=args.logger_iter_interval,
ckpt_save_time_interval=args.ckpt_save_time_interval, ckpt_save_time_interval=args.ckpt_save_time_interval,
use_logger_to_record=not args.use_tqdm_to_record use_logger_to_record=not args.use_tqdm_to_record,
show_gpu_stat=not args.wo_gpu_stat
) )
if hasattr(train_set, 'use_shared_memory') and train_set.use_shared_memory: if hasattr(train_set, 'use_shared_memory') and train_set.use_shared_memory:
......
...@@ -11,7 +11,7 @@ from pcdet.utils import common_utils, commu_utils ...@@ -11,7 +11,7 @@ from pcdet.utils import common_utils, commu_utils
def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, accumulated_iter, optim_cfg, def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, accumulated_iter, optim_cfg,
rank, tbar, total_it_each_epoch, dataloader_iter, tb_log=None, leave_pbar=False, rank, tbar, total_it_each_epoch, dataloader_iter, tb_log=None, leave_pbar=False,
use_logger_to_record=False, logger=None, logger_iter_interval=50, cur_epoch=None, use_logger_to_record=False, logger=None, logger_iter_interval=50, cur_epoch=None,
total_epochs=None, ckpt_save_dir=None, ckpt_save_time_interval=300): total_epochs=None, ckpt_save_dir=None, ckpt_save_time_interval=300, show_gpu_stat=False):
if total_it_each_epoch == len(train_loader): if total_it_each_epoch == len(train_loader):
dataloader_iter = iter(train_loader) dataloader_iter = iter(train_loader)
...@@ -93,6 +93,11 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac ...@@ -93,6 +93,11 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
f'time_cost(epoch): {tbar.format_interval(trained_time_each_epoch)}/{tbar.format_interval(remaining_second_each_epoch)}, ' f'time_cost(epoch): {tbar.format_interval(trained_time_each_epoch)}/{tbar.format_interval(remaining_second_each_epoch)}, '
f'time_cost(all): {tbar.format_interval(trained_time_past_all)}/{tbar.format_interval(remaining_second_all)}, ' f'time_cost(all): {tbar.format_interval(trained_time_past_all)}/{tbar.format_interval(remaining_second_all)}, '
f'{disp_str}') f'{disp_str}')
if show_gpu_stat and accumulated_iter % (3 * logger_iter_interval) == 0:
try:
os.system('gpustat')
except:
print('To show the GPU utilization, please install gpustat through "pip install gpustat"')
else: else:
pbar.update() pbar.update()
pbar.set_postfix(dict(total_it=accumulated_iter)) pbar.set_postfix(dict(total_it=accumulated_iter))
...@@ -124,7 +129,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_ ...@@ -124,7 +129,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
start_epoch, total_epochs, start_iter, rank, tb_log, ckpt_save_dir, train_sampler=None, start_epoch, total_epochs, start_iter, rank, tb_log, ckpt_save_dir, train_sampler=None,
lr_warmup_scheduler=None, ckpt_save_interval=1, max_ckpt_save_num=50, lr_warmup_scheduler=None, ckpt_save_interval=1, max_ckpt_save_num=50,
merge_all_iters_to_one_epoch=False, merge_all_iters_to_one_epoch=False,
use_logger_to_record=False, logger=None, logger_iter_interval=None, ckpt_save_time_interval=None): use_logger_to_record=False, logger=None, logger_iter_interval=None, ckpt_save_time_interval=None, show_gpu_stat=False):
accumulated_iter = start_iter accumulated_iter = start_iter
with tqdm.trange(start_epoch, total_epochs, desc='epochs', dynamic_ncols=True, leave=(rank == 0)) as tbar: with tqdm.trange(start_epoch, total_epochs, desc='epochs', dynamic_ncols=True, leave=(rank == 0)) as tbar:
total_it_each_epoch = len(train_loader) total_it_each_epoch = len(train_loader)
...@@ -155,7 +160,8 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_ ...@@ -155,7 +160,8 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
cur_epoch=cur_epoch, total_epochs=total_epochs, cur_epoch=cur_epoch, total_epochs=total_epochs,
use_logger_to_record=use_logger_to_record, use_logger_to_record=use_logger_to_record,
logger=logger, logger_iter_interval=logger_iter_interval, logger=logger, logger_iter_interval=logger_iter_interval,
ckpt_save_dir=ckpt_save_dir, ckpt_save_time_interval=ckpt_save_time_interval ckpt_save_dir=ckpt_save_dir, ckpt_save_time_interval=ckpt_save_time_interval,
show_gpu_stat=show_gpu_stat
) )
# save trained model # save trained model
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment