Unverified Commit e602fac4 authored by Yan haixu's avatar Yan haixu Committed by GitHub
Browse files

Use timm style to record log by logger (#1229)



* use timm style to record log by logger
Co-authored-by: default avatarhova88 <yanhaixu@senior.auto>
parent 4713332c
......@@ -7,7 +7,7 @@ from easydict import EasyDict
def log_config_to_file(cfg, pre='cfg', logger=None):
for key, val in cfg.items():
if isinstance(cfg[key], EasyDict):
logger.info('\n%s.%s = edict()' % (pre, key))
logger.info('----------- %s -----------' % (key))
log_config_to_file(cfg[key], pre=pre + '.' + key, logger=logger)
continue
logger.info('%s.%s: %s' % (pre, key, val))
......
......@@ -92,7 +92,7 @@ def main():
output_dir.mkdir(parents=True, exist_ok=True)
ckpt_dir.mkdir(parents=True, exist_ok=True)
log_file = output_dir / ('log_train_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
log_file = output_dir / ('train_%s.log' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)
# log to file
......@@ -101,7 +101,10 @@ def main():
logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)
if dist_train:
logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
logger.info('Training in distributed mode : total_batch_size: %d' % (total_gpus * args.batch_size))
else:
logger.info('Training with a single process')
for key, val in vars(args).items():
logger.info('{:16} {}'.format(key, val))
log_config_to_file(cfg, logger=logger)
......@@ -110,7 +113,7 @@ def main():
tb_log = SummaryWriter(log_dir=str(output_dir / 'tensorboard')) if cfg.LOCAL_RANK == 0 else None
# -----------------------create dataloader & network & optimizer---------------------------
logger.info("----------- Create dataloader & network & optimizer -----------")
train_set, train_loader, train_sampler = build_dataloader(
dataset_cfg=cfg.DATA_CONFIG,
class_names=cfg.CLASS_NAMES,
......@@ -157,6 +160,7 @@ def main():
model.train() # before wrap to DistributedDataParallel to support fixed some parameters
if dist_train:
model = nn.parallel.DistributedDataParallel(model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()])
logger.info(f'----------- Model {cfg.MODEL.NAME} created, param count: {sum([m.numel() for m in model.parameters()])} -----------')
logger.info(model)
lr_scheduler, lr_warmup_scheduler = build_scheduler(
......
......@@ -25,6 +25,7 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
data_time = common_utils.AverageMeter()
batch_time = common_utils.AverageMeter()
forward_time = common_utils.AverageMeter()
losses_m = common_utils.AverageMeter()
end = time.time()
for cur_it in range(start_it, total_it_each_epoch):
......@@ -73,9 +74,12 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
# log to console and tensorboard
if rank == 0:
batch_size = batch.get('batch_size', None)
data_time.update(avg_data_time)
forward_time.update(avg_forward_time)
batch_time.update(avg_batch_time)
losses_m.update(loss.item() , batch_size)
disp_dict.update({
'loss': loss.item(), 'lr': cur_lr, 'd_time': f'{data_time.val:.2f}({data_time.avg:.2f})',
......@@ -90,14 +94,28 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
trained_time_each_epoch = pbar.format_dict['elapsed']
remaining_second_each_epoch = second_each_iter * (total_it_each_epoch - cur_it)
remaining_second_all = second_each_iter * ((total_epochs - cur_epoch) * total_it_each_epoch - cur_it)
disp_str = ', '.join([f'{key}={val}' for key, val in disp_dict.items() if key != 'lr'])
disp_str += f', lr={disp_dict["lr"]}'
batch_size = batch.get('batch_size', None)
logger.info(f'epoch: {cur_epoch}/{total_epochs}, acc_iter={accumulated_iter}, cur_iter={cur_it}/{total_it_each_epoch}, batch_size={batch_size}, '
f'time_cost(epoch): {tbar.format_interval(trained_time_each_epoch)}/{tbar.format_interval(remaining_second_each_epoch)}, '
f'time_cost(all): {tbar.format_interval(trained_time_past_all)}/{tbar.format_interval(remaining_second_all)}, '
f'{disp_str}')
logger.info(
'Train: {:>4d}/{} ({:>3.0f}%) [{:>4d}/{} ({:>3.0f}%)] '
'Loss: {loss.val:#.4g} ({loss.avg:#.3g}) '
'LR: {lr:.3e} '
f'Time cost: {tbar.format_interval(trained_time_each_epoch)}/{tbar.format_interval(remaining_second_each_epoch)} '
f'[{tbar.format_interval(trained_time_past_all)}/{tbar.format_interval(remaining_second_all)}] '
'Acc_iter {acc_iter:<10d} '
'Data time: {data_time.val:.2f}({data_time.avg:.2f}) '
'Forward time: {forward_time.val:.2f}({forward_time.avg:.2f}) '
'Batch time: {batch_time.val:.2f}({batch_time.avg:.2f})'.format(
cur_epoch+1,total_epochs, 100. * (cur_epoch+1) / total_epochs,
cur_it,total_it_each_epoch, 100. * cur_it / total_it_each_epoch,
loss=losses_m,
lr=cur_lr,
acc_iter=accumulated_iter,
data_time=data_time,
forward_time=forward_time,
batch_time=batch_time
)
)
if show_gpu_stat and accumulated_iter % (3 * logger_iter_interval) == 0:
# To show the GPU utilization, please install gpustat through "pip install gpustat"
gpu_info = os.popen('gpustat').read()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment