Commit 405730dd authored by Shaoshuai Shi's avatar Shaoshuai Shi
Browse files

add training and testing tool codes

parent 635f1e94
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import tqdm
import time
import pickle
import numpy as np
import torch
from mmpcdet.utils import common_utils
def statistics_info(cfg, ret_dict, metric, disp_dict):
for cur_thresh in cfg.MODEL.POST_PROCESSING.RECALL_THRESH_LIST:
metric['recall_roi_%s' % str(cur_thresh)] += ret_dict['roi_%s' % str(cur_thresh)]
metric['recall_rcnn_%s' % str(cur_thresh)] += ret_dict['rcnn_%s' % str(cur_thresh)]
metric['gt_num'] += ret_dict['gt']
min_thresh = cfg.MODEL.POST_PROCESSING.RECALL_THRESH_LIST[0]
disp_dict['recall_%s' % str(min_thresh)] = \
'(%d, %d) / %d' % (metric['recall_roi_%s' % str(min_thresh)], metric['recall_rcnn_%s' % str(min_thresh)], metric['gt_num'])
def eval_one_epoch(cfg, model, dataloader, epoch_id, logger, dist_test=False, save_to_file=False, result_dir=None):
result_dir.mkdir(parents=True, exist_ok=True)
if save_to_file:
final_output_dir = result_dir / 'final_result' / 'data'
final_output_dir.mkdir(parents=True, exist_ok=True)
else:
final_output_dir = None
metric = {
'gt_num': 0,
}
for cur_thresh in cfg.MODEL.POST_PROCESSING.RECALL_THRESH_LIST:
metric['recall_roi_%s' % str(cur_thresh)] = 0
metric['recall_rcnn_%s' % str(cur_thresh)] = 0
dataset = dataloader.dataset
class_names = dataset.class_names
det_annos = []
logger.info('*************** EPOCH %s EVALUATION *****************' % epoch_id)
if dist_test:
raise NotImplementedError
model.eval()
if cfg.LOCAL_RANK == 0:
progress_bar = tqdm.tqdm(total=len(dataloader), leave=True, desc='eval', dynamic_ncols=True)
start_time = time.time()
for i, batch_dict in enumerate(dataloader):
for key, val in batch_dict.items():
if not isinstance(val, np.ndarray):
continue
if key in ['frame_id', 'calib', 'image_shape']:
continue
batch_dict[key] = torch.from_numpy(val).float().cuda()
with torch.no_grad():
pred_dicts, ret_dict = model(batch_dict)
disp_dict = {}
statistics_info(cfg, ret_dict, metric, disp_dict)
annos = dataset.generate_prediction_dicts(
batch_dict, pred_dicts, class_names,
output_path=final_output_dir if save_to_file else None
)
det_annos += annos
if cfg.LOCAL_RANK == 0:
progress_bar.set_postfix(disp_dict)
progress_bar.update()
if cfg.LOCAL_RANK == 0:
progress_bar.close()
if dist_test:
rank, world_size = common_utils.get_dist_info()
raise NotImplementedError
logger.info('*************** Performance of EPOCH %s *****************' % epoch_id)
sec_per_example = (time.time() - start_time) / len(dataloader.dataset)
logger.info('Generate label finished(sec_per_example: %.4f second).' % sec_per_example)
if cfg.LOCAL_RANK != 0:
return {}
ret_dict = {}
if dist_test:
raise NotImplementedError
gt_num_cnt = metric['gt_num']
for cur_thresh in cfg.MODEL.POST_PROCESSING.RECALL_THRESH_LIST:
cur_roi_recall = metric['recall_roi_%s' % str(cur_thresh)] / max(gt_num_cnt, 1)
cur_rcnn_recall = metric['recall_rcnn_%s' % str(cur_thresh)] / max(gt_num_cnt, 1)
logger.info('recall_roi_%s: %f' % (cur_thresh, cur_roi_recall))
logger.info('recall_rcnn_%s: %f' % (cur_thresh, cur_rcnn_recall))
ret_dict['recall_roi_%s' % str(cur_thresh)] = cur_roi_recall
ret_dict['recall_rcnn_%s' % str(cur_thresh)] = cur_rcnn_recall
total_pred_objects = 0
for anno in det_annos:
total_pred_objects += anno['name'].__len__()
logger.info('Average predicted number of objects(%d samples): %.3f'
% (len(det_annos), total_pred_objects / max(1, len(det_annos))))
with open(result_dir / 'result.pkl', 'wb') as f:
pickle.dump(det_annos, f)
result_str, result_dict = dataset.evaluation(
det_annos, class_names,
eval_metric=cfg.MODEL.POST_PROCESSING.EVAL_METRIC
)
logger.info(result_str)
ret_dict.update(result_dict)
logger.info('Result is save to %s' % result_dir)
logger.info('****************Evaluation done.*****************')
return ret_dict
if __name__ == '__main__':
pass
import os
import torch
from tensorboardX import SummaryWriter
import time
import glob
import re
import datetime
import argparse
from pathlib import Path
import torch.distributed as dist
from mmpcdet.datasets import build_dataloader
from mmpcdet.models import build_network
from mmpcdet.utils import common_utils
from mmpcdet.config import cfg, cfg_from_list, cfg_from_yaml_file, log_config_to_file
from eval_utils import eval_utils
def parge_config():
parser = argparse.ArgumentParser(description='arg parser')
parser.add_argument('--cfg_file', type=str, default=None, help='specify the config for training')
parser.add_argument('--batch_size', type=int, default=16, required=False, help='batch size for training')
parser.add_argument('--epochs', type=int, default=80, required=False, help='Number of epochs to train for')
parser.add_argument('--workers', type=int, default=4, help='number of workers for dataloader')
parser.add_argument('--extra_tag', type=str, default='default', help='extra tag for this experiment')
parser.add_argument('--ckpt', type=str, default=None, help='checkpoint to start from')
parser.add_argument('--mgpus', action='store_true', default=False, help='whether to use multiple gpu')
parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none')
parser.add_argument('--tcp_port', type=int, default=18888, help='tcp port for distrbuted training')
parser.add_argument('--local_rank', type=int, default=0, help='local rank for distributed training')
parser.add_argument('--set', dest='set_cfgs', default=None, nargs=argparse.REMAINDER,
help='set extra config keys if needed')
parser.add_argument('--max_waiting_mins', type=int, default=30, help='max waiting minutes')
parser.add_argument('--start_epoch', type=int, default=0, help='')
parser.add_argument('--eval_tag', type=str, default='default', help='eval tag for this experiment')
parser.add_argument('--eval_all', action='store_true', default=False, help='whether to evaluate all checkpoints')
parser.add_argument('--ckpt_dir', type=str, default=None, help='specify a ckpt directory to be evaluated if needed')
parser.add_argument('--save_to_file', action='store_true', default=False, help='')
args = parser.parse_args()
cfg_from_yaml_file(args.cfg_file, cfg)
cfg.TAG = Path(args.cfg_file).stem
cfg.EXP_GROUP_PATH = '/'.join(args.cfg_file.split('/')[1:-1]) # remove 'cfgs' and 'xxxx.yaml'
if args.set_cfgs is not None:
cfg_from_list(args.set_cfgs, cfg)
return args, cfg
def eval_single_ckpt(model, test_loader, args, eval_output_dir, logger, epoch_id, dist_test=False):
# load checkpoint
model.load_params_from_file(filename=args.ckpt, logger=logger, to_cpu=dist_test)
model.cuda()
# start evaluation
eval_utils.eval_one_epoch(
cfg, model, test_loader, epoch_id, logger, dist_test=dist_test,
result_dir=eval_output_dir, save_to_file=args.save_to_file
)
def get_no_evaluated_ckpt(ckpt_dir, ckpt_record_file, args):
ckpt_list = glob.glob(os.path.join(ckpt_dir, '*checkpoint_epoch_*.pth'))
ckpt_list.sort(key=os.path.getmtime)
evaluated_ckpt_list = [float(x.strip()) for x in open(ckpt_record_file, 'r').readlines()]
for cur_ckpt in ckpt_list:
num_list = re.findall('checkpoint_epoch_(.*).pth', cur_ckpt)
if num_list.__len__() == 0:
continue
epoch_id = num_list[-1]
if 'optim' in epoch_id:
continue
if float(epoch_id) not in evaluated_ckpt_list and int(float(epoch_id)) >= args.start_epoch:
return epoch_id, cur_ckpt
return -1, None
def repeat_eval_ckpt(model, test_loader, args, eval_output_dir, logger, ckpt_dir, dist_test=False):
# evaluated ckpt record
ckpt_record_file = eval_output_dir / ('eval_list_%s.txt' % cfg.DATA_CONFIG.DATA_SPLIT['test'])
with open(ckpt_record_file, 'a'):
pass
# tensorboard log
if cfg.LOCAL_RANK == 0:
tb_log = SummaryWriter(log_dir=str(eval_output_dir / ('tensorboard_%s' % cfg.DATA_CONFIG.DATA_SPLIT['test'])))
total_time = 0
first_eval = True
while True:
# check whether there is checkpoint which is not evaluated
cur_epoch_id, cur_ckpt = get_no_evaluated_ckpt(ckpt_dir, ckpt_record_file, args)
if cur_epoch_id == -1 or int(float(cur_epoch_id)) < args.start_epoch:
wait_second = 30
print('Wait %s seconds for next check (progress: %.1f / %d minutes): %s \r'
% (wait_second, total_time * 1.0 / 60, args.max_waiting_mins, ckpt_dir), end='', flush=True)
time.sleep(wait_second)
total_time += 30
if total_time > args.max_waiting_mins * 60 and (first_eval is False):
break
continue
total_time = 0
first_eval = False
model.load_params_from_file(filename=cur_ckpt, logger=logger, to_cpu=dist_test)
model.cuda()
# start evaluation
cur_result_dir = eval_output_dir / ('epoch_%s' % cur_epoch_id) / cfg.DATA_CONFIG.DATA_SPLIT['test']
tb_dict = eval_utils.eval_one_epoch(
cfg, model, test_loader, cur_epoch_id, logger, dist_test=dist_test,
result_dir=cur_result_dir, save_to_file=args.save_to_file
)
if cfg.LOCAL_RANK == 0:
for key, val in tb_dict.items():
tb_log.add_scalar(key, val, cur_epoch_id)
# record this epoch which has been evaluated
with open(ckpt_record_file, 'a') as f:
print('%s' % cur_epoch_id, file=f)
logger.info('Epoch %s has been evaluated' % cur_epoch_id)
def main():
args, cfg = parge_config()
if args.launcher == 'none':
dist_test = False
else:
args.batch_size, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)(
args.batch_size, args.tcp_port, args.local_rank, backend='nccl'
)
dist_test = True
output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
output_dir.mkdir(parents=True, exist_ok=True)
eval_output_dir = output_dir / 'eval'
if not args.eval_all:
num_list = re.findall(r'\d+', args.ckpt) if args.ckpt is not None else []
epoch_id = num_list[-1] if num_list.__len__() > 0 else 'no_number'
eval_output_dir = eval_output_dir / ('epoch_%s' % epoch_id) / cfg.DATA_CONFIG.DATA_SPLIT['test']
else:
eval_output_dir = eval_output_dir / 'eval_all_default'
if args.eval_tag is not None:
eval_output_dir = eval_output_dir / args.eval_tag
eval_output_dir.mkdir(parents=True, exist_ok=True)
log_file = eval_output_dir / ('log_eval_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)
# log to file
logger.info('**********************Start logging**********************')
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys() else 'ALL'
logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)
if dist_test:
total_gpus = dist.get_world_size()
logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
for key, val in vars(args).items():
logger.info('{:16} {}'.format(key, val))
log_config_to_file(cfg, logger=logger)
ckpt_dir = args.ckpt_dir if args.ckpt_dir is not None else output_dir / 'ckpt'
test_set, test_loader, sampler = build_dataloader(
dataset_cfg=cfg.DATA_CONFIG,
class_names=cfg.CLASS_NAMES,
batch_size=args.batch_size,
dist=dist_test, workers=args.workers, logger=logger, training=False
)
model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), dataset=test_set)
with torch.no_grad():
if args.eval_all:
repeat_eval_ckpt(model, test_loader, args, eval_output_dir, logger, ckpt_dir, dist_test=dist_test)
else:
eval_single_ckpt(model, test_loader, args, eval_output_dir, logger, epoch_id, dist_test=dist_test)
if __name__ == '__main__':
main()
import os
import torch
import torch.nn as nn
from tensorboardX import SummaryWriter
from pcdet.config import cfg, log_config_to_file, cfg_from_list, cfg_from_yaml_file
from pcdet.utils import common_utils
from pcdet.datasets import build_dataloader
from pcdet.models import build_network, model_fn_decorator
from train_utils.optimization import build_optimizer, build_scheduler
from train_utils.train_utils import train_model
import torch.distributed as dist
from pathlib import Path
import argparse
import datetime
import glob
def parge_config():
parser = argparse.ArgumentParser(description='arg parser')
parser.add_argument('--cfg_file', type=str, default=None, help='specify the config for training')
parser.add_argument('--batch_size', type=int, default=16, required=False, help='batch size for training')
parser.add_argument('--epochs', type=int, default=30, required=False, help='number of epochs to train for')
parser.add_argument('--workers', type=int, default=4, help='number of workers for dataloader')
parser.add_argument('--extra_tag', type=str, default='default', help='extra tag for this experiment')
parser.add_argument('--ckpt', type=str, default=None, help='checkpoint to start from')
parser.add_argument('--pretrained_model', type=str, default=None, help='pretrained_model')
parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none')
parser.add_argument('--tcp_port', type=int, default=18888, help='tcp port for distrbuted training')
parser.add_argument('--sync_bn', action='store_true', default=False, help='whether to use sync bn')
parser.add_argument('--fix_random_seed', action='store_true', default=False, help='')
parser.add_argument('--ckpt_save_interval', type=int, default=1, help='number of training epochs')
parser.add_argument('--local_rank', type=int, default=0, help='local rank for distributed training')
parser.add_argument('--max_ckpt_save_num', type=int, default=30, help='max number of saved checkpoint')
parser.add_argument('--set', dest='set_cfgs', default=None, nargs=argparse.REMAINDER,
help='set extra config keys if needed')
parser.add_argument('--max_waiting_mins', type=int, default=0, help='max waiting minutes')
parser.add_argument('--start_epoch', type=int, default=0, help='')
parser.add_argument('--save_to_file', action='store_true', default=False, help='')
args = parser.parse_args()
cfg_from_yaml_file(args.cfg_file, cfg)
cfg.TAG = Path(args.cfg_file).stem
cfg.EXP_GROUP_PATH = '/'.join(args.cfg_file.split('/')[1:-1]) # remove 'cfgs' and 'xxxx.yaml'
if args.set_cfgs is not None:
cfg_from_list(args.set_cfgs, cfg)
return args, cfg
def main():
args, cfg = parge_config()
if args.launcher == 'none':
dist_train = False
else:
args.batch_size, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)(
args.batch_size, args.tcp_port, args.local_rank, backend='nccl'
)
dist_train = True
if args.fix_random_seed:
common_utils.set_random_seed(666)
output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag
ckpt_dir = output_dir / 'ckpt'
output_dir.mkdir(parents=True, exist_ok=True)
ckpt_dir.mkdir(parents=True, exist_ok=True)
log_file = output_dir / ('log_train_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK)
# log to file
logger.info('**********************Start logging**********************')
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys() else 'ALL'
logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)
if dist_train:
total_gpus = dist.get_world_size()
logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
for key, val in vars(args).items():
logger.info('{:16} {}'.format(key, val))
log_config_to_file(cfg, logger=logger)
if cfg.LOCAL_RANK == 0:
os.system('cp %s %s' % (args.cfg_file, output_dir))
tb_log = SummaryWriter(log_dir=str(output_dir / 'tensorboard')) if cfg.LOCAL_RANK == 0 else None
# -----------------------create dataloader & network & optimizer---------------------------
train_set, train_loader, train_sampler = build_dataloader(
dataset_cfg=cfg.DATA_CONFIG,
class_names=cfg.CLASS_NAMES,
batch_size=args.batch_size,
dist=dist_train, workers=args.workers,
logger=logger,
training=True
)
model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), dataset=train_set)
if args.sync_bn:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
model.cuda()
optimizer = build_optimizer(model, cfg.OPTIMIZATION)
# load checkpoint if it is possible
start_epoch = it = 0
last_epoch = -1
if args.pretrained_model is not None:
model.load_params_from_file(filename=args.pretrained_model, to_cpu=dist, logger=logger)
if args.ckpt is not None:
it, start_epoch = model.load_params_with_optimizer(args.ckpt, to_cpu=dist, optimizer=optimizer, logger=logger)
last_epoch = start_epoch + 1
else:
ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth'))
if len(ckpt_list) > 0:
ckpt_list.sort(key=os.path.getmtime)
it, start_epoch = model.load_params_with_optimizer(
ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger
)
last_epoch = start_epoch + 1
model.train() # before wrap to DistributedDataParallel to support fixed some parameters
if dist_train:
model = nn.parallel.DistributedDataParallel(model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()])
logger.info(model)
total_iters_each_epoch = len(train_loader) if not args.merge_all_iters_to_one_epoch else len(train_loader) // args.epochs
lr_scheduler, lr_warmup_scheduler = build_scheduler(
optimizer, total_iters_each_epoch=total_iters_each_epoch, total_epochs=args.epochs,
last_epoch=last_epoch, optim_cfg=cfg.OPTIMIZATION
)
# -----------------------start training---------------------------
logger.info('**********************Start training %s/%s(%s)**********************'
% (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
train_model(
model,
optimizer,
train_loader,
model_func=model_fn_decorator(),
lr_scheduler=lr_scheduler,
optim_cfg=cfg.OPTIMIZATION,
start_epoch=start_epoch,
total_epochs=args.epochs,
start_iter=it,
rank=cfg.LOCAL_RANK,
tb_log=tb_log,
ckpt_save_dir=ckpt_dir,
train_sampler=train_sampler,
lr_warmup_scheduler=lr_warmup_scheduler,
ckpt_save_interval=args.ckpt_save_interval,
max_ckpt_save_num=args.max_ckpt_save_num,
merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch
)
logger.info('**********************End training %s/%s(%s)**********************\n\n\n'
% (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
if __name__ == '__main__':
main()
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_sched
from functools import partial
from .fastai_optim import OptimWrapper
from .learning_schedules_fastai import OneCycle, CosineWarmupLR
def build_optimizer(model, optim_cfg):
if optim_cfg.OPTIMIZER == 'adam':
optimizer = optim.Adam(model.parameters(), lr=optim_cfg.LR, weight_decay=optim_cfg.WEIGHT_DECAY)
elif optim_cfg.OPTIMIZER == 'sgd':
optimizer = optim.SGD(
model.parameters(), lr=optim_cfg.LR, weight_decay=optim_cfg.WEIGHT_DECAY,
momentum=optim_cfg.MOMENTUM
)
elif optim_cfg.OPTIMIZER == 'adam_onecycle':
def children(m: nn.Module):
return list(m.children())
def num_children(m: nn.Module) -> int:
return len(children(m))
flatten_model = lambda m: sum(map(flatten_model, m.children()), []) if num_children(m) else [m]
get_layer_groups = lambda m: [nn.Sequential(*flatten_model(m))]
optimizer_func = partial(optim.Adam, betas=(0.9, 0.99))
optimizer = OptimWrapper.create(
optimizer_func, 3e-3, get_layer_groups(model), wd=optim_cfg.WEIGHT_DECAY, true_wd=True, bn_wd=True
)
else:
raise NotImplementedError
return optimizer
def build_scheduler(optimizer, total_iters_each_epoch, total_epochs, last_epoch, optim_cfg):
decay_steps = [x * total_iters_each_epoch for x in optim_cfg.DECAY_STEP_LIST]
def lr_lbmd(cur_epoch):
cur_decay = 1
for decay_step in decay_steps:
if cur_epoch >= decay_step:
cur_decay = cur_decay * optim_cfg.LR_DECAY
return max(cur_decay, optim_cfg.LR_CLIP / optim_cfg.LR)
lr_warmup_scheduler = None
total_steps = total_iters_each_epoch * total_epochs
if optim_cfg.OPTIMIZER == 'adam_onecycle':
lr_scheduler = OneCycle(
optimizer, total_steps, optim_cfg.LR, list(optim_cfg.MOMS), optim_cfg.DIV_FACTOR, optim_cfg.PCT_START
)
else:
lr_scheduler = lr_sched.LambdaLR(optimizer, lr_lbmd, last_epoch=last_epoch)
if optim_cfg.LR_WARMUP:
lr_warmup_scheduler = CosineWarmupLR(
optimizer, T_max=optim_cfg.WARMUP_EPOCH * len(total_iters_each_epoch),
eta_min=optim_cfg.LR / optim_cfg.DIV_FACTOR
)
return lr_scheduler, lr_warmup_scheduler
# This file is modified from https://github.com/traveller59/second.pytorch
from collections import Iterable
import torch
from torch import nn
from torch.nn.utils import parameters_to_vector
from torch._utils import _unflatten_dense_tensors
bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
def split_bn_bias(layer_groups):
"Split the layers in `layer_groups` into batchnorm (`bn_types`) and non-batchnorm groups."
split_groups = []
for l in layer_groups:
l1, l2 = [], []
for c in l.children():
if isinstance(c, bn_types):
l2.append(c)
else:
l1.append(c)
split_groups += [nn.Sequential(*l1), nn.Sequential(*l2)]
return split_groups
def get_master(layer_groups, flat_master: bool = False):
"Return two lists, one for the model parameters in FP16 and one for the master parameters in FP32."
split_groups = split_bn_bias(layer_groups)
model_params = [[param for param in lg.parameters() if param.requires_grad] for lg in split_groups]
if flat_master:
master_params = []
for lg in model_params:
if len(lg) != 0:
mp = parameters_to_vector([param.data.float() for param in lg])
mp = torch.nn.Parameter(mp, requires_grad=True)
if mp.grad is None: mp.grad = mp.new(*mp.size())
master_params.append([mp])
else:
master_params.append([])
return model_params, master_params
else:
master_params = [[param.clone().float().detach() for param in lg] for lg in model_params]
for mp in master_params:
for param in mp: param.requires_grad = True
return model_params, master_params
def model_g2master_g(model_params, master_params, flat_master: bool = False) -> None:
"Copy the `model_params` gradients to `master_params` for the optimizer step."
if flat_master:
for model_group, master_group in zip(model_params, master_params):
if len(master_group) != 0:
master_group[0].grad.data.copy_(parameters_to_vector([p.grad.data.float() for p in model_group]))
else:
for model_group, master_group in zip(model_params, master_params):
for model, master in zip(model_group, master_group):
if model.grad is not None:
if master.grad is None: master.grad = master.data.new(*master.data.size())
master.grad.data.copy_(model.grad.data)
else:
master.grad = None
def master2model(model_params, master_params, flat_master: bool = False) -> None:
"Copy `master_params` to `model_params`."
if flat_master:
for model_group, master_group in zip(model_params, master_params):
if len(model_group) != 0:
for model, master in zip(model_group, _unflatten_dense_tensors(master_group[0].data, model_group)):
model.data.copy_(master)
else:
for model_group, master_group in zip(model_params, master_params):
for model, master in zip(model_group, master_group): model.data.copy_(master.data)
def listify(p=None, q=None):
"Make `p` listy and the same length as `q`."
if p is None:
p = []
elif isinstance(p, str):
p = [p]
elif not isinstance(p, Iterable):
p = [p]
n = q if type(q) == int else len(p) if q is None else len(q)
if len(p) == 1: p = p * n
assert len(p) == n, f'List len mismatch ({len(p)} vs {n})'
return list(p)
def trainable_params(m: nn.Module):
"Return list of trainable params in `m`."
res = filter(lambda p: p.requires_grad, m.parameters())
return res
def is_tuple(x) -> bool: return isinstance(x, tuple)
# copy from fastai.
class OptimWrapper():
"Basic wrapper around `opt` to simplify hyper-parameters changes."
def __init__(self, opt, wd, true_wd: bool = False, bn_wd: bool = True):
self.opt, self.true_wd, self.bn_wd = opt, true_wd, bn_wd
self.opt_keys = list(self.opt.param_groups[0].keys())
self.opt_keys.remove('params')
self.read_defaults()
self.wd = wd
@classmethod
def create(cls, opt_func, lr,
layer_groups, **kwargs):
"Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`."
split_groups = split_bn_bias(layer_groups)
opt = opt_func([{'params': trainable_params(l), 'lr': 0} for l in split_groups])
opt = cls(opt, **kwargs)
opt.lr, opt.opt_func = listify(lr, layer_groups), opt_func
return opt
def new(self, layer_groups):
"Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters."
opt_func = getattr(self, 'opt_func', self.opt.__class__)
split_groups = split_bn_bias(layer_groups)
opt = opt_func([{'params': trainable_params(l), 'lr': 0} for l in split_groups])
return self.create(opt_func, self.lr, layer_groups, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
def __repr__(self) -> str:
return f'OptimWrapper over {repr(self.opt)}.\nTrue weight decay: {self.true_wd}'
# Pytorch optimizer methods
def step(self) -> None:
"Set weight decay and step optimizer."
# weight decay outside of optimizer step (AdamW)
if self.true_wd:
for lr, wd, pg1, pg2 in zip(self._lr, self._wd, self.opt.param_groups[::2], self.opt.param_groups[1::2]):
for p in pg1['params']:
# When some parameters are fixed: Shaoshuai Shi
if p.requires_grad is False:
continue
p.data.mul_(1 - wd * lr)
if self.bn_wd:
for p in pg2['params']:
# When some parameters are fixed: Shaoshuai Shi
if p.requires_grad is False:
continue
p.data.mul_(1 - wd * lr)
self.set_val('weight_decay', listify(0, self._wd))
self.opt.step()
def zero_grad(self) -> None:
"Clear optimizer gradients."
self.opt.zero_grad()
# Passthrough to the inner opt.
def __getattr__(self, k: str):
return getattr(self.opt, k, None)
def clear(self):
"Reset the state of the inner optimizer."
sd = self.state_dict()
sd['state'] = {}
self.load_state_dict(sd)
# Hyperparameters as properties
@property
def lr(self) -> float:
return self._lr[-1]
@lr.setter
def lr(self, val: float) -> None:
self._lr = self.set_val('lr', listify(val, self._lr))
@property
def mom(self) -> float:
return self._mom[-1]
@mom.setter
def mom(self, val: float) -> None:
if 'momentum' in self.opt_keys:
self.set_val('momentum', listify(val, self._mom))
elif 'betas' in self.opt_keys:
self.set_val('betas', (listify(val, self._mom), self._beta))
self._mom = listify(val, self._mom)
@property
def beta(self) -> float:
return None if self._beta is None else self._beta[-1]
@beta.setter
def beta(self, val: float) -> None:
"Set beta (or alpha as makes sense for given optimizer)."
if val is None: return
if 'betas' in self.opt_keys:
self.set_val('betas', (self._mom, listify(val, self._beta)))
elif 'alpha' in self.opt_keys:
self.set_val('alpha', listify(val, self._beta))
self._beta = listify(val, self._beta)
@property
def wd(self) -> float:
return self._wd[-1]
@wd.setter
def wd(self, val: float) -> None:
"Set weight decay."
if not self.true_wd: self.set_val('weight_decay', listify(val, self._wd), bn_groups=self.bn_wd)
self._wd = listify(val, self._wd)
# Helper functions
def read_defaults(self) -> None:
"Read the values inside the optimizer for the hyper-parameters."
self._beta = None
if 'lr' in self.opt_keys: self._lr = self.read_val('lr')
if 'momentum' in self.opt_keys: self._mom = self.read_val('momentum')
if 'alpha' in self.opt_keys: self._beta = self.read_val('alpha')
if 'betas' in self.opt_keys: self._mom, self._beta = self.read_val('betas')
if 'weight_decay' in self.opt_keys: self._wd = self.read_val('weight_decay')
def set_val(self, key: str, val, bn_groups: bool = True):
"Set `val` inside the optimizer dictionary at `key`."
if is_tuple(val): val = [(v1, v2) for v1, v2 in zip(*val)]
for v, pg1, pg2 in zip(val, self.opt.param_groups[::2], self.opt.param_groups[1::2]):
pg1[key] = v
if bn_groups: pg2[key] = v
return val
def read_val(self, key: str):
"Read a hyperparameter `key` in the optimizer dictionary."
val = [pg[key] for pg in self.opt.param_groups[::2]]
if is_tuple(val[0]): val = [o[0] for o in val], [o[1] for o in val]
return val
class FastAIMixedOptim(OptimWrapper):
@classmethod
def create(cls, opt_func, lr,
layer_groups, model, flat_master=False, loss_scale=512.0, **kwargs):
"Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`."
opt = OptimWrapper.create(opt_func, lr, layer_groups, **kwargs)
opt.model_params, opt.master_params = get_master(layer_groups, flat_master)
opt.flat_master = flat_master
opt.loss_scale = loss_scale
opt.model = model
# Changes the optimizer so that the optimization step is done in FP32.
# opt = self.learn.opt
mom, wd, beta = opt.mom, opt.wd, opt.beta
lrs = [lr for lr in opt._lr for _ in range(2)]
opt_params = [{'params': mp, 'lr': lr} for mp, lr in zip(opt.master_params, lrs)]
opt.opt = opt_func(opt_params)
opt.mom, opt.wd, opt.beta = mom, wd, beta
return opt
def step(self):
model_g2master_g(self.model_params, self.master_params, self.flat_master)
for group in self.master_params:
for param in group: param.grad.div_(self.loss_scale)
super(FastAIMixedOptim, self).step()
self.model.zero_grad()
# Update the params from master to model.
master2model(self.model_params, self.master_params, self.flat_master)
# This file is modified from https://github.com/traveller59/second.pytorch
import numpy as np
import math
from functools import partial
import torch.optim.lr_scheduler as lr_sched
from .fastai_optim import OptimWrapper
class LRSchedulerStep(object):
def __init__(self, fai_optimizer: OptimWrapper, total_step, lr_phases,
mom_phases):
# if not isinstance(fai_optimizer, OptimWrapper):
# raise TypeError('{} is not a fastai OptimWrapper'.format(
# type(fai_optimizer).__name__))
self.optimizer = fai_optimizer
self.total_step = total_step
self.lr_phases = []
for i, (start, lambda_func) in enumerate(lr_phases):
if len(self.lr_phases) != 0:
assert self.lr_phases[-1][0] < start
if isinstance(lambda_func, str):
lambda_func = eval(lambda_func)
if i < len(lr_phases) - 1:
self.lr_phases.append((int(start * total_step), int(lr_phases[i + 1][0] * total_step), lambda_func))
else:
self.lr_phases.append((int(start * total_step), total_step, lambda_func))
assert self.lr_phases[0][0] == 0
self.mom_phases = []
for i, (start, lambda_func) in enumerate(mom_phases):
if len(self.mom_phases) != 0:
assert self.mom_phases[-1][0] < start
if isinstance(lambda_func, str):
lambda_func = eval(lambda_func)
if i < len(mom_phases) - 1:
self.mom_phases.append((int(start * total_step), int(mom_phases[i + 1][0] * total_step), lambda_func))
else:
self.mom_phases.append((int(start * total_step), total_step, lambda_func))
assert self.mom_phases[0][0] == 0
def step(self, step):
for start, end, func in self.lr_phases:
if step >= start:
self.optimizer.lr = func((step - start) / (end - start))
for start, end, func in self.mom_phases:
if step >= start:
self.optimizer.mom = func((step - start) / (end - start))
def annealing_cos(start, end, pct):
# print(pct, start, end)
"Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
cos_out = np.cos(np.pi * pct) + 1
return end + (start - end) / 2 * cos_out
class OneCycle(LRSchedulerStep):
def __init__(self, fai_optimizer, total_step, lr_max, moms, div_factor,
pct_start):
self.lr_max = lr_max
self.moms = moms
self.div_factor = div_factor
self.pct_start = pct_start
a1 = int(total_step * self.pct_start)
a2 = total_step - a1
low_lr = self.lr_max / self.div_factor
lr_phases = ((0, partial(annealing_cos, low_lr, self.lr_max)),
(self.pct_start,
partial(annealing_cos, self.lr_max, low_lr / 1e4)))
mom_phases = ((0, partial(annealing_cos, *self.moms)),
(self.pct_start, partial(annealing_cos,
*self.moms[::-1])))
fai_optimizer.lr, fai_optimizer.mom = low_lr, self.moms[0]
super().__init__(fai_optimizer, total_step, lr_phases, mom_phases)
class CosineWarmupLR(lr_sched._LRScheduler):
def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1):
self.T_max = T_max
self.eta_min = eta_min
super(CosineWarmupLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
return [self.eta_min + (base_lr - self.eta_min) *
(1 - math.cos(math.pi * self.last_epoch / self.T_max)) / 2
for base_lr in self.base_lrs]
class FakeOptim:
def __init__(self):
self.lr = 0
self.mom = 0
if __name__ == "__main__":
import matplotlib.pyplot as plt
opt = FakeOptim() # 3e-3, wd=0.4, div_factor=10
schd = OneCycle(opt, 100, 3e-3, (0.95, 0.85), 10.0, 0.1)
lrs = []
moms = []
for i in range(100):
schd.step(i)
lrs.append(opt.lr)
moms.append(opt.mom)
plt.plot(lrs)
# plt.plot(moms)
plt.show()
plt.plot(moms)
plt.show()
import torch
import os
import glob
import tqdm
from torch.nn.utils import clip_grad_norm_
def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, accumulated_iter, optim_cfg,
rank, tbar, total_it_each_epoch, dataloader_iter, tb_log=None, leave_pbar=False):
if total_it_each_epoch == len(train_loader):
dataloader_iter = iter(train_loader)
if rank == 0:
pbar = tqdm.tqdm(total=total_it_each_epoch, leave=leave_pbar, desc='train', dynamic_ncols=True)
for cur_it in range(total_it_each_epoch):
try:
batch = next(dataloader_iter)
except StopIteration:
dataloader_iter = iter(train_loader)
batch = next(dataloader_iter)
print('new iters')
lr_scheduler.step(accumulated_iter)
try:
cur_lr = float(optimizer.lr)
except:
cur_lr = optimizer.param_groups[0]['lr']
if tb_log is not None:
tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter)
model.train()
optimizer.zero_grad()
loss, tb_dict, disp_dict = model_func(model, batch)
loss.backward()
clip_grad_norm_(model.parameters(), optim_cfg.GRAD_NORM_CLIP)
optimizer.step()
accumulated_iter += 1
disp_dict.update({'loss': loss.item(), 'lr': cur_lr})
# log to console and tensorboard
if rank == 0:
pbar.update()
pbar.set_postfix(dict(total_it=accumulated_iter))
tbar.set_postfix(disp_dict)
tbar.refresh()
if tb_log is not None:
tb_log.add_scalar('train_loss', loss, accumulated_iter)
tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter)
for key, val in tb_dict.items():
tb_log.add_scalar('train_' + key, val, accumulated_iter)
if rank == 0:
pbar.close()
return accumulated_iter
def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_cfg,
start_epoch, total_epochs, start_iter, rank, tb_log, ckpt_save_dir, train_sampler=None,
lr_warmup_scheduler=None, ckpt_save_interval=1, max_ckpt_save_num=50,
merge_all_iters_to_one_epoch=False):
accumulated_iter = start_iter
with tqdm.trange(start_epoch, total_epochs, desc='epochs', dynamic_ncols=True, leave=(rank == 0)) as tbar:
total_it_each_epoch = len(train_loader)
if merge_all_iters_to_one_epoch:
assert hasattr(train_loader.dataset, 'merge_all_iters_to_one_epoch')
train_loader.dataset.merge_all_iters_to_one_epoch(merge=True, epochs=total_epochs)
total_it_each_epoch = len(train_loader) // max(total_epochs, 1)
dataloader_iter = iter(train_loader)
for cur_epoch in tbar:
if train_sampler is not None:
train_sampler.set_epoch(cur_epoch)
# train one epoch
if lr_warmup_scheduler is not None and cur_epoch < optim_cfg.WARMUP_EPOCH:
cur_scheduler = lr_warmup_scheduler
else:
cur_scheduler = lr_scheduler
accumulated_iter = train_one_epoch(
model, optimizer, train_loader, model_func,
lr_scheduler=cur_scheduler,
accumulated_iter=accumulated_iter, optim_cfg=optim_cfg,
rank=rank, tbar=tbar, tb_log=tb_log,
leave_pbar=(cur_epoch + 1 == total_epochs),
total_it_each_epoch=total_it_each_epoch,
dataloader_iter=dataloader_iter
)
# save trained model
trained_epoch = cur_epoch + 1
if trained_epoch % ckpt_save_interval == 0 and rank == 0:
ckpt_list = glob.glob(str(ckpt_save_dir / 'checkpoint_epoch_*.pth'))
ckpt_list.sort(key=os.path.getmtime)
if ckpt_list.__len__() >= max_ckpt_save_num:
for cur_file_idx in range(0, len(ckpt_list) - max_ckpt_save_num + 1):
os.remove(ckpt_list[cur_file_idx])
ckpt_name = ckpt_save_dir / ('checkpoint_epoch_%d' % trained_epoch)
save_checkpoint(
checkpoint_state(model, optimizer, trained_epoch, accumulated_iter), filename=ckpt_name,
)
def model_state_to_cpu(model_state):
model_state_cpu = type(model_state)() # ordered dict
for key, val in model_state.items():
model_state_cpu[key] = val.cpu()
return model_state_cpu
def checkpoint_state(model=None, optimizer=None, epoch=None, it=None):
optim_state = optimizer.state_dict() if optimizer is not None else None
if model is not None:
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
model_state = model_state_to_cpu(model.module.state_dict())
else:
model_state = model.state_dict()
else:
model_state = None
try:
import pcdet
version = 'pcdet+' + pcdet.__version__
except:
version = 'none'
return {'epoch': epoch, 'it': it, 'model_state': model_state, 'optimizer_state': optim_state, 'version': version}
def save_checkpoint(state, filename='checkpoint'):
if False and 'optimizer_state' in state:
optimizer_state = state['optimizer_state']
state.pop('optimizer_state', None)
optimizer_filename = '{}_optim.pth'.format(filename)
torch.save({'optimizer_state': optimizer_state}, optimizer_filename)
filename = '{}.pth'.format(filename)
torch.save(state, filename)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment