train.py 4.77 KB
Newer Older
dingchang's avatar
dingchang committed
1
# Copyright (c) OpenMMLab. All rights reserved.
zhangwenwei's avatar
zhangwenwei committed
2
import argparse
ChaimZhu's avatar
add amp  
ChaimZhu committed
3
import logging
zhangwenwei's avatar
zhangwenwei committed
4
import os
5
import os.path as osp
6

7
from mmengine.config import Config, DictAction
ChaimZhu's avatar
add amp  
ChaimZhu committed
8
from mmengine.logging import print_log
9
from mmengine.registry import RUNNERS
10
from mmengine.runner import Runner
zhangwenwei's avatar
zhangwenwei committed
11

12
from mmdet3d.utils import replace_ceph_backend
13

zhangwenwei's avatar
zhangwenwei committed
14
15

def parse_args():
16
    parser = argparse.ArgumentParser(description='Train a 3D detector')
zhangwenwei's avatar
zhangwenwei committed
17
    parser.add_argument('config', help='train config file path')
zhangwenwei's avatar
zhangwenwei committed
18
    parser.add_argument('--work-dir', help='the dir to save logs and models')
ChaimZhu's avatar
add amp  
ChaimZhu committed
19
20
21
22
23
    parser.add_argument(
        '--amp',
        action='store_true',
        default=False,
        help='enable automatic-mixed-precision training')
24
25
26
27
28
29
    parser.add_argument(
        '--auto-scale-lr',
        action='store_true',
        help='enable automatically scaling LR.')
    parser.add_argument(
        '--resume',
30
31
32
33
34
35
        nargs='?',
        type=str,
        const='auto',
        help='If specify checkpoint path, resume from it, while if not '
        'specify, try to auto resume from the latest checkpoint '
        'in the work directory.')
36
37
    parser.add_argument(
        '--ceph', action='store_true', help='Use ceph as data storage backend')
Wenhao Wu's avatar
Wenhao Wu committed
38
39
40
41
42
43
44
45
46
47
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
zhangwenwei's avatar
zhangwenwei committed
48
49
50
51
52
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
53
54
55
56
    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
    # will pass the `--local-rank` parameter to `tools/train.py` instead
    # of `--local_rank`.
    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
zhangwenwei's avatar
zhangwenwei committed
57
58
59
60
61
62
63
64
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)
    return args


def main():
    args = parse_args()
ChaimZhu's avatar
add amp  
ChaimZhu committed
65

66
    # load config
zhangwenwei's avatar
zhangwenwei committed
67
    cfg = Config.fromfile(args.config)
68

69
70
71
    # TODO: We will unify the ceph support approach with other OpenMMLab repos
    if args.ceph:
        cfg = replace_ceph_backend(cfg)
72

73
    cfg.launcher = args.launcher
Wenhao Wu's avatar
Wenhao Wu committed
74
75
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
ChaimZhu's avatar
add amp  
ChaimZhu committed
76

zhangwenwei's avatar
zhangwenwei committed
77
78
79
80
81
82
83
84
    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])
ChaimZhu's avatar
add amp  
ChaimZhu committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

    # enable automatic-mixed-precision training
    if args.amp is True:
        optim_wrapper = cfg.optim_wrapper.type
        if optim_wrapper == 'AmpOptimWrapper':
            print_log(
                'AMP training is already enabled in your config.',
                logger='current',
                level=logging.WARNING)
        else:
            assert optim_wrapper == 'OptimWrapper', (
                '`--amp` is only supported when the optimizer wrapper type is '
                f'`OptimWrapper` but got {optim_wrapper}.')
            cfg.optim_wrapper.type = 'AmpOptimWrapper'
            cfg.optim_wrapper.loss_scale = 'dynamic'

101
102
103
104
105
106
107
108
109
110
111
    # enable automatically scaling LR
    if args.auto_scale_lr:
        if 'auto_scale_lr' in cfg and \
                'enable' in cfg.auto_scale_lr and \
                'base_batch_size' in cfg.auto_scale_lr:
            cfg.auto_scale_lr.enable = True
        else:
            raise RuntimeError('Can not find "auto_scale_lr" or '
                               '"auto_scale_lr.enable" or '
                               '"auto_scale_lr.base_batch_size" in your'
                               ' configuration file.')
112
113
114
115
116
117
118
119

    # resume is determined in this priority: resume from > auto_resume
    if args.resume == 'auto':
        cfg.resume = True
        cfg.load_from = None
    elif args.resume is not None:
        cfg.resume = True
        cfg.load_from = args.resume
120

VVsssssk's avatar
VVsssssk committed
121
    # build the runner from config
122
123
124
125
126
127
128
    if 'runner_type' not in cfg:
        # build the default runner
        runner = Runner.from_cfg(cfg)
    else:
        # build customized runner from the registry
        # if 'runner_type' is set in the cfg
        runner = RUNNERS.build(cfg)
ChaimZhu's avatar
add amp  
ChaimZhu committed
129

VVsssssk's avatar
VVsssssk committed
130
131
    # start training
    runner.train()
zhangwenwei's avatar
zhangwenwei committed
132
133
134
135


if __name__ == '__main__':
    main()