train.py 7.39 KB
Newer Older
1
# Copyright (c) OpenMMLab. All rights reserved.
unknown's avatar
unknown committed
2
3
4
5
6
import argparse
import copy
import os
import os.path as osp
import time
7
import warnings
unknown's avatar
unknown committed
8
9
10

import mmcv
import torch
11
import torch.distributed as dist
unknown's avatar
unknown committed
12
13
14
15
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist

from mmcls import __version__
16
from mmcls.apis import init_random_seed, set_random_seed, train_model
unknown's avatar
unknown committed
17
18
from mmcls.datasets import build_dataset
from mmcls.models import build_classifier
19
20
from mmcls.utils import (auto_select_device, collect_env, get_root_logger,
                         setup_multi_processes)
unknown's avatar
unknown committed
21
22
23
24
25
26
27
28
29
30
31
32
33


def parse_args():
    parser = argparse.ArgumentParser(description='Train a model')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument(
        '--resume-from', help='the checkpoint file to resume from')
    parser.add_argument(
        '--no-validate',
        action='store_true',
        help='whether not to evaluate the checkpoint during training')
    group_gpus = parser.add_mutually_exclusive_group()
34
35
    group_gpus.add_argument(
        '--device', help='device used for training. (Deprecated)')
unknown's avatar
unknown committed
36
37
38
    group_gpus.add_argument(
        '--gpus',
        type=int,
39
        help='(Deprecated, please use --gpu-id) number of gpus to use '
unknown's avatar
unknown committed
40
41
42
43
44
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
45
        help='(Deprecated, please use --gpu-id) ids of gpus to use '
unknown's avatar
unknown committed
46
        '(only applicable to non-distributed training)')
47
48
49
50
51
52
53
54
55
56
57
    group_gpus.add_argument(
        '--gpu-id',
        type=int,
        default=0,
        help='id of gpu to use '
        '(only applicable to non-distributed training)')
    parser.add_argument(
        '--ipu-replicas',
        type=int,
        default=None,
        help='num of ipu replicas to use')
unknown's avatar
unknown committed
58
    parser.add_argument('--seed', type=int, default=None, help='random seed')
59
60
61
62
    parser.add_argument(
        '--diff-seed',
        action='store_true',
        help='Whether or not set different seeds for different ranks')
unknown's avatar
unknown committed
63
64
65
66
67
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
68
69
70
71
72
73
74
75
76
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
unknown's avatar
unknown committed
77
78
79
80
81
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
82
83
84
85
86
87
88
89
90
91
    parser.add_argument(
        '--world_size',
        type=int,
        default='128',
        help='world_size')
    parser.add_argument(
        '--rank',
        type=int,
        default='128',
        help='rank')
unknown's avatar
unknown committed
92
93
94
95
96
97
98
99
100
101
102
103
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    return args


def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
104
105
106
107
108
109
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)

    # set multi-process settings
    setup_multi_processes(cfg)

unknown's avatar
unknown committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
124
125
126
127
128
    if args.gpus is not None:
        cfg.gpu_ids = range(1)
        warnings.warn('`--gpus` is deprecated because we only support '
                      'single GPU mode in non-distributed training. '
                      'Use `gpus=1` now.')
unknown's avatar
unknown committed
129
    if args.gpu_ids is not None:
130
131
132
133
134
135
136
137
138
139
140
        cfg.gpu_ids = args.gpu_ids[0:1]
        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
                      'Because we only support single GPU mode in '
                      'non-distributed training. Use the first GPU '
                      'in `gpu_ids` now.')
    if args.gpus is None and args.gpu_ids is None:
        cfg.gpu_ids = [args.gpu_id]

    if args.ipu_replicas is not None:
        cfg.ipu_replicas = args.ipu_replicas
        args.device = 'ipu'
unknown's avatar
unknown committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        _, world_size = get_dist_info()
        cfg.gpu_ids = range(world_size)

    # create work_dir
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
    # dump config
    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # init the meta dict to record some important information such as
    # environment info and seed, which will be logged
    meta = dict()
    # log env info
    env_info_dict = collect_env()
    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
                dash_line)
    meta['env_info'] = env_info

    # log some basic info
    logger.info(f'Distributed training: {distributed}')
    logger.info(f'Config:\n{cfg.pretty_text}')

    # set random seeds
176
177
178
179
180
181
182
183
    cfg.device = args.device or auto_select_device()
    seed = init_random_seed(args.seed, device=cfg.device)
    seed = seed + dist.get_rank() if args.diff_seed else seed
    logger.info(f'Set random seed to {seed}, '
                f'deterministic: {args.deterministic}')
    set_random_seed(seed, deterministic=args.deterministic)
    cfg.seed = seed
    meta['seed'] = seed
unknown's avatar
unknown committed
184
185
186
187
188
189
190
191
192

    model = build_classifier(cfg.model)
    model.init_weights()

    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        val_dataset = copy.deepcopy(cfg.data.val)
        val_dataset.pipeline = cfg.data.train.pipeline
        datasets.append(build_dataset(val_dataset))
193
194
195
196
197

    # save mmcls version, config file content and class names in
    # runner as meta data
    meta.update(
        dict(
unknown's avatar
unknown committed
198
199
            mmcls_version=__version__,
            config=cfg.pretty_text,
200
201
            CLASSES=datasets[0].CLASSES))

unknown's avatar
unknown committed
202
203
204
205
206
207
208
209
    # add an attribute for visualization convenience
    train_model(
        model,
        datasets,
        cfg,
        distributed=distributed,
        validate=(not args.no_validate),
        timestamp=timestamp,
210
        device=cfg.device,
unknown's avatar
unknown committed
211
212
213
214
215
        meta=meta)


if __name__ == '__main__':
    main()