"vscode:/vscode.git/clone" did not exist on "77a268128ab0658d6ff0d5d80ee7f8b6e8e75923"
run.py 16.5 KB
Newer Older
gaotongxiao's avatar
gaotongxiao committed
1
2
3
4
5
6
7
8
import argparse
import getpass
import os
import os.path as osp
from datetime import datetime

from mmengine.config import Config

Yuan Liu's avatar
Yuan Liu committed
9
10
from opencompass.partitioners import (MultimodalNaivePartitioner,
                                      NaivePartitioner, SizePartitioner)
Tong Gao's avatar
Tong Gao committed
11
from opencompass.registry import PARTITIONERS, RUNNERS
gaotongxiao's avatar
gaotongxiao committed
12
13
14
15
16
17
18
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.utils import LarkReporter, Summarizer, get_logger


def parse_args():
    parser = argparse.ArgumentParser(description='Run an evaluation task')
    parser.add_argument('config', help='Train config file path')
Tong Gao's avatar
Tong Gao committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
    # add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
    # if "infer" or "eval" not specified
    launch_method = parser.add_mutually_exclusive_group()
    launch_method.add_argument('--slurm',
                               action='store_true',
                               default=False,
                               help='Whether to force tasks to run with srun. '
                               'If True, `--partition(-p)` must be set. '
                               'Defaults to False')
    launch_method.add_argument('--dlc',
                               action='store_true',
                               default=False,
                               help='Whether to force tasks to run on dlc. If '
                               'True, `--aliyun-cfg` must be set. Defaults'
                               ' to False')
gaotongxiao's avatar
gaotongxiao committed
34
35
36
37
38
39
40
    # add general args
    parser.add_argument('--debug',
                        help='Debug mode, in which scheduler will run tasks '
                        'in the single process, and output will not be '
                        'redirected to files',
                        action='store_true',
                        default=False)
Yuan Liu's avatar
Yuan Liu committed
41
42
43
44
    parser.add_argument('--mm-eval',
                        help='Whether or not enable multimodal evaluation',
                        action='store_true',
                        default=False)
Leymore's avatar
Leymore committed
45
46
47
48
49
50
    parser.add_argument('--dry-run',
                        help='Dry run mode, in which the scheduler will not '
                        'actually run the tasks, but only print the commands '
                        'to run',
                        action='store_true',
                        default=False)
gaotongxiao's avatar
gaotongxiao committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    parser.add_argument('-m',
                        '--mode',
                        help='Running mode. You can choose "infer" if you '
                        'only want the inference results, or "eval" if you '
                        'already have the results and want to evaluate them, '
                        'or "viz" if you want to visualize the results.',
                        choices=['all', 'infer', 'eval', 'viz'],
                        default='all',
                        type=str)
    parser.add_argument('-r',
                        '--reuse',
                        nargs='?',
                        type=str,
                        const='latest',
                        help='Reuse previous outputs & results, and run any '
                        'missing jobs presented in the config. If its '
                        'argument is not specified, the latest results in '
                        'the work_dir will be reused. The argument should '
                        'also be a specific timestamp, e.g. 20230516_144254'),
    parser.add_argument('-w',
                        '--work-dir',
Tong Gao's avatar
Tong Gao committed
72
73
74
75
76
                        help='Work path, all the outputs will be '
                        'saved in this path, including the slurm logs, '
                        'the evaluation results, the summary results, etc.'
                        'If not specified, the work_dir will be set to '
                        './outputs/default.',
gaotongxiao's avatar
gaotongxiao committed
77
78
79
80
81
82
83
84
                        default=None,
                        type=str)
    parser.add_argument('-l',
                        '--lark',
                        help='Report the running status to lark bot',
                        action='store_true',
                        default=False)
    parser.add_argument('--max-partition-size',
Tong Gao's avatar
Tong Gao committed
85
86
                        help='The maximum size of an infer task. Only '
                        'effective when "infer" is missing from the config.',
gaotongxiao's avatar
gaotongxiao committed
87
88
89
90
                        type=int,
                        default=2000),
    parser.add_argument(
        '--gen-task-coef',
Tong Gao's avatar
Tong Gao committed
91
92
        help='The dataset cost measurement coefficient for generation tasks, '
        'Only effective when "infer" is missing from the config.',
gaotongxiao's avatar
gaotongxiao committed
93
94
95
        type=int,
        default=20)
    parser.add_argument('--max-num-workers',
Tong Gao's avatar
Tong Gao committed
96
97
98
                        help='Max number of workers to run in parallel. '
                        'Will be overrideen by the "max_num_workers" argument '
                        'in the config.',
gaotongxiao's avatar
gaotongxiao committed
99
100
                        type=int,
                        default=32)
101
102
103
104
105
    parser.add_argument('--max-workers-per-gpu',
                        help='Max task to run in parallel on one GPU. '
                        'It will only be used in the local runner.',
                        type=int,
                        default=32)
gaotongxiao's avatar
gaotongxiao committed
106
107
    parser.add_argument(
        '--retry',
Tong Gao's avatar
Tong Gao committed
108
109
        help='Number of retries if the job failed when using slurm or dlc. '
        'Will be overrideen by the "retry" argument in the config.',
gaotongxiao's avatar
gaotongxiao committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
        type=int,
        default=2)
    # set srun args
    slurm_parser = parser.add_argument_group('slurm_args')
    parse_slurm_args(slurm_parser)
    # set dlc args
    dlc_parser = parser.add_argument_group('dlc_args')
    parse_dlc_args(dlc_parser)
    args = parser.parse_args()
    if args.slurm:
        assert args.partition is not None, (
            '--partition(-p) must be set if you want to use slurm')
    if args.dlc:
        assert os.path.exists(args.aliyun_cfg), (
Tong Gao's avatar
Tong Gao committed
124
            'When launching tasks using dlc, it needs to be configured '
gaotongxiao's avatar
gaotongxiao committed
125
126
127
128
129
130
            'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
            ' to specify a new path.')
    return args


def parse_slurm_args(slurm_parser):
Tong Gao's avatar
Tong Gao committed
131
    """These args are all for slurm launch."""
gaotongxiao's avatar
gaotongxiao committed
132
133
134
135
136
137
138
139
    slurm_parser.add_argument('-p',
                              '--partition',
                              help='Slurm partition name',
                              default=None,
                              type=str)
    slurm_parser.add_argument('-q',
                              '--quotatype',
                              help='Slurm quota type',
Tong Gao's avatar
Tong Gao committed
140
                              default=None,
gaotongxiao's avatar
gaotongxiao committed
141
                              type=str)
Haonan Li's avatar
Haonan Li committed
142
143
144
145
    slurm_parser.add_argument('--qos',
                              help='Slurm quality of service',
                              default=None,
                              type=str)
gaotongxiao's avatar
gaotongxiao committed
146
147
148


def parse_dlc_args(dlc_parser):
Tong Gao's avatar
Tong Gao committed
149
    """These args are all for dlc launch."""
gaotongxiao's avatar
gaotongxiao committed
150
151
152
153
154
155
156
157
    dlc_parser.add_argument('--aliyun-cfg',
                            help='The config path for aliyun config',
                            default='~/.aliyun.cfg',
                            type=str)


def main():
    args = parse_args()
Leymore's avatar
Leymore committed
158
159
    if args.dry_run:
        args.debug = True
gaotongxiao's avatar
gaotongxiao committed
160
161
162
    # initialize logger
    logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')

163
    cfg = Config.fromfile(args.config, format_python_code=False)
gaotongxiao's avatar
gaotongxiao committed
164
165
166
167
168
169
170
171
172
    if args.work_dir is not None:
        cfg['work_dir'] = args.work_dir
    else:
        cfg.setdefault('work_dir', './outputs/default/')

    # cfg_time_str defaults to the current time
    cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
    if args.reuse:
        if args.reuse == 'latest':
Leymore's avatar
Leymore committed
173
174
175
176
177
178
            if not os.path.exists(cfg.work_dir) or not os.listdir(
                    cfg.work_dir):
                logger.warning('No previous results to reuse!')
            else:
                dirs = os.listdir(cfg.work_dir)
                dir_time_str = sorted(dirs)[-1]
gaotongxiao's avatar
gaotongxiao committed
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
        else:
            dir_time_str = args.reuse
        logger.info(f'Reusing experiements from {dir_time_str}')
    elif args.mode in ['eval', 'viz']:
        raise ValueError('You must specify -r or --reuse when running in eval '
                         'or viz mode!')

    # update "actual" work_dir
    cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
    os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)

    # dump config
    output_config_path = osp.join(cfg.work_dir, 'configs',
                                  f'{cfg_time_str}.py')
    cfg.dump(output_config_path)
    # Config is intentally reloaded here to avoid initialized
    # types cannot be serialized
196
    cfg = Config.fromfile(output_config_path, format_python_code=False)
gaotongxiao's avatar
gaotongxiao committed
197
198
199
200
201
202
203
204
205

    # report to lark bot if specify --lark
    if not args.lark:
        cfg['lark_bot_url'] = None
    elif cfg.get('lark_bot_url', None):
        content = f'{getpass.getuser()}\'s task has been launched!'
        LarkReporter(cfg['lark_bot_url']).post(content)

    if args.mode in ['all', 'infer']:
Tong Gao's avatar
Tong Gao committed
206
207
208
        # When user have specified --slurm or --dlc, or have not set
        # "infer" in config, we will provide a default configuration
        # for infer
Tong Gao's avatar
Tong Gao committed
209
210
211
212
213
        if (args.dlc or args.slurm) and cfg.get('infer', None):
            logger.warning('You have set "infer" in the config, but '
                           'also specified --slurm or --dlc. '
                           'The "infer" configuration will be overridden by '
                           'your runtime arguments.')
Yuan Liu's avatar
Yuan Liu committed
214
215
216
217
218
219
220
221
        # Check whether run multimodal evaluation
        if args.mm_eval:
            partitioner = MultimodalNaivePartitioner(
                osp.join(cfg['work_dir'], 'predictions/'))
            tasks = partitioner(cfg)
            exec_mm_infer_runner(tasks, args, cfg)
            return
        elif args.dlc or args.slurm or cfg.get('infer', None) is None:
Tong Gao's avatar
Tong Gao committed
222
223
224
225
226
227
            # Use SizePartitioner to split into subtasks
            partitioner = SizePartitioner(
                osp.join(cfg['work_dir'], 'predictions/'),
                max_task_size=args.max_partition_size,
                gen_task_coef=args.gen_task_coef)
            tasks = partitioner(cfg)
Leymore's avatar
Leymore committed
228
229
            if args.dry_run:
                return
Tong Gao's avatar
Tong Gao committed
230
231
            # execute the infer subtasks
            exec_infer_runner(tasks, args, cfg)
Tong Gao's avatar
Tong Gao committed
232
233
        # If they have specified "infer" in config and haven't used --slurm
        # or --dlc, just follow the config
Tong Gao's avatar
Tong Gao committed
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
        else:
            if args.partition is not None:
                if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                    cfg.infer.runner.partition = args.partition
                    cfg.infer.runner.quotatype = args.quotatype
            else:
                logger.warning('SlurmRunner is not used, so the partition '
                               'argument is ignored.')
            if args.debug:
                cfg.infer.runner.debug = True
            if args.lark:
                cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
            cfg.infer.partitioner['out_dir'] = osp.join(
                cfg['work_dir'], 'predictions/')
            partitioner = PARTITIONERS.build(cfg.infer.partitioner)
            tasks = partitioner(cfg)
Leymore's avatar
Leymore committed
250
251
            if args.dry_run:
                return
Tong Gao's avatar
Tong Gao committed
252
253
            runner = RUNNERS.build(cfg.infer.runner)
            runner(tasks)
gaotongxiao's avatar
gaotongxiao committed
254
255
256

    # evaluate
    if args.mode in ['all', 'eval']:
Tong Gao's avatar
Tong Gao committed
257
258
259
        # When user have specified --slurm or --dlc, or have not set
        # "eval" in config, we will provide a default configuration
        # for eval
Tong Gao's avatar
Tong Gao committed
260
261
262
263
264
265
266
267
268
269
        if (args.dlc or args.slurm) and cfg.get('eval', None):
            logger.warning('You have set "eval" in the config, but '
                           'also specified --slurm or --dlc. '
                           'The "eval" configuration will be overridden by '
                           'your runtime arguments.')
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
            # Use NaivePartitioner,not split
            partitioner = NaivePartitioner(
                osp.join(cfg['work_dir'], 'results/'))
            tasks = partitioner(cfg)
Leymore's avatar
Leymore committed
270
271
            if args.dry_run:
                return
Tong Gao's avatar
Tong Gao committed
272
273
            # execute the eval tasks
            exec_eval_runner(tasks, args, cfg)
Tong Gao's avatar
Tong Gao committed
274
275
        # If they have specified "eval" in config and haven't used --slurm
        # or --dlc, just follow the config
Tong Gao's avatar
Tong Gao committed
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
        else:
            if args.partition is not None:
                if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                    cfg.eval.runner.partition = args.partition
                    cfg.eval.runner.quotatype = args.quotatype
                else:
                    logger.warning('SlurmRunner is not used, so the partition '
                                   'argument is ignored.')
            if args.debug:
                cfg.eval.runner.debug = True
            if args.lark:
                cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
            cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
                                                       'results/')
            partitioner = PARTITIONERS.build(cfg.eval.partitioner)
            tasks = partitioner(cfg)
Leymore's avatar
Leymore committed
292
293
            if args.dry_run:
                return
Tong Gao's avatar
Tong Gao committed
294
295
            runner = RUNNERS.build(cfg.eval.runner)
            runner(tasks)
gaotongxiao's avatar
gaotongxiao committed
296
297
298
299
300
301
302

    # visualize
    if args.mode in ['all', 'eval', 'viz']:
        summarizer = Summarizer(cfg)
        summarizer.summarize(time_str=cfg_time_str)


Yuan Liu's avatar
Yuan Liu committed
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
def exec_mm_infer_runner(tasks, args, cfg):
    """execute multimodal infer runner according to args."""
    if args.slurm:
        runner = SlurmRunner(dict(type='MultimodalInferTask'),
                             max_num_workers=args.max_num_workers,
                             partition=args.partition,
                             quotatype=args.quotatype,
                             retry=args.retry,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
        raise NotImplementedError('Currently, we do not support evaluating \
                             multimodal models on dlc.')
    else:
        runner = LocalRunner(task=dict(type='MultimodalInferTask'),
                             max_num_workers=args.max_num_workers,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    runner(tasks)


gaotongxiao's avatar
gaotongxiao committed
324
325
326
327
328
329
330
def exec_infer_runner(tasks, args, cfg):
    """execute infer runner according to args."""
    if args.slurm:
        runner = SlurmRunner(dict(type='OpenICLInferTask'),
                             max_num_workers=args.max_num_workers,
                             partition=args.partition,
                             quotatype=args.quotatype,
Haonan Li's avatar
Haonan Li committed
331
                             qos=args.qos,
gaotongxiao's avatar
gaotongxiao committed
332
333
334
335
336
337
338
339
340
341
342
                             retry=args.retry,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
        runner = DLCRunner(dict(type='OpenICLInferTask'),
                           max_num_workers=args.max_num_workers,
                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
                           retry=args.retry,
                           debug=args.debug,
                           lark_bot_url=cfg['lark_bot_url'])
    else:
Tong Gao's avatar
Tong Gao committed
343
344
        runner = LocalRunner(task=dict(type='OpenICLInferTask'),
                             max_num_workers=args.max_num_workers,
345
                             max_workers_per_gpu=args.max_workers_per_gpu,
Tong Gao's avatar
Tong Gao committed
346
347
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
gaotongxiao's avatar
gaotongxiao committed
348
349
350
351
352
353
354
355
356
357
    runner(tasks)


def exec_eval_runner(tasks, args, cfg):
    """execute infer runner according to args."""
    if args.slurm:
        runner = SlurmRunner(dict(type='OpenICLEvalTask'),
                             max_num_workers=args.max_num_workers,
                             partition=args.partition,
                             quotatype=args.quotatype,
Haonan Li's avatar
Haonan Li committed
358
                             qos=args.qos,
gaotongxiao's avatar
gaotongxiao committed
359
360
361
362
363
364
365
366
367
368
369
                             retry=args.retry,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
        runner = DLCRunner(dict(type='OpenICLEvalTask'),
                           max_num_workers=args.max_num_workers,
                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
                           retry=args.retry,
                           debug=args.debug,
                           lark_bot_url=cfg['lark_bot_url'])
    else:
Tong Gao's avatar
Tong Gao committed
370
371
372
373
        runner = LocalRunner(task=dict(type='OpenICLEvalTask'),
                             max_num_workers=args.max_num_workers,
                             debug=args.debug,
                             lark_bot_url=cfg['lark_bot_url'])
gaotongxiao's avatar
gaotongxiao committed
374
375
376
377
378
    runner(tasks)


if __name__ == '__main__':
    main()