[Sync] Use finally to clean up temp files (#337)

ce65d339 · Tong Gao · GitHub · 2cd994c3 · ce65d339 · ce65d339
Unverified Commit ce65d339 authored Sep 04, 2023 by Tong Gao Committed by GitHub Sep 04, 2023
5 changed files
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -86,65 +86,70 @@ class DLCRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
-        task_cfg.dump(param_file)
+        try:
+            task_cfg.dump(param_file)
-        # Build up DLC command
-        pwd = os.getcwd()
+            # Build up DLC command
-        shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; '
+            pwd = os.getcwd()
-                     f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
+            shell_cmd = (
-                     f'cd {pwd}; '
+                f'source {self.aliyun_cfg["bashrc_path"]}; '
-                     '{task_cmd}')
+                f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
+                f'cd {pwd}; '
-        tmpl = ('dlc create job'
+                '{task_cmd}')
-                f" --command '{shell_cmd}'"
-                f' --name {task_name[:512]}'
+            tmpl = ('dlc create job'
-                ' --kind BatchJob'
+                    f" --command '{shell_cmd}'"
-                f" -c {self.aliyun_cfg['dlc_config_path']}"
+                    f' --name {task_name[:512]}'
-                f" --workspace_id {self.aliyun_cfg['workspace_id']}"
+                    ' --kind BatchJob'
-                ' --worker_count 1'
+                    f" -c {self.aliyun_cfg['dlc_config_path']}"
-                f' --worker_cpu {max(num_gpus * 6, 8)}'
+                    f" --workspace_id {self.aliyun_cfg['workspace_id']}"
-                f' --worker_gpu {num_gpus}'
+                    ' --worker_count 1'
-                f' --worker_memory {max(num_gpus * 32, 48)}'
+                    f' --worker_cpu {max(num_gpus * 6, 8)}'
-                f" --worker_image {self.aliyun_cfg['worker_image']}"
+                    f' --worker_gpu {num_gpus}'
-                ' --interactive')
+                    f' --worker_memory {max(num_gpus * 32, 48)}'
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+                    f" --worker_image {self.aliyun_cfg['worker_image']}"
-        cmd = get_cmd()
+                    ' --interactive')
+            get_cmd = partial(task.get_command,
-        logger = get_logger()
+                              cfg_path=param_file,
-        logger.debug(f'Running command: {cmd}')
+                              template=tmpl)
+            cmd = get_cmd()
-        # Run command with retry
-        if self.debug:
+            logger = get_logger()
-            stdout = None
+            logger.debug(f'Running command: {cmd}')
-        else:
-            out_path = task.get_log_path(file_extension='out')
+            # Run command with retry
-            mmengine.mkdir_or_exist(osp.split(out_path)[0])
+            if self.debug:
-            stdout = open(out_path, 'w', encoding='utf-8')
+                stdout = None
+            else:
-        if random_sleep:
+                out_path = task.get_log_path(file_extension='out')
-            time.sleep(random.randint(0, 10))
+                mmengine.mkdir_or_exist(osp.split(out_path)[0])
-        result = subprocess.run(cmd,
+                stdout = open(out_path, 'w', encoding='utf-8')
-                                shell=True,
-                                text=True,
-                                stdout=stdout,
-                                stderr=stdout)
-        retry = self.retry
-        output_paths = task.get_output_paths()
-        while self._job_failed(result.returncode, output_paths) and retry > 0:
-            retry -= 1
            if random_sleep:
                time.sleep(random.randint(0, 10))
-            # Re-generate command to refresh ports.
-            cmd = get_cmd()
            result = subprocess.run(cmd,
                                    shell=True,
                                    text=True,
                                    stdout=stdout,
                                    stderr=stdout)
-        # Clean up
+            retry = self.retry
-        os.remove(param_file)
+            output_paths = task.get_output_paths()
+            while self._job_failed(result.returncode,
+                                   output_paths) and retry > 0:
+                retry -= 1
+                if random_sleep:
+                    time.sleep(random.randint(0, 10))
+                # Re-generate command to refresh ports.
+                cmd = get_cmd()
+                result = subprocess.run(cmd,
+                                        shell=True,
+                                        text=True,
+                                        stdout=stdout,
+                                        stderr=stdout)
+        finally:
+            # Clean up
+            os.remove(param_file)
        return task_name, result.returncode
    def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:

--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@@ -62,15 +62,17 @@ class LocalRunner(BaseRunner):
                # get cmd
                mmengine.mkdir_or_exist('tmp/')
                param_file = f'tmp/{os.getpid()}_params.py'
-                task.cfg.dump(param_file)
+                try:
-                cmd = task.get_command(cfg_path=param_file,
+                    task.cfg.dump(param_file)
-                                       template='{task_cmd}')
+                    cmd = task.get_command(cfg_path=param_file,
-                # run in subprocess if starts with torchrun etc.
+                                           template='{task_cmd}')
-                if cmd.startswith('python'):
+                    # run in subprocess if starts with torchrun etc.
-                    task.run()
+                    if cmd.startswith('python'):
-                else:
+                        task.run()
-                    subprocess.run(cmd, shell=True, text=True)
+                    else:
-                os.remove(param_file)
+                        subprocess.run(cmd, shell=True, text=True)
+                finally:
+                    os.remove(param_file)
                status.append((task_name, 0))
        else:
            import torch
@@ -141,31 +143,34 @@ class LocalRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_{index}_params.py'
-        task.cfg.dump(param_file)
+        try:
+            task.cfg.dump(param_file)
-        # Build up slurm command
-        tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
+            # Build up slurm command
-        tmpl += ' {task_cmd}'
+            tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            tmpl += ' {task_cmd}'
-        cmd = get_cmd()
+            get_cmd = partial(task.get_command,
+                              cfg_path=param_file,
-        logger = get_logger()
+                              template=tmpl)
-        logger.debug(f'Running command: {cmd}')
+            cmd = get_cmd()
-        # Run command
+            logger = get_logger()
-        out_path = task.get_log_path(file_extension='out')
+            logger.debug(f'Running command: {cmd}')
-        mmengine.mkdir_or_exist(osp.split(out_path)[0])
-        stdout = open(out_path, 'w', encoding='utf-8')
+            # Run command
+            out_path = task.get_log_path(file_extension='out')
-        result = subprocess.run(cmd,
+            mmengine.mkdir_or_exist(osp.split(out_path)[0])
-                                shell=True,
+            stdout = open(out_path, 'w', encoding='utf-8')
-                                text=True,
-                                stdout=stdout,
+            result = subprocess.run(cmd,
-                                stderr=stdout)
+                                    shell=True,
+                                    text=True,
-        if result.returncode != 0:
+                                    stdout=stdout,
-            logger.warning(f'task {task_name} fail, see\n{out_path}')
+                                    stderr=stdout)
-        # Clean up
+            if result.returncode != 0:
-        os.remove(param_file)
+                logger.warning(f'task {task_name} fail, see\n{out_path}')
+        finally:
+            # Clean up
+            os.remove(param_file)
        return task_name, result.returncode
--- a/opencompass/runners/slurm.py
+++ b/opencompass/runners/slurm.py
@@ -91,60 +91,64 @@ class SlurmRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
-        task_cfg.dump(param_file)
+        try:
+            task_cfg.dump(param_file)
-        # Build up slurm command
-        tmpl = 'srun'
+            # Build up slurm command
-        if self.partition:
+            tmpl = 'srun'
-            tmpl += f' -p {self.partition}'
+            if self.partition:
-        if self.quotatype:
+                tmpl += f' -p {self.partition}'
-            tmpl += f' --quotatype={self.quotatype}'
+            if self.quotatype:
-        if self.qos:
+                tmpl += f' --quotatype={self.quotatype}'
-            tmpl += f' --qos={self.qos}'
+            if self.qos:
-        if num_gpus > 0:
+                tmpl += f' --qos={self.qos}'
-            tmpl += f' --gres=gpu:{num_gpus}'
+            if num_gpus > 0:
-        tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
+                tmpl += f' --gres=gpu:{num_gpus}'
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
-        cmd = get_cmd()
+            get_cmd = partial(task.get_command,
+                              cfg_path=param_file,
-        logger = get_logger()
+                              template=tmpl)
-        logger.debug(f'Running command: {cmd}')
+            cmd = get_cmd()
-        # Run command with retry
+            logger = get_logger()
-        if self.debug:
+            logger.debug(f'Running command: {cmd}')
-            stdout = None
-        else:
+            # Run command with retry
-            out_path = task.get_log_path(file_extension='out')
+            if self.debug:
-            mmengine.mkdir_or_exist(osp.split(out_path)[0])
+                stdout = None
-            stdout = open(out_path, 'w', encoding='utf-8')
+            else:
+                out_path = task.get_log_path(file_extension='out')
-        if random_sleep:
+                mmengine.mkdir_or_exist(osp.split(out_path)[0])
-            time.sleep(random.randint(0, 10))
+                stdout = open(out_path, 'w', encoding='utf-8')
-        result = subprocess.run(cmd,
-                                shell=True,
-                                text=True,
-                                stdout=stdout,
-                                stderr=stdout)
-        retry = self.retry
-        output_paths = task.get_output_paths()
-        while self._job_failed(result.returncode, output_paths) and retry > 0:
-            retry -= 1
            if random_sleep:
                time.sleep(random.randint(0, 10))
-            # Re-generate command to refresh ports.
-            cmd = get_cmd()
            result = subprocess.run(cmd,
                                    shell=True,
                                    text=True,
                                    stdout=stdout,
                                    stderr=stdout)
-        if result.returncode != 0 and not self.debug:
+            retry = self.retry
-            logger.warning(f'task {task_name} fail, see\n{out_path}')
+            output_paths = task.get_output_paths()
+            while self._job_failed(result.returncode,
-        # Clean up
+                                   output_paths) and retry > 0:
-        os.remove(param_file)
+                retry -= 1
+                if random_sleep:
+                    time.sleep(random.randint(0, 10))
+                # Re-generate command to refresh ports.
+                cmd = get_cmd()
+                result = subprocess.run(cmd,
+                                        shell=True,
+                                        text=True,
+                                        stdout=stdout,
+                                        stderr=stdout)
+            if result.returncode != 0 and not self.debug:
+                logger.warning(f'task {task_name} fail, see\n{out_path}')
+        finally:
+            # Clean up
+            os.remove(param_file)
        return task_name, result.returncode
    def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:

--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -3,7 +3,9 @@ from typing import List, Union
 import tabulate
 from mmengine.config import Config
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
 from opencompass.utils import get_logger, match_files
@@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
    runner(tasks)
-def exec_infer_runner(tasks, args, cfg):
+def get_config_type(obj) -> str:
-    """execute infer runner according to args."""
+    return f'{obj.__module__}.{obj.__name__}'
+def fill_infer_cfg(cfg, args):
+    new_cfg = dict(infer=dict(
+        partitioner=dict(type=get_config_type(SizePartitioner),
+                         max_task_size=args.max_partition_size,
+                         gen_task_coef=args.gen_task_coef),
+        runner=dict(
+            max_num_workers=args.max_num_workers,
+            debug=args.debug,
+            task=dict(type=get_config_type(OpenICLInferTask)),
+            lark_bot_url=cfg['lark_bot_url'],
+        )), )
    if args.slurm:
-        runner = SlurmRunner(dict(type='OpenICLInferTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(SlurmRunner)
-                             max_num_workers=args.max_num_workers,
+        new_cfg['infer']['runner']['partition'] = args.partition
-                             partition=args.partition,
+        new_cfg['infer']['runner']['quotatype'] = args.quotatype
-                             quotatype=args.quotatype,
+        new_cfg['infer']['runner']['qos'] = args.qos
-                             qos=args.qos,
+        new_cfg['infer']['runner']['retry'] = args.retry
-                             retry=args.retry,
-                             debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
-        runner = DLCRunner(dict(type='OpenICLInferTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(DLCRunner)
-                           max_num_workers=args.max_num_workers,
+        new_cfg['infer']['runner']['aliyun_cfg'] = Config.fromfile(
-                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
+            args.aliyun_cfg)
-                           retry=args.retry,
+        new_cfg['infer']['runner']['retry'] = args.retry
-                           debug=args.debug,
-                           lark_bot_url=cfg['lark_bot_url'])
    else:
-        runner = LocalRunner(task=dict(type='OpenICLInferTask'),
+        new_cfg['infer']['runner']['type'] = get_config_type(LocalRunner)
-                             max_num_workers=args.max_num_workers,
+        new_cfg['infer']['runner'][
-                             max_workers_per_gpu=args.max_workers_per_gpu,
+            'max_workers_per_gpu'] = args.max_workers_per_gpu
-                             debug=args.debug,
+    cfg.merge_from_dict(new_cfg)
-                             lark_bot_url=cfg['lark_bot_url'])
-    runner(tasks)
-def exec_eval_runner(tasks, args, cfg):
+def fill_eval_cfg(cfg, args):
-    """execute infer runner according to args."""
+    new_cfg = dict(
+        eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
+                  runner=dict(
+                      max_num_workers=args.max_num_workers,
+                      debug=args.debug,
+                      task=dict(type=get_config_type(OpenICLEvalTask)),
+                      lark_bot_url=cfg['lark_bot_url'],
+                  )))
    if args.slurm:
-        runner = SlurmRunner(dict(type='OpenICLEvalTask'),
+        new_cfg['eval']['runner']['type'] = get_config_type(SlurmRunner)
-                             max_num_workers=args.max_num_workers,
+        new_cfg['eval']['runner']['partition'] = args.partition
-                             partition=args.partition,
+        new_cfg['eval']['runner']['quotatype'] = args.quotatype
-                             quotatype=args.quotatype,
+        new_cfg['eval']['runner']['qos'] = args.qos
-                             qos=args.qos,
+        new_cfg['eval']['runner']['retry'] = args.retry
-                             retry=args.retry,
-                             debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
    elif args.dlc:
-        runner = DLCRunner(dict(type='OpenICLEvalTask'),
+        new_cfg['eval']['runner']['type'] = get_config_type(DLCRunner)
-                           max_num_workers=args.max_num_workers,
+        new_cfg['eval']['runner']['aliyun_cfg'] = Config.fromfile(
-                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
+            args.aliyun_cfg)
-                           retry=args.retry,
+        new_cfg['eval']['runner']['retry'] = args.retry
-                           debug=args.debug,
-                           lark_bot_url=cfg['lark_bot_url'])
    else:
-        runner = LocalRunner(task=dict(type='OpenICLEvalTask'),
+        new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
-                             max_num_workers=args.max_num_workers,
+        new_cfg['eval']['runner'][
-                             debug=args.debug,
+            'max_workers_per_gpu'] = args.max_workers_per_gpu
-                             lark_bot_url=cfg['lark_bot_url'])
+    cfg.merge_from_dict(new_cfg)
-    runner(tasks)
--- a/run.py
+++ b/run.py
@@ -6,13 +6,12 @@ from datetime import datetime
 from mmengine.config import Config, DictAction
-from opencompass.partitioners import (MultimodalNaivePartitioner,
+from opencompass.partitioners import MultimodalNaivePartitioner
-                                      NaivePartitioner, SizePartitioner)
 from opencompass.registry import PARTITIONERS, RUNNERS
 from opencompass.runners import SlurmRunner
 from opencompass.utils import LarkReporter, Summarizer, get_logger
-from opencompass.utils.run import (exec_eval_runner, exec_infer_runner,
+from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
-                                   exec_mm_infer_runner, get_config_from_arg)
+                                   fill_infer_cfg, get_config_from_arg)
 def parse_args():
@@ -245,39 +244,29 @@ def main():
            tasks = partitioner(cfg)
            exec_mm_infer_runner(tasks, args, cfg)
            return
-        elif args.dlc or args.slurm or cfg.get('infer', None) is None:
-            # Use SizePartitioner to split into subtasks
+        if args.dlc or args.slurm or cfg.get('infer', None) is None:
-            partitioner = SizePartitioner(
+            fill_infer_cfg(cfg, args)
-                osp.join(cfg['work_dir'], 'predictions/'),
-                max_task_size=args.max_partition_size,
+        if args.partition is not None:
-                gen_task_coef=args.gen_task_coef)
+            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
-            tasks = partitioner(cfg)
+                cfg.infer.runner.partition = args.partition
-            if args.dry_run:
+                cfg.infer.runner.quotatype = args.quotatype
-                return
-            # execute the infer subtasks
-            exec_infer_runner(tasks, args, cfg)
-        # If they have specified "infer" in config and haven't used --slurm
-        # or --dlc, just follow the config
        else:
-            if args.partition is not None:
+            logger.warning('SlurmRunner is not used, so the partition '
-                if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
+                           'argument is ignored.')
-                    cfg.infer.runner.partition = args.partition
+        if args.debug:
-                    cfg.infer.runner.quotatype = args.quotatype
+            cfg.infer.runner.debug = True
-            else:
+        if args.lark:
-                logger.warning('SlurmRunner is not used, so the partition '
+            cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
-                               'argument is ignored.')
+        cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
-            if args.debug:
+                                                    'predictions/')
-                cfg.infer.runner.debug = True
+        partitioner = PARTITIONERS.build(cfg.infer.partitioner)
-            if args.lark:
+        tasks = partitioner(cfg)
-                cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
+        if args.dry_run:
-            cfg.infer.partitioner['out_dir'] = osp.join(
+            return
-                cfg['work_dir'], 'predictions/')
+        runner = RUNNERS.build(cfg.infer.runner)
-            partitioner = PARTITIONERS.build(cfg.infer.partitioner)
+        runner(tasks)
-            tasks = partitioner(cfg)
-            if args.dry_run:
-                return
-            runner = RUNNERS.build(cfg.infer.runner)
-            runner(tasks)
    # evaluate
    if args.mode in ['all', 'eval']:
@@ -289,37 +278,28 @@ def main():
                           'also specified --slurm or --dlc. '
                           'The "eval" configuration will be overridden by '
                           'your runtime arguments.')
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
-            # Use NaivePartitioner，not split
+            fill_eval_cfg(cfg, args)
-            partitioner = NaivePartitioner(
-                osp.join(cfg['work_dir'], 'results/'))
+        if args.partition is not None:
-            tasks = partitioner(cfg)
+            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
-            if args.dry_run:
+                cfg.eval.runner.partition = args.partition
-                return
+                cfg.eval.runner.quotatype = args.quotatype
-            # execute the eval tasks
+            else:
-            exec_eval_runner(tasks, args, cfg)
+                logger.warning('SlurmRunner is not used, so the partition '
-        # If they have specified "eval" in config and haven't used --slurm
+                               'argument is ignored.')
-        # or --dlc, just follow the config
+        if args.debug:
-        else:
+            cfg.eval.runner.debug = True
-            if args.partition is not None:
+        if args.lark:
-                if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
+            cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
-                    cfg.eval.runner.partition = args.partition
+        cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
-                    cfg.eval.runner.quotatype = args.quotatype
+        partitioner = PARTITIONERS.build(cfg.eval.partitioner)
-                else:
+        tasks = partitioner(cfg)
-                    logger.warning('SlurmRunner is not used, so the partition '
+        if args.dry_run:
-                                   'argument is ignored.')
+            return
-            if args.debug:
+        runner = RUNNERS.build(cfg.eval.runner)
-                cfg.eval.runner.debug = True
+        runner(tasks)
-            if args.lark:
-                cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
-            cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
-                                                       'results/')
-            partitioner = PARTITIONERS.build(cfg.eval.partitioner)
-            tasks = partitioner(cfg)
-            if args.dry_run:
-                return
-            runner = RUNNERS.build(cfg.eval.runner)
-            runner(tasks)
    # visualize
    if args.mode in ['all', 'eval', 'viz']: