[Sync] Use finally to clean up temp files (#337)

ce65d339 · Tong Gao · GitHub · 2cd994c3 · ce65d339 · ce65d339
Unverified Commit ce65d339 authored Sep 04, 2023 by Tong Gao Committed by GitHub Sep 04, 2023
5 changed files
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -86,11 +86,13 @@ class DLCRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
+        try:
            task_cfg.dump(param_file)

            # Build up DLC command
            pwd = os.getcwd()
-        shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; '
+            shell_cmd = (
+                f'source {self.aliyun_cfg["bashrc_path"]}; '
                f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
                f'cd {pwd}; '
                '{task_cmd}')
@@ -107,7 +109,9 @@ class DLCRunner(BaseRunner):
                    f' --worker_memory {max(num_gpus * 32, 48)}'
                    f" --worker_image {self.aliyun_cfg['worker_image']}"
                    ' --interactive')
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
+                              cfg_path=param_file,
+                              template=tmpl)
            cmd = get_cmd()

            logger = get_logger()
@@ -131,7 +135,8 @@ class DLCRunner(BaseRunner):

            retry = self.retry
            output_paths = task.get_output_paths()
-        while self._job_failed(result.returncode, output_paths) and retry > 0:
+            while self._job_failed(result.returncode,
+                                   output_paths) and retry > 0:
                retry -= 1
                if random_sleep:
                    time.sleep(random.randint(0, 10))
@@ -142,7 +147,7 @@ class DLCRunner(BaseRunner):
                                        text=True,
                                        stdout=stdout,
                                        stderr=stdout)
-
+        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode

--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@@ -62,6 +62,7 @@ class LocalRunner(BaseRunner):
                # get cmd
                mmengine.mkdir_or_exist('tmp/')
                param_file = f'tmp/{os.getpid()}_params.py'
+                try:
                    task.cfg.dump(param_file)
                    cmd = task.get_command(cfg_path=param_file,
                                           template='{task_cmd}')
@@ -70,6 +71,7 @@ class LocalRunner(BaseRunner):
                        task.run()
                    else:
                        subprocess.run(cmd, shell=True, text=True)
+                finally:
                    os.remove(param_file)
                status.append((task_name, 0))
        else:
@@ -141,12 +143,15 @@ class LocalRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_{index}_params.py'
+        try:
            task.cfg.dump(param_file)

            # Build up slurm command
            tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
            tmpl += ' {task_cmd}'
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
+                              cfg_path=param_file,
+                              template=tmpl)
            cmd = get_cmd()

            logger = get_logger()
@@ -165,7 +170,7 @@ class LocalRunner(BaseRunner):

            if result.returncode != 0:
                logger.warning(f'task {task_name} fail, see\n{out_path}')
-
+        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode
--- a/opencompass/runners/slurm.py
+++ b/opencompass/runners/slurm.py
@@ -91,6 +91,7 @@ class SlurmRunner(BaseRunner):
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
+        try:
            task_cfg.dump(param_file)

            # Build up slurm command
@@ -104,7 +105,9 @@ class SlurmRunner(BaseRunner):
            if num_gpus > 0:
                tmpl += f' --gres=gpu:{num_gpus}'
            tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
-        get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
+            get_cmd = partial(task.get_command,
+                              cfg_path=param_file,
+                              template=tmpl)
            cmd = get_cmd()

            logger = get_logger()
@@ -128,7 +131,8 @@ class SlurmRunner(BaseRunner):

            retry = self.retry
            output_paths = task.get_output_paths()
-        while self._job_failed(result.returncode, output_paths) and retry > 0:
+            while self._job_failed(result.returncode,
+                                   output_paths) and retry > 0:
                retry -= 1
                if random_sleep:
                    time.sleep(random.randint(0, 10))
@@ -142,7 +146,7 @@ class SlurmRunner(BaseRunner):

            if result.returncode != 0 and not self.debug:
                logger.warning(f'task {task_name} fail, see\n{out_path}')
-
+        finally:
            # Clean up
            os.remove(param_file)
        return task_name, result.returncode

--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -3,7 +3,9 @@ from typing import List, Union
 import tabulate
 from mmengine.config import Config

+from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
 from opencompass.utils import get_logger, match_files


@@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
    runner(tasks)


-def exec_infer_runner(tasks, args, cfg):
-    """execute infer runner according to args."""
-    if args.slurm:
-        runner = SlurmRunner(dict(type='OpenICLInferTask'),
+def get_config_type(obj) -> str:
+    return f'{obj.__module__}.{obj.__name__}'
+
+
+def fill_infer_cfg(cfg, args):
+    new_cfg = dict(infer=dict(
+        partitioner=dict(type=get_config_type(SizePartitioner),
+                         max_task_size=args.max_partition_size,
+                         gen_task_coef=args.gen_task_coef),
+        runner=dict(
            max_num_workers=args.max_num_workers,
-                             partition=args.partition,
-                             quotatype=args.quotatype,
-                             qos=args.qos,
-                             retry=args.retry,
            debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
+            task=dict(type=get_config_type(OpenICLInferTask)),
+            lark_bot_url=cfg['lark_bot_url'],
+        )), )
+    if args.slurm:
+        new_cfg['infer']['runner']['type'] = get_config_type(SlurmRunner)
+        new_cfg['infer']['runner']['partition'] = args.partition
+        new_cfg['infer']['runner']['quotatype'] = args.quotatype
+        new_cfg['infer']['runner']['qos'] = args.qos
+        new_cfg['infer']['runner']['retry'] = args.retry
    elif args.dlc:
-        runner = DLCRunner(dict(type='OpenICLInferTask'),
-                           max_num_workers=args.max_num_workers,
-                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
-                           retry=args.retry,
-                           debug=args.debug,
-                           lark_bot_url=cfg['lark_bot_url'])
+        new_cfg['infer']['runner']['type'] = get_config_type(DLCRunner)
+        new_cfg['infer']['runner']['aliyun_cfg'] = Config.fromfile(
+            args.aliyun_cfg)
+        new_cfg['infer']['runner']['retry'] = args.retry
    else:
-        runner = LocalRunner(task=dict(type='OpenICLInferTask'),
-                             max_num_workers=args.max_num_workers,
-                             max_workers_per_gpu=args.max_workers_per_gpu,
-                             debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
-    runner(tasks)
+        new_cfg['infer']['runner']['type'] = get_config_type(LocalRunner)
+        new_cfg['infer']['runner'][
+            'max_workers_per_gpu'] = args.max_workers_per_gpu
+    cfg.merge_from_dict(new_cfg)


-def exec_eval_runner(tasks, args, cfg):
-    """execute infer runner according to args."""
-    if args.slurm:
-        runner = SlurmRunner(dict(type='OpenICLEvalTask'),
+def fill_eval_cfg(cfg, args):
+    new_cfg = dict(
+        eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
+                  runner=dict(
                      max_num_workers=args.max_num_workers,
-                             partition=args.partition,
-                             quotatype=args.quotatype,
-                             qos=args.qos,
-                             retry=args.retry,
                      debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
+                      task=dict(type=get_config_type(OpenICLEvalTask)),
+                      lark_bot_url=cfg['lark_bot_url'],
+                  )))
+    if args.slurm:
+        new_cfg['eval']['runner']['type'] = get_config_type(SlurmRunner)
+        new_cfg['eval']['runner']['partition'] = args.partition
+        new_cfg['eval']['runner']['quotatype'] = args.quotatype
+        new_cfg['eval']['runner']['qos'] = args.qos
+        new_cfg['eval']['runner']['retry'] = args.retry
    elif args.dlc:
-        runner = DLCRunner(dict(type='OpenICLEvalTask'),
-                           max_num_workers=args.max_num_workers,
-                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
-                           retry=args.retry,
-                           debug=args.debug,
-                           lark_bot_url=cfg['lark_bot_url'])
+        new_cfg['eval']['runner']['type'] = get_config_type(DLCRunner)
+        new_cfg['eval']['runner']['aliyun_cfg'] = Config.fromfile(
+            args.aliyun_cfg)
+        new_cfg['eval']['runner']['retry'] = args.retry
    else:
-        runner = LocalRunner(task=dict(type='OpenICLEvalTask'),
-                             max_num_workers=args.max_num_workers,
-                             debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
-    runner(tasks)
+        new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
+        new_cfg['eval']['runner'][
+            'max_workers_per_gpu'] = args.max_workers_per_gpu
+    cfg.merge_from_dict(new_cfg)
--- a/run.py
+++ b/run.py
@@ -6,13 +6,12 @@ from datetime import datetime

 from mmengine.config import Config, DictAction

-from opencompass.partitioners import (MultimodalNaivePartitioner,
-                                      NaivePartitioner, SizePartitioner)
+from opencompass.partitioners import MultimodalNaivePartitioner
 from opencompass.registry import PARTITIONERS, RUNNERS
 from opencompass.runners import SlurmRunner
 from opencompass.utils import LarkReporter, Summarizer, get_logger
-from opencompass.utils.run import (exec_eval_runner, exec_infer_runner,
-                                   exec_mm_infer_runner, get_config_from_arg)
+from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
+                                   fill_infer_cfg, get_config_from_arg)


 def parse_args():
@@ -245,20 +244,10 @@ def main():
            tasks = partitioner(cfg)
            exec_mm_infer_runner(tasks, args, cfg)
            return
-        elif args.dlc or args.slurm or cfg.get('infer', None) is None:
-            # Use SizePartitioner to split into subtasks
-            partitioner = SizePartitioner(
-                osp.join(cfg['work_dir'], 'predictions/'),
-                max_task_size=args.max_partition_size,
-                gen_task_coef=args.gen_task_coef)
-            tasks = partitioner(cfg)
-            if args.dry_run:
-                return
-            # execute the infer subtasks
-            exec_infer_runner(tasks, args, cfg)
-        # If they have specified "infer" in config and haven't used --slurm
-        # or --dlc, just follow the config
-        else:
+
+        if args.dlc or args.slurm or cfg.get('infer', None) is None:
+            fill_infer_cfg(cfg, args)
+
        if args.partition is not None:
            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                cfg.infer.runner.partition = args.partition
@@ -270,8 +259,8 @@ def main():
            cfg.infer.runner.debug = True
        if args.lark:
            cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
-            cfg.infer.partitioner['out_dir'] = osp.join(
-                cfg['work_dir'], 'predictions/')
+        cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
+                                                    'predictions/')
        partitioner = PARTITIONERS.build(cfg.infer.partitioner)
        tasks = partitioner(cfg)
        if args.dry_run:
@@ -289,18 +278,10 @@ def main():
                           'also specified --slurm or --dlc. '
                           'The "eval" configuration will be overridden by '
                           'your runtime arguments.')
+
        if args.dlc or args.slurm or cfg.get('eval', None) is None:
-            # Use NaivePartitioner，not split
-            partitioner = NaivePartitioner(
-                osp.join(cfg['work_dir'], 'results/'))
-            tasks = partitioner(cfg)
-            if args.dry_run:
-                return
-            # execute the eval tasks
-            exec_eval_runner(tasks, args, cfg)
-        # If they have specified "eval" in config and haven't used --slurm
-        # or --dlc, just follow the config
-        else:
+            fill_eval_cfg(cfg, args)
+
        if args.partition is not None:
            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
                cfg.eval.runner.partition = args.partition
@@ -312,8 +293,7 @@ def main():
            cfg.eval.runner.debug = True
        if args.lark:
            cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
-            cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
-                                                       'results/')
+        cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
        partitioner = PARTITIONERS.build(cfg.eval.partitioner)
        tasks = partitioner(cfg)
        if args.dry_run: