"git@developer.sourcefind.cn:dadigang/Ventoy.git" did not exist on "d1584c10b4fbfe23eda94b4fcedc6352990f23f8"
Unverified Commit ce65d339 authored by Tong Gao's avatar Tong Gao Committed by GitHub
Browse files

[Sync] Use finally to clean up temp files (#337)

parent 2cd994c3
......@@ -86,11 +86,13 @@ class DLCRunner(BaseRunner):
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py'
try:
task_cfg.dump(param_file)
# Build up DLC command
pwd = os.getcwd()
shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; '
shell_cmd = (
f'source {self.aliyun_cfg["bashrc_path"]}; '
f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
f'cd {pwd}; '
'{task_cmd}')
......@@ -107,7 +109,9 @@ class DLCRunner(BaseRunner):
f' --worker_memory {max(num_gpus * 32, 48)}'
f" --worker_image {self.aliyun_cfg['worker_image']}"
' --interactive')
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
get_cmd = partial(task.get_command,
cfg_path=param_file,
template=tmpl)
cmd = get_cmd()
logger = get_logger()
......@@ -131,7 +135,8 @@ class DLCRunner(BaseRunner):
retry = self.retry
output_paths = task.get_output_paths()
while self._job_failed(result.returncode, output_paths) and retry > 0:
while self._job_failed(result.returncode,
output_paths) and retry > 0:
retry -= 1
if random_sleep:
time.sleep(random.randint(0, 10))
......@@ -142,7 +147,7 @@ class DLCRunner(BaseRunner):
text=True,
stdout=stdout,
stderr=stdout)
finally:
# Clean up
os.remove(param_file)
return task_name, result.returncode
......
......@@ -62,6 +62,7 @@ class LocalRunner(BaseRunner):
# get cmd
mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py'
try:
task.cfg.dump(param_file)
cmd = task.get_command(cfg_path=param_file,
template='{task_cmd}')
......@@ -70,6 +71,7 @@ class LocalRunner(BaseRunner):
task.run()
else:
subprocess.run(cmd, shell=True, text=True)
finally:
os.remove(param_file)
status.append((task_name, 0))
else:
......@@ -141,12 +143,15 @@ class LocalRunner(BaseRunner):
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_{index}_params.py'
try:
task.cfg.dump(param_file)
# Build up slurm command
tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
tmpl += ' {task_cmd}'
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
get_cmd = partial(task.get_command,
cfg_path=param_file,
template=tmpl)
cmd = get_cmd()
logger = get_logger()
......@@ -165,7 +170,7 @@ class LocalRunner(BaseRunner):
if result.returncode != 0:
logger.warning(f'task {task_name} fail, see\n{out_path}')
finally:
# Clean up
os.remove(param_file)
return task_name, result.returncode
......@@ -91,6 +91,7 @@ class SlurmRunner(BaseRunner):
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py'
try:
task_cfg.dump(param_file)
# Build up slurm command
......@@ -104,7 +105,9 @@ class SlurmRunner(BaseRunner):
if num_gpus > 0:
tmpl += f' --gres=gpu:{num_gpus}'
tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl)
get_cmd = partial(task.get_command,
cfg_path=param_file,
template=tmpl)
cmd = get_cmd()
logger = get_logger()
......@@ -128,7 +131,8 @@ class SlurmRunner(BaseRunner):
retry = self.retry
output_paths = task.get_output_paths()
while self._job_failed(result.returncode, output_paths) and retry > 0:
while self._job_failed(result.returncode,
output_paths) and retry > 0:
retry -= 1
if random_sleep:
time.sleep(random.randint(0, 10))
......@@ -142,7 +146,7 @@ class SlurmRunner(BaseRunner):
if result.returncode != 0 and not self.debug:
logger.warning(f'task {task_name} fail, see\n{out_path}')
finally:
# Clean up
os.remove(param_file)
return task_name, result.returncode
......
......@@ -3,7 +3,9 @@ from typing import List, Union
import tabulate
from mmengine.config import Config
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.utils import get_logger, match_files
......@@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
runner(tasks)
def exec_infer_runner(tasks, args, cfg):
"""execute infer runner according to args."""
if args.slurm:
runner = SlurmRunner(dict(type='OpenICLInferTask'),
def get_config_type(obj) -> str:
return f'{obj.__module__}.{obj.__name__}'
def fill_infer_cfg(cfg, args):
new_cfg = dict(infer=dict(
partitioner=dict(type=get_config_type(SizePartitioner),
max_task_size=args.max_partition_size,
gen_task_coef=args.gen_task_coef),
runner=dict(
max_num_workers=args.max_num_workers,
partition=args.partition,
quotatype=args.quotatype,
qos=args.qos,
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
task=dict(type=get_config_type(OpenICLInferTask)),
lark_bot_url=cfg['lark_bot_url'],
)), )
if args.slurm:
new_cfg['infer']['runner']['type'] = get_config_type(SlurmRunner)
new_cfg['infer']['runner']['partition'] = args.partition
new_cfg['infer']['runner']['quotatype'] = args.quotatype
new_cfg['infer']['runner']['qos'] = args.qos
new_cfg['infer']['runner']['retry'] = args.retry
elif args.dlc:
runner = DLCRunner(dict(type='OpenICLInferTask'),
max_num_workers=args.max_num_workers,
aliyun_cfg=Config.fromfile(args.aliyun_cfg),
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
new_cfg['infer']['runner']['type'] = get_config_type(DLCRunner)
new_cfg['infer']['runner']['aliyun_cfg'] = Config.fromfile(
args.aliyun_cfg)
new_cfg['infer']['runner']['retry'] = args.retry
else:
runner = LocalRunner(task=dict(type='OpenICLInferTask'),
max_num_workers=args.max_num_workers,
max_workers_per_gpu=args.max_workers_per_gpu,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
runner(tasks)
new_cfg['infer']['runner']['type'] = get_config_type(LocalRunner)
new_cfg['infer']['runner'][
'max_workers_per_gpu'] = args.max_workers_per_gpu
cfg.merge_from_dict(new_cfg)
def exec_eval_runner(tasks, args, cfg):
"""execute infer runner according to args."""
if args.slurm:
runner = SlurmRunner(dict(type='OpenICLEvalTask'),
def fill_eval_cfg(cfg, args):
new_cfg = dict(
eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
runner=dict(
max_num_workers=args.max_num_workers,
partition=args.partition,
quotatype=args.quotatype,
qos=args.qos,
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
task=dict(type=get_config_type(OpenICLEvalTask)),
lark_bot_url=cfg['lark_bot_url'],
)))
if args.slurm:
new_cfg['eval']['runner']['type'] = get_config_type(SlurmRunner)
new_cfg['eval']['runner']['partition'] = args.partition
new_cfg['eval']['runner']['quotatype'] = args.quotatype
new_cfg['eval']['runner']['qos'] = args.qos
new_cfg['eval']['runner']['retry'] = args.retry
elif args.dlc:
runner = DLCRunner(dict(type='OpenICLEvalTask'),
max_num_workers=args.max_num_workers,
aliyun_cfg=Config.fromfile(args.aliyun_cfg),
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
new_cfg['eval']['runner']['type'] = get_config_type(DLCRunner)
new_cfg['eval']['runner']['aliyun_cfg'] = Config.fromfile(
args.aliyun_cfg)
new_cfg['eval']['runner']['retry'] = args.retry
else:
runner = LocalRunner(task=dict(type='OpenICLEvalTask'),
max_num_workers=args.max_num_workers,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
runner(tasks)
new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
new_cfg['eval']['runner'][
'max_workers_per_gpu'] = args.max_workers_per_gpu
cfg.merge_from_dict(new_cfg)
......@@ -6,13 +6,12 @@ from datetime import datetime
from mmengine.config import Config, DictAction
from opencompass.partitioners import (MultimodalNaivePartitioner,
NaivePartitioner, SizePartitioner)
from opencompass.partitioners import MultimodalNaivePartitioner
from opencompass.registry import PARTITIONERS, RUNNERS
from opencompass.runners import SlurmRunner
from opencompass.utils import LarkReporter, Summarizer, get_logger
from opencompass.utils.run import (exec_eval_runner, exec_infer_runner,
exec_mm_infer_runner, get_config_from_arg)
from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
fill_infer_cfg, get_config_from_arg)
def parse_args():
......@@ -245,20 +244,10 @@ def main():
tasks = partitioner(cfg)
exec_mm_infer_runner(tasks, args, cfg)
return
elif args.dlc or args.slurm or cfg.get('infer', None) is None:
# Use SizePartitioner to split into subtasks
partitioner = SizePartitioner(
osp.join(cfg['work_dir'], 'predictions/'),
max_task_size=args.max_partition_size,
gen_task_coef=args.gen_task_coef)
tasks = partitioner(cfg)
if args.dry_run:
return
# execute the infer subtasks
exec_infer_runner(tasks, args, cfg)
# If they have specified "infer" in config and haven't used --slurm
# or --dlc, just follow the config
else:
if args.dlc or args.slurm or cfg.get('infer', None) is None:
fill_infer_cfg(cfg, args)
if args.partition is not None:
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
cfg.infer.runner.partition = args.partition
......@@ -270,8 +259,8 @@ def main():
cfg.infer.runner.debug = True
if args.lark:
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
cfg.infer.partitioner['out_dir'] = osp.join(
cfg['work_dir'], 'predictions/')
cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
'predictions/')
partitioner = PARTITIONERS.build(cfg.infer.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
......@@ -289,18 +278,10 @@ def main():
'also specified --slurm or --dlc. '
'The "eval" configuration will be overridden by '
'your runtime arguments.')
if args.dlc or args.slurm or cfg.get('eval', None) is None:
# Use NaivePartitioner,not split
partitioner = NaivePartitioner(
osp.join(cfg['work_dir'], 'results/'))
tasks = partitioner(cfg)
if args.dry_run:
return
# execute the eval tasks
exec_eval_runner(tasks, args, cfg)
# If they have specified "eval" in config and haven't used --slurm
# or --dlc, just follow the config
else:
fill_eval_cfg(cfg, args)
if args.partition is not None:
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
cfg.eval.runner.partition = args.partition
......@@ -312,8 +293,7 @@ def main():
cfg.eval.runner.debug = True
if args.lark:
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
'results/')
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment