"git@developer.sourcefind.cn:OpenDAS/torchaudio.git" did not exist on "080cd3033436679702dc4446b98f9d10b35c9ae1"
Unverified Commit ce65d339 authored by Tong Gao's avatar Tong Gao Committed by GitHub
Browse files

[Sync] Use finally to clean up temp files (#337)

parent 2cd994c3
...@@ -86,65 +86,70 @@ class DLCRunner(BaseRunner): ...@@ -86,65 +86,70 @@ class DLCRunner(BaseRunner):
# Dump task config to file # Dump task config to file
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py' param_file = f'tmp/{os.getpid()}_params.py'
task_cfg.dump(param_file) try:
task_cfg.dump(param_file)
# Build up DLC command
pwd = os.getcwd() # Build up DLC command
shell_cmd = (f'source {self.aliyun_cfg["bashrc_path"]}; ' pwd = os.getcwd()
f'conda activate {self.aliyun_cfg["conda_env_name"]}; ' shell_cmd = (
f'cd {pwd}; ' f'source {self.aliyun_cfg["bashrc_path"]}; '
'{task_cmd}') f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
f'cd {pwd}; '
tmpl = ('dlc create job' '{task_cmd}')
f" --command '{shell_cmd}'"
f' --name {task_name[:512]}' tmpl = ('dlc create job'
' --kind BatchJob' f" --command '{shell_cmd}'"
f" -c {self.aliyun_cfg['dlc_config_path']}" f' --name {task_name[:512]}'
f" --workspace_id {self.aliyun_cfg['workspace_id']}" ' --kind BatchJob'
' --worker_count 1' f" -c {self.aliyun_cfg['dlc_config_path']}"
f' --worker_cpu {max(num_gpus * 6, 8)}' f" --workspace_id {self.aliyun_cfg['workspace_id']}"
f' --worker_gpu {num_gpus}' ' --worker_count 1'
f' --worker_memory {max(num_gpus * 32, 48)}' f' --worker_cpu {max(num_gpus * 6, 8)}'
f" --worker_image {self.aliyun_cfg['worker_image']}" f' --worker_gpu {num_gpus}'
' --interactive') f' --worker_memory {max(num_gpus * 32, 48)}'
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) f" --worker_image {self.aliyun_cfg['worker_image']}"
cmd = get_cmd() ' --interactive')
get_cmd = partial(task.get_command,
logger = get_logger() cfg_path=param_file,
logger.debug(f'Running command: {cmd}') template=tmpl)
cmd = get_cmd()
# Run command with retry
if self.debug: logger = get_logger()
stdout = None logger.debug(f'Running command: {cmd}')
else:
out_path = task.get_log_path(file_extension='out') # Run command with retry
mmengine.mkdir_or_exist(osp.split(out_path)[0]) if self.debug:
stdout = open(out_path, 'w', encoding='utf-8') stdout = None
else:
if random_sleep: out_path = task.get_log_path(file_extension='out')
time.sleep(random.randint(0, 10)) mmengine.mkdir_or_exist(osp.split(out_path)[0])
result = subprocess.run(cmd, stdout = open(out_path, 'w', encoding='utf-8')
shell=True,
text=True,
stdout=stdout,
stderr=stdout)
retry = self.retry
output_paths = task.get_output_paths()
while self._job_failed(result.returncode, output_paths) and retry > 0:
retry -= 1
if random_sleep: if random_sleep:
time.sleep(random.randint(0, 10)) time.sleep(random.randint(0, 10))
# Re-generate command to refresh ports.
cmd = get_cmd()
result = subprocess.run(cmd, result = subprocess.run(cmd,
shell=True, shell=True,
text=True, text=True,
stdout=stdout, stdout=stdout,
stderr=stdout) stderr=stdout)
# Clean up retry = self.retry
os.remove(param_file) output_paths = task.get_output_paths()
while self._job_failed(result.returncode,
output_paths) and retry > 0:
retry -= 1
if random_sleep:
time.sleep(random.randint(0, 10))
# Re-generate command to refresh ports.
cmd = get_cmd()
result = subprocess.run(cmd,
shell=True,
text=True,
stdout=stdout,
stderr=stdout)
finally:
# Clean up
os.remove(param_file)
return task_name, result.returncode return task_name, result.returncode
def _job_failed(self, return_code: int, output_paths: List[str]) -> bool: def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:
......
...@@ -62,15 +62,17 @@ class LocalRunner(BaseRunner): ...@@ -62,15 +62,17 @@ class LocalRunner(BaseRunner):
# get cmd # get cmd
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py' param_file = f'tmp/{os.getpid()}_params.py'
task.cfg.dump(param_file) try:
cmd = task.get_command(cfg_path=param_file, task.cfg.dump(param_file)
template='{task_cmd}') cmd = task.get_command(cfg_path=param_file,
# run in subprocess if starts with torchrun etc. template='{task_cmd}')
if cmd.startswith('python'): # run in subprocess if starts with torchrun etc.
task.run() if cmd.startswith('python'):
else: task.run()
subprocess.run(cmd, shell=True, text=True) else:
os.remove(param_file) subprocess.run(cmd, shell=True, text=True)
finally:
os.remove(param_file)
status.append((task_name, 0)) status.append((task_name, 0))
else: else:
import torch import torch
...@@ -141,31 +143,34 @@ class LocalRunner(BaseRunner): ...@@ -141,31 +143,34 @@ class LocalRunner(BaseRunner):
# Dump task config to file # Dump task config to file
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_{index}_params.py' param_file = f'tmp/{os.getpid()}_{index}_params.py'
task.cfg.dump(param_file) try:
task.cfg.dump(param_file)
# Build up slurm command
tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids) # Build up slurm command
tmpl += ' {task_cmd}' tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) tmpl += ' {task_cmd}'
cmd = get_cmd() get_cmd = partial(task.get_command,
cfg_path=param_file,
logger = get_logger() template=tmpl)
logger.debug(f'Running command: {cmd}') cmd = get_cmd()
# Run command logger = get_logger()
out_path = task.get_log_path(file_extension='out') logger.debug(f'Running command: {cmd}')
mmengine.mkdir_or_exist(osp.split(out_path)[0])
stdout = open(out_path, 'w', encoding='utf-8') # Run command
out_path = task.get_log_path(file_extension='out')
result = subprocess.run(cmd, mmengine.mkdir_or_exist(osp.split(out_path)[0])
shell=True, stdout = open(out_path, 'w', encoding='utf-8')
text=True,
stdout=stdout, result = subprocess.run(cmd,
stderr=stdout) shell=True,
text=True,
if result.returncode != 0: stdout=stdout,
logger.warning(f'task {task_name} fail, see\n{out_path}') stderr=stdout)
# Clean up if result.returncode != 0:
os.remove(param_file) logger.warning(f'task {task_name} fail, see\n{out_path}')
finally:
# Clean up
os.remove(param_file)
return task_name, result.returncode return task_name, result.returncode
...@@ -91,60 +91,64 @@ class SlurmRunner(BaseRunner): ...@@ -91,60 +91,64 @@ class SlurmRunner(BaseRunner):
# Dump task config to file # Dump task config to file
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py' param_file = f'tmp/{os.getpid()}_params.py'
task_cfg.dump(param_file) try:
task_cfg.dump(param_file)
# Build up slurm command
tmpl = 'srun' # Build up slurm command
if self.partition: tmpl = 'srun'
tmpl += f' -p {self.partition}' if self.partition:
if self.quotatype: tmpl += f' -p {self.partition}'
tmpl += f' --quotatype={self.quotatype}' if self.quotatype:
if self.qos: tmpl += f' --quotatype={self.quotatype}'
tmpl += f' --qos={self.qos}' if self.qos:
if num_gpus > 0: tmpl += f' --qos={self.qos}'
tmpl += f' --gres=gpu:{num_gpus}' if num_gpus > 0:
tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}' tmpl += f' --gres=gpu:{num_gpus}'
get_cmd = partial(task.get_command, cfg_path=param_file, template=tmpl) tmpl += f" -N1 -J '{task_name[:512]}'" + ' {task_cmd}'
cmd = get_cmd() get_cmd = partial(task.get_command,
cfg_path=param_file,
logger = get_logger() template=tmpl)
logger.debug(f'Running command: {cmd}') cmd = get_cmd()
# Run command with retry logger = get_logger()
if self.debug: logger.debug(f'Running command: {cmd}')
stdout = None
else: # Run command with retry
out_path = task.get_log_path(file_extension='out') if self.debug:
mmengine.mkdir_or_exist(osp.split(out_path)[0]) stdout = None
stdout = open(out_path, 'w', encoding='utf-8') else:
out_path = task.get_log_path(file_extension='out')
if random_sleep: mmengine.mkdir_or_exist(osp.split(out_path)[0])
time.sleep(random.randint(0, 10)) stdout = open(out_path, 'w', encoding='utf-8')
result = subprocess.run(cmd,
shell=True,
text=True,
stdout=stdout,
stderr=stdout)
retry = self.retry
output_paths = task.get_output_paths()
while self._job_failed(result.returncode, output_paths) and retry > 0:
retry -= 1
if random_sleep: if random_sleep:
time.sleep(random.randint(0, 10)) time.sleep(random.randint(0, 10))
# Re-generate command to refresh ports.
cmd = get_cmd()
result = subprocess.run(cmd, result = subprocess.run(cmd,
shell=True, shell=True,
text=True, text=True,
stdout=stdout, stdout=stdout,
stderr=stdout) stderr=stdout)
if result.returncode != 0 and not self.debug: retry = self.retry
logger.warning(f'task {task_name} fail, see\n{out_path}') output_paths = task.get_output_paths()
while self._job_failed(result.returncode,
# Clean up output_paths) and retry > 0:
os.remove(param_file) retry -= 1
if random_sleep:
time.sleep(random.randint(0, 10))
# Re-generate command to refresh ports.
cmd = get_cmd()
result = subprocess.run(cmd,
shell=True,
text=True,
stdout=stdout,
stderr=stdout)
if result.returncode != 0 and not self.debug:
logger.warning(f'task {task_name} fail, see\n{out_path}')
finally:
# Clean up
os.remove(param_file)
return task_name, result.returncode return task_name, result.returncode
def _job_failed(self, return_code: int, output_paths: List[str]) -> bool: def _job_failed(self, return_code: int, output_paths: List[str]) -> bool:
......
...@@ -3,7 +3,9 @@ from typing import List, Union ...@@ -3,7 +3,9 @@ from typing import List, Union
import tabulate import tabulate
from mmengine.config import Config from mmengine.config import Config
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.utils import get_logger, match_files from opencompass.utils import get_logger, match_files
...@@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg): ...@@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
runner(tasks) runner(tasks)
def exec_infer_runner(tasks, args, cfg): def get_config_type(obj) -> str:
"""execute infer runner according to args.""" return f'{obj.__module__}.{obj.__name__}'
def fill_infer_cfg(cfg, args):
new_cfg = dict(infer=dict(
partitioner=dict(type=get_config_type(SizePartitioner),
max_task_size=args.max_partition_size,
gen_task_coef=args.gen_task_coef),
runner=dict(
max_num_workers=args.max_num_workers,
debug=args.debug,
task=dict(type=get_config_type(OpenICLInferTask)),
lark_bot_url=cfg['lark_bot_url'],
)), )
if args.slurm: if args.slurm:
runner = SlurmRunner(dict(type='OpenICLInferTask'), new_cfg['infer']['runner']['type'] = get_config_type(SlurmRunner)
max_num_workers=args.max_num_workers, new_cfg['infer']['runner']['partition'] = args.partition
partition=args.partition, new_cfg['infer']['runner']['quotatype'] = args.quotatype
quotatype=args.quotatype, new_cfg['infer']['runner']['qos'] = args.qos
qos=args.qos, new_cfg['infer']['runner']['retry'] = args.retry
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
elif args.dlc: elif args.dlc:
runner = DLCRunner(dict(type='OpenICLInferTask'), new_cfg['infer']['runner']['type'] = get_config_type(DLCRunner)
max_num_workers=args.max_num_workers, new_cfg['infer']['runner']['aliyun_cfg'] = Config.fromfile(
aliyun_cfg=Config.fromfile(args.aliyun_cfg), args.aliyun_cfg)
retry=args.retry, new_cfg['infer']['runner']['retry'] = args.retry
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
else: else:
runner = LocalRunner(task=dict(type='OpenICLInferTask'), new_cfg['infer']['runner']['type'] = get_config_type(LocalRunner)
max_num_workers=args.max_num_workers, new_cfg['infer']['runner'][
max_workers_per_gpu=args.max_workers_per_gpu, 'max_workers_per_gpu'] = args.max_workers_per_gpu
debug=args.debug, cfg.merge_from_dict(new_cfg)
lark_bot_url=cfg['lark_bot_url'])
runner(tasks)
def exec_eval_runner(tasks, args, cfg): def fill_eval_cfg(cfg, args):
"""execute infer runner according to args.""" new_cfg = dict(
eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
runner=dict(
max_num_workers=args.max_num_workers,
debug=args.debug,
task=dict(type=get_config_type(OpenICLEvalTask)),
lark_bot_url=cfg['lark_bot_url'],
)))
if args.slurm: if args.slurm:
runner = SlurmRunner(dict(type='OpenICLEvalTask'), new_cfg['eval']['runner']['type'] = get_config_type(SlurmRunner)
max_num_workers=args.max_num_workers, new_cfg['eval']['runner']['partition'] = args.partition
partition=args.partition, new_cfg['eval']['runner']['quotatype'] = args.quotatype
quotatype=args.quotatype, new_cfg['eval']['runner']['qos'] = args.qos
qos=args.qos, new_cfg['eval']['runner']['retry'] = args.retry
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
elif args.dlc: elif args.dlc:
runner = DLCRunner(dict(type='OpenICLEvalTask'), new_cfg['eval']['runner']['type'] = get_config_type(DLCRunner)
max_num_workers=args.max_num_workers, new_cfg['eval']['runner']['aliyun_cfg'] = Config.fromfile(
aliyun_cfg=Config.fromfile(args.aliyun_cfg), args.aliyun_cfg)
retry=args.retry, new_cfg['eval']['runner']['retry'] = args.retry
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
else: else:
runner = LocalRunner(task=dict(type='OpenICLEvalTask'), new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
max_num_workers=args.max_num_workers, new_cfg['eval']['runner'][
debug=args.debug, 'max_workers_per_gpu'] = args.max_workers_per_gpu
lark_bot_url=cfg['lark_bot_url']) cfg.merge_from_dict(new_cfg)
runner(tasks)
...@@ -6,13 +6,12 @@ from datetime import datetime ...@@ -6,13 +6,12 @@ from datetime import datetime
from mmengine.config import Config, DictAction from mmengine.config import Config, DictAction
from opencompass.partitioners import (MultimodalNaivePartitioner, from opencompass.partitioners import MultimodalNaivePartitioner
NaivePartitioner, SizePartitioner)
from opencompass.registry import PARTITIONERS, RUNNERS from opencompass.registry import PARTITIONERS, RUNNERS
from opencompass.runners import SlurmRunner from opencompass.runners import SlurmRunner
from opencompass.utils import LarkReporter, Summarizer, get_logger from opencompass.utils import LarkReporter, Summarizer, get_logger
from opencompass.utils.run import (exec_eval_runner, exec_infer_runner, from opencompass.utils.run import (exec_mm_infer_runner, fill_eval_cfg,
exec_mm_infer_runner, get_config_from_arg) fill_infer_cfg, get_config_from_arg)
def parse_args(): def parse_args():
...@@ -245,39 +244,29 @@ def main(): ...@@ -245,39 +244,29 @@ def main():
tasks = partitioner(cfg) tasks = partitioner(cfg)
exec_mm_infer_runner(tasks, args, cfg) exec_mm_infer_runner(tasks, args, cfg)
return return
elif args.dlc or args.slurm or cfg.get('infer', None) is None:
# Use SizePartitioner to split into subtasks if args.dlc or args.slurm or cfg.get('infer', None) is None:
partitioner = SizePartitioner( fill_infer_cfg(cfg, args)
osp.join(cfg['work_dir'], 'predictions/'),
max_task_size=args.max_partition_size, if args.partition is not None:
gen_task_coef=args.gen_task_coef) if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
tasks = partitioner(cfg) cfg.infer.runner.partition = args.partition
if args.dry_run: cfg.infer.runner.quotatype = args.quotatype
return
# execute the infer subtasks
exec_infer_runner(tasks, args, cfg)
# If they have specified "infer" in config and haven't used --slurm
# or --dlc, just follow the config
else: else:
if args.partition is not None: logger.warning('SlurmRunner is not used, so the partition '
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner: 'argument is ignored.')
cfg.infer.runner.partition = args.partition if args.debug:
cfg.infer.runner.quotatype = args.quotatype cfg.infer.runner.debug = True
else: if args.lark:
logger.warning('SlurmRunner is not used, so the partition ' cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
'argument is ignored.') cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
if args.debug: 'predictions/')
cfg.infer.runner.debug = True partitioner = PARTITIONERS.build(cfg.infer.partitioner)
if args.lark: tasks = partitioner(cfg)
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url'] if args.dry_run:
cfg.infer.partitioner['out_dir'] = osp.join( return
cfg['work_dir'], 'predictions/') runner = RUNNERS.build(cfg.infer.runner)
partitioner = PARTITIONERS.build(cfg.infer.partitioner) runner(tasks)
tasks = partitioner(cfg)
if args.dry_run:
return
runner = RUNNERS.build(cfg.infer.runner)
runner(tasks)
# evaluate # evaluate
if args.mode in ['all', 'eval']: if args.mode in ['all', 'eval']:
...@@ -289,37 +278,28 @@ def main(): ...@@ -289,37 +278,28 @@ def main():
'also specified --slurm or --dlc. ' 'also specified --slurm or --dlc. '
'The "eval" configuration will be overridden by ' 'The "eval" configuration will be overridden by '
'your runtime arguments.') 'your runtime arguments.')
if args.dlc or args.slurm or cfg.get('eval', None) is None: if args.dlc or args.slurm or cfg.get('eval', None) is None:
# Use NaivePartitioner,not split fill_eval_cfg(cfg, args)
partitioner = NaivePartitioner(
osp.join(cfg['work_dir'], 'results/')) if args.partition is not None:
tasks = partitioner(cfg) if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
if args.dry_run: cfg.eval.runner.partition = args.partition
return cfg.eval.runner.quotatype = args.quotatype
# execute the eval tasks else:
exec_eval_runner(tasks, args, cfg) logger.warning('SlurmRunner is not used, so the partition '
# If they have specified "eval" in config and haven't used --slurm 'argument is ignored.')
# or --dlc, just follow the config if args.debug:
else: cfg.eval.runner.debug = True
if args.partition is not None: if args.lark:
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner: cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
cfg.eval.runner.partition = args.partition cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
cfg.eval.runner.quotatype = args.quotatype partitioner = PARTITIONERS.build(cfg.eval.partitioner)
else: tasks = partitioner(cfg)
logger.warning('SlurmRunner is not used, so the partition ' if args.dry_run:
'argument is ignored.') return
if args.debug: runner = RUNNERS.build(cfg.eval.runner)
cfg.eval.runner.debug = True runner(tasks)
if args.lark:
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'],
'results/')
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
tasks = partitioner(cfg)
if args.dry_run:
return
runner = RUNNERS.build(cfg.eval.runner)
runner(tasks)
# visualize # visualize
if args.mode in ['all', 'eval', 'viz']: if args.mode in ['all', 'eval', 'viz']:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment