Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
ce65d339
"git@developer.sourcefind.cn:dadigang/Ventoy.git" did not exist on "d1584c10b4fbfe23eda94b4fcedc6352990f23f8"
Unverified
Commit
ce65d339
authored
Sep 04, 2023
by
Tong Gao
Committed by
GitHub
Sep 04, 2023
Browse files
[Sync] Use finally to clean up temp files (#337)
parent
2cd994c3
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
245 additions
and
242 deletions
+245
-242
opencompass/runners/dlc.py
opencompass/runners/dlc.py
+56
-51
opencompass/runners/local.py
opencompass/runners/local.py
+41
-36
opencompass/runners/slurm.py
opencompass/runners/slurm.py
+50
-46
opencompass/utils/run.py
opencompass/utils/run.py
+52
-43
run.py
run.py
+46
-66
No files found.
opencompass/runners/dlc.py
View file @
ce65d339
...
...
@@ -86,11 +86,13 @@ class DLCRunner(BaseRunner):
# Dump task config to file
mmengine
.
mkdir_or_exist
(
'tmp/'
)
param_file
=
f
'tmp/
{
os
.
getpid
()
}
_params.py'
try
:
task_cfg
.
dump
(
param_file
)
# Build up DLC command
pwd
=
os
.
getcwd
()
shell_cmd
=
(
f
'source
{
self
.
aliyun_cfg
[
"bashrc_path"
]
}
; '
shell_cmd
=
(
f
'source
{
self
.
aliyun_cfg
[
"bashrc_path"
]
}
; '
f
'conda activate
{
self
.
aliyun_cfg
[
"conda_env_name"
]
}
; '
f
'cd
{
pwd
}
; '
'{task_cmd}'
)
...
...
@@ -107,7 +109,9 @@ class DLCRunner(BaseRunner):
f
' --worker_memory
{
max
(
num_gpus
*
32
,
48
)
}
'
f
" --worker_image
{
self
.
aliyun_cfg
[
'worker_image'
]
}
"
' --interactive'
)
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
cmd
=
get_cmd
()
logger
=
get_logger
()
...
...
@@ -131,7 +135,8 @@ class DLCRunner(BaseRunner):
retry
=
self
.
retry
output_paths
=
task
.
get_output_paths
()
while
self
.
_job_failed
(
result
.
returncode
,
output_paths
)
and
retry
>
0
:
while
self
.
_job_failed
(
result
.
returncode
,
output_paths
)
and
retry
>
0
:
retry
-=
1
if
random_sleep
:
time
.
sleep
(
random
.
randint
(
0
,
10
))
...
...
@@ -142,7 +147,7 @@ class DLCRunner(BaseRunner):
text
=
True
,
stdout
=
stdout
,
stderr
=
stdout
)
finally
:
# Clean up
os
.
remove
(
param_file
)
return
task_name
,
result
.
returncode
...
...
opencompass/runners/local.py
View file @
ce65d339
...
...
@@ -62,6 +62,7 @@ class LocalRunner(BaseRunner):
# get cmd
mmengine
.
mkdir_or_exist
(
'tmp/'
)
param_file
=
f
'tmp/
{
os
.
getpid
()
}
_params.py'
try
:
task
.
cfg
.
dump
(
param_file
)
cmd
=
task
.
get_command
(
cfg_path
=
param_file
,
template
=
'{task_cmd}'
)
...
...
@@ -70,6 +71,7 @@ class LocalRunner(BaseRunner):
task
.
run
()
else
:
subprocess
.
run
(
cmd
,
shell
=
True
,
text
=
True
)
finally
:
os
.
remove
(
param_file
)
status
.
append
((
task_name
,
0
))
else
:
...
...
@@ -141,12 +143,15 @@ class LocalRunner(BaseRunner):
# Dump task config to file
mmengine
.
mkdir_or_exist
(
'tmp/'
)
param_file
=
f
'tmp/
{
os
.
getpid
()
}
_
{
index
}
_params.py'
try
:
task
.
cfg
.
dump
(
param_file
)
# Build up slurm command
tmpl
=
'CUDA_VISIBLE_DEVICES='
+
','
.
join
(
str
(
i
)
for
i
in
gpu_ids
)
tmpl
+=
' {task_cmd}'
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
cmd
=
get_cmd
()
logger
=
get_logger
()
...
...
@@ -165,7 +170,7 @@ class LocalRunner(BaseRunner):
if
result
.
returncode
!=
0
:
logger
.
warning
(
f
'task
{
task_name
}
fail, see
\n
{
out_path
}
'
)
finally
:
# Clean up
os
.
remove
(
param_file
)
return
task_name
,
result
.
returncode
opencompass/runners/slurm.py
View file @
ce65d339
...
...
@@ -91,6 +91,7 @@ class SlurmRunner(BaseRunner):
# Dump task config to file
mmengine
.
mkdir_or_exist
(
'tmp/'
)
param_file
=
f
'tmp/
{
os
.
getpid
()
}
_params.py'
try
:
task_cfg
.
dump
(
param_file
)
# Build up slurm command
...
...
@@ -104,7 +105,9 @@ class SlurmRunner(BaseRunner):
if
num_gpus
>
0
:
tmpl
+=
f
' --gres=gpu:
{
num_gpus
}
'
tmpl
+=
f
" -N1 -J '
{
task_name
[:
512
]
}
'"
+
' {task_cmd}'
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
cmd
=
get_cmd
()
logger
=
get_logger
()
...
...
@@ -128,7 +131,8 @@ class SlurmRunner(BaseRunner):
retry
=
self
.
retry
output_paths
=
task
.
get_output_paths
()
while
self
.
_job_failed
(
result
.
returncode
,
output_paths
)
and
retry
>
0
:
while
self
.
_job_failed
(
result
.
returncode
,
output_paths
)
and
retry
>
0
:
retry
-=
1
if
random_sleep
:
time
.
sleep
(
random
.
randint
(
0
,
10
))
...
...
@@ -142,7 +146,7 @@ class SlurmRunner(BaseRunner):
if
result
.
returncode
!=
0
and
not
self
.
debug
:
logger
.
warning
(
f
'task
{
task_name
}
fail, see
\n
{
out_path
}
'
)
finally
:
# Clean up
os
.
remove
(
param_file
)
return
task_name
,
result
.
returncode
...
...
opencompass/utils/run.py
View file @
ce65d339
...
...
@@ -3,7 +3,9 @@ from typing import List, Union
import
tabulate
from
mmengine.config
import
Config
from
opencompass.partitioners
import
NaivePartitioner
,
SizePartitioner
from
opencompass.runners
import
DLCRunner
,
LocalRunner
,
SlurmRunner
from
opencompass.tasks
import
OpenICLEvalTask
,
OpenICLInferTask
from
opencompass.utils
import
get_logger
,
match_files
...
...
@@ -118,54 +120,61 @@ def exec_mm_infer_runner(tasks, args, cfg):
runner
(
tasks
)
def
exec_infer_runner
(
tasks
,
args
,
cfg
):
"""execute infer runner according to args."""
if
args
.
slurm
:
runner
=
SlurmRunner
(
dict
(
type
=
'OpenICLInferTask'
),
def
get_config_type
(
obj
)
->
str
:
return
f
'
{
obj
.
__module__
}
.
{
obj
.
__name__
}
'
def
fill_infer_cfg
(
cfg
,
args
):
new_cfg
=
dict
(
infer
=
dict
(
partitioner
=
dict
(
type
=
get_config_type
(
SizePartitioner
),
max_task_size
=
args
.
max_partition_size
,
gen_task_coef
=
args
.
gen_task_coef
),
runner
=
dict
(
max_num_workers
=
args
.
max_num_workers
,
partition
=
args
.
partition
,
quotatype
=
args
.
quotatype
,
qos
=
args
.
qos
,
retry
=
args
.
retry
,
debug
=
args
.
debug
,
lark_bot_url
=
cfg
[
'lark_bot_url'
])
task
=
dict
(
type
=
get_config_type
(
OpenICLInferTask
)),
lark_bot_url
=
cfg
[
'lark_bot_url'
],
)),
)
if
args
.
slurm
:
new_cfg
[
'infer'
][
'runner'
][
'type'
]
=
get_config_type
(
SlurmRunner
)
new_cfg
[
'infer'
][
'runner'
][
'partition'
]
=
args
.
partition
new_cfg
[
'infer'
][
'runner'
][
'quotatype'
]
=
args
.
quotatype
new_cfg
[
'infer'
][
'runner'
][
'qos'
]
=
args
.
qos
new_cfg
[
'infer'
][
'runner'
][
'retry'
]
=
args
.
retry
elif
args
.
dlc
:
runner
=
DLCRunner
(
dict
(
type
=
'OpenICLInferTask'
),
max_num_workers
=
args
.
max_num_workers
,
aliyun_cfg
=
Config
.
fromfile
(
args
.
aliyun_cfg
),
retry
=
args
.
retry
,
debug
=
args
.
debug
,
lark_bot_url
=
cfg
[
'lark_bot_url'
])
new_cfg
[
'infer'
][
'runner'
][
'type'
]
=
get_config_type
(
DLCRunner
)
new_cfg
[
'infer'
][
'runner'
][
'aliyun_cfg'
]
=
Config
.
fromfile
(
args
.
aliyun_cfg
)
new_cfg
[
'infer'
][
'runner'
][
'retry'
]
=
args
.
retry
else
:
runner
=
LocalRunner
(
task
=
dict
(
type
=
'OpenICLInferTask'
),
max_num_workers
=
args
.
max_num_workers
,
max_workers_per_gpu
=
args
.
max_workers_per_gpu
,
debug
=
args
.
debug
,
lark_bot_url
=
cfg
[
'lark_bot_url'
])
runner
(
tasks
)
new_cfg
[
'infer'
][
'runner'
][
'type'
]
=
get_config_type
(
LocalRunner
)
new_cfg
[
'infer'
][
'runner'
][
'max_workers_per_gpu'
]
=
args
.
max_workers_per_gpu
cfg
.
merge_from_dict
(
new_cfg
)
def
exec
_eval_
runner
(
tasks
,
args
,
cfg
):
"""execute infer runner according to args."""
if
args
.
slurm
:
runner
=
SlurmR
unner
(
dict
(
type
=
'OpenICLEvalTask'
),
def
fill
_eval_
cfg
(
cfg
,
args
):
new_cfg
=
dict
(
eval
=
dict
(
partitioner
=
dict
(
type
=
get_config_type
(
NaivePartitioner
)),
r
unner
=
dict
(
max_num_workers
=
args
.
max_num_workers
,
partition
=
args
.
partition
,
quotatype
=
args
.
quotatype
,
qos
=
args
.
qos
,
retry
=
args
.
retry
,
debug
=
args
.
debug
,
lark_bot_url
=
cfg
[
'lark_bot_url'
])
task
=
dict
(
type
=
get_config_type
(
OpenICLEvalTask
)),
lark_bot_url
=
cfg
[
'lark_bot_url'
],
)))
if
args
.
slurm
:
new_cfg
[
'eval'
][
'runner'
][
'type'
]
=
get_config_type
(
SlurmRunner
)
new_cfg
[
'eval'
][
'runner'
][
'partition'
]
=
args
.
partition
new_cfg
[
'eval'
][
'runner'
][
'quotatype'
]
=
args
.
quotatype
new_cfg
[
'eval'
][
'runner'
][
'qos'
]
=
args
.
qos
new_cfg
[
'eval'
][
'runner'
][
'retry'
]
=
args
.
retry
elif
args
.
dlc
:
runner
=
DLCRunner
(
dict
(
type
=
'OpenICLEvalTask'
),
max_num_workers
=
args
.
max_num_workers
,
aliyun_cfg
=
Config
.
fromfile
(
args
.
aliyun_cfg
),
retry
=
args
.
retry
,
debug
=
args
.
debug
,
lark_bot_url
=
cfg
[
'lark_bot_url'
])
new_cfg
[
'eval'
][
'runner'
][
'type'
]
=
get_config_type
(
DLCRunner
)
new_cfg
[
'eval'
][
'runner'
][
'aliyun_cfg'
]
=
Config
.
fromfile
(
args
.
aliyun_cfg
)
new_cfg
[
'eval'
][
'runner'
][
'retry'
]
=
args
.
retry
else
:
runner
=
LocalRunner
(
task
=
dict
(
type
=
'OpenICLEvalTask'
),
max_num_workers
=
args
.
max_num_workers
,
debug
=
args
.
debug
,
lark_bot_url
=
cfg
[
'lark_bot_url'
])
runner
(
tasks
)
new_cfg
[
'eval'
][
'runner'
][
'type'
]
=
get_config_type
(
LocalRunner
)
new_cfg
[
'eval'
][
'runner'
][
'max_workers_per_gpu'
]
=
args
.
max_workers_per_gpu
cfg
.
merge_from_dict
(
new_cfg
)
run.py
View file @
ce65d339
...
...
@@ -6,13 +6,12 @@ from datetime import datetime
from
mmengine.config
import
Config
,
DictAction
from
opencompass.partitioners
import
(
MultimodalNaivePartitioner
,
NaivePartitioner
,
SizePartitioner
)
from
opencompass.partitioners
import
MultimodalNaivePartitioner
from
opencompass.registry
import
PARTITIONERS
,
RUNNERS
from
opencompass.runners
import
SlurmRunner
from
opencompass.utils
import
LarkReporter
,
Summarizer
,
get_logger
from
opencompass.utils.run
import
(
exec_
eval_runner
,
exec_infer_runner
,
exec_mm
_infer_
runner
,
get_config_from_arg
)
from
opencompass.utils.run
import
(
exec_
mm_infer_runner
,
fill_eval_cfg
,
fill
_infer_
cfg
,
get_config_from_arg
)
def
parse_args
():
...
...
@@ -245,20 +244,10 @@ def main():
tasks
=
partitioner
(
cfg
)
exec_mm_infer_runner
(
tasks
,
args
,
cfg
)
return
elif
args
.
dlc
or
args
.
slurm
or
cfg
.
get
(
'infer'
,
None
)
is
None
:
# Use SizePartitioner to split into subtasks
partitioner
=
SizePartitioner
(
osp
.
join
(
cfg
[
'work_dir'
],
'predictions/'
),
max_task_size
=
args
.
max_partition_size
,
gen_task_coef
=
args
.
gen_task_coef
)
tasks
=
partitioner
(
cfg
)
if
args
.
dry_run
:
return
# execute the infer subtasks
exec_infer_runner
(
tasks
,
args
,
cfg
)
# If they have specified "infer" in config and haven't used --slurm
# or --dlc, just follow the config
else
:
if
args
.
dlc
or
args
.
slurm
or
cfg
.
get
(
'infer'
,
None
)
is
None
:
fill_infer_cfg
(
cfg
,
args
)
if
args
.
partition
is
not
None
:
if
RUNNERS
.
get
(
cfg
.
infer
.
runner
.
type
)
==
SlurmRunner
:
cfg
.
infer
.
runner
.
partition
=
args
.
partition
...
...
@@ -270,8 +259,8 @@ def main():
cfg
.
infer
.
runner
.
debug
=
True
if
args
.
lark
:
cfg
.
infer
.
runner
.
lark_bot_url
=
cfg
[
'lark_bot_url'
]
cfg
.
infer
.
partitioner
[
'out_dir'
]
=
osp
.
join
(
cfg
[
'work_dir'
],
'predictions/'
)
cfg
.
infer
.
partitioner
[
'out_dir'
]
=
osp
.
join
(
cfg
[
'work_dir'
],
'predictions/'
)
partitioner
=
PARTITIONERS
.
build
(
cfg
.
infer
.
partitioner
)
tasks
=
partitioner
(
cfg
)
if
args
.
dry_run
:
...
...
@@ -289,18 +278,10 @@ def main():
'also specified --slurm or --dlc. '
'The "eval" configuration will be overridden by '
'your runtime arguments.'
)
if
args
.
dlc
or
args
.
slurm
or
cfg
.
get
(
'eval'
,
None
)
is
None
:
# Use NaivePartitioner,not split
partitioner
=
NaivePartitioner
(
osp
.
join
(
cfg
[
'work_dir'
],
'results/'
))
tasks
=
partitioner
(
cfg
)
if
args
.
dry_run
:
return
# execute the eval tasks
exec_eval_runner
(
tasks
,
args
,
cfg
)
# If they have specified "eval" in config and haven't used --slurm
# or --dlc, just follow the config
else
:
fill_eval_cfg
(
cfg
,
args
)
if
args
.
partition
is
not
None
:
if
RUNNERS
.
get
(
cfg
.
infer
.
runner
.
type
)
==
SlurmRunner
:
cfg
.
eval
.
runner
.
partition
=
args
.
partition
...
...
@@ -312,8 +293,7 @@ def main():
cfg
.
eval
.
runner
.
debug
=
True
if
args
.
lark
:
cfg
.
eval
.
runner
.
lark_bot_url
=
cfg
[
'lark_bot_url'
]
cfg
.
eval
.
partitioner
[
'out_dir'
]
=
osp
.
join
(
cfg
[
'work_dir'
],
'results/'
)
cfg
.
eval
.
partitioner
[
'out_dir'
]
=
osp
.
join
(
cfg
[
'work_dir'
],
'results/'
)
partitioner
=
PARTITIONERS
.
build
(
cfg
.
eval
.
partitioner
)
tasks
=
partitioner
(
cfg
)
if
args
.
dry_run
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment