Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
805293a9
Unverified
Commit
805293a9
authored
Jul 07, 2023
by
Ma Zerun
Committed by
GitHub
Jul 07, 2023
Browse files
Auto re-generate port number during retry (#24)
* Auto re-generate port number during retry * Fix slurm command
parent
efdf116f
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
62 additions
and
54 deletions
+62
-54
opencompass/runners/dlc.py
opencompass/runners/dlc.py
+19
-20
opencompass/runners/local.py
opencompass/runners/local.py
+6
-8
opencompass/runners/slurm.py
opencompass/runners/slurm.py
+11
-11
opencompass/tasks/base.py
opencompass/tasks/base.py
+6
-8
opencompass/tasks/openicl_eval.py
opencompass/tasks/openicl_eval.py
+4
-2
opencompass/tasks/openicl_infer.py
opencompass/tasks/openicl_infer.py
+16
-5
No files found.
opencompass/runners/dlc.py
View file @
805293a9
import
inspect
import
os
import
os.path
as
osp
import
random
import
subprocess
import
time
from
functools
import
partial
from
typing
import
Any
,
Dict
,
List
,
Tuple
import
mmengine
...
...
@@ -82,7 +82,6 @@ class DLCRunner(BaseRunner):
task
=
task_type
(
task_cfg
)
num_gpus
=
task
.
num_gpus
task_name
=
task
.
name
script_path
=
inspect
.
getsourcefile
(
task_type
)
# Dump task config to file
mmengine
.
mkdir_or_exist
(
'tmp/'
)
...
...
@@ -90,28 +89,26 @@ class DLCRunner(BaseRunner):
task_cfg
.
dump
(
param_file
)
# Build up DLC command
task_cmd_template
=
task
.
get_command_template
()
task_cmd
=
task_cmd_template
.
replace
(
'{SCRIPT_PATH}'
,
script_path
).
replace
(
'{CFG_PATH}'
,
param_file
)
pwd
=
os
.
getcwd
()
shell_cmd
=
(
f
'source
{
self
.
aliyun_cfg
[
"bashrc_path"
]
}
; '
f
'conda activate
{
self
.
aliyun_cfg
[
"conda_env_name"
]
}
; '
f
'cd
{
pwd
}
; '
f
'
{
task_cmd
}
'
)
cmd
=
(
'dlc create job'
f
" --command '
{
shell_cmd
}
'"
f
' --name
{
task_name
[:
512
]
}
'
' --kind BatchJob'
f
" -c
{
self
.
aliyun_cfg
[
'dlc_config_path'
]
}
"
f
" --workspace_id
{
self
.
aliyun_cfg
[
'workspace_id'
]
}
"
' --worker_count 1'
f
' --worker_cpu
{
max
(
num_gpus
*
6
,
8
)
}
'
f
' --worker_gpu
{
num_gpus
}
'
f
' --worker_memory
{
max
(
num_gpus
*
32
,
48
)
}
'
f
" --worker_image
{
self
.
aliyun_cfg
[
'worker_image'
]
}
"
' --interactive'
)
'{task_cmd}'
)
tmpl
=
(
'dlc create job'
f
" --command '
{
shell_cmd
}
'"
f
' --name
{
task_name
[:
512
]
}
'
' --kind BatchJob'
f
" -c
{
self
.
aliyun_cfg
[
'dlc_config_path'
]
}
"
f
" --workspace_id
{
self
.
aliyun_cfg
[
'workspace_id'
]
}
"
' --worker_count 1'
f
' --worker_cpu
{
max
(
num_gpus
*
6
,
8
)
}
'
f
' --worker_gpu
{
num_gpus
}
'
f
' --worker_memory
{
max
(
num_gpus
*
32
,
48
)
}
'
f
" --worker_image
{
self
.
aliyun_cfg
[
'worker_image'
]
}
"
' --interactive'
)
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
cmd
=
get_cmd
()
logger
=
get_logger
()
logger
.
debug
(
f
'Running command:
{
cmd
}
'
)
...
...
@@ -138,6 +135,8 @@ class DLCRunner(BaseRunner):
retry
-=
1
if
random_sleep
:
time
.
sleep
(
random
.
randint
(
0
,
10
))
# Re-generate command to refresh ports.
cmd
=
get_cmd
()
result
=
subprocess
.
run
(
cmd
,
shell
=
True
,
text
=
True
,
...
...
opencompass/runners/local.py
View file @
805293a9
import
inspect
import
os
import
os.path
as
osp
import
subprocess
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
from
threading
import
Lock
from
typing
import
Any
,
Dict
,
List
,
Tuple
...
...
@@ -108,7 +108,6 @@ class LocalRunner(BaseRunner):
"""
task_name
=
task
.
name
script_path
=
inspect
.
getsourcefile
(
type
(
task
))
# Dump task config to file
mmengine
.
mkdir_or_exist
(
'tmp/'
)
...
...
@@ -116,12 +115,11 @@ class LocalRunner(BaseRunner):
task
.
cfg
.
dump
(
param_file
)
# Build up slurm command
task_cmd_template
=
task
.
get_command_template
()
task_cmd
=
task_cmd_template
.
replace
(
'{SCRIPT_PATH}'
,
script_path
).
replace
(
'{CFG_PATH}'
,
param_file
)
cmd
=
'CUDA_VISIBLE_DEVICES='
+
','
.
join
(
str
(
i
)
for
i
in
gpu_ids
)
+
' '
cmd
+=
task_cmd
tmpl
=
'CUDA_VISIBLE_DEVICES='
+
','
.
join
(
str
(
i
)
for
i
in
gpu_ids
)
tmpl
+=
' {task_cmd}'
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
cmd
=
get_cmd
()
logger
=
get_logger
()
logger
.
debug
(
f
'Running command:
{
cmd
}
'
)
...
...
opencompass/runners/slurm.py
View file @
805293a9
import
inspect
import
os
import
os.path
as
osp
import
random
import
subprocess
import
time
from
functools
import
partial
from
typing
import
Any
,
Dict
,
List
,
Tuple
import
mmengine
...
...
@@ -85,7 +85,6 @@ class SlurmRunner(BaseRunner):
task
=
task_type
(
task_cfg
)
num_gpus
=
task
.
num_gpus
task_name
=
task
.
name
script_path
=
inspect
.
getsourcefile
(
task_type
)
# Dump task config to file
mmengine
.
mkdir_or_exist
(
'tmp/'
)
...
...
@@ -93,18 +92,17 @@ class SlurmRunner(BaseRunner):
task_cfg
.
dump
(
param_file
)
# Build up slurm command
task_cmd_template
=
task
.
get_command_template
()
task_cmd
=
task_cmd_template
.
replace
(
'{SCRIPT_PATH}'
,
script_path
).
replace
(
'{CFG_PATH}'
,
param_file
)
cmd
=
'srun'
tmpl
=
'srun'
if
self
.
partition
:
cmd
+=
f
' -p
{
self
.
partition
}
'
tmpl
+=
f
' -p
{
self
.
partition
}
'
if
self
.
quotatype
:
cmd
+=
f
' --quotatype=
{
self
.
quotatype
}
'
tmpl
+=
f
' --quotatype=
{
self
.
quotatype
}
'
if
num_gpus
>
0
:
cmd
+=
f
' --gres=gpu:
{
num_gpus
}
'
cmd
+=
f
" -N1 -J '
{
task_name
[:
512
]
}
'
{
task_cmd
}
"
tmpl
+=
f
' --gres=gpu:
{
num_gpus
}
'
tmpl
+=
f
" -N1 -J '
{
task_name
[:
512
]
}
'"
+
' {task_cmd}'
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
template
=
tmpl
)
cmd
=
get_cmd
()
logger
=
get_logger
()
logger
.
debug
(
f
'Running command:
{
cmd
}
'
)
...
...
@@ -130,6 +128,8 @@ class SlurmRunner(BaseRunner):
retry
-=
1
if
random_sleep
:
time
.
sleep
(
random
.
randint
(
0
,
10
))
# Re-generate command to refresh ports.
cmd
=
get_cmd
()
result
=
subprocess
.
run
(
cmd
,
shell
=
True
,
text
=
True
,
...
...
opencompass/tasks/base.py
View file @
805293a9
...
...
@@ -10,7 +10,7 @@ from opencompass.utils import get_infer_output_path, task_abbr_from_cfg
class
BaseTask
:
"""Base class for all tasks. There are two ways to run the task:
1. Directly by calling the `run` method.
2. Calling the `get_command
_template
` method to get the command
template
,
2. Calling the `get_command` method to get the command,
and then run the command in the shell.
Args:
...
...
@@ -35,15 +35,13 @@ class BaseTask:
"""Run the task."""
@
abstractmethod
def
get_command
_
template
(
self
)
->
str
:
def
get_command
(
self
,
cfg_path
,
template
)
->
str
:
"""Get the command template for the task.
The command template should
contain the following placeholders:
1. ``{SCRIPT_PATH}``: This placeholder will be replaced by the path to
the script file of the task.
2. ``{CFG_PATH}`` This placeholder will be replaced by the
path to the config file of the task.
Args:
cfg_path (str): The path to the config file of the task.
template (str): The template which have '{task_cmd}' to format
the command.
"""
@
property
...
...
opencompass/tasks/openicl_eval.py
View file @
805293a9
...
...
@@ -31,8 +31,10 @@ class OpenICLEvalTask(BaseTask):
self
.
num_gpus
=
0
self
.
logger
=
get_logger
()
def
get_command_template
(
self
):
return
'python3 {SCRIPT_PATH} {CFG_PATH}'
def
get_command
(
self
,
cfg_path
,
template
):
script_path
=
__file__
command
=
f
'python3
{
script_path
}
{
cfg_path
}
'
return
template
.
format
(
task_cmd
=
command
)
def
run
(
self
):
for
model_cfg
,
dataset_cfgs
in
zip
(
self
.
model_cfgs
,
self
.
dataset_cfgs
):
...
...
opencompass/tasks/openicl_infer.py
View file @
805293a9
...
...
@@ -31,13 +31,24 @@ class OpenICLInferTask(BaseTask):
self
.
num_gpus
=
run_cfg
.
get
(
'num_gpus'
,
0
)
self
.
num_procs
=
run_cfg
.
get
(
'num_procs'
,
1
)
def
get_command_template
(
self
):
def
get_command
(
self
,
cfg_path
,
template
):
"""Get the command template for the task.
Args:
cfg_path (str): The path to the config file of the task.
template (str): The template which have '{task_cmd}' to format
the command.
"""
script_path
=
__file__
if
self
.
num_gpus
>
0
:
return
(
f
'torchrun --master_port=
{
random
.
randint
(
12000
,
32000
)
}
'
f
'--nproc_per_node
{
self
.
num_procs
}
'
'{SCRIPT_PATH} {CFG_PATH}'
)
port
=
random
.
randint
(
12000
,
32000
)
command
=
(
f
'torchrun --master_port=
{
port
}
'
f
'--nproc_per_node
{
self
.
num_procs
}
'
f
'
{
script_path
}
{
cfg_path
}
'
)
else
:
return
(
'python {SCRIPT_PATH} {CFG_PATH}'
)
command
=
'python {script_path} {cfg_path}'
return
template
.
format
(
task_cmd
=
command
)
def
run
(
self
):
for
model_cfg
,
dataset_cfgs
in
zip
(
self
.
model_cfgs
,
self
.
dataset_cfgs
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment