Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
4dd9a3fc
"examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py" did not exist on "f842396367d78bf4bbe294c4940db6940c2494d1"
Unverified
Commit
4dd9a3fc
authored
Oct 18, 2023
by
Leymore
Committed by
GitHub
Oct 18, 2023
Browse files
[Sync] sync with internal codes 20231019 (#488)
parent
2737249f
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1723 additions
and
1648 deletions
+1723
-1648
opencompass/datasets/lawbench/utils/compare_m2_for_evaluation.py
...pass/datasets/lawbench/utils/compare_m2_for_evaluation.py
+433
-433
opencompass/datasets/lawbench/utils/function_utils.py
opencompass/datasets/lawbench/utils/function_utils.py
+49
-49
opencompass/datasets/lawbench/utils/modules/alignment.py
opencompass/datasets/lawbench/utils/modules/alignment.py
+333
-333
opencompass/datasets/lawbench/utils/modules/annotator.py
opencompass/datasets/lawbench/utils/modules/annotator.py
+76
-76
opencompass/datasets/lawbench/utils/modules/classifier.py
opencompass/datasets/lawbench/utils/modules/classifier.py
+151
-151
opencompass/datasets/lawbench/utils/modules/merger.py
opencompass/datasets/lawbench/utils/modules/merger.py
+272
-272
opencompass/datasets/lawbench/utils/modules/tokenizer.py
opencompass/datasets/lawbench/utils/modules/tokenizer.py
+92
-92
opencompass/datasets/lawbench/utils/parallel_to_m2.py
opencompass/datasets/lawbench/utils/parallel_to_m2.py
+221
-221
opencompass/runners/dlc.py
opencompass/runners/dlc.py
+96
-21
No files found.
opencompass/datasets/lawbench/utils/compare_m2_for_evaluation.py
View file @
4dd9a3fc
opencompass/datasets/lawbench/utils/function_utils.py
View file @
4dd9a3fc
opencompass/datasets/lawbench/utils/modules/alignment.py
View file @
4dd9a3fc
opencompass/datasets/lawbench/utils/modules/annotator.py
View file @
4dd9a3fc
opencompass/datasets/lawbench/utils/modules/classifier.py
View file @
4dd9a3fc
opencompass/datasets/lawbench/utils/modules/merger.py
View file @
4dd9a3fc
opencompass/datasets/lawbench/utils/modules/tokenizer.py
View file @
4dd9a3fc
opencompass/datasets/lawbench/utils/parallel_to_m2.py
View file @
4dd9a3fc
opencompass/runners/dlc.py
View file @
4dd9a3fc
import
datetime
import
os
import
os
import
os.path
as
osp
import
os.path
as
osp
import
random
import
random
import
re
import
subprocess
import
subprocess
import
sys
import
time
import
time
from
functools
import
partial
from
functools
import
partial
from
typing
import
Any
,
Dict
,
List
,
Tuple
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
mmengine
import
mmengine
from
mmengine.config
import
ConfigDict
from
mmengine.config
import
ConfigDict
...
@@ -43,6 +46,11 @@ class DLCRunner(BaseRunner):
...
@@ -43,6 +46,11 @@ class DLCRunner(BaseRunner):
self
.
max_num_workers
=
max_num_workers
self
.
max_num_workers
=
max_num_workers
self
.
retry
=
retry
self
.
retry
=
retry
logger
=
get_logger
()
logger
.
warning
(
'To ensure the integrity of the log results, the log displayed '
f
'by
{
self
.
__class__
.
__name__
}
has a 10-second delay.'
)
def
launch
(
self
,
tasks
:
List
[
Dict
[
str
,
Any
]])
->
List
[
Tuple
[
str
,
int
]]:
def
launch
(
self
,
tasks
:
List
[
Dict
[
str
,
Any
]])
->
List
[
Tuple
[
str
,
int
]]:
"""Launch multiple tasks.
"""Launch multiple tasks.
...
@@ -63,18 +71,23 @@ class DLCRunner(BaseRunner):
...
@@ -63,18 +71,23 @@ class DLCRunner(BaseRunner):
status
=
[
self
.
_launch
(
task
,
random_sleep
=
False
)
for
task
in
tasks
]
status
=
[
self
.
_launch
(
task
,
random_sleep
=
False
)
for
task
in
tasks
]
return
status
return
status
def
_launch
(
self
,
cfg
:
ConfigDict
,
random_sleep
:
bool
=
Tru
e
):
def
_launch
(
self
,
cfg
:
ConfigDict
,
random_sleep
:
Optional
[
bool
]
=
Non
e
):
"""Launch a single task.
"""Launch a single task.
Args:
Args:
cfg (ConfigDict): Task config.
cfg (ConfigDict): Task config.
random_sleep (bool): Whether to sleep for a random time before
random_sleep (bool): Whether to sleep for a random time before
running the command. This avoids cluster error when launching
running the command. When Aliyun has many tasks to schedule,
multiple tasks at the same time. Default: True.
its stability decreases. Therefore, when we need to submit a
large number of tasks at once, we adopt the "random_sleep"
strategy. Tasks that would have been submitted all at once are
now evenly spread out over a 10-second period. Default: None.
Returns:
Returns:
tuple[str, int]: Task name and exit code.
tuple[str, int]: Task name and exit code.
"""
"""
if
random_sleep
is
None
:
random_sleep
=
(
self
.
max_num_workers
>
32
)
task
=
TASKS
.
build
(
dict
(
cfg
=
cfg
,
type
=
self
.
task_cfg
[
'type'
]))
task
=
TASKS
.
build
(
dict
(
cfg
=
cfg
,
type
=
self
.
task_cfg
[
'type'
]))
num_gpus
=
task
.
num_gpus
num_gpus
=
task
.
num_gpus
...
@@ -116,7 +129,7 @@ class DLCRunner(BaseRunner):
...
@@ -116,7 +129,7 @@ class DLCRunner(BaseRunner):
# Run command with retry
# Run command with retry
if
self
.
debug
:
if
self
.
debug
:
stdout
=
None
stdout
=
sys
.
stdout
else
:
else
:
out_path
=
task
.
get_log_path
(
file_extension
=
'out'
)
out_path
=
task
.
get_log_path
(
file_extension
=
'out'
)
mmengine
.
mkdir_or_exist
(
osp
.
split
(
out_path
)[
0
])
mmengine
.
mkdir_or_exist
(
osp
.
split
(
out_path
)[
0
])
...
@@ -124,30 +137,92 @@ class DLCRunner(BaseRunner):
...
@@ -124,30 +137,92 @@ class DLCRunner(BaseRunner):
if
random_sleep
:
if
random_sleep
:
time
.
sleep
(
random
.
randint
(
0
,
10
))
time
.
sleep
(
random
.
randint
(
0
,
10
))
result
=
subprocess
.
run
(
cmd
,
def
_run_within_retry
():
try
:
process
=
subprocess
.
Popen
(
cmd
,
shell
=
True
,
text
=
True
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
job_id
=
None
job_allocated
=
False
job_finished
=
False
last_end_time
=
datetime
.
datetime
.
now
().
strftime
(
'%Y-%m-%dT%H:%M:%SZ'
)
while
True
:
if
not
job_allocated
:
line
=
process
.
stdout
.
readline
()
if
not
line
:
break
match
=
re
.
search
(
r
'(dlc[0-9a-z]+)'
,
line
)
if
match
and
job_id
is
None
:
job_id
=
match
.
group
(
1
)
stdout
.
write
(
line
)
match
=
re
.
search
(
r
'Job .* is \[Running\]'
,
line
)
if
match
:
job_allocated
=
True
else
:
try
:
process
.
wait
(
10
)
except
subprocess
.
TimeoutExpired
:
pass
else
:
job_finished
=
True
if
job_finished
:
this_end_time
=
datetime
.
datetime
.
now
(
).
strftime
(
'%Y-%m-%dT%H:%M:%SZ'
)
else
:
this_end_time
=
(
datetime
.
datetime
.
now
()
-
datetime
.
timedelta
(
seconds
=
10
)
).
strftime
(
'%Y-%m-%dT%H:%M:%SZ'
)
logs_cmd
=
(
'dlc logs'
f
'
{
job_id
}
{
job_id
}
-worker-0'
f
' --start_time
{
last_end_time
}
'
f
' --end_time
{
this_end_time
}
'
f
" -c
{
self
.
aliyun_cfg
[
'dlc_config_path'
]
}
"
)
log_process
=
subprocess
.
Popen
(
logs_cmd
,
shell
=
True
,
shell
=
True
,
text
=
True
,
text
=
True
,
stdout
=
stdout
,
stdout
=
subprocess
.
PIPE
,
stderr
=
stdout
)
stderr
=
subprocess
.
PIPE
)
log_output
,
log_err
=
log_process
.
communicate
()
log_output
=
'
\n
'
.
join
(
log_output
.
split
(
'
\n
'
)[
2
:])
stdout
.
write
(
log_output
)
last_end_time
=
this_end_time
stdout
.
flush
()
if
job_finished
:
break
process
.
wait
()
return
process
.
returncode
finally
:
if
job_id
is
not
None
:
cancel_cmd
=
(
'dlc stop job'
f
'
{
job_id
}
'
f
" -c
{
self
.
aliyun_cfg
[
'dlc_config_path'
]
}
"
' -f'
)
subprocess
.
run
(
cancel_cmd
,
shell
=
True
,
text
=
True
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
return_code
=
_run_within_retry
()
retry
=
self
.
retry
retry
=
self
.
retry
output_paths
=
task
.
get_output_paths
()
output_paths
=
task
.
get_output_paths
()
while
self
.
_job_failed
(
result
.
returncode
,
while
self
.
_job_failed
(
return_code
,
output_paths
)
and
retry
>
0
:
output_paths
)
and
retry
>
0
:
retry
-=
1
retry
-=
1
if
random_sleep
:
time
.
sleep
(
random
.
randint
(
0
,
10
))
# Re-generate command to refresh ports.
cmd
=
get_cmd
()
cmd
=
get_cmd
()
result
=
subprocess
.
run
(
cmd
,
return_code
=
_run_within_retry
()
shell
=
True
,
text
=
True
,
stdout
=
stdout
,
stderr
=
stdout
)
finally
:
finally
:
# Clean up
# Clean up
os
.
remove
(
param_file
)
os
.
remove
(
param_file
)
return
task_name
,
result
.
returncode
return
task_name
,
return_code
def
_job_failed
(
self
,
return_code
:
int
,
output_paths
:
List
[
str
])
->
bool
:
def
_job_failed
(
self
,
return_code
:
int
,
output_paths
:
List
[
str
])
->
bool
:
return
return_code
!=
0
or
not
all
(
return
return_code
!=
0
or
not
all
(
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment