Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
8c85edd1
Unverified
Commit
8c85edd1
authored
Apr 19, 2024
by
Fengzhe Zhou
Committed by
GitHub
Apr 19, 2024
Browse files
[Sync] deprecate old mbpps (#1064)
parent
c1724013
Changes
95
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
394 additions
and
251 deletions
+394
-251
docs/zh_cn/advanced_guides/code_eval.md
docs/zh_cn/advanced_guides/code_eval.md
+3
-3
opencompass/cli/main.py
opencompass/cli/main.py
+6
-0
opencompass/datasets/apps.py
opencompass/datasets/apps.py
+1
-3
opencompass/datasets/mbpp.py
opencompass/datasets/mbpp.py
+40
-36
opencompass/datasets/taco.py
opencompass/datasets/taco.py
+1
-3
opencompass/models/openai_api.py
opencompass/models/openai_api.py
+20
-10
opencompass/models/qwen_api.py
opencompass/models/qwen_api.py
+1
-1
opencompass/runners/dlc.py
opencompass/runners/dlc.py
+10
-0
opencompass/runners/slurm_sequential.py
opencompass/runners/slurm_sequential.py
+1
-0
opencompass/summarizers/needlebench.py
opencompass/summarizers/needlebench.py
+6
-3
opencompass/summarizers/subjective/compass_arena.py
opencompass/summarizers/subjective/compass_arena.py
+151
-144
opencompass/summarizers/subjective/mtbench.py
opencompass/summarizers/subjective/mtbench.py
+51
-46
opencompass/tasks/openicl_eval.py
opencompass/tasks/openicl_eval.py
+3
-2
opencompass/tasks/outer_eval/alpacaeval.py
opencompass/tasks/outer_eval/alpacaeval.py
+5
-0
opencompass/utils/run.py
opencompass/utils/run.py
+95
-0
No files found.
docs/zh_cn/advanced_guides/code_eval.md
View file @
8c85edd1
...
...
@@ -4,7 +4,7 @@
## pass@1
如果只需要生成单条回复来评测pass@1的性能,可以直接使用
[
configs/datasets/humaneval/humaneval_gen_8e312c.py
](
https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py
)
和
[
configs/datasets/mbpp/mbpp_gen_1e1056.py
](
https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py
)
并参考通用的
[
快速上手教程
](
../get_started/quick_start.md
)
即可。
如果只需要生成单条回复来评测pass@1的性能,可以直接使用
[
configs/datasets/humaneval/humaneval_gen_8e312c.py
](
https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py
)
和
[
configs/datasets/mbpp/
deprecated_
mbpp_gen_1e1056.py
](
https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/
deprecated_
mbpp_gen_1e1056.py
)
并参考通用的
[
快速上手教程
](
../get_started/quick_start.md
)
即可。
如果要进行多语言评测,可以参考
[
多语言代码评测教程
](
./code_eval_service.md
)
。
...
...
@@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with
read_base
():
from
.datasets.humaneval.humaneval_gen_8e312c
import
humaneval_datasets
from
.datasets.mbpp.mbpp_gen_1e1056
import
mbpp_datasets
from
.datasets.mbpp.
deprecated_
mbpp_gen_1e1056
import
mbpp_datasets
mbpp_datasets
[
0
][
'type'
]
=
MBPPDataset_V2
mbpp_datasets
[
0
][
'eval_cfg'
][
'evaluator'
][
'type'
]
=
MBPPPassKEvaluator
...
...
@@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with
read_base
():
from
.datasets.humaneval.humaneval_gen_8e312c
import
humaneval_datasets
from
.datasets.mbpp.mbpp_gen_1e1056
import
mbpp_datasets
from
.datasets.mbpp.
deprecated_
mbpp_gen_1e1056
import
mbpp_datasets
humaneval_datasets
[
0
][
'abbr'
]
=
'openai_humaneval_pass10'
humaneval_datasets
[
0
][
'num_repeats'
]
=
10
...
...
opencompass/cli/main.py
View file @
8c85edd1
...
...
@@ -56,6 +56,12 @@ def parse_args():
'to run'
,
action
=
'store_true'
,
default
=
False
)
parser
.
add_argument
(
'--accelerator'
,
help
=
'Infer accelerator, support vllm and lmdeploy now.'
,
choices
=
[
'vllm'
,
'lmdeploy'
,
'hg'
],
default
=
'hg'
,
type
=
str
)
parser
.
add_argument
(
'-m'
,
'--mode'
,
help
=
'Running mode. You can choose "infer" if you '
...
...
opencompass/datasets/apps.py
View file @
8c85edd1
...
...
@@ -27,11 +27,9 @@ except ImportError:
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
opencompass.utils.logging
import
get_logger
from
.base
import
BaseDataset
logger
=
get_logger
()
TIMEOUT
=
10
...
...
@@ -321,7 +319,7 @@ def timeout_handler(signum, frame):
try
:
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
except
AttributeError
:
logger
.
warn
in
g
(
'signal.SIGALRM is not available on this platform'
)
pr
in
t
(
'signal.SIGALRM is not available on this platform'
)
timeout
=
4
# seconds
...
...
opencompass/datasets/mbpp.py
View file @
8c85edd1
...
...
@@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset):
multiple responses in special cases.
"""
def
processing_test
(
example
):
example
[
'test_case'
]
=
example
[
'test_list'
]
example
[
'test_list'
]
=
'
\n
'
.
join
(
example
[
'test_list'
])
example
[
'test_list_2'
]
=
example
[
'test_list'
]
example
[
'test_column'
]
=
dict
(
test_list_2
=
example
[
'test_list'
],
task_id
=
example
[
'task_id'
])
return
example
dataset
=
[]
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
dataset
.
extend
(
[
json
.
loads
(
line
.
strip
())
for
_
in
range
(
num_repeats
)])
example
=
json
.
loads
(
line
.
strip
())
example
=
processing_test
(
example
)
dataset
.
extend
([
example
for
_
in
range
(
num_repeats
)])
return
Dataset
.
from_list
(
dataset
)
...
...
@@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator):
predictions
)):
pred
=
self
.
_process_answer
(
pred
)
programs
=
self
.
_process_test
(
refer
,
pred
)
future
=
executor
.
submit
(
execution
,
programs
,
i
,
3
)
future
=
executor
.
submit
(
execution
,
programs
,
i
,
10
)
futures
.
append
(
future
)
details
[
str
(
i
)]
=
{}
details
[
str
(
i
)][
'origin'
]
=
predictions
[
i
]
...
...
@@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator):
return
{
f
'mbpp_plus_
{
k
}
'
:
score
[
k
]
*
100
for
k
in
score
}
def
_process_answer
(
self
,
text
):
try
:
# for chatGLM related text
eval_text
=
eval
(
text
)
except
Exception
:
pass
else
:
if
isinstance
(
eval_text
,
str
):
text
=
eval_text
# deal with code block
if
'```'
in
text
:
blocks
=
re
.
findall
(
r
'```(.*?)```'
,
text
,
re
.
DOTALL
)
if
len
(
blocks
)
==
0
:
text
=
text
.
split
(
'```'
)[
1
]
# fall back to default strategy
else
:
text
=
blocks
[
0
]
# fetch the first code block
if
not
text
.
startswith
(
'
\n
'
):
# in case starting with ```xxx
text
=
text
[
max
(
text
.
find
(
'
\n
'
)
+
1
,
0
):]
text
=
text
.
strip
()
match
=
re
.
search
(
r
"('\s*|)(\[DONE\]|DONE)"
,
text
)
if
match
:
text
=
text
[:
match
.
start
()]
match
=
re
.
search
(
r
"(\[BEGIN\]|BEGIN)('\s*|)"
,
text
)
if
match
:
text
=
text
[
match
.
end
():]
patterns
=
[
r
"\[BEGIN\]\s*'(.*)'\s*\[DONE\]"
,
r
"BEGIN\s*'(.*)'\s*\[DONE\]"
,
r
"\[BEGIN\]\s*'(.*)'\s*DONE"
,
r
"BEGIN\s*'(.*)'\s*DONE"
,
r
"\[BEGIN\]\s*'(.*)\s*\[DONE\]"
,
r
"BEGIN\s*'(.*)\s*\[DONE\]"
,
r
"\[BEGIN\]\s*'(.*)\s*DONE"
,
r
"BEGIN\s*'(.*)\s*DONE"
,
r
'\[BEGIN\]\s*(.*)\s*\[DONE\]'
,
r
'BEGIN\s*(.*)\s*\[DONE\]'
,
r
'\[BEGIN\]\s*(.*)\s*DONE'
,
r
'BEGIN\s*(.*)\s*DONE'
,
r
'```python\s*(.*)\s*```'
,
r
'```\s*(.*)\s*```'
,
r
'(.*)\s*```.*'
,
r
"\[BEGIN\]\s*'(.*)"
,
r
'\[BEGIN\](.*)'
,
]
for
p
in
patterns
:
match
=
re
.
search
(
p
,
text
,
re
.
DOTALL
)
if
match
:
text
=
match
.
group
(
1
)
break
text
=
text
.
split
(
'```'
)[
0
]
text
=
re
.
split
(
r
"'?\s*\[?DONE\]?"
,
text
)[
0
]
text
=
text
.
replace
(
'
\\
_'
,
'_'
)
text
=
text
.
strip
()
if
text
.
startswith
(
"'"
):
text
=
text
[
1
:]
if
text
.
endswith
(
"'"
):
text
=
text
[:
-
1
]
text
=
text
.
replace
(
'
\\
'
,
''
)
match
=
re
.
search
(
r
'```python(.*)```'
,
text
,
re
.
DOTALL
)
if
match
:
text
=
match
.
group
(
1
).
strip
().
split
(
'```'
)[
0
].
strip
()
return
text
def
_process_test
(
self
,
test_case
,
pred
):
...
...
@@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
for
pred
in
preds
:
pred
=
self
.
_process_answer
(
pred
)
programs
=
self
.
_process_test
(
test_case
,
pred
)
future
=
executor
.
submit
(
execution
,
programs
,
task_id
,
3
)
future
=
executor
.
submit
(
execution
,
programs
,
task_id
,
10
)
futures
.
append
(
future
)
from
tqdm
import
tqdm
...
...
opencompass/datasets/taco.py
View file @
8c85edd1
...
...
@@ -27,11 +27,9 @@ except ImportError:
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
opencompass.utils.logging
import
get_logger
from
.base
import
BaseDataset
logger
=
get_logger
()
TIMEOUT
=
10
...
...
@@ -267,7 +265,7 @@ def timeout_handler(signum, frame):
try
:
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
except
AttributeError
:
logger
.
warn
in
g
(
'signal.SIGALRM is not available on this platform'
)
pr
in
t
(
'signal.SIGALRM is not available on this platform'
)
timeout
=
4
# seconds
...
...
opencompass/models/openai_api.py
View file @
8c85edd1
...
...
@@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel):
self
.
top_logprobs
=
top_logprobs
if
isinstance
(
key
,
str
):
self
.
keys
=
[
os
.
getenv
(
'OPENAI_API_KEY'
)
if
key
==
'ENV'
else
key
]
if
key
==
'ENV'
:
if
'OPENAI_API_KEY'
not
in
os
.
environ
:
raise
ValueError
(
'OpenAI API key is not set.'
)
self
.
keys
=
os
.
getenv
(
'OPENAI_API_KEY'
).
split
(
','
)
else
:
self
.
keys
=
[
key
]
else
:
self
.
keys
=
key
...
...
@@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel):
self
.
url
=
openai_api_base
self
.
path
=
path
def
generate
(
self
,
inputs
:
List
[
PromptType
],
max_out_len
:
int
=
512
,
temperature
:
float
=
0.7
,
)
->
List
[
str
]:
def
generate
(
self
,
inputs
:
List
[
PromptType
],
max_out_len
:
int
=
512
,
temperature
:
float
=
0.7
,
**
kwargs
)
->
List
[
str
]:
"""Generate results given a list of inputs.
Args:
...
...
@@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI):
}
for
_
in
range
(
self
.
retry
):
self
.
wait
()
raw_response
=
requests
.
post
(
self
.
url
,
headers
=
self
.
headers
,
data
=
json
.
dumps
(
data
))
try
:
raw_response
=
requests
.
post
(
self
.
url
,
headers
=
self
.
headers
,
data
=
json
.
dumps
(
data
))
except
requests
.
ConnectionError
:
self
.
logger
.
error
(
'Request error, got'
,
str
(
raw_response
.
content
))
time
.
sleep
(
1
)
continue
try
:
response
=
raw_response
.
json
()
except
requests
.
JSONDecodeError
:
...
...
opencompass/models/qwen_api.py
View file @
8c85edd1
...
...
@@ -161,7 +161,7 @@ class Qwen(BaseAPIModel):
time
.
sleep
(
1
)
continue
if
response
.
status_code
==
429
:
print
(
'Rate limited'
)
print
(
response
)
time
.
sleep
(
2
)
continue
if
response
.
status_code
==
400
:
...
...
opencompass/runners/dlc.py
View file @
8c85edd1
...
...
@@ -214,6 +214,16 @@ class DLCRunner(BaseRunner):
pod_create_time
=
None
pri_time
=
None
initial_time
=
datetime
.
datetime
.
now
()
url
=
'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com'
# noqa: E501
logger
=
get_logger
()
logger
.
debug
(
''
)
logger
.
debug
(
'*'
*
168
)
logger
.
debug
(
f
'
{
url
}
/index?workspaceId=
{
self
.
aliyun_cfg
[
"workspace_id"
]
}
#/dlc2/job/
{
job_id
}
/detail'
# noqa: E501
)
logger
.
debug
(
'*'
*
168
)
while
True
:
# 1. Avoid to request dlc too frequently.
# 2. DLC job may not be ready immediately after creation.
...
...
opencompass/runners/slurm_sequential.py
View file @
8c85edd1
...
...
@@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner):
tmpl
+=
f
' --gres=gpu:
{
num_gpus
}
'
for
extra_cmd
in
self
.
extra_command
:
tmpl
+=
f
'
{
extra_cmd
}
'
tmpl
+=
' -x HOST-10-140-60-7'
tmpl
+=
f
" -N1 -u -J '
{
task_name
[:
512
]
}
'"
+
' {task_cmd}'
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
...
...
opencompass/summarizers/needlebench.py
View file @
8c85edd1
...
...
@@ -72,7 +72,7 @@ dataset_mapping_dict = {}
needle_counts
=
[
'2'
,
'3'
,
'4'
,
'5'
]
languages
=
[
'en'
,
'zh'
]
sizes
=
[
'4k'
,
'8k'
,
'32k'
,
'200k'
,
'1000k'
]
sizes
=
[
'4k'
,
'8k'
,
'32k'
,
'200k'
,
'256k'
,
'1000k'
]
types
=
[
'origin'
,
'parallel'
]
for
needle_count
in
needle_counts
:
...
...
@@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path):
numbers
=
[
2
,
3
,
4
,
5
]
languages
=
[
'en'
,
'zh'
]
size_exists
=
[]
sizes_origin
=
[
'_4k'
,
'_8k'
,
'_32k'
,
'_128k'
,
'_200k'
,
'_1000k'
]
sizes_origin
=
[
'_4k'
,
'_8k'
,
'_32k'
,
'_128k'
,
'_200k'
,
'_256k'
,
'_1000k'
]
for
size
in
sizes_origin
:
if
size
in
content
:
...
...
@@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
markersize
=
8
,
label
=
'Average Depth Score'
)
for
x_value
,
y_value
in
zip
(
x_data
,
y_data
):
ax2
.
text
(
x_value
,
y_value
,
f
'
{
y_value
:.
2
f
}
'
,
ha
=
'center'
,
va
=
'top'
)
ax2
.
set_ylim
(
0
,
100
)
ax2
.
set_yticklabels
([])
...
...
@@ -353,7 +356,7 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
new_save_path
=
os
.
path
.
join
(
directory_path
,
new_filename
)
plt
.
savefig
(
new_save_path
,
format
=
'png'
,
bbox_inches
=
'tight'
,
pad_inches
=
0
)
print
(
f
'Saved
:
{
new_save_path
}
'
)
print
(
f
'Saved:
{
new_save_path
}
'
)
plt
.
close
()
...
...
opencompass/summarizers/subjective/compass_arena.py
View file @
8c85edd1
# flake8: noqa: E501
import
ast
import
csv
# flake8: noqa
# yapf: disable
import
os
import
os.path
as
osp
import
re
...
...
@@ -10,7 +9,7 @@ from itertools import product
import
mmengine
from
mmengine
import
ConfigDict
from
prettytabl
e
import
from_csv
from
tabulat
e
import
tabulate
from
opencompass.partitioners.sub_naive
import
remove_duplicate_pairs
from
opencompass.utils
import
dataset_abbr_from_cfg
,
model_abbr_from_cfg
...
...
@@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
from
.utils
import
get_judgeanswer_and_reference
,
get_outdir
def
model_abbr_from_cfg_used_in_summarizer
(
model
):
if
model
.
get
(
'summarizer_abbr'
,
None
):
return
model
[
'summarizer_abbr'
]
else
:
return
model_abbr_from_cfg
(
model
)
def
post_process_compass_arena
(
s
):
if
result
:
=
re
.
findall
(
'(?:选择:|Choice: )([ABC])'
,
s
):
return
result
[
0
]
...
...
@@ -68,17 +73,90 @@ class CompassArenaSummarizer:
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'compare_models'
]
self
.
judge_models
=
self
.
cfg
.
get
(
'judge_models'
,
None
)
self
.
meta_judge_model
=
self
.
cfg
.
eval
.
partitioner
.
get
(
'meta_judge_model'
,
None
)
self
.
meta_judge_model
=
self
.
cfg
.
eval
.
partitioner
.
get
(
'meta_judge_model'
,
None
)
self
.
judge_type
=
judge_type
assert
self
.
judge_type
in
[
'general'
]
self
.
judge_map
=
{
'general'
:
post_process_compass_arena
,
}
self
.
judge_map
=
{
'general'
:
post_process_compass_arena
}
self
.
judge_function
=
self
.
judge_map
[
self
.
judge_type
]
self
.
check_pos_bias
=
check_pos_bias
self
.
summary_type
=
summary_type
def
get_score
(
self
,
time_str
):
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
model_combinations
=
list
(
product
(
self
.
base_models
,
self
.
compare_models
))
unique_combinations
=
remove_duplicate_pairs
([
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]])
if
self
.
meta_judge_model
is
not
None
:
self
.
judge_models
.
append
(
self
.
meta_judge_model
)
scores
=
{}
for
idx
,
judge_model_cfg
in
enumerate
(
self
.
judge_models
):
judge_model
=
model_abbr_from_cfg
(
judge_model_cfg
)
for
dataset
in
self
.
cfg
[
'datasets'
]:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
for
model_pair
in
unique_combinations
:
model1
=
model_pair
[
0
][
'abbr'
]
model2
=
model_pair
[
1
][
'abbr'
]
if
idx
==
len
(
self
.
judge_models
):
subdir
=
model1
+
'_'
+
model2
+
'_summarized-by--'
+
judge_model
else
:
subdir
=
model1
+
'_'
+
model2
+
'_judged-by--'
+
judge_model
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
if
not
os
.
path
.
isdir
(
subdir_path
):
print
(
subdir_path
+
' is not exist! please check!'
)
continue
judged_answers
,
references
=
get_judgeanswer_and_reference
(
dataset
,
subdir_path
,
self
.
judge_function
)
if
self
.
check_pos_bias
:
bias_num
=
check_position_bias
(
judged_answers
,
references
)
else
:
bias_num
=
0
win_model1
=
defaultdict
(
float
)
win_model2
=
defaultdict
(
float
)
categories
=
defaultdict
(
float
)
model1
=
references
[
0
][
'answer1'
]
model2
=
references
[
0
][
'answer2'
]
for
prediction
,
reference
in
zip
(
judged_answers
,
references
):
categories
[
dataset_abbr
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
prediction
==
'A'
:
if
reference
[
'answer1'
]
==
model1
:
score_1
,
score_2
=
1
,
0
else
:
score_1
,
score_2
=
0
,
1
elif
prediction
==
'B'
:
if
reference
[
'answer1'
]
==
model1
:
score_1
,
score_2
=
0
,
1
else
:
score_1
,
score_2
=
1
,
0
elif
prediction
==
'C'
:
if
self
.
summary_type
==
'half_add'
:
score_1
,
score_2
=
0.5
,
0.5
else
:
score_1
,
score_2
=
0
,
0
win_model1
[
reference
[
'capability'
]]
+=
score_1
win_model1
[
dataset_abbr
]
+=
score_1
win_model2
[
reference
[
'capability'
]]
+=
score_2
win_model2
[
dataset_abbr
]
+=
score_2
for
capability
in
categories
:
win_model1
[
capability
]
=
win_model1
[
capability
]
/
categories
[
capability
]
*
100
win_model1
[
capability
]
=
round
(
win_model1
[
capability
],
2
)
win_model2
[
capability
]
=
win_model2
[
capability
]
/
categories
[
capability
]
*
100
win_model2
[
capability
]
=
round
(
win_model2
[
capability
],
2
)
win_model1
[
'position_bias'
]
=
bias_num
win_model2
[
'position_bias'
]
=
bias_num
if
judge_model
not
in
scores
:
scores
[
judge_model
]
=
{}
if
dataset_abbr
not
in
scores
[
judge_model
]:
scores
[
judge_model
][
dataset_abbr
]
=
{}
scores
[
judge_model
][
dataset_abbr
][
model2
]
=
win_model2
return
scores
def
summarize
(
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
),
...
...
@@ -91,143 +169,72 @@ class CompassArenaSummarizer:
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
scores
=
self
.
get_score
(
time_str
)
# scores['win_' + model1] = win_model1
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
model_combinations
=
list
(
product
(
self
.
base_models
,
self
.
compare_models
))
unique_combinations
=
remove_duplicate_pairs
(
[
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]])
fout_list
=
[]
pre_len
=
len
(
self
.
judge_models
)
if
self
.
meta_judge_model
is
not
None
:
self
.
judge_models
.
append
(
self
.
meta_judge_model
)
meta_judge_model_abbr
=
model_abbr_from_cfg
(
self
.
meta_judge_model
)
else
:
meta_judge_model_abbr
=
None
for
idx
,
judge_model
in
enumerate
(
self
.
judge_models
):
judge_
model
=
model_abbr_from_cfg
(
judge_model
)
for
dataset
in
dataset
_cfgs
:
judge_
abbr
=
model_abbr_from_cfg
(
judge_model
)
for
dataset
in
self
.
cfg
[
'
dataset
s'
]
:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
if
idx
==
pre_len
:
fout
=
osp
.
join
(
output_dir
,
'summarized-by--'
+
judge_model
+
'-'
+
dataset_abbr
+
'-report.csv'
)
summarizer_model_abbrs
=
[
model_abbr_from_cfg_used_in_summarizer
(
i
)
for
i
in
self
.
compare_models
]
one_column
=
list
(
scores
[
judge_abbr
][
dataset_abbr
].
values
())[
0
]
row_headers
=
[
i
for
i
in
one_column
.
keys
()
if
i
not
in
[
dataset_abbr
,
'position_bias'
]]
row_headers
=
[
dataset_abbr
,
'position_bias'
]
+
row_headers
headers
=
[
''
]
+
summarizer_model_abbrs
table
=
[]
for
row_header
in
row_headers
:
row
=
[
row_header
]
for
model_cfg
in
self
.
compare_models
:
model_abbr
=
model_abbr_from_cfg
(
model_cfg
)
s
=
scores
[
judge_abbr
][
dataset_abbr
][
model_abbr
].
get
(
row_header
,
''
)
if
isinstance
(
s
,
float
):
s
=
f
'
{
s
:.
2
f
}
'
if
isinstance
(
s
,
int
):
s
=
str
(
s
)
row
.
append
(
s
)
table
.
append
(
row
)
txt
=
tabulate
(
table
,
headers
=
headers
)
print
(
txt
)
if
idx
==
len
(
self
.
judge_models
):
output_filename
=
osp
.
join
(
output_dir
,
'summarized-by--'
+
judge_abbr
+
'-'
+
dataset_abbr
+
'-report.csv'
)
else
:
fout
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_model
+
'-'
+
dataset_abbr
+
'-report.csv'
)
fout_list
.
append
(
fout
)
for
model_pair
in
unique_combinations
:
model1
,
model2
,
=
model_pair
[
0
][
'abbr'
],
model_pair
[
1
][
'abbr'
],
if
idx
==
pre_len
:
subdir
=
model1
+
'_'
+
model2
+
'_summarized-by--'
+
judge_model
else
:
subdir
=
model1
+
'_'
+
model2
+
'_judged-by--'
+
judge_model
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
if
os
.
path
.
isdir
(
subdir_path
):
judged_answers
,
references
=
get_judgeanswer_and_reference
(
dataset
,
subdir_path
,
self
.
judge_function
,
)
if
self
.
check_pos_bias
:
bias_num
=
check_position_bias
(
judged_answers
,
references
)
else
:
bias_num
=
0
win_model1
,
win_model2
,
categories
=
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
)
model1
,
model2
=
references
[
0
][
'answer1'
],
references
[
0
][
'answer2'
]
for
prediction
,
reference
in
zip
(
judged_answers
,
references
):
if
self
.
summary_type
==
'single'
:
if
prediction
==
'A'
:
categories
[
'total'
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
reference
[
'answer1'
]
==
model1
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
else
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
elif
prediction
==
'B'
:
categories
[
'total'
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
reference
[
'answer1'
]
==
model1
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
else
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
elif
self
.
summary_type
==
'half_add'
:
categories
[
'total'
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
prediction
==
'A'
:
if
reference
[
'answer1'
]
==
model1
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
else
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
elif
prediction
==
'B'
:
if
reference
[
'answer1'
]
==
model1
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
else
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
elif
prediction
==
'C'
:
win_model1
[
reference
[
'capability'
]]
+=
0.5
win_model1
[
'total'
]
+=
0.5
win_model2
[
reference
[
'capability'
]]
+=
0.5
win_model2
[
'total'
]
+=
0.5
for
capability
in
categories
:
if
capability
not
in
win_model1
:
win_model1
[
capability
]
=
0.0
else
:
win_model1
[
capability
]
=
round
(
(
win_model1
[
capability
]
/
categories
[
capability
])
*
100
,
2
)
if
capability
not
in
win_model2
:
win_model2
[
capability
]
=
0.0
else
:
win_model2
[
capability
]
=
round
(
(
win_model2
[
capability
]
/
categories
[
capability
])
*
100
,
2
)
win_model1
[
'position_bias'
]
=
bias_num
win_model2
[
'position_bias'
]
=
bias_num
scores
=
{
'win_'
+
model1
:
win_model1
,
'win_'
+
model2
:
win_model2
}
rows
=
list
(
scores
.
keys
())
columns
=
list
(
scores
[
rows
[
0
]].
keys
())
columns
.
insert
(
0
,
columns
.
pop
(
columns
.
index
(
'total'
)))
columns
.
insert
(
1
,
columns
.
pop
(
columns
.
index
(
'position_bias'
)))
with
open
(
fout
,
'a+'
,
newline
=
''
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
writer
.
writerow
([
model1
+
'_vs_'
+
model2
]
+
columns
)
for
row
in
rows
:
writer
.
writerow
([
row
]
+
[
scores
[
row
][
column
]
for
column
in
columns
])
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
for
fout
in
fout_list
:
with
open
(
fout
,
'r'
)
as
f
:
x
=
from_csv
(
f
)
print
(
fout
)
print
(
x
)
output_filename
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_abbr
+
'-'
+
dataset_abbr
+
'-report.csv'
)
with
open
(
output_filename
,
'w'
)
as
f
:
f
.
write
(
','
.
join
(
headers
)
+
'
\n
'
)
for
line
in
table
:
f
.
write
(
','
.
join
(
line
)
+
'
\n
'
)
print
(
output_filename
)
table
=
[]
summarizer_model_abbrs
=
[
model_abbr_from_cfg_used_in_summarizer
(
i
)
for
i
in
self
.
compare_models
]
headers
=
[
''
]
+
summarizer_model_abbrs
for
dataset
in
self
.
cfg
[
'datasets'
]:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
row
=
[
dataset_abbr
]
for
model_cfg
in
self
.
compare_models
:
model_abbr
=
model_abbr_from_cfg
(
model_cfg
)
s
=
scores
[
judge_abbr
][
dataset_abbr
][
model_abbr
].
get
(
dataset_abbr
,
''
)
if
isinstance
(
s
,
float
):
s
=
f
'
{
s
:.
2
f
}
'
if
isinstance
(
s
,
int
):
s
=
str
(
s
)
row
.
append
(
s
)
table
.
append
(
row
)
txt
=
tabulate
(
table
,
headers
=
headers
)
print
(
txt
)
if
idx
==
len
(
self
.
judge_models
):
output_filename
=
osp
.
join
(
output_dir
,
'summarized-by--'
+
judge_abbr
+
'-overall-report.csv'
)
else
:
output_filename
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_abbr
+
'-overall-report.csv'
)
with
open
(
output_filename
,
'w'
)
as
f
:
f
.
write
(
','
.
join
(
headers
)
+
'
\n
'
)
for
line
in
table
:
f
.
write
(
','
.
join
(
line
)
+
'
\n
'
)
print
(
output_filename
)
opencompass/summarizers/subjective/mtbench.py
View file @
8c85edd1
# flake8: noqa: E501
# flake8: noqa
# yapf: disable
import
csv
import
os
import
os.path
as
osp
...
...
@@ -8,11 +9,7 @@ from datetime import datetime
import
numpy
as
np
from
mmengine
import
ConfigDict
try
:
from
prettytable
import
from_csv
except
ImportError
:
from_csv
=
None
from
tabulate
import
tabulate
from
opencompass.utils
import
model_abbr_from_cfg
...
...
@@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer
from
.utils
import
get_judgeanswer_and_reference
,
get_outdir
def
model_abbr_from_cfg_used_in_summarizer
(
model
):
if
model
.
get
(
'summarizer_abbr'
,
None
):
return
model
[
'summarizer_abbr'
]
else
:
return
model_abbr_from_cfg
(
model
)
def
post_process_mtbench_pair
(
judgement
:
str
):
"""Input a string like below:
...
...
@@ -52,7 +55,7 @@ def get_capability_results(
references
,
fout
,
fout_flag
,
model
,
model
_abbr
,
):
capability_ratings
=
defaultdict
(
int
)
capability_counts
=
defaultdict
(
int
)
...
...
@@ -70,12 +73,12 @@ def get_capability_results(
capability_avg_ratings
[
capability
]
=
s
columns
=
list
(
capability_avg_ratings
.
keys
())
columns
.
insert
(
0
,
columns
.
pop
(
columns
.
index
(
'total'
)))
with
open
(
fout
,
'a+'
,
newline
=
''
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
if
fout_flag
==
0
:
writer
.
writerow
([
'model'
]
+
columns
)
writer
.
writerow
([
model
]
+
[
capability_avg_ratings
[
column
]
for
column
in
columns
])
writer
.
writerow
([
model_abbr
]
+
[
capability_avg_ratings
[
column
]
for
column
in
columns
])
class
MTBenchSummarizer
(
CompassArenaSummarizer
):
...
...
@@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer):
self
.
cfg
=
config
if
self
.
judge_type
==
'single'
:
self
.
eval_model_cfgs
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'models'
]
self
.
eval_model_abbrs
=
[
model_abbr_from_cfg
(
model
)
for
model
in
self
.
eval_model_cfgs
]
elif
self
.
judge_type
==
'pair'
:
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'compare_models'
]
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'compare_models'
]
self
.
judge_abbr
=
model_abbr_from_cfg
(
self
.
cfg
[
'judge_models'
][
0
])
self
.
judge_map
=
{
'single'
:
post_process_mtbench_single
,
...
...
@@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
}
self
.
judge_function
=
self
.
judge_map
[
self
.
judge_type
]
def
summarize
(
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
def
summarize
(
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
...
...
@@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer):
Returns:
pd.DataFrame: The summary results.
"""
if
self
.
judge_type
==
'single'
:
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
fout_flag
=
0
for
eval_model_abbr
in
self
.
eval_model_abbrs
:
subdir
=
eval_model_abbr
+
'_judged-by--'
+
self
.
judge_abbr
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
if
os
.
path
.
isdir
(
subdir_path
):
model
,
judge_model
=
eval_model_abbr
,
self
.
judge_abbr
fout
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_model
+
'-capability.csv'
)
overall_judged_answers
,
overall_references
=
[],
[]
for
dataset
in
dataset_cfgs
:
judged_answers
,
references
=
get_judgeanswer_and_reference
(
dataset
,
subdir_path
,
self
.
judge_function
)
overall_judged_answers
+=
judged_answers
overall_references
+=
references
get_capability_results
(
overall_judged_answers
,
overall_references
,
fout
,
fout_flag
,
model
)
fout_flag
+=
1
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
with
open
(
fout
,
'r'
)
as
f
:
x
=
from_csv
(
f
)
print
(
x
)
print
(
fout
)
elif
self
.
judge_type
==
'pair'
:
super
().
summarize
()
if
self
.
judge_type
==
'pair'
:
return
super
().
summarize
()
# self.judge_type == 'single'
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
fout_flag
=
0
for
eval_model_cfg
in
self
.
eval_model_cfgs
:
eval_model_abbr
=
model_abbr_from_cfg
(
eval_model_cfg
)
show_model_abbr
=
model_abbr_from_cfg_used_in_summarizer
(
eval_model_cfg
)
subdir_path
=
os
.
path
.
join
(
results_folder
,
eval_model_abbr
+
'_judged-by--'
+
self
.
judge_abbr
)
if
os
.
path
.
isdir
(
subdir_path
):
fout
=
osp
.
join
(
output_dir
,
'judged-by--'
+
self
.
judge_abbr
+
'-capability.csv'
)
overall_judged_answers
,
overall_references
=
[],
[]
for
dataset
in
dataset_cfgs
:
judged_answers
,
references
=
get_judgeanswer_and_reference
(
dataset
,
subdir_path
,
self
.
judge_function
)
overall_judged_answers
+=
judged_answers
overall_references
+=
references
get_capability_results
(
overall_judged_answers
,
overall_references
,
fout
,
fout_flag
,
show_model_abbr
)
fout_flag
+=
1
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
with
open
(
fout
,
'r'
)
as
f
:
csv_reader
=
csv
.
reader
(
f
)
header
=
next
(
csv_reader
)
table
=
[
line
for
line
in
csv_reader
]
new_header
=
[
''
]
+
[
line
[
0
]
for
line
in
table
]
new_table
=
[[
h
]
+
line
[
1
:]
for
h
,
line
in
zip
(
header
[
1
:],
table
)]
new_table
=
[[
h
]
+
[
line
[
i
]
for
line
in
table
]
for
i
,
h
in
enumerate
(
header
[
1
:],
start
=
1
)]
t
=
tabulate
(
new_table
,
headers
=
new_header
)
with
open
(
fout
,
'w'
)
as
f
:
f
.
write
(
','
.
join
(
new_header
)
+
'
\n
'
)
for
line
in
new_table
:
f
.
write
(
','
.
join
(
map
(
str
,
line
))
+
'
\n
'
)
print
(
t
)
print
(
fout
)
opencompass/tasks/openicl_eval.py
View file @
8c85edd1
...
...
@@ -3,6 +3,7 @@ import copy
import
fnmatch
import
math
import
os.path
as
osp
import
re
import
statistics
import
time
from
collections
import
Counter
...
...
@@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str],
start
=
0
end
=
len
(
s
)
if
begin_str
:
if
begin_str
and
re
.
match
(
r
'\s*'
,
begin_str
)
is
None
:
begin_idx
=
s
.
find
(
begin_str
)
if
begin_idx
!=
-
1
:
start
=
begin_idx
+
len
(
begin_str
)
if
end_str
:
if
end_str
and
re
.
match
(
r
'\s*'
,
end_str
)
is
None
:
# TODO: Support calling tokenizer for the accurate eos token
# and avoid such hardcode
end_idx
=
s
.
find
(
end_str
,
start
)
...
...
opencompass/tasks/outer_eval/alpacaeval.py
View file @
8c85edd1
# flake8: noqa: E501
import
copy
import
json
import
os
import
os.path
as
osp
import
mmengine
...
...
@@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask):
command
=
''
if
api_key
is
not
None
:
command
+=
f
'export OPENAI_API_KEY=
{
api_key
}
; '
else
:
api_key
=
os
.
environ
.
get
(
'OPENAI_API_KEY'
,
''
).
split
(
','
)[
0
]
if
api_key
:
command
+=
f
'export OPENAI_API_KEY=
{
api_key
}
; '
command
+=
f
'alpaca_eval --model_outputs
{
filename
}
--annotators_config
{
alpaca_cfg
}
--output_path
{
output_path
}
'
return
template
.
format
(
task_cmd
=
command
)
...
...
opencompass/utils/run.py
View file @
8c85edd1
...
...
@@ -5,6 +5,7 @@ import tabulate
from
mmengine.config
import
Config
from
opencompass.datasets.custom
import
make_custom_dataset_config
from
opencompass.models
import
VLLM
,
HuggingFaceCausalLM
,
TurboMindModel
from
opencompass.partitioners
import
NaivePartitioner
,
SizePartitioner
from
opencompass.runners
import
DLCRunner
,
LocalRunner
,
SlurmRunner
from
opencompass.tasks
import
OpenICLEvalTask
,
OpenICLInferTask
...
...
@@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config:
if
args
.
config
:
config
=
Config
.
fromfile
(
args
.
config
,
format_python_code
=
False
)
config
=
try_fill_in_custom_cfgs
(
config
)
# set infer accelerator if needed
if
args
.
accelerator
in
[
'vllm'
,
'lmdeploy'
]:
config
[
'models'
]
=
change_accelerator
(
config
[
'models'
],
args
.
accelerator
)
return
config
# parse dataset args
if
not
args
.
datasets
and
not
args
.
custom_dataset_path
:
...
...
@@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config:
pad_token_id
=
args
.
pad_token_id
,
run_cfg
=
dict
(
num_gpus
=
args
.
num_gpus
))
models
.
append
(
model
)
# set infer accelerator if needed
if
args
.
accelerator
in
[
'vllm'
,
'lmdeploy'
]:
models
=
change_accelerator
(
models
,
args
.
accelerator
)
# parse summarizer args
summarizer_arg
=
args
.
summarizer
if
args
.
summarizer
is
not
None
\
else
'example'
...
...
@@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config:
format_python_code
=
False
)
def
change_accelerator
(
models
,
accelerator
):
models
=
models
.
copy
()
model_accels
=
[]
for
model
in
models
:
get_logger
().
info
(
f
'Transforming
{
model
[
"abbr"
]
}
to
{
accelerator
}
'
)
# change HuggingFace model to VLLM or TurboMindModel
if
model
[
'type'
]
is
HuggingFaceCausalLM
:
gen_args
=
dict
()
if
model
.
get
(
'generation_kwargs'
)
is
not
None
:
generation_kwargs
=
model
[
'generation_kwargs'
].
copy
()
gen_args
[
'temperature'
]
=
0.001
if
generation_kwargs
.
get
(
'temperature'
)
is
None
else
generation_kwargs
[
'temperature'
]
gen_args
[
'top_k'
]
=
1
if
generation_kwargs
.
get
(
'top_k'
)
is
None
else
generation_kwargs
[
'top_k'
]
gen_args
[
'top_p'
]
=
0.9
if
generation_kwargs
.
get
(
'top_p'
)
is
None
else
generation_kwargs
[
'top_p'
]
gen_args
[
'stop_token_ids'
]
=
None
if
generation_kwargs
.
get
(
'eos_token_id'
)
is
None
else
generation_kwargs
[
'eos_token_id'
]
generation_kwargs
[
'stop_token_ids'
]
=
None
if
generation_kwargs
.
get
(
'eos_token_id'
)
is
None
else
generation_kwargs
[
'eos_token_id'
]
generation_kwargs
.
pop
(
'eos_token_id'
)
else
:
# if generation_kwargs is not provided, set default values
generation_kwargs
=
dict
()
gen_args
[
'temperature'
]
=
0.0
gen_args
[
'top_k'
]
=
1
gen_args
[
'top_p'
]
=
0.9
gen_args
[
'stop_token_ids'
]
=
None
if
accelerator
==
'lmdeploy'
:
get_logger
().
info
(
f
'Transforming
{
model
[
"abbr"
]
}
to
{
accelerator
}
'
)
model
=
dict
(
type
=
# noqa E251
f
'
{
TurboMindModel
.
__module__
}
.
{
TurboMindModel
.
__name__
}
'
,
abbr
=
model
[
'abbr'
].
replace
(
'hf'
,
'lmdeploy'
)
if
'-hf'
in
model
[
'abbr'
]
else
model
[
'abbr'
]
+
'-lmdeploy'
,
path
=
model
[
'path'
],
engine_config
=
dict
(
session_len
=
model
[
'max_seq_len'
],
max_batch_size
=
model
[
'batch_size'
],
tp
=
model
[
'run_cfg'
][
'num_gpus'
]),
gen_config
=
dict
(
top_k
=
gen_args
[
'top_k'
],
temperature
=
gen_args
[
'temperature'
],
top_p
=
gen_args
[
'top_p'
],
max_new_tokens
=
model
[
'max_out_len'
],
stop_words
=
gen_args
[
'stop_token_ids'
]),
max_out_len
=
model
[
'max_out_len'
],
max_seq_len
=
model
[
'max_seq_len'
],
batch_size
=
model
[
'batch_size'
],
concurrency
=
model
[
'batch_size'
],
run_cfg
=
model
[
'run_cfg'
],
)
for
item
in
[
'meta_template'
]:
if
model
.
get
(
item
)
is
not
None
:
model
.
update
(
item
,
model
[
item
])
elif
accelerator
==
'vllm'
:
get_logger
().
info
(
f
'Transforming
{
model
[
"abbr"
]
}
to
{
accelerator
}
'
)
model
=
dict
(
type
=
f
'
{
VLLM
.
__module__
}
.
{
VLLM
.
__name__
}
'
,
abbr
=
model
[
'abbr'
].
replace
(
'hf'
,
'vllm'
)
if
'-hf'
in
model
[
'abbr'
]
else
model
[
'abbr'
]
+
'-vllm'
,
path
=
model
[
'path'
],
model_kwargs
=
dict
(
tensor_parallel_size
=
model
[
'run_cfg'
][
'num_gpus'
]),
max_out_len
=
model
[
'max_out_len'
],
max_seq_len
=
model
[
'max_seq_len'
],
batch_size
=
model
[
'batch_size'
],
generation_kwargs
=
generation_kwargs
,
run_cfg
=
model
[
'run_cfg'
],
)
for
item
in
[
'meta_template'
,
'end_str'
]:
if
model
.
get
(
item
)
is
not
None
:
model
.
update
(
item
,
model
[
item
])
generation_kwargs
.
update
(
dict
(
temperature
=
gen_args
[
'temperature'
]))
else
:
raise
ValueError
(
f
'Unsupported accelerator
{
accelerator
}
'
)
model_accels
.
append
(
model
)
return
model_accels
def
exec_mm_infer_runner
(
tasks
,
args
,
cfg
):
"""execute multimodal infer runner according to args."""
if
args
.
slurm
:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment