Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
8c85edd1
Unverified
Commit
8c85edd1
authored
Apr 19, 2024
by
Fengzhe Zhou
Committed by
GitHub
Apr 19, 2024
Browse files
[Sync] deprecate old mbpps (#1064)
parent
c1724013
Changes
95
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
394 additions
and
251 deletions
+394
-251
docs/zh_cn/advanced_guides/code_eval.md
docs/zh_cn/advanced_guides/code_eval.md
+3
-3
opencompass/cli/main.py
opencompass/cli/main.py
+6
-0
opencompass/datasets/apps.py
opencompass/datasets/apps.py
+1
-3
opencompass/datasets/mbpp.py
opencompass/datasets/mbpp.py
+40
-36
opencompass/datasets/taco.py
opencompass/datasets/taco.py
+1
-3
opencompass/models/openai_api.py
opencompass/models/openai_api.py
+20
-10
opencompass/models/qwen_api.py
opencompass/models/qwen_api.py
+1
-1
opencompass/runners/dlc.py
opencompass/runners/dlc.py
+10
-0
opencompass/runners/slurm_sequential.py
opencompass/runners/slurm_sequential.py
+1
-0
opencompass/summarizers/needlebench.py
opencompass/summarizers/needlebench.py
+6
-3
opencompass/summarizers/subjective/compass_arena.py
opencompass/summarizers/subjective/compass_arena.py
+151
-144
opencompass/summarizers/subjective/mtbench.py
opencompass/summarizers/subjective/mtbench.py
+51
-46
opencompass/tasks/openicl_eval.py
opencompass/tasks/openicl_eval.py
+3
-2
opencompass/tasks/outer_eval/alpacaeval.py
opencompass/tasks/outer_eval/alpacaeval.py
+5
-0
opencompass/utils/run.py
opencompass/utils/run.py
+95
-0
No files found.
docs/zh_cn/advanced_guides/code_eval.md
View file @
8c85edd1
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
## pass@1
## pass@1
如果只需要生成单条回复来评测pass@1的性能,可以直接使用
[
configs/datasets/humaneval/humaneval_gen_8e312c.py
](
https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py
)
和
[
configs/datasets/mbpp/mbpp_gen_1e1056.py
](
https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py
)
并参考通用的
[
快速上手教程
](
../get_started/quick_start.md
)
即可。
如果只需要生成单条回复来评测pass@1的性能,可以直接使用
[
configs/datasets/humaneval/humaneval_gen_8e312c.py
](
https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py
)
和
[
configs/datasets/mbpp/
deprecated_
mbpp_gen_1e1056.py
](
https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/
deprecated_
mbpp_gen_1e1056.py
)
并参考通用的
[
快速上手教程
](
../get_started/quick_start.md
)
即可。
如果要进行多语言评测,可以参考
[
多语言代码评测教程
](
./code_eval_service.md
)
。
如果要进行多语言评测,可以参考
[
多语言代码评测教程
](
./code_eval_service.md
)
。
...
@@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
...
@@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with
read_base
():
with
read_base
():
from
.datasets.humaneval.humaneval_gen_8e312c
import
humaneval_datasets
from
.datasets.humaneval.humaneval_gen_8e312c
import
humaneval_datasets
from
.datasets.mbpp.mbpp_gen_1e1056
import
mbpp_datasets
from
.datasets.mbpp.
deprecated_
mbpp_gen_1e1056
import
mbpp_datasets
mbpp_datasets
[
0
][
'type'
]
=
MBPPDataset_V2
mbpp_datasets
[
0
][
'type'
]
=
MBPPDataset_V2
mbpp_datasets
[
0
][
'eval_cfg'
][
'evaluator'
][
'type'
]
=
MBPPPassKEvaluator
mbpp_datasets
[
0
][
'eval_cfg'
][
'evaluator'
][
'type'
]
=
MBPPPassKEvaluator
...
@@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
...
@@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with
read_base
():
with
read_base
():
from
.datasets.humaneval.humaneval_gen_8e312c
import
humaneval_datasets
from
.datasets.humaneval.humaneval_gen_8e312c
import
humaneval_datasets
from
.datasets.mbpp.mbpp_gen_1e1056
import
mbpp_datasets
from
.datasets.mbpp.
deprecated_
mbpp_gen_1e1056
import
mbpp_datasets
humaneval_datasets
[
0
][
'abbr'
]
=
'openai_humaneval_pass10'
humaneval_datasets
[
0
][
'abbr'
]
=
'openai_humaneval_pass10'
humaneval_datasets
[
0
][
'num_repeats'
]
=
10
humaneval_datasets
[
0
][
'num_repeats'
]
=
10
...
...
opencompass/cli/main.py
View file @
8c85edd1
...
@@ -56,6 +56,12 @@ def parse_args():
...
@@ -56,6 +56,12 @@ def parse_args():
'to run'
,
'to run'
,
action
=
'store_true'
,
action
=
'store_true'
,
default
=
False
)
default
=
False
)
parser
.
add_argument
(
'--accelerator'
,
help
=
'Infer accelerator, support vllm and lmdeploy now.'
,
choices
=
[
'vllm'
,
'lmdeploy'
,
'hg'
],
default
=
'hg'
,
type
=
str
)
parser
.
add_argument
(
'-m'
,
parser
.
add_argument
(
'-m'
,
'--mode'
,
'--mode'
,
help
=
'Running mode. You can choose "infer" if you '
help
=
'Running mode. You can choose "infer" if you '
...
...
opencompass/datasets/apps.py
View file @
8c85edd1
...
@@ -27,11 +27,9 @@ except ImportError:
...
@@ -27,11 +27,9 @@ except ImportError:
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
opencompass.utils.logging
import
get_logger
from
.base
import
BaseDataset
from
.base
import
BaseDataset
logger
=
get_logger
()
TIMEOUT
=
10
TIMEOUT
=
10
...
@@ -321,7 +319,7 @@ def timeout_handler(signum, frame):
...
@@ -321,7 +319,7 @@ def timeout_handler(signum, frame):
try
:
try
:
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
except
AttributeError
:
except
AttributeError
:
logger
.
warn
in
g
(
'signal.SIGALRM is not available on this platform'
)
pr
in
t
(
'signal.SIGALRM is not available on this platform'
)
timeout
=
4
# seconds
timeout
=
4
# seconds
...
...
opencompass/datasets/mbpp.py
View file @
8c85edd1
...
@@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset):
...
@@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset):
multiple responses in special cases.
multiple responses in special cases.
"""
"""
def
processing_test
(
example
):
example
[
'test_case'
]
=
example
[
'test_list'
]
example
[
'test_list'
]
=
'
\n
'
.
join
(
example
[
'test_list'
])
example
[
'test_list_2'
]
=
example
[
'test_list'
]
example
[
'test_column'
]
=
dict
(
test_list_2
=
example
[
'test_list'
],
task_id
=
example
[
'task_id'
])
return
example
dataset
=
[]
dataset
=
[]
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
dataset
.
extend
(
example
=
json
.
loads
(
line
.
strip
())
[
json
.
loads
(
line
.
strip
())
for
_
in
range
(
num_repeats
)])
example
=
processing_test
(
example
)
dataset
.
extend
([
example
for
_
in
range
(
num_repeats
)])
return
Dataset
.
from_list
(
dataset
)
return
Dataset
.
from_list
(
dataset
)
...
@@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator):
...
@@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator):
predictions
)):
predictions
)):
pred
=
self
.
_process_answer
(
pred
)
pred
=
self
.
_process_answer
(
pred
)
programs
=
self
.
_process_test
(
refer
,
pred
)
programs
=
self
.
_process_test
(
refer
,
pred
)
future
=
executor
.
submit
(
execution
,
programs
,
i
,
3
)
future
=
executor
.
submit
(
execution
,
programs
,
i
,
10
)
futures
.
append
(
future
)
futures
.
append
(
future
)
details
[
str
(
i
)]
=
{}
details
[
str
(
i
)]
=
{}
details
[
str
(
i
)][
'origin'
]
=
predictions
[
i
]
details
[
str
(
i
)][
'origin'
]
=
predictions
[
i
]
...
@@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator):
...
@@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator):
return
{
f
'mbpp_plus_
{
k
}
'
:
score
[
k
]
*
100
for
k
in
score
}
return
{
f
'mbpp_plus_
{
k
}
'
:
score
[
k
]
*
100
for
k
in
score
}
def
_process_answer
(
self
,
text
):
def
_process_answer
(
self
,
text
):
try
:
patterns
=
[
# for chatGLM related text
r
"\[BEGIN\]\s*'(.*)'\s*\[DONE\]"
,
eval_text
=
eval
(
text
)
r
"BEGIN\s*'(.*)'\s*\[DONE\]"
,
except
Exception
:
r
"\[BEGIN\]\s*'(.*)'\s*DONE"
,
pass
r
"BEGIN\s*'(.*)'\s*DONE"
,
else
:
r
"\[BEGIN\]\s*'(.*)\s*\[DONE\]"
,
if
isinstance
(
eval_text
,
str
):
r
"BEGIN\s*'(.*)\s*\[DONE\]"
,
text
=
eval_text
r
"\[BEGIN\]\s*'(.*)\s*DONE"
,
# deal with code block
r
"BEGIN\s*'(.*)\s*DONE"
,
if
'```'
in
text
:
r
'\[BEGIN\]\s*(.*)\s*\[DONE\]'
,
blocks
=
re
.
findall
(
r
'```(.*?)```'
,
text
,
re
.
DOTALL
)
r
'BEGIN\s*(.*)\s*\[DONE\]'
,
if
len
(
blocks
)
==
0
:
r
'\[BEGIN\]\s*(.*)\s*DONE'
,
text
=
text
.
split
(
'```'
)[
1
]
# fall back to default strategy
r
'BEGIN\s*(.*)\s*DONE'
,
else
:
r
'```python\s*(.*)\s*```'
,
text
=
blocks
[
0
]
# fetch the first code block
r
'```\s*(.*)\s*```'
,
if
not
text
.
startswith
(
'
\n
'
):
# in case starting with ```xxx
r
'(.*)\s*```.*'
,
text
=
text
[
max
(
text
.
find
(
'
\n
'
)
+
1
,
0
):]
r
"\[BEGIN\]\s*'(.*)"
,
text
=
text
.
strip
()
r
'\[BEGIN\](.*)'
,
match
=
re
.
search
(
r
"('\s*|)(\[DONE\]|DONE)"
,
text
)
]
if
match
:
for
p
in
patterns
:
text
=
text
[:
match
.
start
()]
match
=
re
.
search
(
p
,
text
,
re
.
DOTALL
)
match
=
re
.
search
(
r
"(\[BEGIN\]|BEGIN)('\s*|)"
,
text
)
if
match
:
if
match
:
text
=
match
.
group
(
1
)
text
=
text
[
match
.
end
():]
break
text
=
text
.
split
(
'```'
)[
0
]
text
=
re
.
split
(
r
"'?\s*\[?DONE\]?"
,
text
)[
0
]
text
=
text
.
replace
(
'
\\
_'
,
'_'
)
text
=
text
.
strip
()
text
=
text
.
strip
()
if
text
.
startswith
(
"'"
):
text
=
text
[
1
:]
if
text
.
endswith
(
"'"
):
text
=
text
[:
-
1
]
text
=
text
.
replace
(
'
\\
'
,
''
)
match
=
re
.
search
(
r
'```python(.*)```'
,
text
,
re
.
DOTALL
)
if
match
:
text
=
match
.
group
(
1
).
strip
().
split
(
'```'
)[
0
].
strip
()
return
text
return
text
def
_process_test
(
self
,
test_case
,
pred
):
def
_process_test
(
self
,
test_case
,
pred
):
...
@@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
...
@@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
for
pred
in
preds
:
for
pred
in
preds
:
pred
=
self
.
_process_answer
(
pred
)
pred
=
self
.
_process_answer
(
pred
)
programs
=
self
.
_process_test
(
test_case
,
pred
)
programs
=
self
.
_process_test
(
test_case
,
pred
)
future
=
executor
.
submit
(
execution
,
programs
,
task_id
,
3
)
future
=
executor
.
submit
(
execution
,
programs
,
task_id
,
10
)
futures
.
append
(
future
)
futures
.
append
(
future
)
from
tqdm
import
tqdm
from
tqdm
import
tqdm
...
...
opencompass/datasets/taco.py
View file @
8c85edd1
...
@@ -27,11 +27,9 @@ except ImportError:
...
@@ -27,11 +27,9 @@ except ImportError:
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
opencompass.utils.logging
import
get_logger
from
.base
import
BaseDataset
from
.base
import
BaseDataset
logger
=
get_logger
()
TIMEOUT
=
10
TIMEOUT
=
10
...
@@ -267,7 +265,7 @@ def timeout_handler(signum, frame):
...
@@ -267,7 +265,7 @@ def timeout_handler(signum, frame):
try
:
try
:
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
except
AttributeError
:
except
AttributeError
:
logger
.
warn
in
g
(
'signal.SIGALRM is not available on this platform'
)
pr
in
t
(
'signal.SIGALRM is not available on this platform'
)
timeout
=
4
# seconds
timeout
=
4
# seconds
...
...
opencompass/models/openai_api.py
View file @
8c85edd1
...
@@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel):
...
@@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel):
self
.
top_logprobs
=
top_logprobs
self
.
top_logprobs
=
top_logprobs
if
isinstance
(
key
,
str
):
if
isinstance
(
key
,
str
):
self
.
keys
=
[
os
.
getenv
(
'OPENAI_API_KEY'
)
if
key
==
'ENV'
else
key
]
if
key
==
'ENV'
:
if
'OPENAI_API_KEY'
not
in
os
.
environ
:
raise
ValueError
(
'OpenAI API key is not set.'
)
self
.
keys
=
os
.
getenv
(
'OPENAI_API_KEY'
).
split
(
','
)
else
:
self
.
keys
=
[
key
]
else
:
else
:
self
.
keys
=
key
self
.
keys
=
key
...
@@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel):
...
@@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel):
self
.
url
=
openai_api_base
self
.
url
=
openai_api_base
self
.
path
=
path
self
.
path
=
path
def
generate
(
def
generate
(
self
,
self
,
inputs
:
List
[
PromptType
],
inputs
:
List
[
PromptType
],
max_out_len
:
int
=
512
,
max_out_len
:
int
=
512
,
temperature
:
float
=
0.7
,
temperature
:
float
=
0.7
,
**
kwargs
)
->
List
[
str
]:
)
->
List
[
str
]:
"""Generate results given a list of inputs.
"""Generate results given a list of inputs.
Args:
Args:
...
@@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI):
...
@@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI):
}
}
for
_
in
range
(
self
.
retry
):
for
_
in
range
(
self
.
retry
):
self
.
wait
()
self
.
wait
()
raw_response
=
requests
.
post
(
self
.
url
,
try
:
headers
=
self
.
headers
,
raw_response
=
requests
.
post
(
self
.
url
,
data
=
json
.
dumps
(
data
))
headers
=
self
.
headers
,
data
=
json
.
dumps
(
data
))
except
requests
.
ConnectionError
:
self
.
logger
.
error
(
'Request error, got'
,
str
(
raw_response
.
content
))
time
.
sleep
(
1
)
continue
try
:
try
:
response
=
raw_response
.
json
()
response
=
raw_response
.
json
()
except
requests
.
JSONDecodeError
:
except
requests
.
JSONDecodeError
:
...
...
opencompass/models/qwen_api.py
View file @
8c85edd1
...
@@ -161,7 +161,7 @@ class Qwen(BaseAPIModel):
...
@@ -161,7 +161,7 @@ class Qwen(BaseAPIModel):
time
.
sleep
(
1
)
time
.
sleep
(
1
)
continue
continue
if
response
.
status_code
==
429
:
if
response
.
status_code
==
429
:
print
(
'Rate limited'
)
print
(
response
)
time
.
sleep
(
2
)
time
.
sleep
(
2
)
continue
continue
if
response
.
status_code
==
400
:
if
response
.
status_code
==
400
:
...
...
opencompass/runners/dlc.py
View file @
8c85edd1
...
@@ -214,6 +214,16 @@ class DLCRunner(BaseRunner):
...
@@ -214,6 +214,16 @@ class DLCRunner(BaseRunner):
pod_create_time
=
None
pod_create_time
=
None
pri_time
=
None
pri_time
=
None
initial_time
=
datetime
.
datetime
.
now
()
initial_time
=
datetime
.
datetime
.
now
()
url
=
'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com'
# noqa: E501
logger
=
get_logger
()
logger
.
debug
(
''
)
logger
.
debug
(
'*'
*
168
)
logger
.
debug
(
f
'
{
url
}
/index?workspaceId=
{
self
.
aliyun_cfg
[
"workspace_id"
]
}
#/dlc2/job/
{
job_id
}
/detail'
# noqa: E501
)
logger
.
debug
(
'*'
*
168
)
while
True
:
while
True
:
# 1. Avoid to request dlc too frequently.
# 1. Avoid to request dlc too frequently.
# 2. DLC job may not be ready immediately after creation.
# 2. DLC job may not be ready immediately after creation.
...
...
opencompass/runners/slurm_sequential.py
View file @
8c85edd1
...
@@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner):
...
@@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner):
tmpl
+=
f
' --gres=gpu:
{
num_gpus
}
'
tmpl
+=
f
' --gres=gpu:
{
num_gpus
}
'
for
extra_cmd
in
self
.
extra_command
:
for
extra_cmd
in
self
.
extra_command
:
tmpl
+=
f
'
{
extra_cmd
}
'
tmpl
+=
f
'
{
extra_cmd
}
'
tmpl
+=
' -x HOST-10-140-60-7'
tmpl
+=
f
" -N1 -u -J '
{
task_name
[:
512
]
}
'"
+
' {task_cmd}'
tmpl
+=
f
" -N1 -u -J '
{
task_name
[:
512
]
}
'"
+
' {task_cmd}'
get_cmd
=
partial
(
task
.
get_command
,
get_cmd
=
partial
(
task
.
get_command
,
cfg_path
=
param_file
,
cfg_path
=
param_file
,
...
...
opencompass/summarizers/needlebench.py
View file @
8c85edd1
...
@@ -72,7 +72,7 @@ dataset_mapping_dict = {}
...
@@ -72,7 +72,7 @@ dataset_mapping_dict = {}
needle_counts
=
[
'2'
,
'3'
,
'4'
,
'5'
]
needle_counts
=
[
'2'
,
'3'
,
'4'
,
'5'
]
languages
=
[
'en'
,
'zh'
]
languages
=
[
'en'
,
'zh'
]
sizes
=
[
'4k'
,
'8k'
,
'32k'
,
'200k'
,
'1000k'
]
sizes
=
[
'4k'
,
'8k'
,
'32k'
,
'200k'
,
'256k'
,
'1000k'
]
types
=
[
'origin'
,
'parallel'
]
types
=
[
'origin'
,
'parallel'
]
for
needle_count
in
needle_counts
:
for
needle_count
in
needle_counts
:
...
@@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path):
...
@@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path):
numbers
=
[
2
,
3
,
4
,
5
]
numbers
=
[
2
,
3
,
4
,
5
]
languages
=
[
'en'
,
'zh'
]
languages
=
[
'en'
,
'zh'
]
size_exists
=
[]
size_exists
=
[]
sizes_origin
=
[
'_4k'
,
'_8k'
,
'_32k'
,
'_128k'
,
'_200k'
,
'_1000k'
]
sizes_origin
=
[
'_4k'
,
'_8k'
,
'_32k'
,
'_128k'
,
'_200k'
,
'_256k'
,
'_1000k'
]
for
size
in
sizes_origin
:
for
size
in
sizes_origin
:
if
size
in
content
:
if
size
in
content
:
...
@@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
...
@@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
markersize
=
8
,
markersize
=
8
,
label
=
'Average Depth Score'
label
=
'Average Depth Score'
)
)
for
x_value
,
y_value
in
zip
(
x_data
,
y_data
):
ax2
.
text
(
x_value
,
y_value
,
f
'
{
y_value
:.
2
f
}
'
,
ha
=
'center'
,
va
=
'top'
)
ax2
.
set_ylim
(
0
,
100
)
ax2
.
set_ylim
(
0
,
100
)
ax2
.
set_yticklabels
([])
ax2
.
set_yticklabels
([])
...
@@ -353,7 +356,7 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
...
@@ -353,7 +356,7 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
new_save_path
=
os
.
path
.
join
(
directory_path
,
new_filename
)
new_save_path
=
os
.
path
.
join
(
directory_path
,
new_filename
)
plt
.
savefig
(
new_save_path
,
format
=
'png'
,
bbox_inches
=
'tight'
,
pad_inches
=
0
)
plt
.
savefig
(
new_save_path
,
format
=
'png'
,
bbox_inches
=
'tight'
,
pad_inches
=
0
)
print
(
f
'Saved
:
{
new_save_path
}
'
)
print
(
f
'Saved:
{
new_save_path
}
'
)
plt
.
close
()
plt
.
close
()
...
...
opencompass/summarizers/subjective/compass_arena.py
View file @
8c85edd1
# flake8: noqa: E501
# flake8: noqa
import
ast
# yapf: disable
import
csv
import
os
import
os
import
os.path
as
osp
import
os.path
as
osp
import
re
import
re
...
@@ -10,7 +9,7 @@ from itertools import product
...
@@ -10,7 +9,7 @@ from itertools import product
import
mmengine
import
mmengine
from
mmengine
import
ConfigDict
from
mmengine
import
ConfigDict
from
prettytabl
e
import
from_csv
from
tabulat
e
import
tabulate
from
opencompass.partitioners.sub_naive
import
remove_duplicate_pairs
from
opencompass.partitioners.sub_naive
import
remove_duplicate_pairs
from
opencompass.utils
import
dataset_abbr_from_cfg
,
model_abbr_from_cfg
from
opencompass.utils
import
dataset_abbr_from_cfg
,
model_abbr_from_cfg
...
@@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
...
@@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
from
.utils
import
get_judgeanswer_and_reference
,
get_outdir
from
.utils
import
get_judgeanswer_and_reference
,
get_outdir
def
model_abbr_from_cfg_used_in_summarizer
(
model
):
if
model
.
get
(
'summarizer_abbr'
,
None
):
return
model
[
'summarizer_abbr'
]
else
:
return
model_abbr_from_cfg
(
model
)
def
post_process_compass_arena
(
s
):
def
post_process_compass_arena
(
s
):
if
result
:
=
re
.
findall
(
'(?:选择:|Choice: )([ABC])'
,
s
):
if
result
:
=
re
.
findall
(
'(?:选择:|Choice: )([ABC])'
,
s
):
return
result
[
0
]
return
result
[
0
]
...
@@ -68,17 +73,90 @@ class CompassArenaSummarizer:
...
@@ -68,17 +73,90 @@ class CompassArenaSummarizer:
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'compare_models'
]
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'compare_models'
]
self
.
judge_models
=
self
.
cfg
.
get
(
'judge_models'
,
None
)
self
.
judge_models
=
self
.
cfg
.
get
(
'judge_models'
,
None
)
self
.
meta_judge_model
=
self
.
cfg
.
eval
.
partitioner
.
get
(
self
.
meta_judge_model
=
self
.
cfg
.
eval
.
partitioner
.
get
(
'meta_judge_model'
,
None
)
'meta_judge_model'
,
None
)
self
.
judge_type
=
judge_type
self
.
judge_type
=
judge_type
assert
self
.
judge_type
in
[
'general'
]
assert
self
.
judge_type
in
[
'general'
]
self
.
judge_map
=
{
self
.
judge_map
=
{
'general'
:
post_process_compass_arena
}
'general'
:
post_process_compass_arena
,
}
self
.
judge_function
=
self
.
judge_map
[
self
.
judge_type
]
self
.
judge_function
=
self
.
judge_map
[
self
.
judge_type
]
self
.
check_pos_bias
=
check_pos_bias
self
.
check_pos_bias
=
check_pos_bias
self
.
summary_type
=
summary_type
self
.
summary_type
=
summary_type
def
get_score
(
self
,
time_str
):
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
model_combinations
=
list
(
product
(
self
.
base_models
,
self
.
compare_models
))
unique_combinations
=
remove_duplicate_pairs
([
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]])
if
self
.
meta_judge_model
is
not
None
:
self
.
judge_models
.
append
(
self
.
meta_judge_model
)
scores
=
{}
for
idx
,
judge_model_cfg
in
enumerate
(
self
.
judge_models
):
judge_model
=
model_abbr_from_cfg
(
judge_model_cfg
)
for
dataset
in
self
.
cfg
[
'datasets'
]:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
for
model_pair
in
unique_combinations
:
model1
=
model_pair
[
0
][
'abbr'
]
model2
=
model_pair
[
1
][
'abbr'
]
if
idx
==
len
(
self
.
judge_models
):
subdir
=
model1
+
'_'
+
model2
+
'_summarized-by--'
+
judge_model
else
:
subdir
=
model1
+
'_'
+
model2
+
'_judged-by--'
+
judge_model
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
if
not
os
.
path
.
isdir
(
subdir_path
):
print
(
subdir_path
+
' is not exist! please check!'
)
continue
judged_answers
,
references
=
get_judgeanswer_and_reference
(
dataset
,
subdir_path
,
self
.
judge_function
)
if
self
.
check_pos_bias
:
bias_num
=
check_position_bias
(
judged_answers
,
references
)
else
:
bias_num
=
0
win_model1
=
defaultdict
(
float
)
win_model2
=
defaultdict
(
float
)
categories
=
defaultdict
(
float
)
model1
=
references
[
0
][
'answer1'
]
model2
=
references
[
0
][
'answer2'
]
for
prediction
,
reference
in
zip
(
judged_answers
,
references
):
categories
[
dataset_abbr
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
prediction
==
'A'
:
if
reference
[
'answer1'
]
==
model1
:
score_1
,
score_2
=
1
,
0
else
:
score_1
,
score_2
=
0
,
1
elif
prediction
==
'B'
:
if
reference
[
'answer1'
]
==
model1
:
score_1
,
score_2
=
0
,
1
else
:
score_1
,
score_2
=
1
,
0
elif
prediction
==
'C'
:
if
self
.
summary_type
==
'half_add'
:
score_1
,
score_2
=
0.5
,
0.5
else
:
score_1
,
score_2
=
0
,
0
win_model1
[
reference
[
'capability'
]]
+=
score_1
win_model1
[
dataset_abbr
]
+=
score_1
win_model2
[
reference
[
'capability'
]]
+=
score_2
win_model2
[
dataset_abbr
]
+=
score_2
for
capability
in
categories
:
win_model1
[
capability
]
=
win_model1
[
capability
]
/
categories
[
capability
]
*
100
win_model1
[
capability
]
=
round
(
win_model1
[
capability
],
2
)
win_model2
[
capability
]
=
win_model2
[
capability
]
/
categories
[
capability
]
*
100
win_model2
[
capability
]
=
round
(
win_model2
[
capability
],
2
)
win_model1
[
'position_bias'
]
=
bias_num
win_model2
[
'position_bias'
]
=
bias_num
if
judge_model
not
in
scores
:
scores
[
judge_model
]
=
{}
if
dataset_abbr
not
in
scores
[
judge_model
]:
scores
[
judge_model
][
dataset_abbr
]
=
{}
scores
[
judge_model
][
dataset_abbr
][
model2
]
=
win_model2
return
scores
def
summarize
(
def
summarize
(
self
,
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
),
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
),
...
@@ -91,143 +169,72 @@ class CompassArenaSummarizer:
...
@@ -91,143 +169,72 @@ class CompassArenaSummarizer:
Returns:
Returns:
pd.DataFrame: The summary results.
pd.DataFrame: The summary results.
"""
"""
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
scores
=
self
.
get_score
(
time_str
)
# scores['win_' + model1] = win_model1
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
model_combinations
=
list
(
product
(
self
.
base_models
,
self
.
compare_models
))
unique_combinations
=
remove_duplicate_pairs
(
[
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]])
fout_list
=
[]
pre_len
=
len
(
self
.
judge_models
)
if
self
.
meta_judge_model
is
not
None
:
self
.
judge_models
.
append
(
self
.
meta_judge_model
)
meta_judge_model_abbr
=
model_abbr_from_cfg
(
self
.
meta_judge_model
)
else
:
meta_judge_model_abbr
=
None
for
idx
,
judge_model
in
enumerate
(
self
.
judge_models
):
for
idx
,
judge_model
in
enumerate
(
self
.
judge_models
):
judge_
model
=
model_abbr_from_cfg
(
judge_model
)
judge_
abbr
=
model_abbr_from_cfg
(
judge_model
)
for
dataset
in
dataset
_cfgs
:
for
dataset
in
self
.
cfg
[
'
dataset
s'
]
:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
if
idx
==
pre_len
:
summarizer_model_abbrs
=
[
model_abbr_from_cfg_used_in_summarizer
(
i
)
for
i
in
self
.
compare_models
]
fout
=
osp
.
join
(
one_column
=
list
(
scores
[
judge_abbr
][
dataset_abbr
].
values
())[
0
]
output_dir
,
'summarized-by--'
+
judge_model
+
'-'
+
row_headers
=
[
i
for
i
in
one_column
.
keys
()
if
i
not
in
[
dataset_abbr
,
'position_bias'
]]
dataset_abbr
+
'-report.csv'
)
row_headers
=
[
dataset_abbr
,
'position_bias'
]
+
row_headers
headers
=
[
''
]
+
summarizer_model_abbrs
table
=
[]
for
row_header
in
row_headers
:
row
=
[
row_header
]
for
model_cfg
in
self
.
compare_models
:
model_abbr
=
model_abbr_from_cfg
(
model_cfg
)
s
=
scores
[
judge_abbr
][
dataset_abbr
][
model_abbr
].
get
(
row_header
,
''
)
if
isinstance
(
s
,
float
):
s
=
f
'
{
s
:.
2
f
}
'
if
isinstance
(
s
,
int
):
s
=
str
(
s
)
row
.
append
(
s
)
table
.
append
(
row
)
txt
=
tabulate
(
table
,
headers
=
headers
)
print
(
txt
)
if
idx
==
len
(
self
.
judge_models
):
output_filename
=
osp
.
join
(
output_dir
,
'summarized-by--'
+
judge_abbr
+
'-'
+
dataset_abbr
+
'-report.csv'
)
else
:
else
:
fout
=
osp
.
join
(
output_filename
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_abbr
+
'-'
+
dataset_abbr
+
'-report.csv'
)
output_dir
,
'judged-by--'
+
judge_model
+
'-'
+
dataset_abbr
+
'-report.csv'
)
with
open
(
output_filename
,
'w'
)
as
f
:
fout_list
.
append
(
fout
)
f
.
write
(
','
.
join
(
headers
)
+
'
\n
'
)
for
model_pair
in
unique_combinations
:
for
line
in
table
:
model1
,
model2
,
=
model_pair
[
0
][
'abbr'
],
model_pair
[
1
][
f
.
write
(
','
.
join
(
line
)
+
'
\n
'
)
'abbr'
],
print
(
output_filename
)
if
idx
==
pre_len
:
subdir
=
model1
+
'_'
+
model2
+
'_summarized-by--'
+
judge_model
table
=
[]
else
:
summarizer_model_abbrs
=
[
model_abbr_from_cfg_used_in_summarizer
(
i
)
for
i
in
self
.
compare_models
]
subdir
=
model1
+
'_'
+
model2
+
'_judged-by--'
+
judge_model
headers
=
[
''
]
+
summarizer_model_abbrs
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
for
dataset
in
self
.
cfg
[
'datasets'
]:
if
os
.
path
.
isdir
(
subdir_path
):
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
judged_answers
,
references
=
get_judgeanswer_and_reference
(
row
=
[
dataset_abbr
]
dataset
,
for
model_cfg
in
self
.
compare_models
:
subdir_path
,
model_abbr
=
model_abbr_from_cfg
(
model_cfg
)
self
.
judge_function
,
s
=
scores
[
judge_abbr
][
dataset_abbr
][
model_abbr
].
get
(
dataset_abbr
,
''
)
)
if
isinstance
(
s
,
float
):
if
self
.
check_pos_bias
:
s
=
f
'
{
s
:.
2
f
}
'
bias_num
=
check_position_bias
(
if
isinstance
(
s
,
int
):
judged_answers
,
references
)
s
=
str
(
s
)
else
:
row
.
append
(
s
)
bias_num
=
0
table
.
append
(
row
)
win_model1
,
win_model2
,
categories
=
defaultdict
(
txt
=
tabulate
(
table
,
headers
=
headers
)
float
),
defaultdict
(
float
),
defaultdict
(
float
)
print
(
txt
)
model1
,
model2
=
references
[
0
][
'answer1'
],
references
[
0
][
'answer2'
]
if
idx
==
len
(
self
.
judge_models
):
for
prediction
,
reference
in
zip
(
output_filename
=
osp
.
join
(
output_dir
,
'summarized-by--'
+
judge_abbr
+
'-overall-report.csv'
)
judged_answers
,
references
):
else
:
if
self
.
summary_type
==
'single'
:
output_filename
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_abbr
+
'-overall-report.csv'
)
if
prediction
==
'A'
:
with
open
(
output_filename
,
'w'
)
as
f
:
categories
[
'total'
]
+=
1
f
.
write
(
','
.
join
(
headers
)
+
'
\n
'
)
categories
[
reference
[
'capability'
]]
+=
1
for
line
in
table
:
if
reference
[
'answer1'
]
==
model1
:
f
.
write
(
','
.
join
(
line
)
+
'
\n
'
)
win_model1
[
print
(
output_filename
)
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
else
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
elif
prediction
==
'B'
:
categories
[
'total'
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
reference
[
'answer1'
]
==
model1
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
else
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
elif
self
.
summary_type
==
'half_add'
:
categories
[
'total'
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
prediction
==
'A'
:
if
reference
[
'answer1'
]
==
model1
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
else
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
elif
prediction
==
'B'
:
if
reference
[
'answer1'
]
==
model1
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
else
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
elif
prediction
==
'C'
:
win_model1
[
reference
[
'capability'
]]
+=
0.5
win_model1
[
'total'
]
+=
0.5
win_model2
[
reference
[
'capability'
]]
+=
0.5
win_model2
[
'total'
]
+=
0.5
for
capability
in
categories
:
if
capability
not
in
win_model1
:
win_model1
[
capability
]
=
0.0
else
:
win_model1
[
capability
]
=
round
(
(
win_model1
[
capability
]
/
categories
[
capability
])
*
100
,
2
)
if
capability
not
in
win_model2
:
win_model2
[
capability
]
=
0.0
else
:
win_model2
[
capability
]
=
round
(
(
win_model2
[
capability
]
/
categories
[
capability
])
*
100
,
2
)
win_model1
[
'position_bias'
]
=
bias_num
win_model2
[
'position_bias'
]
=
bias_num
scores
=
{
'win_'
+
model1
:
win_model1
,
'win_'
+
model2
:
win_model2
}
rows
=
list
(
scores
.
keys
())
columns
=
list
(
scores
[
rows
[
0
]].
keys
())
columns
.
insert
(
0
,
columns
.
pop
(
columns
.
index
(
'total'
)))
columns
.
insert
(
1
,
columns
.
pop
(
columns
.
index
(
'position_bias'
)))
with
open
(
fout
,
'a+'
,
newline
=
''
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
writer
.
writerow
([
model1
+
'_vs_'
+
model2
]
+
columns
)
for
row
in
rows
:
writer
.
writerow
([
row
]
+
[
scores
[
row
][
column
]
for
column
in
columns
])
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
for
fout
in
fout_list
:
with
open
(
fout
,
'r'
)
as
f
:
x
=
from_csv
(
f
)
print
(
fout
)
print
(
x
)
opencompass/summarizers/subjective/mtbench.py
View file @
8c85edd1
# flake8: noqa: E501
# flake8: noqa
# yapf: disable
import
csv
import
csv
import
os
import
os
import
os.path
as
osp
import
os.path
as
osp
...
@@ -8,11 +9,7 @@ from datetime import datetime
...
@@ -8,11 +9,7 @@ from datetime import datetime
import
numpy
as
np
import
numpy
as
np
from
mmengine
import
ConfigDict
from
mmengine
import
ConfigDict
from
tabulate
import
tabulate
try
:
from
prettytable
import
from_csv
except
ImportError
:
from_csv
=
None
from
opencompass.utils
import
model_abbr_from_cfg
from
opencompass.utils
import
model_abbr_from_cfg
...
@@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer
...
@@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer
from
.utils
import
get_judgeanswer_and_reference
,
get_outdir
from
.utils
import
get_judgeanswer_and_reference
,
get_outdir
def
model_abbr_from_cfg_used_in_summarizer
(
model
):
if
model
.
get
(
'summarizer_abbr'
,
None
):
return
model
[
'summarizer_abbr'
]
else
:
return
model_abbr_from_cfg
(
model
)
def
post_process_mtbench_pair
(
judgement
:
str
):
def
post_process_mtbench_pair
(
judgement
:
str
):
"""Input a string like below:
"""Input a string like below:
...
@@ -52,7 +55,7 @@ def get_capability_results(
...
@@ -52,7 +55,7 @@ def get_capability_results(
references
,
references
,
fout
,
fout
,
fout_flag
,
fout_flag
,
model
,
model
_abbr
,
):
):
capability_ratings
=
defaultdict
(
int
)
capability_ratings
=
defaultdict
(
int
)
capability_counts
=
defaultdict
(
int
)
capability_counts
=
defaultdict
(
int
)
...
@@ -70,12 +73,12 @@ def get_capability_results(
...
@@ -70,12 +73,12 @@ def get_capability_results(
capability_avg_ratings
[
capability
]
=
s
capability_avg_ratings
[
capability
]
=
s
columns
=
list
(
capability_avg_ratings
.
keys
())
columns
=
list
(
capability_avg_ratings
.
keys
())
columns
.
insert
(
0
,
columns
.
pop
(
columns
.
index
(
'total'
)))
columns
.
insert
(
0
,
columns
.
pop
(
columns
.
index
(
'total'
)))
with
open
(
fout
,
'a+'
,
newline
=
''
)
as
csvfile
:
with
open
(
fout
,
'a+'
,
newline
=
''
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
writer
=
csv
.
writer
(
csvfile
)
if
fout_flag
==
0
:
if
fout_flag
==
0
:
writer
.
writerow
([
'model'
]
+
columns
)
writer
.
writerow
([
'model'
]
+
columns
)
writer
.
writerow
([
model
]
+
writer
.
writerow
([
model_abbr
]
+
[
capability_avg_ratings
[
column
]
for
column
in
columns
])
[
capability_avg_ratings
[
column
]
for
column
in
columns
])
class
MTBenchSummarizer
(
CompassArenaSummarizer
):
class
MTBenchSummarizer
(
CompassArenaSummarizer
):
...
@@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer):
...
@@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer):
self
.
cfg
=
config
self
.
cfg
=
config
if
self
.
judge_type
==
'single'
:
if
self
.
judge_type
==
'single'
:
self
.
eval_model_cfgs
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'models'
]
self
.
eval_model_cfgs
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'models'
]
self
.
eval_model_abbrs
=
[
model_abbr_from_cfg
(
model
)
for
model
in
self
.
eval_model_cfgs
]
elif
self
.
judge_type
==
'pair'
:
elif
self
.
judge_type
==
'pair'
:
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'compare_models'
]
'compare_models'
]
self
.
judge_abbr
=
model_abbr_from_cfg
(
self
.
cfg
[
'judge_models'
][
0
])
self
.
judge_abbr
=
model_abbr_from_cfg
(
self
.
cfg
[
'judge_models'
][
0
])
self
.
judge_map
=
{
self
.
judge_map
=
{
'single'
:
post_process_mtbench_single
,
'single'
:
post_process_mtbench_single
,
...
@@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
...
@@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
}
}
self
.
judge_function
=
self
.
judge_map
[
self
.
judge_type
]
self
.
judge_function
=
self
.
judge_map
[
self
.
judge_type
]
def
summarize
(
self
,
def
summarize
(
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
"""Summarize the subjectivity analysis based on evaluation results.
"""Summarize the subjectivity analysis based on evaluation results.
Args:
Args:
...
@@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer):
...
@@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer):
Returns:
Returns:
pd.DataFrame: The summary results.
pd.DataFrame: The summary results.
"""
"""
if
self
.
judge_type
==
'single'
:
if
self
.
judge_type
==
'pair'
:
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
return
super
().
summarize
()
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
fout_flag
=
0
# self.judge_type == 'single'
for
eval_model_abbr
in
self
.
eval_model_abbrs
:
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
subdir
=
eval_model_abbr
+
'_judged-by--'
+
self
.
judge_abbr
output_dir
,
results_folder
=
get_outdir
(
self
.
cfg
,
time_str
)
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
fout_flag
=
0
if
os
.
path
.
isdir
(
subdir_path
):
for
eval_model_cfg
in
self
.
eval_model_cfgs
:
model
,
judge_model
=
eval_model_abbr
,
self
.
judge_abbr
eval_model_abbr
=
model_abbr_from_cfg
(
eval_model_cfg
)
fout
=
osp
.
join
(
show_model_abbr
=
model_abbr_from_cfg_used_in_summarizer
(
eval_model_cfg
)
output_dir
,
subdir_path
=
os
.
path
.
join
(
results_folder
,
eval_model_abbr
+
'_judged-by--'
+
self
.
judge_abbr
)
'judged-by--'
+
judge_model
+
'-capability.csv'
)
if
os
.
path
.
isdir
(
subdir_path
):
overall_judged_answers
,
overall_references
=
[],
[]
fout
=
osp
.
join
(
output_dir
,
'judged-by--'
+
self
.
judge_abbr
+
'-capability.csv'
)
for
dataset
in
dataset_cfgs
:
overall_judged_answers
,
overall_references
=
[],
[]
judged_answers
,
references
=
get_judgeanswer_and_reference
(
for
dataset
in
dataset_cfgs
:
dataset
,
subdir_path
,
self
.
judge_function
)
judged_answers
,
references
=
get_judgeanswer_and_reference
(
dataset
,
subdir_path
,
self
.
judge_function
)
overall_judged_answers
+=
judged_answers
overall_judged_answers
+=
judged_answers
overall_references
+=
references
overall_references
+=
references
get_capability_results
(
overall_judged_answers
,
get_capability_results
(
overall_judged_answers
,
overall_references
,
fout
,
fout_flag
,
show_model_abbr
)
overall_references
,
fout
,
fout_flag
,
fout_flag
+=
1
model
)
else
:
fout_flag
+=
1
print
(
subdir_path
+
' is not exist! please check!'
)
else
:
with
open
(
fout
,
'r'
)
as
f
:
print
(
subdir_path
+
' is not exist! please check!'
)
csv_reader
=
csv
.
reader
(
f
)
with
open
(
fout
,
'r'
)
as
f
:
header
=
next
(
csv_reader
)
x
=
from_csv
(
f
)
table
=
[
line
for
line
in
csv_reader
]
print
(
x
)
print
(
fout
)
new_header
=
[
''
]
+
[
line
[
0
]
for
line
in
table
]
elif
self
.
judge_type
==
'pair'
:
new_table
=
[[
h
]
+
line
[
1
:]
for
h
,
line
in
zip
(
header
[
1
:],
table
)]
super
().
summarize
()
new_table
=
[[
h
]
+
[
line
[
i
]
for
line
in
table
]
for
i
,
h
in
enumerate
(
header
[
1
:],
start
=
1
)]
t
=
tabulate
(
new_table
,
headers
=
new_header
)
with
open
(
fout
,
'w'
)
as
f
:
f
.
write
(
','
.
join
(
new_header
)
+
'
\n
'
)
for
line
in
new_table
:
f
.
write
(
','
.
join
(
map
(
str
,
line
))
+
'
\n
'
)
print
(
t
)
print
(
fout
)
opencompass/tasks/openicl_eval.py
View file @
8c85edd1
...
@@ -3,6 +3,7 @@ import copy
...
@@ -3,6 +3,7 @@ import copy
import
fnmatch
import
fnmatch
import
math
import
math
import
os.path
as
osp
import
os.path
as
osp
import
re
import
statistics
import
statistics
import
time
import
time
from
collections
import
Counter
from
collections
import
Counter
...
@@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str],
...
@@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str],
start
=
0
start
=
0
end
=
len
(
s
)
end
=
len
(
s
)
if
begin_str
:
if
begin_str
and
re
.
match
(
r
'\s*'
,
begin_str
)
is
None
:
begin_idx
=
s
.
find
(
begin_str
)
begin_idx
=
s
.
find
(
begin_str
)
if
begin_idx
!=
-
1
:
if
begin_idx
!=
-
1
:
start
=
begin_idx
+
len
(
begin_str
)
start
=
begin_idx
+
len
(
begin_str
)
if
end_str
:
if
end_str
and
re
.
match
(
r
'\s*'
,
end_str
)
is
None
:
# TODO: Support calling tokenizer for the accurate eos token
# TODO: Support calling tokenizer for the accurate eos token
# and avoid such hardcode
# and avoid such hardcode
end_idx
=
s
.
find
(
end_str
,
start
)
end_idx
=
s
.
find
(
end_str
,
start
)
...
...
opencompass/tasks/outer_eval/alpacaeval.py
View file @
8c85edd1
# flake8: noqa: E501
# flake8: noqa: E501
import
copy
import
copy
import
json
import
json
import
os
import
os.path
as
osp
import
os.path
as
osp
import
mmengine
import
mmengine
...
@@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask):
...
@@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask):
command
=
''
command
=
''
if
api_key
is
not
None
:
if
api_key
is
not
None
:
command
+=
f
'export OPENAI_API_KEY=
{
api_key
}
; '
command
+=
f
'export OPENAI_API_KEY=
{
api_key
}
; '
else
:
api_key
=
os
.
environ
.
get
(
'OPENAI_API_KEY'
,
''
).
split
(
','
)[
0
]
if
api_key
:
command
+=
f
'export OPENAI_API_KEY=
{
api_key
}
; '
command
+=
f
'alpaca_eval --model_outputs
{
filename
}
--annotators_config
{
alpaca_cfg
}
--output_path
{
output_path
}
'
command
+=
f
'alpaca_eval --model_outputs
{
filename
}
--annotators_config
{
alpaca_cfg
}
--output_path
{
output_path
}
'
return
template
.
format
(
task_cmd
=
command
)
return
template
.
format
(
task_cmd
=
command
)
...
...
opencompass/utils/run.py
View file @
8c85edd1
...
@@ -5,6 +5,7 @@ import tabulate
...
@@ -5,6 +5,7 @@ import tabulate
from
mmengine.config
import
Config
from
mmengine.config
import
Config
from
opencompass.datasets.custom
import
make_custom_dataset_config
from
opencompass.datasets.custom
import
make_custom_dataset_config
from
opencompass.models
import
VLLM
,
HuggingFaceCausalLM
,
TurboMindModel
from
opencompass.partitioners
import
NaivePartitioner
,
SizePartitioner
from
opencompass.partitioners
import
NaivePartitioner
,
SizePartitioner
from
opencompass.runners
import
DLCRunner
,
LocalRunner
,
SlurmRunner
from
opencompass.runners
import
DLCRunner
,
LocalRunner
,
SlurmRunner
from
opencompass.tasks
import
OpenICLEvalTask
,
OpenICLInferTask
from
opencompass.tasks
import
OpenICLEvalTask
,
OpenICLInferTask
...
@@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config:
...
@@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config:
if
args
.
config
:
if
args
.
config
:
config
=
Config
.
fromfile
(
args
.
config
,
format_python_code
=
False
)
config
=
Config
.
fromfile
(
args
.
config
,
format_python_code
=
False
)
config
=
try_fill_in_custom_cfgs
(
config
)
config
=
try_fill_in_custom_cfgs
(
config
)
# set infer accelerator if needed
if
args
.
accelerator
in
[
'vllm'
,
'lmdeploy'
]:
config
[
'models'
]
=
change_accelerator
(
config
[
'models'
],
args
.
accelerator
)
return
config
return
config
# parse dataset args
# parse dataset args
if
not
args
.
datasets
and
not
args
.
custom_dataset_path
:
if
not
args
.
datasets
and
not
args
.
custom_dataset_path
:
...
@@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config:
...
@@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config:
pad_token_id
=
args
.
pad_token_id
,
pad_token_id
=
args
.
pad_token_id
,
run_cfg
=
dict
(
num_gpus
=
args
.
num_gpus
))
run_cfg
=
dict
(
num_gpus
=
args
.
num_gpus
))
models
.
append
(
model
)
models
.
append
(
model
)
# set infer accelerator if needed
if
args
.
accelerator
in
[
'vllm'
,
'lmdeploy'
]:
models
=
change_accelerator
(
models
,
args
.
accelerator
)
# parse summarizer args
# parse summarizer args
summarizer_arg
=
args
.
summarizer
if
args
.
summarizer
is
not
None
\
summarizer_arg
=
args
.
summarizer
if
args
.
summarizer
is
not
None
\
else
'example'
else
'example'
...
@@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config:
...
@@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config:
format_python_code
=
False
)
format_python_code
=
False
)
def
change_accelerator
(
models
,
accelerator
):
models
=
models
.
copy
()
model_accels
=
[]
for
model
in
models
:
get_logger
().
info
(
f
'Transforming
{
model
[
"abbr"
]
}
to
{
accelerator
}
'
)
# change HuggingFace model to VLLM or TurboMindModel
if
model
[
'type'
]
is
HuggingFaceCausalLM
:
gen_args
=
dict
()
if
model
.
get
(
'generation_kwargs'
)
is
not
None
:
generation_kwargs
=
model
[
'generation_kwargs'
].
copy
()
gen_args
[
'temperature'
]
=
0.001
if
generation_kwargs
.
get
(
'temperature'
)
is
None
else
generation_kwargs
[
'temperature'
]
gen_args
[
'top_k'
]
=
1
if
generation_kwargs
.
get
(
'top_k'
)
is
None
else
generation_kwargs
[
'top_k'
]
gen_args
[
'top_p'
]
=
0.9
if
generation_kwargs
.
get
(
'top_p'
)
is
None
else
generation_kwargs
[
'top_p'
]
gen_args
[
'stop_token_ids'
]
=
None
if
generation_kwargs
.
get
(
'eos_token_id'
)
is
None
else
generation_kwargs
[
'eos_token_id'
]
generation_kwargs
[
'stop_token_ids'
]
=
None
if
generation_kwargs
.
get
(
'eos_token_id'
)
is
None
else
generation_kwargs
[
'eos_token_id'
]
generation_kwargs
.
pop
(
'eos_token_id'
)
else
:
# if generation_kwargs is not provided, set default values
generation_kwargs
=
dict
()
gen_args
[
'temperature'
]
=
0.0
gen_args
[
'top_k'
]
=
1
gen_args
[
'top_p'
]
=
0.9
gen_args
[
'stop_token_ids'
]
=
None
if
accelerator
==
'lmdeploy'
:
get_logger
().
info
(
f
'Transforming
{
model
[
"abbr"
]
}
to
{
accelerator
}
'
)
model
=
dict
(
type
=
# noqa E251
f
'
{
TurboMindModel
.
__module__
}
.
{
TurboMindModel
.
__name__
}
'
,
abbr
=
model
[
'abbr'
].
replace
(
'hf'
,
'lmdeploy'
)
if
'-hf'
in
model
[
'abbr'
]
else
model
[
'abbr'
]
+
'-lmdeploy'
,
path
=
model
[
'path'
],
engine_config
=
dict
(
session_len
=
model
[
'max_seq_len'
],
max_batch_size
=
model
[
'batch_size'
],
tp
=
model
[
'run_cfg'
][
'num_gpus'
]),
gen_config
=
dict
(
top_k
=
gen_args
[
'top_k'
],
temperature
=
gen_args
[
'temperature'
],
top_p
=
gen_args
[
'top_p'
],
max_new_tokens
=
model
[
'max_out_len'
],
stop_words
=
gen_args
[
'stop_token_ids'
]),
max_out_len
=
model
[
'max_out_len'
],
max_seq_len
=
model
[
'max_seq_len'
],
batch_size
=
model
[
'batch_size'
],
concurrency
=
model
[
'batch_size'
],
run_cfg
=
model
[
'run_cfg'
],
)
for
item
in
[
'meta_template'
]:
if
model
.
get
(
item
)
is
not
None
:
model
.
update
(
item
,
model
[
item
])
elif
accelerator
==
'vllm'
:
get_logger
().
info
(
f
'Transforming
{
model
[
"abbr"
]
}
to
{
accelerator
}
'
)
model
=
dict
(
type
=
f
'
{
VLLM
.
__module__
}
.
{
VLLM
.
__name__
}
'
,
abbr
=
model
[
'abbr'
].
replace
(
'hf'
,
'vllm'
)
if
'-hf'
in
model
[
'abbr'
]
else
model
[
'abbr'
]
+
'-vllm'
,
path
=
model
[
'path'
],
model_kwargs
=
dict
(
tensor_parallel_size
=
model
[
'run_cfg'
][
'num_gpus'
]),
max_out_len
=
model
[
'max_out_len'
],
max_seq_len
=
model
[
'max_seq_len'
],
batch_size
=
model
[
'batch_size'
],
generation_kwargs
=
generation_kwargs
,
run_cfg
=
model
[
'run_cfg'
],
)
for
item
in
[
'meta_template'
,
'end_str'
]:
if
model
.
get
(
item
)
is
not
None
:
model
.
update
(
item
,
model
[
item
])
generation_kwargs
.
update
(
dict
(
temperature
=
gen_args
[
'temperature'
]))
else
:
raise
ValueError
(
f
'Unsupported accelerator
{
accelerator
}
'
)
model_accels
.
append
(
model
)
return
model_accels
def
exec_mm_infer_runner
(
tasks
,
args
,
cfg
):
def
exec_mm_infer_runner
(
tasks
,
args
,
cfg
):
"""execute multimodal infer runner according to args."""
"""execute multimodal infer runner according to args."""
if
args
.
slurm
:
if
args
.
slurm
:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment