Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
d34ba111
Unverified
Commit
d34ba111
authored
Feb 05, 2024
by
Fengzhe Zhou
Committed by
GitHub
Feb 05, 2024
Browse files
[Sync] Merge branch 'dev' into zfz/update-keyset-demo (#876)
parent
32b5948f
Changes
97
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
320 additions
and
26 deletions
+320
-26
opencompass/datasets/teval/utils/__init__.py
opencompass/datasets/teval/utils/__init__.py
+0
-0
opencompass/datasets/teval/utils/convert_results.py
opencompass/datasets/teval/utils/convert_results.py
+35
-0
opencompass/datasets/teval/utils/format_load.py
opencompass/datasets/teval/utils/format_load.py
+44
-0
opencompass/datasets/teval/utils/meta_template.py
opencompass/datasets/teval/utils/meta_template.py
+11
-0
opencompass/datasets/teval/utils/template.py
opencompass/datasets/teval/utils/template.py
+76
-0
opencompass/datasets/triviaqa.py
opencompass/datasets/triviaqa.py
+4
-3
opencompass/models/baichuan_api.py
opencompass/models/baichuan_api.py
+1
-1
opencompass/models/huggingface.py
opencompass/models/huggingface.py
+0
-2
opencompass/models/openai_api.py
opencompass/models/openai_api.py
+3
-0
opencompass/models/sensetime_api.py
opencompass/models/sensetime_api.py
+1
-0
opencompass/models/zhipuai_v2_api.py
opencompass/models/zhipuai_v2_api.py
+2
-0
opencompass/partitioners/num_worker.py
opencompass/partitioners/num_worker.py
+22
-7
opencompass/summarizers/default.py
opencompass/summarizers/default.py
+1
-1
opencompass/summarizers/subjective/alignmentbench.py
opencompass/summarizers/subjective/alignmentbench.py
+104
-1
opencompass/tasks/openicl_eval.py
opencompass/tasks/openicl_eval.py
+3
-1
opencompass/utils/text_postprocessors.py
opencompass/utils/text_postprocessors.py
+1
-1
tools/prompt_viewer.py
tools/prompt_viewer.py
+12
-9
No files found.
opencompass/datasets/teval/utils/__init__.py
0 → 100644
View file @
d34ba111
opencompass/datasets/teval/utils/convert_results.py
0 → 100644
View file @
d34ba111
import
mmengine
import
os
import
argparse
import
numpy
as
np
# np.set_printoptions(precision=1)
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--result_path'
,
type
=
str
)
args
=
parser
.
parse_args
()
return
args
def
convert_results
(
result_path
):
result
=
mmengine
.
load
(
result_path
)
instruct_list
=
[(
result
[
'instruct_json'
][
'json_format_metric'
]
+
result
[
'instruct_json'
][
'json_args_em_metric'
])
/
2
,
\
(
result
[
'instruct_json'
][
'string_format_metric'
]
+
result
[
'instruct_json'
][
'string_args_em_metric'
])
/
2
]
plan_list
=
[
result
[
'plan_str'
][
'f1_score'
],
result
[
'plan_json'
][
'f1_score'
]]
reason_list
=
[
result
[
'reason_str'
][
'thought'
],
result
[
'rru_json'
][
'thought'
]]
retrieve_list
=
[
result
[
'retrieve_str'
][
'name'
],
result
[
'rru_json'
][
'name'
]]
understand_list
=
[
result
[
'understand_str'
][
'args'
],
result
[
'rru_json'
][
'args'
]]
review_list
=
[
result
[
'review_str'
][
'review_quality'
],
result
[
'review_str'
][
'review_quality'
]]
final_score
=
[
np
.
mean
(
instruct_list
),
np
.
mean
(
plan_list
),
np
.
mean
(
reason_list
),
\
np
.
mean
(
retrieve_list
),
np
.
mean
(
understand_list
),
np
.
mean
(
review_list
)]
overall
=
np
.
mean
(
final_score
)
final_score
.
insert
(
0
,
overall
)
name_list
=
[
'Overall'
,
'Instruct'
,
'Plan'
,
'Reason'
,
'Retrieve'
,
'Understand'
,
'Review'
]
print
(
"Cut Paste Results: "
,
np
.
array
(
final_score
)
*
100
)
for
i
in
range
(
len
(
name_list
)):
print
(
"%s: %.1f"
%
(
name_list
[
i
],
final_score
[
i
]
*
100
),
end
=
'
\t
'
)
if
__name__
==
'__main__'
:
args
=
parse_args
()
convert_results
(
args
.
result_path
)
opencompass/datasets/teval/utils/format_load.py
0 → 100644
View file @
d34ba111
import
ast
import
json
def
format_load
(
raw_data
:
str
,
start_character
:
str
=
''
,
end_character
:
str
=
''
):
"""Format the raw data into the format that can be evaluated.
Args:
raw_data (str): The raw data.
start_character (str, optional): The start character. Defaults to '', if using it, the string will be sliced from the first start_character.
end_character (str, optional): The end character. Defaults to '', if using it, the string will be sliced to the last end_character.
Returns:
str: The formatted data.
"""
if
type
(
raw_data
)
!=
str
:
# the data has been evaluated
return
raw_data
if
"```json"
in
raw_data
:
raw_data
=
raw_data
[
raw_data
.
find
(
"```json"
)
+
len
(
"```json"
):]
raw_data
=
raw_data
.
strip
(
"`"
)
if
start_character
!=
''
:
raw_data
=
raw_data
[
raw_data
.
find
(
start_character
):]
if
end_character
!=
''
:
raw_data
=
raw_data
[:
raw_data
.
rfind
(
end_character
)
+
len
(
end_character
)]
successful_parse
=
False
try
:
data
=
ast
.
literal_eval
(
raw_data
)
successful_parse
=
True
except
Exception
as
e
:
pass
try
:
if
not
successful_parse
:
data
=
json
.
loads
(
raw_data
)
successful_parse
=
True
except
Exception
as
e
:
pass
try
:
if
not
successful_parse
:
data
=
json
.
loads
(
raw_data
.
replace
(
"
\'
"
,
"
\"
"
))
successful_parse
=
True
except
Exception
as
e
:
pass
if
not
successful_parse
:
raise
Exception
(
"Cannot parse raw data"
)
return
data
opencompass/datasets/teval/utils/meta_template.py
0 → 100644
View file @
d34ba111
meta_template_dict
=
dict
(
internlm
=
[
dict
(
role
=
'system'
,
begin
=
'<|System|>:'
,
end
=
'
\n
'
),
dict
(
role
=
'user'
,
begin
=
'<|User|>:'
,
end
=
'
\n
'
),
dict
(
role
=
'assistant'
,
begin
=
'<|Bot|>:'
,
end
=
'<eoa>
\n
'
,
generate
=
True
)
],
)
opencompass/datasets/teval/utils/template.py
0 → 100644
View file @
d34ba111
import
re
from
string
import
Formatter
def
format_string
(
template
:
str
,
input_data
:
dict
)
->
str
:
"""Return string with input content according input format template.
Args:
template (str): Format string with keyword-only argument. For
example '{who} like {what}'
input_data (dict): Input data to fill in the input template.
Returns:
str: Return string.
"""
return
template
.
format
(
**
input_data
)
def
parse_string
(
template
:
str
,
input_string
:
str
,
allow_newline
:
bool
=
False
)
->
dict
:
"""Return a dictionary whose keys are from input template and value is
responding content from input_string.
Args:
template (str): Format template with keyword-only argument. For
example '{who} like {what}'
input_string (str): Input string will be parsed.
allow_newline (boolen): Whether allow '
\n
' in {} during RE match, default to False.
Returns:
dict: Parsed data from input string according to format string. If
input string doesn't match template, It will return None.
Examples:
>>> template = '{who} like {what}'
>>> input_string = 'monkey like banana'
>>> data = parse_string(template, input_string)
>>> data
>>> {'who': 'monkey', 'what': 'banana'}
>>> input_string = 'monkey likes banana'
>>> data = parse_string(template, input_string)
>>> data
>>> None
>>> template = '{what} like {what}'
>>> input_string = 'monkey like banana'
>>> data = parse_string(template, input_string)
>>> data
>>> {'what': ['monkey', 'banana']}
"""
formatter
=
Formatter
()
context
=
[]
keys
=
[]
for
v
in
formatter
.
parse
(
template
):
# v is (literal_text, field_name, format_spec, conversion)
if
v
[
1
]
is
not
None
:
keys
.
append
(
v
[
1
])
context
.
append
(
v
[
0
])
pattern
=
template
for
k
in
keys
:
pattern
=
pattern
.
replace
(
'{'
+
f
'
{
k
}
'
+
'}'
,
'(.*)'
)
# pattern = re.compile(rf'{pattern}')
values
=
re
.
findall
(
pattern
,
input_string
,
re
.
S
if
allow_newline
else
0
)
if
len
(
values
)
<
1
:
return
None
data
=
dict
()
for
k
,
v
in
zip
(
keys
,
values
[
0
]):
if
k
in
data
:
tmp
=
data
[
k
]
if
isinstance
(
tmp
,
list
):
data
[
k
].
append
(
v
)
else
:
data
[
k
]
=
[
tmp
,
v
]
else
:
data
[
k
]
=
v
return
data
opencompass/datasets/triviaqa.py
View file @
d34ba111
...
@@ -85,9 +85,10 @@ class TriviaQAEvaluator(BaseEvaluator):
...
@@ -85,9 +85,10 @@ class TriviaQAEvaluator(BaseEvaluator):
cnt
=
0
cnt
=
0
for
pred
,
cand_ans
in
zip
(
processed_predictions
,
processed_answers
):
for
pred
,
cand_ans
in
zip
(
processed_predictions
,
processed_answers
):
detail
=
{
'pred'
:
pred
,
'answer'
:
cand_ans
,
'correct'
:
False
}
detail
=
{
'pred'
:
pred
,
'answer'
:
cand_ans
,
'correct'
:
False
}
cnt
+=
int
(
any
([
cand
==
pred
for
cand
in
cand_ans
]))
# is_correct = any([cand == pred for cand in cand_ans])
if
int
(
any
([
cand
==
pred
for
cand
in
cand_ans
])):
is_correct
=
any
([
cand
in
pred
for
cand
in
cand_ans
])
detail
[
'correct'
]
=
True
cnt
+=
int
(
is_correct
)
detail
[
'correct'
]
=
is_correct
details
.
append
(
detail
)
details
.
append
(
detail
)
score
=
cnt
/
len
(
predictions
)
*
100
score
=
cnt
/
len
(
predictions
)
*
100
...
...
opencompass/models/baichuan_api.py
View file @
d34ba111
...
@@ -150,7 +150,7 @@ class BaiChuan(BaseAPIModel):
...
@@ -150,7 +150,7 @@ class BaiChuan(BaseAPIModel):
return
msg
return
msg
if
raw_response
.
status_code
!=
200
:
if
raw_response
.
status_code
!=
200
:
print
(
raw_response
)
print
(
raw_response
.
json
()
)
time
.
sleep
(
1
)
time
.
sleep
(
1
)
continue
continue
print
(
response
)
print
(
response
)
...
...
opencompass/models/huggingface.py
View file @
d34ba111
...
@@ -109,10 +109,8 @@ class HuggingFace(BaseModel):
...
@@ -109,10 +109,8 @@ class HuggingFace(BaseModel):
max_seq_len
=
max_seq_len
,
max_seq_len
=
max_seq_len
,
tokenizer_only
=
tokenizer_only
,
tokenizer_only
=
tokenizer_only
,
meta_template
=
meta_template
)
meta_template
=
meta_template
)
from
opencompass.utils.fileio
import
patch_hf_auto_model
if
hf_cache_dir
is
None
:
if
hf_cache_dir
is
None
:
hf_cache_dir
=
os
.
getenv
(
'HF_MODEL_HUB'
,
None
)
hf_cache_dir
=
os
.
getenv
(
'HF_MODEL_HUB'
,
None
)
patch_hf_auto_model
(
hf_cache_dir
)
self
.
logger
=
get_logger
()
self
.
logger
=
get_logger
()
self
.
pad_token_id
=
pad_token_id
self
.
pad_token_id
=
pad_token_id
assert
mode
in
[
'none'
,
'mid'
]
assert
mode
in
[
'none'
,
'mid'
]
...
...
opencompass/models/openai_api.py
View file @
d34ba111
...
@@ -405,6 +405,7 @@ class OpenAIAllesAPIN(OpenAI):
...
@@ -405,6 +405,7 @@ class OpenAIAllesAPIN(OpenAI):
except
requests
.
JSONDecodeError
:
except
requests
.
JSONDecodeError
:
self
.
logger
.
error
(
'JsonDecode error, got'
,
self
.
logger
.
error
(
'JsonDecode error, got'
,
str
(
raw_response
.
content
))
str
(
raw_response
.
content
))
time
.
sleep
(
1
)
continue
continue
if
raw_response
.
status_code
==
200
and
response
[
if
raw_response
.
status_code
==
200
and
response
[
'msgCode'
]
==
'10000'
:
'msgCode'
]
==
'10000'
:
...
@@ -415,6 +416,8 @@ class OpenAIAllesAPIN(OpenAI):
...
@@ -415,6 +416,8 @@ class OpenAIAllesAPIN(OpenAI):
else
:
else
:
return
choices
[
0
][
'message'
][
'content'
].
strip
()
return
choices
[
0
][
'message'
][
'content'
].
strip
()
self
.
logger
.
error
(
response
[
'msg'
])
self
.
logger
.
error
(
response
[
'msg'
])
self
.
logger
.
error
(
response
)
time
.
sleep
(
1
)
raise
RuntimeError
(
'API call failed.'
)
raise
RuntimeError
(
'API call failed.'
)
...
...
opencompass/models/sensetime_api.py
View file @
d34ba111
...
@@ -193,6 +193,7 @@ class SenseTime(BaseAPIModel):
...
@@ -193,6 +193,7 @@ class SenseTime(BaseAPIModel):
time
.
sleep
(
1
)
time
.
sleep
(
1
)
continue
continue
return
''
raise
RuntimeError
(
raise
RuntimeError
(
f
'request id: '
f
'request id: '
f
'
{
raw_response
.
headers
.
get
(
"X-Request-Id"
)
}
,
{
raw_response
.
text
}
'
)
f
'
{
raw_response
.
headers
.
get
(
"X-Request-Id"
)
}
,
{
raw_response
.
text
}
'
)
opencompass/models/zhipuai_v2_api.py
View file @
d34ba111
...
@@ -119,6 +119,8 @@ class ZhiPuV2AI(BaseAPIModel):
...
@@ -119,6 +119,8 @@ class ZhiPuV2AI(BaseAPIModel):
while
max_num_retries
<
self
.
retry
:
while
max_num_retries
<
self
.
retry
:
self
.
acquire
()
self
.
acquire
()
response
=
None
try
:
try
:
response
=
self
.
client
.
chat
.
completions
.
create
(
**
data
)
response
=
self
.
client
.
chat
.
completions
.
create
(
**
data
)
except
APIStatusError
as
err
:
except
APIStatusError
as
err
:
...
...
opencompass/partitioners/num_worker.py
View file @
d34ba111
...
@@ -30,12 +30,17 @@ class NumWorkerPartitioner(BasePartitioner):
...
@@ -30,12 +30,17 @@ class NumWorkerPartitioner(BasePartitioner):
out_dir
:
str
,
out_dir
:
str
,
num_worker
:
int
=
8
,
num_worker
:
int
=
8
,
min_task_size
:
int
=
16
,
min_task_size
:
int
=
16
,
strategy
:
str
=
'heuristic'
,
dataset_size_path
:
str
=
'.cache/dataset_size.json'
,
dataset_size_path
:
str
=
'.cache/dataset_size.json'
,
keep_keys
:
Optional
[
List
[
str
]]
=
None
):
keep_keys
:
Optional
[
List
[
str
]]
=
None
):
super
().
__init__
(
out_dir
=
out_dir
,
keep_keys
=
keep_keys
)
super
().
__init__
(
out_dir
=
out_dir
,
keep_keys
=
keep_keys
)
self
.
num_worker
=
num_worker
self
.
num_worker
=
num_worker
self
.
min_task_size
=
min_task_size
self
.
min_task_size
=
min_task_size
self
.
dataset_size_path
=
dataset_size_path
self
.
dataset_size_path
=
dataset_size_path
assert
strategy
in
(
'heuristic'
,
'split'
),
\
f
'Unsupported partition strategy:
{
strategy
}
. '
\
'Supported strategies are: `heuristic`, `split` .'
self
.
strategy
=
strategy
def
partition
(
self
,
def
partition
(
self
,
model_dataset_combinations
:
List
[
Dict
[
str
,
List
]],
model_dataset_combinations
:
List
[
Dict
[
str
,
List
]],
...
@@ -64,6 +69,7 @@ class NumWorkerPartitioner(BasePartitioner):
...
@@ -64,6 +69,7 @@ class NumWorkerPartitioner(BasePartitioner):
else
:
else
:
chunks
.
append
(
dataset
)
chunks
.
append
(
dataset
)
if
self
.
strategy
==
'heuristic'
:
buckets
=
[[]
for
_
in
range
(
self
.
num_worker
)]
buckets
=
[[]
for
_
in
range
(
self
.
num_worker
)]
for
i
,
chunk
in
enumerate
(
chunks
):
for
i
,
chunk
in
enumerate
(
chunks
):
buckets
[
i
%
self
.
num_worker
].
append
(
chunk
)
buckets
[
i
%
self
.
num_worker
].
append
(
chunk
)
...
@@ -77,6 +83,15 @@ class NumWorkerPartitioner(BasePartitioner):
...
@@ -77,6 +83,15 @@ class NumWorkerPartitioner(BasePartitioner):
'work_dir'
:
work_dir
,
'work_dir'
:
work_dir
,
**
add_cfg
**
add_cfg
}))
}))
elif
self
.
strategy
==
'split'
:
for
dataset
in
chunks
:
tasks
.
append
(
Config
({
'models'
:
[
model
],
'datasets'
:
[[
dataset
]],
'work_dir'
:
work_dir
,
**
add_cfg
}))
return
tasks
return
tasks
@
property
@
property
...
...
opencompass/summarizers/default.py
View file @
d34ba111
...
@@ -16,7 +16,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
...
@@ -16,7 +16,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
model_abbr_from_cfg
)
model_abbr_from_cfg
)
from
opencompass.utils.prompt
import
get_prompt_hash
from
opencompass.utils.prompt
import
get_prompt_hash
METRIC_WHITELIST
=
[
'score'
,
'auc_score'
,
'accuracy'
,
'humaneval_pass@1'
,
'rouge1'
,
'avg_toxicity_score'
,
'bleurt_diff'
,
'matthews_correlation'
,
'truth'
]
METRIC_WHITELIST
=
[
'score'
,
'auc_score'
,
'accuracy'
,
'humaneval_pass@1'
,
'rouge1'
,
'avg_toxicity_score'
,
'bleurt_diff'
,
'matthews_correlation'
,
'truth'
,
'f1'
,
'exact_match'
]
METRIC_BLACKLIST
=
[
'bp'
,
'sys_len'
,
'ref_len'
]
METRIC_BLACKLIST
=
[
'bp'
,
'sys_len'
,
'ref_len'
]
def
model_abbr_from_cfg_used_in_summarizer
(
model
):
def
model_abbr_from_cfg_used_in_summarizer
(
model
):
...
...
opencompass/summarizers/subjective/alignmentbench.py
View file @
d34ba111
...
@@ -29,6 +29,62 @@ All_Dimensions = [
...
@@ -29,6 +29,62 @@ All_Dimensions = [
'公平与可负责程度'
,
'丰富度'
,
'综合得分'
'公平与可负责程度'
,
'丰富度'
,
'综合得分'
]
]
MAPPING
=
{
'事实与解释型回答'
:
[
'事实正确性'
,
'满足用户需求'
,
'清晰度'
,
'完备性'
],
'逻辑推理型回答'
:
[
'事实正确性'
,
'满足用户需求'
,
'逻辑连贯性'
,
'完备性'
],
'生成型回答'
:
[
'事实正确性'
,
'满足用户需求'
,
'逻辑连贯性'
,
'创造性'
,
'丰富度'
],
'建议型回答'
:
[
'事实正确性'
,
'满足用户需求'
,
'公平与可负责程度'
,
'创造性'
]
}
def
detect_mapping
(
text
):
if
'清晰度'
in
text
and
'完备性'
in
text
:
return
'事实与解释型回答'
elif
'完备性'
in
text
and
'逻辑连贯性'
in
text
:
return
'逻辑推理型回答'
elif
'创造性'
in
text
and
'丰富度'
in
text
:
return
'生成型回答'
elif
'创造性'
in
text
and
'公平与可负责程度'
in
text
:
return
'建议型回答'
else
:
return
None
def
extract_missing_rating
(
text
,
search_type
):
searching_keys
=
MAPPING
[
search_type
]
result_dict
=
{}
for
k
in
searching_keys
:
matches
=
re
.
findall
(
rf
'
{
k
}
.*?\n'
,
text
)
result_dict
[
k
]
=
None
for
match
in
reversed
(
matches
):
if
re
.
findall
(
r
'\d{1,2}'
,
match
):
result_dict
[
k
]
=
int
(
re
.
findall
(
r
'\d{1,2}'
,
match
)[
-
1
])
break
overall_number
=
re
.
findall
(
'\d{1,2}'
,
text
)
try
:
result_dict
[
'综合得分'
]
=
int
(
overall_number
[
-
1
])
except
:
return
{}
return
result_dict
def
extract_rating_plus
(
text
):
pattern
=
r
'{(.*?)}(?![^{]*{)'
# match last brackets
match
=
re
.
search
(
pattern
,
text
)
if
match
:
dictionary_str
=
match
.
group
(
1
)
kv_pattern
=
r
"'(.*?)': (\d+)"
matches
=
re
.
findall
(
kv_pattern
,
dictionary_str
)
result_dict
=
{
key
:
int
(
value
)
for
key
,
value
in
matches
}
return
result_dict
else
:
match_type
=
detect_mapping
(
text
=
text
)
if
match_type
is
not
None
:
return
extract_missing_rating
(
text
=
text
,
search_type
=
match_type
)
else
:
return
None
def
extract_rating
(
text
):
def
extract_rating
(
text
):
pattern
=
r
'{(.*?)}(?![^{]*{)'
# match last brackets
pattern
=
r
'{(.*?)}(?![^{]*{)'
# match last brackets
...
@@ -56,6 +112,50 @@ def check_rating(rating, all_dimensions):
...
@@ -56,6 +112,50 @@ def check_rating(rating, all_dimensions):
return
rating
return
rating
def
post_process_alignbench_plus
(
judgement
:
str
,
all_dimensions
=
All_Dimensions
,
possible_keys
=
[
'综合得分'
]):
"""Input a string like below:
xxx{'事实正确性': 1, '满足用户需求': 1, '清晰度': 2, '完备性': 1, '综合得分': 1}xxx,
and extract each score
"""
def
extract_score
(
text
):
keys_pattern
=
'|'
.
join
(
map
(
re
.
escape
,
possible_keys
))
pattern
=
rf
"(
{
'|'
.
join
(
possible_keys
)
}
): (\d+(\.\d{{1,2}})?)"
match
=
re
.
search
(
pattern
,
text
)
if
match
:
try
:
return
float
(
match
.
group
(
1
))
except
ValueError
:
return
-
1
return
-
1
# judgement = judgement.replace('\n', '')
rating
=
extract_rating_plus
(
judgement
)
if
rating
is
not
None
:
score
=
-
1
for
key
in
possible_keys
:
score
=
rating
.
get
(
key
,
-
1
)
if
score
!=
-
1
:
break
if
score
==
-
1
:
score
=
extract_score
(
judgement
)
if
score
>=
0
and
score
<=
10
:
pass
else
:
score
=
-
1
rating
=
check_rating
(
rating
,
all_dimensions
)
else
:
score
=
-
1
if
rating
==
None
or
score
==
-
1
:
return
None
else
:
return
{
'rating'
:
rating
,
'score'
:
score
}
def
post_process_alignbench
(
judgement
:
str
,
def
post_process_alignbench
(
judgement
:
str
,
all_dimensions
=
All_Dimensions
,
all_dimensions
=
All_Dimensions
,
possible_keys
=
[
'综合得分'
]):
possible_keys
=
[
'综合得分'
]):
...
@@ -211,9 +311,12 @@ class AlignmentBenchSummarizer:
...
@@ -211,9 +311,12 @@ class AlignmentBenchSummarizer:
]
]
self
.
judge_abbr
=
model_abbr_from_cfg
(
self
.
cfg
[
'judge_model'
])
self
.
judge_abbr
=
model_abbr_from_cfg
(
self
.
cfg
[
'judge_model'
])
self
.
judge_type
=
judge_type
self
.
judge_type
=
judge_type
assert
self
.
judge_type
in
[
'general'
,
'autoj'
,
'judgelm'
]
assert
self
.
judge_type
in
[
'general'
,
'autoj'
,
'judgelm'
,
'general_plus'
]
self
.
judge_map
=
{
self
.
judge_map
=
{
'general'
:
post_process_alignbench
,
'general'
:
post_process_alignbench
,
'general_plus'
:
post_process_alignbench_plus
,
'autoj'
:
post_process_autoj
,
'autoj'
:
post_process_autoj
,
'judgelm'
:
post_process_judgelm
'judgelm'
:
post_process_judgelm
}
}
...
...
opencompass/tasks/openicl_eval.py
View file @
d34ba111
...
@@ -67,8 +67,10 @@ class OpenICLEvalTask(BaseTask):
...
@@ -67,8 +67,10 @@ class OpenICLEvalTask(BaseTask):
def
__init__
(
self
,
cfg
:
ConfigDict
):
def
__init__
(
self
,
cfg
:
ConfigDict
):
super
().
__init__
(
cfg
)
super
().
__init__
(
cfg
)
self
.
num_gpus
=
0
self
.
logger
=
get_logger
()
self
.
logger
=
get_logger
()
self
.
num_gpus
=
max
(
c
.
get
(
'eval_cfg'
,
{}).
get
(
'num_gpus'
,
0
)
for
c
in
sum
(
self
.
dataset_cfgs
,
[]))
self
.
dump_details
=
cfg
.
get
(
'eval'
,
{}).
get
(
'runner'
,
{}).
get
(
self
.
dump_details
=
cfg
.
get
(
'eval'
,
{}).
get
(
'runner'
,
{}).
get
(
'task'
,
{}).
get
(
'dump_details'
,
False
)
'task'
,
{}).
get
(
'dump_details'
,
False
)
...
...
opencompass/utils/text_postprocessors.py
View file @
d34ba111
...
@@ -83,7 +83,6 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
...
@@ -83,7 +83,6 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
f
'([
{
options
}
])\s?是正确答案'
,
f
'([
{
options
}
])\s?是正确答案'
,
f
'选项\s?([
{
options
}
])\s?正确'
,
f
'选项\s?([
{
options
}
])\s?正确'
,
f
'所以答\s?([
{
options
}
])'
,
f
'所以答\s?([
{
options
}
])'
,
f
'1.\s?([
{
options
}
])[.。$]?$'
,
f
'所以\s?([
{
options
}
][.。$]?$)'
,
f
'所以\s?([
{
options
}
][.。$]?$)'
,
f
'所有\s?([
{
options
}
][.。$]?$)'
,
f
'所有\s?([
{
options
}
][.。$]?$)'
,
f
'[\s,::,]([
{
options
}
])[。,,\.]?$'
,
f
'[\s,::,]([
{
options
}
])[。,,\.]?$'
,
...
@@ -105,6 +104,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
...
@@ -105,6 +104,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
f
'(\s|^)[
{
options
}
][\s。,,::\.$]'
,
f
'(\s|^)[
{
options
}
][\s。,,::\.$]'
,
f
'(\s|^)[
{
options
}
](\s|$)'
,
f
'(\s|^)[
{
options
}
](\s|$)'
,
f
'1.\s?(.*?)$'
,
f
'1.\s?(.*?)$'
,
f
'1.\s?([
{
options
}
])[.。$]?$'
,
]
]
cushion_patterns
=
[
cushion_patterns
=
[
f
'([
{
options
}
]):'
,
f
'([
{
options
}
]):'
,
...
...
tools/prompt_viewer.py
View file @
d34ba111
...
@@ -4,8 +4,9 @@ from typing import Dict
...
@@ -4,8 +4,9 @@ from typing import Dict
from
mmengine.config
import
Config
,
ConfigDict
from
mmengine.config
import
Config
,
ConfigDict
from
opencompass.openicl.icl_inferencer
import
(
CLPInferencer
,
GenInferencer
,
from
opencompass.openicl.icl_inferencer
import
(
AgentInferencer
,
PPLInferencer
,
ChatInferencer
,
CLPInferencer
,
GenInferencer
,
PPLInferencer
,
PPLOnlyInferencer
)
PPLOnlyInferencer
)
from
opencompass.registry
import
ICL_PROMPT_TEMPLATES
,
ICL_RETRIEVERS
from
opencompass.registry
import
ICL_PROMPT_TEMPLATES
,
ICL_RETRIEVERS
from
opencompass.utils
import
(
Menu
,
build_dataset_from_cfg
,
from
opencompass.utils
import
(
Menu
,
build_dataset_from_cfg
,
...
@@ -78,12 +79,16 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
...
@@ -78,12 +79,16 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
ice_idx_list
=
retriever
.
retrieve
()
ice_idx_list
=
retriever
.
retrieve
()
assert
infer_cfg
.
inferencer
.
type
in
[
supported_inferencer
=
[
PPLInferencer
,
GenInferencer
,
CLPInferencer
,
PPLOnlyInferencer
],
\
AgentInferencer
,
PPLInferencer
,
GenInferencer
,
CLPInferencer
,
'Only PPLInferencer and GenInferencer are supported'
PPLOnlyInferencer
,
ChatInferencer
]
if
infer_cfg
.
inferencer
.
type
not
in
supported_inferencer
:
print
(
f
'Only
{
supported_inferencer
}
are supported'
)
return
for
idx
in
range
(
min
(
count
,
len
(
ice_idx_list
))):
for
idx
in
range
(
min
(
count
,
len
(
ice_idx_list
))):
if
infer_cfg
.
inferencer
.
type
==
PPLInferencer
:
if
issubclass
(
infer_cfg
.
inferencer
.
type
,
PPLInferencer
)
:
labels
=
retriever
.
get_labels
(
ice_template
=
ice_template
,
labels
=
retriever
.
get_labels
(
ice_template
=
ice_template
,
prompt_template
=
prompt_template
)
prompt_template
=
prompt_template
)
ice
=
retriever
.
generate_ice
(
ice_idx_list
[
idx
],
ice
=
retriever
.
generate_ice
(
ice_idx_list
[
idx
],
...
@@ -129,9 +134,7 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
...
@@ -129,9 +134,7 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
print
(
'-'
*
100
)
print
(
'-'
*
100
)
print
(
prompt
)
print
(
prompt
)
print
(
'-'
*
100
)
print
(
'-'
*
100
)
elif
infer_cfg
.
inferencer
.
type
in
[
else
:
GenInferencer
,
CLPInferencer
,
PPLOnlyInferencer
]:
ice_idx
=
ice_idx_list
[
idx
]
ice_idx
=
ice_idx_list
[
idx
]
ice
=
retriever
.
generate_ice
(
ice_idx
,
ice_template
=
ice_template
)
ice
=
retriever
.
generate_ice
(
ice_idx
,
ice_template
=
ice_template
)
prompt
=
retriever
.
generate_prompt_for_generate_task
(
prompt
=
retriever
.
generate_prompt_for_generate_task
(
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment