Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
32f40a8f
Unverified
Commit
32f40a8f
authored
Jan 08, 2024
by
Fengzhe Zhou
Committed by
GitHub
Jan 08, 2024
Browse files
[Sync] Sync with internal codes 2023.01.08 (#777)
parent
8194199d
Changes
118
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1256 additions
and
247 deletions
+1256
-247
configs/summarizers/longeval_v2.py
configs/summarizers/longeval_v2.py
+61
-0
opencompass/datasets/__init__.py
opencompass/datasets/__init__.py
+3
-0
opencompass/datasets/cibench.py
opencompass/datasets/cibench.py
+139
-20
opencompass/datasets/circular.py
opencompass/datasets/circular.py
+10
-5
opencompass/datasets/custom.py
opencompass/datasets/custom.py
+288
-34
opencompass/datasets/ds1000.py
opencompass/datasets/ds1000.py
+1
-1
opencompass/datasets/humanevalx.py
opencompass/datasets/humanevalx.py
+24
-5
opencompass/datasets/hungarian_math.py
opencompass/datasets/hungarian_math.py
+20
-0
opencompass/datasets/jsonl.py
opencompass/datasets/jsonl.py
+20
-0
opencompass/datasets/math.py
opencompass/datasets/math.py
+255
-103
opencompass/datasets/math401.py
opencompass/datasets/math401.py
+30
-0
opencompass/datasets/natural_question.py
opencompass/datasets/natural_question.py
+22
-1
opencompass/datasets/reasonbench/ReasonBenchDataset.py
opencompass/datasets/reasonbench/ReasonBenchDataset.py
+7
-7
opencompass/datasets/triviaqa.py
opencompass/datasets/triviaqa.py
+37
-7
opencompass/lagent/actions/ipython_interpreter.py
opencompass/lagent/actions/ipython_interpreter.py
+8
-1
opencompass/models/__init__.py
opencompass/models/__init__.py
+1
-0
opencompass/models/base.py
opencompass/models/base.py
+72
-15
opencompass/models/huggingface.py
opencompass/models/huggingface.py
+94
-47
opencompass/models/llama2.py
opencompass/models/llama2.py
+40
-1
opencompass/models/vllm.py
opencompass/models/vllm.py
+124
-0
No files found.
configs/summarizers/longeval_v2.py
0 → 100644
View file @
32f40a8f
_longeval_2k
=
[
'classification_en_2k'
,
'lines_2k'
,
'qa_en_2k'
,
'qa_zh_2k'
,
'stackselect_2k'
,
'summarization_en_2k'
,
'textsort_2k'
]
_longeval_4k
=
[
'classification_en_4k'
,
'lines_4k'
,
'qa_en_4k'
,
'qa_zh_4k'
,
'stackselect_4k'
,
'summarization_en_4k'
,
'textsort_4k'
]
_longeval_8k
=
[
'classification_en_8k'
,
'lines_8k'
,
'qa_en_8k'
,
'qa_zh_8k'
,
'stackselect_8k'
,
'summarization_en_8k'
,
'textsort_8k'
]
_longeval_15k
=
[
'classification_en_15k'
,
'lines_15k'
,
'qa_en_15k'
,
'qa_zh_15k'
,
'stackselect_15k'
,
'summarization_en_15k'
,
'textsort_15k'
]
_longeval_30k
=
[
'classification_en_30k'
,
'lines_30k'
,
'qa_en_30k'
,
'qa_zh_30k'
,
'stackselect_30k'
,
'summarization_en_30k'
,
'textsort_30k'
]
longeval_summary_groups
=
[
{
'name'
:
'longeval_v2_2k'
,
'subsets'
:
_longeval_2k
},
{
'name'
:
'longeval_v2_4k'
,
'subsets'
:
_longeval_4k
},
{
'name'
:
'longeval_v2_8k'
,
'subsets'
:
_longeval_8k
},
{
'name'
:
'longeval_v2_15k'
,
'subsets'
:
_longeval_15k
},
{
'name'
:
'longeval_v2_30k'
,
'subsets'
:
_longeval_30k
},
{
'name'
:
'longeval_v2'
,
'subsets'
:
_longeval_2k
+
_longeval_4k
+
_longeval_8k
+
_longeval_15k
+
_longeval_30k
}
]
summarizer
=
dict
(
dataset_abbrs
=
[
'longeval_v2'
,
'longeval_v2_2k'
,
'longeval_v2_4k'
,
'longeval_v2_8k'
,
'longeval_v2_15k'
,
'longeval_v2_30k'
,
'classification_en_2k'
,
'classification_en_4k'
,
'classification_en_8k'
,
'classification_en_15k'
,
'classification_en_30k'
,
'lines_2k'
,
'lines_4k'
,
'lines_8k'
,
'lines_15k'
,
'lines_30k'
,
'qa_en_2k'
,
'qa_en_4k'
,
'qa_en_8k'
,
'qa_en_15k'
,
'qa_en_30k'
,
'qa_zh_2k'
,
'qa_zh_4k'
,
'qa_zh_8k'
,
'qa_zh_15k'
,
'qa_zh_30k'
,
'stackselect_2k'
,
'stackselect_4k'
,
'stackselect_8k'
,
'stackselect_15k'
,
'stackselect_30k'
,
'summarization_en_2k'
,
'summarization_en_4k'
,
'summarization_en_8k'
,
'summarization_en_15k'
,
'summarization_en_30k'
,
'textsort_2k'
,
'textsort_4k'
,
'textsort_8k'
,
'textsort_15k'
,
'textsort_30k'
,
],
summary_groups
=
longeval_summary_groups
,
)
opencompass/datasets/__init__.py
View file @
32f40a8f
...
...
@@ -46,9 +46,11 @@ from .hellaswag import * # noqa: F401, F403
from
.huggingface
import
*
# noqa: F401, F403
from
.humaneval
import
*
# noqa: F401, F403
from
.humanevalx
import
*
# noqa: F401, F403
from
.hungarian_math
import
*
# noqa: F401, F403
from
.infinitebench
import
*
# noqa: F401, F403
from
.iwslt2017
import
*
# noqa: F401, F403
from
.jigsawmultilingual
import
*
# noqa: F401, F403
from
.jsonl
import
JsonlDataset
# noqa: F401, F403
from
.kaoshi
import
KaoshiDataset
,
KaoshiEvaluator
# noqa: F401, F403
from
.lambada
import
*
# noqa: F401, F403
from
.lawbench
import
*
# noqa: F401, F403
...
...
@@ -57,6 +59,7 @@ from .leval import * # noqa: F401, F403
from
.longbench
import
*
# noqa: F401, F403
from
.mastermath2024v1
import
*
# noqa: F401, F403
from
.math
import
*
# noqa: F401, F403
from
.math401
import
*
# noqa: F401, F403
from
.mathbench
import
*
# noqa: F401, F403
from
.mbpp
import
*
# noqa: F401, F403
from
.medbench
import
*
# noqa: F401, F403
...
...
opencompass/datasets/cibench.py
View file @
32f40a8f
...
...
@@ -69,13 +69,105 @@ def load_experiment(file: str) -> dict:
)
def
load_experiment_template
(
file
:
str
)
->
dict
:
"""Load single experiment file with solutions for template experiment."""
with
open
(
file
,
'r'
)
as
f
:
notebook
=
json
.
load
(
f
)
example
=
notebook
[
'cells'
]
metadata
=
notebook
[
'metadata'
]
modules
=
metadata
.
get
(
'modules'
,
[])
if
modules
:
# these two annotations should be the same
assert
len
(
modules
)
==
len
(
metadata
.
get
(
'step_types'
))
# reformat annotations
modules
=
[[
_m
.
strip
()
for
_m
in
_modules
.
split
(
'&'
)]
for
_modules
in
modules
]
questions
=
[]
source_codes
=
[]
outputs
=
[]
tags
=
[]
for
cell
in
example
:
if
cell
[
'cell_type'
]
==
'markdown'
:
text
=
''
.
join
(
cell
[
'source'
]).
strip
()
if
modules
:
_modules
=
modules
.
pop
(
0
)
if
'chinese'
not
in
file
:
text
+=
f
"Please use
{
' and '
.
join
(
_modules
)
}
modules."
else
:
text
+=
f
"请用
{
' 和 '
.
join
(
_modules
)
}
模块."
text
=
text
.
strip
()
+
'
\n
'
# append the formatted text
questions
.
append
(
text
)
elif
cell
[
'cell_type'
]
==
'code'
:
source_codes
.
append
(
''
.
join
(
cell
[
'source'
]))
output_flag
=
False
if
cell
[
'outputs'
]:
for
_output
in
cell
[
'outputs'
]:
if
_output
[
'output_type'
]
==
'display_data'
:
assert
not
output_flag
output_flag
=
True
tags
.
append
(
'vis'
)
outputs
.
append
(
_output
[
'data'
][
'image/png'
])
for
_output
in
cell
[
'outputs'
]:
if
output_flag
:
break
if
_output
[
'output_type'
]
==
'stream'
and
_output
[
'name'
]
==
'stdout'
:
assert
not
output_flag
output_flag
=
True
tags
.
append
(
'general'
)
outputs
.
append
(
''
.
join
(
_output
[
'text'
]))
elif
_output
[
'output_type'
]
==
'execute_result'
:
assert
not
output_flag
output_flag
=
True
tags
.
append
(
'general'
)
outputs
.
append
(
''
.
join
(
_output
[
'data'
][
'text/plain'
]))
if
not
output_flag
:
# no output fallback to exec
tags
.
append
(
'exec'
)
outputs
.
append
(
None
)
return
dict
(
experiment
=
file
,
questions
=
sum
(([
dict
(
role
=
'user'
,
content
=
question
),
dict
(
role
=
'assistant'
,
content
=
source_code
)
]
for
question
,
source_code
in
zip
(
questions
,
source_codes
)),
[]),
references
=
dict
(
outputs
=
outputs
,
tags
=
tags
,
metadata
=
metadata
,
experiment
=
file
),
)
def
check_internet
():
"""A tricky way to check internet."""
import
socket
import
nltk
socket
.
setdefaulttimeout
(
10
)
ret
=
nltk
.
download
(
'stopwords'
,
quiet
=
True
)
socket
.
setdefaulttimeout
(
None
)
if
not
ret
:
raise
ConnectionError
(
'CIBench needs internet to get response. Please'
'check your internet and proxy.'
)
@
LOAD_DATASET
.
register_module
()
class
CIBenchDataset
(
BaseDataset
):
"""Code Interpreter dataset."""
@
staticmethod
def
load
(
path
:
str
):
"""Load whole dataset."""
def
load
(
path
:
str
,
internet_check
:
bool
=
False
):
"""Load whole dataset.
Args:
path(str): Path of cibench dataset.
internet_check(bool): Whether to check internet.
Defaults to False.
"""
if
internet_check
:
check_internet
()
assert
os
.
path
.
exists
(
path
),
f
'Path
{
path
}
does not exist.'
data_list
=
[]
for
cwd
,
dirs
,
files
in
os
.
walk
(
path
):
...
...
@@ -83,11 +175,36 @@ class CIBenchDataset(BaseDataset):
files
.
sort
()
for
f
in
files
:
if
'.ipynb'
in
f
:
try
:
data
=
load_experiment
(
os
.
path
.
join
(
cwd
,
f
))
except
Exception
:
print
(
f
'Error with file
{
os
.
path
.
join
(
cwd
,
f
)
}
'
)
continue
data
=
load_experiment
(
os
.
path
.
join
(
cwd
,
f
))
data_list
.
append
(
data
)
dataset
=
Dataset
.
from_list
(
data_list
)
return
dataset
@
LOAD_DATASET
.
register_module
()
class
CIBenchTemplateDataset
(
BaseDataset
):
"""Code Interpreter dataset for template dataset."""
@
staticmethod
def
load
(
path
:
str
,
internet_check
:
bool
=
False
):
"""Load whole dataset.
Args:
path(str): Path of cibench dataset.
internet_check(bool): Whether to check internet.
Defaults to False.
"""
if
internet_check
:
check_internet
()
assert
os
.
path
.
exists
(
path
),
f
'Path
{
path
}
does not exist.'
data_list
=
[]
for
cwd
,
dirs
,
files
in
os
.
walk
(
path
):
dirs
.
sort
()
files
.
sort
()
for
f
in
files
:
if
'.ipynb'
in
f
:
data
=
load_experiment_template
(
os
.
path
.
join
(
cwd
,
f
))
data_list
.
append
(
data
)
dataset
=
Dataset
.
from_list
(
data_list
)
...
...
@@ -138,7 +255,8 @@ class CIBenchEvaluator(BaseEvaluator):
def
check_user_data_dir
(
self
,
user_data_dir
):
if
user_data_dir
==
'ENV'
:
user_data_dir
=
os
.
environ
.
get
(
'USER_DATA_DIR'
,
''
)
default_path
=
osp
.
abspath
(
'./data/cibench_dataset/datasources'
)
user_data_dir
=
os
.
environ
.
get
(
'USER_DATA_DIR'
,
default_path
)
user_data_dir
=
user_data_dir
.
rstrip
(
'/'
)
basename
=
osp
.
basename
(
user_data_dir
)
if
basename
and
basename
!=
'data'
:
...
...
@@ -172,10 +290,11 @@ class CIBenchEvaluator(BaseEvaluator):
if
action
[
'result'
]:
try
:
pred
=
action
[
'result'
][
'text'
]
match
=
re
.
search
(
'```
\n
(.*?)
\n
```'
,
pred
,
re
.
DOTALL
)
match
=
re
.
search
(
'execute_result:
\n\n
```
\n
(.*?)
\n
```'
,
pred
,
re
.
DOTALL
)
if
match
:
out
=
match
.
group
(
1
)
return
out
==
target
or
out
in
target
return
out
.
strip
()
==
target
.
strip
()
except
Exception
:
return
False
# Fall back to False
...
...
@@ -313,23 +432,23 @@ class CIBenchEvaluator(BaseEvaluator):
# numeric_correct: numerical correct
# text_score: text score
# vis_sim: visual similarity
result
=
defaultdict
(
list
)
for
tag
,
step
,
output
in
zip
(
tags
,
steps
,
outputs
):
# check whether this step is valid
result
[
'executable'
].
append
(
self
.
valid_step
(
step
))
if
tag
!=
'exec'
:
key
,
func
=
self
.
TAG_MAPPING
[
tag
]
result
[
key
].
append
(
func
(
step
,
output
))
# add missing metric for better analyse if not exists
# create empty results
result
=
dict
()
if
hard_tags
:
check_tags
=
[
'exec'
,
'num'
,
'text'
,
'vis'
]
else
:
check_tags
=
[
'exec'
,
'general'
,
'vis'
]
for
tag
in
check_tags
:
key
=
self
.
TAG_MAPPING
[
tag
][
0
]
if
key
not
in
result
:
result
[
key
]
=
[]
result
[
key
]
=
[]
for
tag
,
step
,
output
in
zip
(
tags
,
steps
,
outputs
):
# check whether this step is valid
result
[
'executable'
].
append
(
self
.
valid_step
(
step
))
if
tag
!=
'exec'
:
key
,
func
=
self
.
TAG_MAPPING
[
tag
]
result
[
key
].
append
(
func
(
step
,
output
))
return
result
...
...
opencompass/datasets/circular.py
View file @
32f40a8f
...
...
@@ -183,8 +183,13 @@ class CircularDatasetMeta(type):
def
load
(
cls
,
circular_patterns
=
'circular'
,
*
args
,
**
kwargs
):
circular_splits
=
getattr
(
cls
,
'default_circular_splits'
,
None
)
option_keys
=
cls
.
default_option_keys
option_keys
=
getattr
(
cls
,
'default_option_keys'
,
None
)
if
'option_keys'
in
kwargs
:
option_keys
=
kwargs
.
pop
(
'option_keys'
)
assert
option_keys
is
not
None
,
'option_keys cannot be None'
answer_key
=
getattr
(
cls
,
'default_answer_key'
,
None
)
if
'answer_key'
in
kwargs
:
answer_key
=
kwargs
.
pop
(
'answer_key'
)
answer_key_switch_method
=
getattr
(
cls
,
'default_answer_key_switch_method'
,
None
)
dataset
=
cls
.
dataset_class
.
load
(
*
args
,
**
kwargs
)
...
...
@@ -311,11 +316,11 @@ class CircularEvaluator(BaseEvaluator):
tmp_metrics
.
update
({
f
'correct_
{
k
}
'
:
0
for
k
in
circular_patterns
})
tmp_metrics
.
update
({
f
'count_
{
k
}
'
:
0
for
k
in
circular_patterns
})
# calculate the original accuracy
for
pred
,
ref
,
origin_item
in
zip
(
predictions
,
references
,
test_set
):
for
pred
,
ref
r
,
origin_item
in
zip
(
predictions
,
references
,
test_set
):
circular_pattern
=
origin_item
[
'circular_pattern'
]
for
k
in
circular_patterns
:
if
tuple
(
circular_pattern
)
in
circular_patterns
[
k
]:
tmp_metrics
[
f
'correct_
{
k
}
'
]
+=
1
if
pred
==
ref
else
0
tmp_metrics
[
f
'correct_
{
k
}
'
]
+=
1
if
pred
==
ref
r
else
0
tmp_metrics
[
f
'count_
{
k
}
'
]
+=
1
for
k
in
circular_patterns
:
...
...
@@ -324,13 +329,13 @@ class CircularEvaluator(BaseEvaluator):
# calculate the circular accuracy
_details
=
{
k
:
{}
for
k
in
circular_patterns
}
for
pred
,
ref
,
origin_item
in
zip
(
predictions
,
references
,
test_set
):
for
pred
,
ref
r
,
origin_item
in
zip
(
predictions
,
references
,
test_set
):
index
=
origin_item
[
'qid'
]
circular_pattern
=
origin_item
[
'circular_pattern'
]
for
k
in
circular_patterns
:
if
tuple
(
circular_pattern
)
in
circular_patterns
[
k
]:
_details
[
k
].
setdefault
(
index
,
[]).
append
(
True
if
pred
==
ref
else
False
)
index
,
[]).
append
(
True
if
pred
==
ref
r
else
False
)
for
k
in
_details
:
_details
[
k
]
=
{
index
:
sum
(
_details
[
k
][
index
])
...
...
opencompass/datasets/custom.py
View file @
32f40a8f
import
copy
import
csv
import
json
import
os
from
typing
import
List
from
datasets
import
Dataset
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.datasets.circular
import
(
CircularDatasetMeta
,
CircularEvaluator
)
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
,
BaseEvaluator
from
opencompass.openicl.icl_inferencer
import
GenInferencer
,
PPLInferencer
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.registry
import
LOAD_DATASET
from
opencompass.utils.text_postprocessors
import
first_option_postprocess
from
.base
import
BaseDataset
class
OptionSimAccEvaluator
(
BaseEvaluator
):
def
__init__
(
self
,
options
)
->
None
:
super
().
__init__
()
if
not
all
((
isinstance
(
i
,
str
)
and
i
.
isupper
()
and
len
(
i
)
==
1
)
for
i
in
options
):
raise
ValueError
(
f
'Each options should be single upper letter, got
{
options
}
'
)
self
.
options
=
options
def
match_any_label
(
self
,
pred
,
test_item
):
from
rapidfuzz.distance
import
Levenshtein
as
L
from
opencompass.utils.text_postprocessors
import
\
first_option_postprocess
pred
=
pred
.
strip
()
if
any
([
pred
==
i
for
i
in
self
.
options
]):
parsed
=
pred
else
:
parsed
=
''
if
parsed
==
''
:
parsed
=
first_option_postprocess
(
pred
,
''
.
join
(
self
.
options
),
cushion
=
False
)
if
parsed
==
''
:
possible_options
=
[]
for
opt
in
self
.
options
:
opt_str
=
test_item
[
opt
]
if
opt_str
is
not
None
and
opt_str
.
lower
()
in
pred
.
lower
():
possible_options
.
append
(
opt
)
if
len
(
possible_options
)
==
1
:
parsed
=
possible_options
[
0
]
if
parsed
==
''
:
dists
=
[]
for
opt
in
self
.
options
:
opt_str
=
test_item
[
opt
]
if
opt_str
is
None
:
continue
cands
=
[
opt
,
opt_str
,
opt
+
'. '
+
opt_str
]
d
=
min
(
L
.
distance
(
pred
,
cand
)
for
cand
in
cands
)
dists
.
append
((
d
,
opt
))
if
len
(
dists
)
>
0
:
parsed
=
min
(
dists
)[
1
]
return
parsed
def
score
(
self
,
predictions
:
List
,
references
:
List
,
test_set
)
->
dict
:
assert
len
(
predictions
)
==
len
(
references
)
num_correct
,
num_total
=
0
,
0
details
=
{}
for
index
in
range
(
len
(
predictions
)):
pred
=
predictions
[
index
]
refr
=
references
[
index
]
parsed
=
self
.
match_any_label
(
pred
,
test_set
[
index
])
num_correct
+=
1
if
parsed
==
refr
else
0
num_total
+=
1
details
[
str
(
index
)]
=
{}
details
[
str
(
index
)][
'pred'
]
=
pred
details
[
str
(
index
)][
'parsed'
]
=
parsed
details
[
str
(
index
)][
'refr'
]
=
refr
details
[
str
(
index
)][
'correct'
]
=
parsed
==
refr
return
{
'accuracy'
:
num_correct
/
num_total
*
100
,
'details'
:
details
}
# TODO: DO NOT COPY YOURSELF!!!
class
CircularOptionSimAccEvaluator
(
OptionSimAccEvaluator
):
def
__init__
(
self
,
options
,
circular_pattern
=
'circular'
):
super
().
__init__
(
options
)
self
.
circular_pattern
=
circular_pattern
def
score
(
self
,
predictions
,
references
,
test_set
):
from
opencompass.datasets.circular
import
(
get_all_possible_patterns
,
get_circular_patterns
,
get_origin_patterns
)
circular_patterns
=
{}
circular_patterns
[
'origin'
]
=
get_origin_patterns
(
test_set
[
0
][
'circular_pattern'
])
circular_patterns
[
'circular'
]
=
get_circular_patterns
(
test_set
[
0
][
'circular_pattern'
])
if
self
.
circular_pattern
==
'all_possible'
:
circular_patterns
[
'all_possible'
]
=
get_all_possible_patterns
(
test_set
[
0
][
'circular_pattern'
])
metrics
=
{}
tmp_metrics
=
{}
tmp_metrics
.
update
({
f
'correct_
{
k
}
'
:
0
for
k
in
circular_patterns
})
tmp_metrics
.
update
({
f
'count_
{
k
}
'
:
0
for
k
in
circular_patterns
})
# calculate the original accuracy
for
pred
,
refr
,
origin_item
in
zip
(
predictions
,
references
,
test_set
):
parsed
=
self
.
match_any_label
(
pred
,
origin_item
)
circular_pattern
=
origin_item
[
'circular_pattern'
]
for
k
in
circular_patterns
:
if
tuple
(
circular_pattern
)
in
circular_patterns
[
k
]:
tmp_metrics
[
f
'correct_
{
k
}
'
]
+=
(
1
if
parsed
==
refr
else
0
)
tmp_metrics
[
f
'count_
{
k
}
'
]
+=
1
for
k
in
circular_patterns
:
metrics
[
f
'acc_
{
k
}
'
]
=
(
tmp_metrics
[
f
'correct_
{
k
}
'
]
/
tmp_metrics
[
f
'count_
{
k
}
'
]
*
100
)
# calculate the circular accuracy
_details
=
{
k
:
{}
for
k
in
circular_patterns
}
for
pred
,
refr
,
origin_item
in
zip
(
predictions
,
references
,
test_set
):
index
=
origin_item
[
'qid'
]
parsed
=
self
.
match_any_label
(
pred
,
origin_item
)
circular_pattern
=
origin_item
[
'circular_pattern'
]
for
k
in
circular_patterns
:
if
tuple
(
circular_pattern
)
in
circular_patterns
[
k
]:
_details
[
k
].
setdefault
(
index
,
[]).
append
(
True
if
parsed
==
refr
else
False
)
for
k
in
_details
:
_details
[
k
]
=
{
index
:
sum
(
_details
[
k
][
index
])
for
index
in
_details
[
k
]
}
for
k
in
_details
:
for
j
in
range
(
1
,
len
(
circular_patterns
[
k
])
+
1
):
count
=
sum
([
_details
[
k
][
index
]
>=
j
for
index
in
_details
[
k
]])
total
=
len
(
_details
[
k
])
if
j
!=
len
(
circular_patterns
[
k
]):
metrics
[
f
'more_
{
j
}
_
{
k
}
'
]
=
count
/
total
*
100
else
:
metrics
[
f
'perf_
{
k
}
'
]
=
count
/
total
*
100
# make details
details
=
{}
for
index
in
range
(
len
(
predictions
)):
parsed
=
self
.
match_any_label
(
predictions
[
index
],
test_set
[
index
])
details
[
str
(
index
)]
=
{}
if
'question'
in
test_set
[
index
]:
details
[
str
(
index
)][
'question'
]
=
test_set
[
index
][
'question'
]
details
[
str
(
index
)][
'pred'
]
=
predictions
[
index
]
details
[
str
(
index
)][
'parsed'
]
=
parsed
details
[
str
(
index
)][
'refr'
]
=
references
[
index
]
details
[
str
(
index
)][
'correct'
]
=
parsed
==
references
[
index
]
metrics
[
'details'
]
=
details
return
metrics
@
LOAD_DATASET
.
register_module
()
class
CustomDataset
(
BaseDataset
):
@
staticmethod
def
load
(
path
):
if
path
.
endswith
(
'.jsonl'
):
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8
-sig
'
)
as
f
:
data
=
[
json
.
loads
(
line
)
for
line
in
f
]
elif
path
.
endswith
(
'.csv'
):
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8
-sig
'
)
as
f
:
reader
=
csv
.
reader
(
f
)
header
=
next
(
reader
)
data
=
[
dict
(
zip
(
header
,
row
))
for
row
in
reader
]
...
...
@@ -33,6 +179,10 @@ class CustomDataset(BaseDataset):
return
Dataset
.
from_list
(
data
)
class
CircularCustomDataset
(
CustomDataset
,
metaclass
=
CircularDatasetMeta
):
dataset_class
=
CustomDataset
def
stringfy_types
(
obj
):
for
k
,
v
in
obj
.
items
():
if
k
==
'type'
:
...
...
@@ -69,12 +219,12 @@ def make_mcq_gen_config(meta):
inferencer
=
dict
(
type
=
GenInferencer
),
)
eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
pred_role
=
'BOT'
,
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
''
.
join
(
meta
[
'options'
])
,
)
)
eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
meta
.
get
(
'evaluator'
,
OptionSimAccEvaluator
)
,
**
meta
.
get
(
'evaluator_kwargs'
,
{
'options'
:
meta
[
'options'
]}))
,
pred_role
=
'BOT'
,
)
dataset
=
dict
(
abbr
=
meta
[
'abbr'
],
...
...
@@ -87,6 +237,54 @@ def make_mcq_gen_config(meta):
return
dataset
def
make_circular_mcq_gen_config
(
meta
):
if
meta
.
get
(
'template'
,
None
)
is
None
:
_human_prompt
=
'Question: {question}'
+
''
.
join
(
[
f
'
\n
{
item
}
. {{
{
item
}
}}'
for
item
in
meta
[
'options'
]])
human_prompt
=
meta
.
get
(
'human_prompt'
,
_human_prompt
)
_bot_prompt
=
f
'Answer: {{
{
meta
[
"output_column"
]
}
}}'
bot_prompt
=
meta
.
get
(
'bot_prompt'
,
_bot_prompt
)
template
=
dict
(
round
=
[
dict
(
role
=
'HUMAN'
,
prompt
=
human_prompt
),
dict
(
role
=
'BOT'
,
prompt
=
bot_prompt
),
])
else
:
template
=
meta
[
'template'
]
reader_cfg
=
dict
(
input_columns
=
meta
[
'input_columns'
],
output_column
=
meta
[
'output_column'
],
)
infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
template
,
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
),
)
eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
meta
.
get
(
'evaluator'
,
CircularOptionSimAccEvaluator
),
**
meta
.
get
(
'evaluator_kwargs'
,
{
'options'
:
meta
[
'options'
]})),
pred_role
=
'BOT'
,
)
dataset
=
dict
(
abbr
=
meta
[
'abbr'
],
type
=
CircularCustomDataset
,
option_keys
=
meta
[
'options'
],
answer_key
=
meta
[
'output_column'
],
path
=
meta
[
'path'
],
reader_cfg
=
reader_cfg
,
infer_cfg
=
infer_cfg
,
eval_cfg
=
eval_cfg
,
)
return
dataset
def
make_qa_gen_config
(
meta
):
if
meta
.
get
(
'template'
,
None
)
is
None
:
human_prompt
=
meta
.
get
(
'human_prompt'
,
'{question}'
)
...
...
@@ -102,7 +300,6 @@ def make_qa_gen_config(meta):
])
else
:
template
=
meta
[
'template'
]
reader_cfg
=
dict
(
input_columns
=
meta
[
'input_columns'
],
output_column
=
meta
[
'output_column'
],
...
...
@@ -117,7 +314,8 @@ def make_qa_gen_config(meta):
)
eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
evaluator
=
dict
(
type
=
meta
.
get
(
'evaluator'
,
AccEvaluator
),
**
meta
.
get
(
'evaluator_kwargs'
,
{})),
pred_role
=
'BOT'
,
)
...
...
@@ -164,7 +362,8 @@ def make_mcq_ppl_config(meta):
inferencer
=
dict
(
type
=
PPLInferencer
),
)
eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
))
eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
meta
.
get
(
'evaluator'
,
AccEvaluator
),
**
meta
.
get
(
'evaluator_kwargs'
,
{})))
dataset
=
dict
(
abbr
=
meta
[
'abbr'
],
...
...
@@ -177,17 +376,61 @@ def make_mcq_ppl_config(meta):
return
dataset
def
make_circular_mcq_ppl_config
(
meta
):
if
meta
.
get
(
'template'
,
None
)
is
None
:
_human_prompt
=
'Question: {question}'
+
''
.
join
(
[
f
'
\n
{
item
}
. {{
{
item
}
}}'
for
item
in
meta
[
'options'
]])
human_prompt
=
meta
.
get
(
'human_prompt'
,
_human_prompt
)
_bot_prompt
=
f
'Answer: {{
{
meta
[
"output_column"
]
}
}}'
bot_prompt
=
meta
.
get
(
'bot_prompt'
,
_bot_prompt
)
template
=
{
answer
:
dict
(
round
=
[
dict
(
role
=
'HUMAN'
,
prompt
=
human_prompt
),
dict
(
role
=
'BOT'
,
prompt
=
bot_prompt
.
format
(
**
{
meta
[
'output_column'
]:
answer
})),
],
)
for
answer
in
meta
[
'options'
]
}
else
:
template
=
meta
[
'template'
]
reader_cfg
=
dict
(
input_columns
=
meta
[
'input_columns'
],
output_column
=
meta
[
'output_column'
],
)
infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
template
,
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
PPLInferencer
),
)
eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
meta
.
get
(
'evaluator'
,
CircularEvaluator
),
**
meta
.
get
(
'evaluator_kwargs'
,
{})))
dataset
=
dict
(
abbr
=
meta
[
'abbr'
],
type
=
CircularCustomDataset
,
option_keys
=
meta
[
'options'
],
answer_key
=
meta
[
'output_column'
],
path
=
meta
[
'path'
],
reader_cfg
=
reader_cfg
,
infer_cfg
=
infer_cfg
,
eval_cfg
=
eval_cfg
,
)
return
dataset
def
parse_example_dataset
(
config
):
#
try to read
meta
json
#
config -> .
meta
.
json
l -> parsed_results
path
=
config
[
'path'
]
meta_path
=
config
.
get
(
'meta_path'
,
path
+
'.meta.json'
)
if
os
.
path
.
exists
(
meta_path
):
with
open
(
meta_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
meta
=
json
.
load
(
f
)
else
:
meta
=
{}
# load sample
# load sample and get parsed_meta
parsed_meta
=
{}
if
path
.
endswith
(
'.jsonl'
):
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
data_item
=
json
.
loads
(
f
.
readline
())
...
...
@@ -200,11 +443,11 @@ def parse_example_dataset(config):
else
:
raise
ValueError
(
f
'Unsupported ext:
{
path
}
, .jsonl or .csv required'
)
meta
[
'path'
]
=
path
parsed_
meta
[
'path'
]
=
path
input_columns
=
[
i
for
i
in
data_item
.
keys
()
if
i
!=
'answer'
]
meta
.
setdefault
(
'input_columns'
,
input_columns
)
parsed_meta
[
'input_columns'
]
=
input_columns
output_column
=
'answer'
if
'answer'
in
data_item
else
None
meta
.
setdefault
(
'output_column'
,
output_column
)
parsed_meta
[
'output_column'
]
=
output_column
options
=
[]
for
i
in
range
(
26
):
i
=
chr
(
ord
(
'A'
)
+
i
)
...
...
@@ -212,19 +455,28 @@ def parse_example_dataset(config):
options
.
append
(
i
)
else
:
break
meta
.
setdefault
(
'options'
,
options
)
parsed_meta
[
'options'
]
=
options
abbr
=
os
.
path
.
basename
(
path
).
split
(
'.'
)[
0
]
meta
.
setdefault
(
'abbr'
,
abbr
)
parsed_meta
[
'abbr'
]
=
abbr
parsed_meta
[
'data_type'
]
=
'mcq'
if
len
(
options
)
>
1
else
'qa'
parsed_meta
[
'infer_method'
]
=
'gen'
if
'data_type'
in
config
:
meta
.
setdefault
(
'data_type'
,
config
[
'data_type'
])
else
:
data_type
=
'mcq'
if
len
(
options
)
>
1
else
'qa'
meta
.
setdefault
(
'data_type'
,
data_type
)
if
'infer_method'
in
config
:
meta
.
setdefault
(
'infer_method'
,
config
[
'infer_method'
])
# try to read meta json
meta_path
=
config
.
get
(
'meta_path'
,
path
+
'.meta.json'
)
if
os
.
path
.
exists
(
meta_path
):
with
open
(
meta_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
read_from_file_meta
=
json
.
load
(
f
)
else
:
meta
.
setdefault
(
'infer_method'
,
'gen'
)
read_from_file_meta
=
{}
# get config meta
config_meta
=
copy
.
deepcopy
(
config
)
# merge meta
meta
=
{}
meta
.
update
(
parsed_meta
)
meta
.
update
(
read_from_file_meta
)
meta
.
update
(
config_meta
)
return
meta
...
...
@@ -236,6 +488,8 @@ def make_custom_dataset_config(config):
(
'mcq'
,
'gen'
):
make_mcq_gen_config
,
(
'mcq'
,
'ppl'
):
make_mcq_ppl_config
,
(
'qa'
,
'gen'
):
make_qa_gen_config
,
(
'circular-mcq'
,
'gen'
):
make_circular_mcq_gen_config
,
(
'circular-mcq'
,
'ppl'
):
make_circular_mcq_ppl_config
,
}.
get
((
meta
[
'data_type'
],
meta
[
'infer_method'
]),
None
)
if
make_config_func
is
None
:
raise
ValueError
(
f
'Unsupported dataset data_type:
{
meta
[
"data_type"
]
}
'
...
...
opencompass/datasets/ds1000.py
View file @
32f40a8f
...
...
@@ -365,7 +365,7 @@ class DS1000ServiceEvaluator(BaseEvaluator):
lib
:
str
,
ip_address
=
'localhost'
,
port
=
5000
,
timeout
=
18
0
)
->
None
:
timeout
=
60
0
)
->
None
:
assert
lib
in
_LIBRARY_NAME_LIST
,
(
f
' lib must be in
{
_LIBRARY_NAME_LIST
}
'
)
self
.
lib
=
lib
...
...
opencompass/datasets/humanevalx.py
View file @
32f40a8f
...
...
@@ -5,6 +5,7 @@ import os.path as osp
import
re
import
subprocess
import
tempfile
import
time
from
shutil
import
copyfile
from
typing
import
Dict
,
Iterable
...
...
@@ -73,7 +74,8 @@ class HumanevalXEvaluator(BaseEvaluator):
language
,
ip_address
=
'localhost'
,
port
=
5000
,
timeout
=
180
)
->
None
:
retry
=
2
,
timeout
=
600
)
->
None
:
assert
language
in
_LANGUAGE_NAME_DICT
.
keys
(),
(
f
'language must be in
{
list
(
_LANGUAGE_NAME_DICT
.
keys
())
}
'
)
if
language
==
'rust'
:
...
...
@@ -81,6 +83,7 @@ class HumanevalXEvaluator(BaseEvaluator):
self
.
language
=
language
self
.
ip_address
=
ip_address
self
.
port
=
port
self
.
retry
=
retry
self
.
timeout
=
timeout
super
().
__init__
()
...
...
@@ -96,7 +99,17 @@ class HumanevalXEvaluator(BaseEvaluator):
for
pred
in
predictions
:
f
.
write
(
json
.
dumps
(
pred
)
+
'
\n
'
)
succeed
,
output
=
self
.
_code_eval_service
(
file_path
=
tmp_out_path
)
num_retry
=
0
while
num_retry
<
self
.
retry
:
succeed
,
output
=
self
.
_code_eval_service
(
file_path
=
tmp_out_path
)
if
not
succeed
and
'(56) Recv failure'
in
output
:
# only retry when connection failed
num_retry
+=
1
# wait a min in case the service load is too high
time
.
sleep
(
60
)
else
:
break
if
succeed
:
if
isinstance
(
output
,
str
):
...
...
@@ -104,9 +117,15 @@ class HumanevalXEvaluator(BaseEvaluator):
elif
isinstance
(
output
,
dict
):
return
output
ref_url
=
'https://github.com/Ezra-Yu/code-evaluator'
result_file_path
=
os
.
path
.
join
(
'outputs'
,
f
'humanevalx_
{
self
.
language
}
.json'
)
ref_url
=
'https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html'
# noqa
if
hasattr
(
self
,
'_out_dir'
):
result_file_path
=
re
.
sub
(
'results'
,
'mid_results'
,
self
.
_out_dir
)
+
'.json'
# noqa
if
not
osp
.
exists
(
osp
.
dirname
(
result_file_path
)):
os
.
makedirs
(
osp
.
dirname
(
result_file_path
))
else
:
result_file_path
=
os
.
path
.
join
(
'outputs'
,
f
'humanevalx_
{
self
.
language
}
.json'
)
copyfile
(
tmp_out_path
,
result_file_path
)
raise
Exception
(
f
'Call CodeEvalService Error in `HumanevalXEvaluator`, The '
...
...
opencompass/datasets/hungarian_math.py
0 → 100644
View file @
32f40a8f
import
pandas
as
pd
from
datasets
import
Dataset
from
opencompass.registry
import
LOAD_DATASET
from
.base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
HungarianExamMathDataset
(
BaseDataset
):
@
staticmethod
def
load
(
path
):
df
=
pd
.
read_csv
(
path
)
df
.
columns
=
[
'question'
]
outputs
=
[{
'question'
:
question
}
for
question
in
df
[
'question'
].
tolist
()]
dataset
=
Dataset
.
from_list
(
outputs
)
return
dataset
opencompass/datasets/jsonl.py
0 → 100644
View file @
32f40a8f
import
json
from
datasets
import
Dataset
from
opencompass.registry
import
LOAD_DATASET
from
.base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
JsonlDataset
(
BaseDataset
):
@
staticmethod
def
load
(
path
):
data
=
[]
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
data
.
append
(
json
.
loads
(
line
))
return
Dataset
.
from_list
(
data
)
opencompass/datasets/math.py
View file @
32f40a8f
import
json
import
re
from
datasets
import
Dataset
,
DatasetDict
...
...
@@ -9,48 +10,125 @@ from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
from
.base
import
BaseDataset
def
last_boxed_only_string
(
string
):
idx
=
string
.
rfind
(
'
\\
boxed'
)
if
idx
<
0
:
idx
=
string
.
rfind
(
'
\\
fbox'
)
if
idx
<
0
:
return
None
i
=
idx
right_brace_idx
=
None
num_left_braces_open
=
0
while
i
<
len
(
string
):
if
string
[
i
]
==
'{'
:
num_left_braces_open
+=
1
if
string
[
i
]
==
'}'
:
num_left_braces_open
-=
1
if
num_left_braces_open
==
0
:
right_brace_idx
=
i
break
i
+=
1
if
right_brace_idx
is
None
:
retval
=
None
else
:
retval
=
string
[
idx
:
right_brace_idx
+
1
]
return
retval
def
remove_boxed
(
s
):
left
=
'
\\
boxed{'
try
:
assert
s
[:
len
(
left
)]
==
left
assert
s
[
-
1
]
==
'}'
return
s
[
len
(
left
):
-
1
]
except
Exception
:
return
None
def
extract_boxed_answer
(
pred_str
,
strip_double_curly_brace
=
False
):
boxed_str
=
last_boxed_only_string
(
pred_str
)
if
boxed_str
is
None
:
return
None
answer
=
remove_boxed
(
boxed_str
)
if
answer
is
None
:
return
None
if
strip_double_curly_brace
:
match
=
re
.
match
(
'^\{(.*)\}$'
,
answer
)
# noqa: W605
if
match
:
answer
=
match
.
group
(
1
)
return
answer
def
normalize_final_answer
(
final_answer
:
str
)
->
str
:
"""Normalize a final answer to a quantitative reasoning question."""
# final_answer = final_answer.split('=')[-1]
SUBSTITUTIONS
=
[(
'an '
,
''
),
(
'a '
,
''
),
(
'.$'
,
'$'
),
(
'
\\
$'
,
''
),
(
r
'\ '
,
''
),
(
' '
,
''
),
(
'mbox'
,
'text'
),
(
',
\\
text{and}'
,
','
),
(
'
\\
text{and}'
,
','
),
(
'
\\
text{m}'
,
'
\\
text{}'
),
(
'
\\
le'
,
'<'
)]
REMOVED_EXPRESSIONS
=
[
'square'
,
'ways'
,
'integers'
,
'dollars'
,
'mph'
,
'inches'
,
'ft'
,
'hours'
,
'km'
,
'units'
,
'
\\
ldots'
,
'sue'
,
'points'
,
'feet'
,
'minutes'
,
'digits'
,
'cents'
,
'degrees'
,
'cm'
,
'gm'
,
'pounds'
,
'meters'
,
'meals'
,
'edges'
,
'students'
,
'childrentickets'
,
'multiples'
,
'
\\
text{s}'
,
'
\\
text{.}'
,
'
\\
text{
\n
s}'
,
'
\\
text{}^2'
,
'
\\
text{}^3'
,
'
\\
text{
\n
}'
,
'
\\
text{}'
,
r
'\mathrm{th}'
,
r
'^\circ'
,
r
'^{\circ}'
,
r
'\;'
,
r
',\!'
,
'{,}'
,
'"'
,
'
\\
dots'
,
'
\n
'
,
'
\r
'
,
'
\f
'
]
for
before
,
after
in
SUBSTITUTIONS
:
final_answer
=
final_answer
.
replace
(
before
,
after
)
for
expr
in
REMOVED_EXPRESSIONS
:
final_answer
=
final_answer
.
replace
(
expr
,
''
)
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer
=
re
.
sub
(
r
'(\\text\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\textbf\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\overline\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\boxed\{)(.*)(\})'
,
'
\\
2'
,
final_answer
)
assert
'
\n
'
not
in
final_answer
assert
'
\r
'
not
in
final_answer
assert
'
\f
'
not
in
final_answer
if
len
(
re
.
findall
(
r
'finalansweris(.*)'
,
final_answer
))
>
0
:
final_answer
=
re
.
findall
(
r
'finalansweris(.*)'
,
final_answer
)[
-
1
]
if
len
(
re
.
findall
(
r
'answer?is:?(.*)'
,
final_answer
))
>
0
:
final_answer
=
re
.
findall
(
r
'answer?is:?(.*)'
,
final_answer
)[
-
1
]
if
len
(
re
.
findall
(
r
'oxed\{(.*?)\}'
,
final_answer
))
>
0
:
final_answer
=
re
.
findall
(
r
'oxed\{(.*?)\}'
,
final_answer
)[
-
1
]
if
len
(
re
.
findall
(
r
'\$(.*?)\$'
,
final_answer
))
>
0
:
final_answer
=
re
.
findall
(
r
'\$(.*?)\$'
,
final_answer
)[
-
1
]
final_answer
=
final_answer
.
strip
()
if
'rac'
in
final_answer
and
'
\\
frac'
not
in
final_answer
:
final_answer
=
final_answer
.
replace
(
'rac'
,
'
\\
frac'
)
# Normalize shorthand TeX:
# \fracab -> \frac{a}{b}
# \frac{abc}{bef} -> \frac{abc}{bef}
# \fracabc -> \frac{a}{b}c
# \sqrta -> \sqrt{a}
# \sqrtab -> sqrt{a}b
final_answer
=
re
.
sub
(
r
'(frac)([^{])(.)'
,
'frac{
\\
2}{
\\
3}'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(sqrt)([^{])'
,
'sqrt{
\\
2}'
,
final_answer
)
final_answer
=
final_answer
.
replace
(
'$'
,
''
)
# Normalize 100,000 -> 100000
if
final_answer
.
replace
(
','
,
''
).
isdigit
():
final_answer
=
final_answer
.
replace
(
','
,
''
)
return
final_answer
@
LOAD_DATASET
.
register_module
()
class
MATHDataset
(
BaseDataset
):
@
staticmethod
def
load
(
path
:
str
):
def
remove_boxed
(
s
):
left
=
'
\\
boxed{'
try
:
assert
s
[:
len
(
left
)]
==
left
assert
s
[
-
1
]
==
'}'
return
s
[
len
(
left
):
-
1
]
except
Exception
:
return
None
def
last_boxed_only_string
(
string
):
idx
=
string
.
rfind
(
'
\\
boxed'
)
if
idx
<
0
:
idx
=
string
.
rfind
(
'
\\
fbox'
)
if
idx
<
0
:
return
None
i
=
idx
right_brace_idx
=
None
num_left_braces_open
=
0
while
i
<
len
(
string
):
if
string
[
i
]
==
'{'
:
num_left_braces_open
+=
1
if
string
[
i
]
==
'}'
:
num_left_braces_open
-=
1
if
num_left_braces_open
==
0
:
right_brace_idx
=
i
break
i
+=
1
if
right_brace_idx
is
None
:
retval
=
None
else
:
retval
=
string
[
idx
:
right_brace_idx
+
1
]
return
retval
dataset
=
DatasetDict
()
data
=
json
.
load
(
open
(
path
))
raw_data
=
[]
...
...
@@ -59,7 +137,7 @@ class MATHDataset(BaseDataset):
'problem'
:
data
[
i
][
'problem'
],
'solution'
:
remove_boxed
(
last_boxed_only_string
(
data
[
i
][
'solution'
])
)
extract_boxed_answer
(
data
[
i
][
'solution'
])
})
dataset
[
'test'
]
=
Dataset
.
from_list
(
raw_data
)
dataset
[
'train'
]
=
Dataset
.
from_list
(
raw_data
)
...
...
@@ -68,66 +146,6 @@ class MATHDataset(BaseDataset):
@
TEXT_POSTPROCESSORS
.
register_module
(
'math_postprocess'
)
def
math_postprocess
(
text
:
str
)
->
str
:
SUBSTITUTIONS
=
[(
'an '
,
''
),
(
'a '
,
''
),
(
'.$'
,
'$'
),
(
'
\\
$'
,
''
),
(
r
'\ '
,
''
),
(
' '
,
''
),
(
'mbox'
,
'text'
),
(
',
\\
text{and}'
,
','
),
(
'
\\
text{and}'
,
','
),
(
'
\\
text{m}'
,
'
\\
text{}'
),
(
'
\\
le'
,
'<'
)]
REMOVED_EXPRESSIONS
=
[
'square'
,
'ways'
,
'integers'
,
'dollars'
,
'mph'
,
'inches'
,
'ft'
,
'hours'
,
'km'
,
'units'
,
'
\\
ldots'
,
'sue'
,
'points'
,
'feet'
,
'minutes'
,
'digits'
,
'cents'
,
'degrees'
,
'cm'
,
'gm'
,
'pounds'
,
'meters'
,
'meals'
,
'edges'
,
'students'
,
'childrentickets'
,
'multiples'
,
'
\\
text{s}'
,
'
\\
text{.}'
,
'
\\
text{
\n
s}'
,
'
\\
text{}^2'
,
'
\\
text{}^3'
,
'
\\
text{
\n
}'
,
'
\\
text{}'
,
r
'\mathrm{th}'
,
r
'^\circ'
,
r
'^{\circ}'
,
r
'\;'
,
r
',\!'
,
'{,}'
,
'"'
,
'
\\
dots'
,
'
\n
'
,
'
\r
'
,
'
\f
'
]
import
re
def
normalize_final_answer
(
final_answer
:
str
)
->
str
:
"""Normalize a final answer to a quantitative reasoning question."""
# final_answer = final_answer.split('=')[-1]
for
before
,
after
in
SUBSTITUTIONS
:
final_answer
=
final_answer
.
replace
(
before
,
after
)
for
expr
in
REMOVED_EXPRESSIONS
:
final_answer
=
final_answer
.
replace
(
expr
,
''
)
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer
=
re
.
sub
(
r
'(\\text\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\textbf\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\overline\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\boxed\{)(.*)(\})'
,
'
\\
2'
,
final_answer
)
assert
'
\n
'
not
in
final_answer
assert
'
\r
'
not
in
final_answer
assert
'
\f
'
not
in
final_answer
if
len
(
re
.
findall
(
r
'finalansweris(.*)'
,
final_answer
))
>
0
:
final_answer
=
re
.
findall
(
r
'finalansweris(.*)'
,
final_answer
)[
-
1
]
if
len
(
re
.
findall
(
r
'oxed\{(.*?)\}'
,
final_answer
))
>
0
:
final_answer
=
re
.
findall
(
r
'oxed\{(.*?)\}'
,
final_answer
)[
-
1
]
if
len
(
re
.
findall
(
r
'\$(.*?)\$'
,
final_answer
))
>
0
:
final_answer
=
re
.
findall
(
r
'\$(.*?)\$'
,
final_answer
)[
-
1
]
final_answer
=
final_answer
.
strip
()
if
'rac'
in
final_answer
and
'
\\
frac'
not
in
final_answer
:
final_answer
=
final_answer
.
replace
(
'rac'
,
'
\\
frac'
)
# Normalize shorthand TeX:
# \fracab -> \frac{a}{b}
# \frac{abc}{bef} -> \frac{abc}{bef}
# \fracabc -> \frac{a}{b}c
# \sqrta -> \sqrt{a}
# \sqrtab -> sqrt{a}b
final_answer
=
re
.
sub
(
r
'(frac)([^{])(.)'
,
'frac{
\\
2}{
\\
3}'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(sqrt)([^{])'
,
'sqrt{
\\
2}'
,
final_answer
)
final_answer
=
final_answer
.
replace
(
'$'
,
''
)
# Normalize 100,000 -> 100000
if
final_answer
.
replace
(
','
,
''
).
isdigit
():
final_answer
=
final_answer
.
replace
(
','
,
''
)
return
final_answer
for
maybe_ans
in
text
.
split
(
'.'
):
if
'final answer'
in
maybe_ans
.
lower
():
...
...
@@ -137,9 +155,27 @@ def math_postprocess(text: str) -> str:
# text.split('Final Answer: ', 1)[-1].split('\n\n')[0])
@
TEXT_POSTPROCESSORS
.
register_module
(
'math_postprocess_v2'
)
def
math_postprocess_v2
(
text
:
str
)
->
str
:
cand_ans
=
extract_boxed_answer
(
text
,
strip_double_curly_brace
=
True
)
if
cand_ans
:
return
cand_ans
for
maybe_ans
in
text
.
split
(
'.'
):
# if 'final answer' in maybe_ans.lower():
if
re
.
search
(
'final answer|answer is'
,
maybe_ans
.
lower
()):
return
normalize_final_answer
(
maybe_ans
)
return
normalize_final_answer
(
text
.
split
(
'.'
)[
0
])
@
ICL_EVALUATORS
.
register_module
()
class
MATHEvaluator
(
BaseEvaluator
):
def
__init__
(
self
,
version
=
'v1'
):
assert
version
in
[
'v1'
,
'v2'
]
self
.
version
=
version
def
score
(
self
,
predictions
,
references
):
if
len
(
predictions
)
!=
len
(
references
):
return
{
...
...
@@ -166,7 +202,7 @@ class MATHEvaluator(BaseEvaluator):
substrs
=
substrs
[
1
:]
for
substr
in
substrs
:
new_str
+=
'
\\
frac'
if
substr
[
0
]
==
'{'
:
if
len
(
substr
)
>
0
and
substr
[
0
]
==
'{'
:
new_str
+=
substr
else
:
try
:
...
...
@@ -228,6 +264,10 @@ class MATHEvaluator(BaseEvaluator):
new_string
+=
new_substr
return
new_string
def
_fix_sqrt_v2
(
self
,
string
):
_string
=
re
.
sub
(
r
'\\sqrt(\w+)'
,
r
'\\sqrt{\1}'
,
string
)
return
_string
def
_strip_string
(
self
,
string
):
# linebreaks
string
=
string
.
replace
(
'
\n
'
,
''
)
...
...
@@ -295,6 +335,109 @@ class MATHEvaluator(BaseEvaluator):
return
string
def
_strip_string_v2
(
self
,
string
):
string
=
str
(
string
).
strip
()
# linebreaks
string
=
string
.
replace
(
'
\n
'
,
''
)
# right "."
string
=
string
.
rstrip
(
'.'
)
# remove inverse spaces
string
=
string
.
replace
(
'
\\
!'
,
''
)
string
=
string
.
replace
(
'
\\
'
,
''
)
# replace \\ with \
string
=
string
.
replace
(
'
\\\\
'
,
'
\\
'
)
string
=
string
.
replace
(
'
\\\\
'
,
'
\\
'
)
# replace tfrac and dfrac with frac
string
=
string
.
replace
(
'tfrac'
,
'frac'
)
string
=
string
.
replace
(
'dfrac'
,
'frac'
)
# remove \left and \right
string
=
string
.
replace
(
'
\\
left'
,
''
)
string
=
string
.
replace
(
'
\\
right'
,
''
)
# Remove unit: miles, dollars if after is not none
_string
=
re
.
sub
(
r
'\\text{.*?}$'
,
''
,
string
).
strip
()
if
_string
!=
''
and
_string
!=
string
:
string
=
_string
# Remove circ (degrees)
string
=
string
.
replace
(
'^{
\\
circ}'
,
''
)
string
=
string
.
replace
(
'^
\\
circ'
,
''
)
# remove dollar signs
string
=
string
.
replace
(
'
\\
$'
,
''
)
string
=
string
.
replace
(
'$'
,
''
)
string
=
string
.
replace
(
'
\\
text'
,
''
)
string
=
string
.
replace
(
'x
\\
in'
,
''
)
# remove percentage
string
=
string
.
replace
(
'
\\
%'
,
''
)
string
=
string
.
replace
(
'\%'
,
''
)
# noqa: W605
string
=
string
.
replace
(
'%'
,
''
)
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively,
# add "0" if "." is the start of the string
string
=
string
.
replace
(
' .'
,
' 0.'
)
string
=
string
.
replace
(
'{.'
,
'{0.'
)
# cdot
string
=
string
.
replace
(
'
\\
cdot'
,
''
)
# inf
string
=
string
.
replace
(
'infinity'
,
'
\\
infty'
)
if
'
\\
infty'
not
in
string
:
string
=
string
.
replace
(
'inf'
,
'
\\
infty'
)
string
=
string
.
replace
(
'+
\\
inity'
,
'
\\
infty'
)
# and
string
=
string
.
replace
(
'and'
,
''
)
string
=
string
.
replace
(
'
\\
mathbf'
,
''
)
# use regex to remove \mbox{...}
string
=
re
.
sub
(
r
'\\mbox{.*?}'
,
''
,
string
)
# quote
string
.
replace
(
"'"
,
''
)
string
.
replace
(
'"'
,
''
)
# i, j
if
'j'
in
string
and
'i'
not
in
string
:
string
=
string
.
replace
(
'j'
,
'i'
)
# replace a.000b where b is not number or b is end, with ab, use regex
string
=
re
.
sub
(
r
'(\d+)\.0+([^\d])'
,
r
'\1\2'
,
string
)
string
=
re
.
sub
(
r
'(\d+)\.0+$'
,
r
'\1'
,
string
)
# if empty, return empty string
if
len
(
string
)
==
0
:
return
string
if
string
[
0
]
==
'.'
:
string
=
'0'
+
string
# to consider: get rid of e.g. "k = " or "q = " at beginning
if
len
(
string
.
split
(
'='
))
==
2
:
if
len
(
string
.
split
(
'='
)[
0
])
<=
2
:
string
=
string
.
split
(
'='
)[
1
]
string
=
self
.
_fix_sqrt_v2
(
string
)
string
=
string
.
replace
(
' '
,
''
)
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc.
# Even works with \frac1{72} (but not \frac{72}1).
# Also does a/b --> \\frac{a}{b}
string
=
self
.
_fix_fracs
(
string
)
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple
# cases fix in case the model output is X/Y
string
=
self
.
_fix_a_slash_b
(
string
)
return
string
def
is_equiv
(
self
,
str1
,
str2
,
verbose
=
False
):
if
str1
is
None
and
str2
is
None
:
print
(
'WARNING: Both None'
)
...
...
@@ -302,16 +445,24 @@ class MATHEvaluator(BaseEvaluator):
if
str1
is
None
or
str2
is
None
:
return
False
if
self
.
version
==
'v1'
:
strip_string_func
=
self
.
_strip_string
elif
self
.
version
==
'v2'
:
strip_string_func
=
self
.
_strip_string_v2
else
:
raise
NotImplementedError
try
:
ss1
=
self
.
_
strip_string
(
str1
)
ss2
=
self
.
_
strip_string
(
str2
)
ss1
=
strip_string
_func
(
str1
)
ss2
=
strip_string
_func
(
str2
)
if
verbose
:
print
(
ss1
,
ss2
)
return
ss1
==
ss2
except
:
# noqa
except
Exception
:
return
str1
==
str2
@
ICL_EVALUATORS
.
register_module
()
class
MATHAgentEvaluator
(
MATHEvaluator
):
"""math agent evaluator for soft condition.
...
...
@@ -320,8 +471,9 @@ class MATHAgentEvaluator(MATHEvaluator):
Defaults to `PythonInterpreter`.
"""
def
__init__
(
self
,
action
:
str
=
'PythonInterpreter'
):
def
__init__
(
self
,
action
:
str
=
'PythonInterpreter'
,
version
=
'v1'
):
self
.
action
=
action
super
().
__init__
(
version
=
version
)
def
soft_equal
(
self
,
pred
,
refer
,
step
):
try
:
...
...
opencompass/datasets/math401.py
0 → 100644
View file @
32f40a8f
from
opencompass.openicl
import
BaseEvaluator
def
check
(
a
,
b
):
return
abs
(
float
(
a
)
-
float
(
b
))
<
1e-3
class
Math401Evaluator
(
BaseEvaluator
):
def
score
(
self
,
predictions
,
references
):
if
len
(
predictions
)
!=
len
(
references
):
return
{
'error'
:
'predictions and references have different '
'length'
}
correct
=
0
count
=
0
details
=
[]
for
i
,
j
in
zip
(
predictions
,
references
):
detail
=
{
'pred'
:
i
,
'answer'
:
j
,
'correct'
:
False
}
count
+=
1
try
:
if
check
(
i
,
j
):
correct
+=
1
detail
[
'correct'
]
=
True
except
Exception
:
pass
details
.
append
(
detail
)
result
=
{
'accuracy'
:
100
*
correct
/
count
,
'details'
:
details
}
return
result
opencompass/datasets/natural_question.py
View file @
32f40a8f
import
csv
import
json
import
os.path
as
osp
from
datasets
import
Dataset
,
DatasetDict
...
...
@@ -18,7 +19,7 @@ class NaturalQuestionDataset(BaseDataset):
dataset
=
DatasetDict
()
for
split
in
[
'dev'
,
'test'
]:
filename
=
osp
.
join
(
path
,
f
'nq-
{
split
}
.qa.csv'
)
with
open
(
filename
)
as
f
:
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
'
\t
'
)
raw_data
=
[]
for
row
in
reader
:
...
...
@@ -33,6 +34,26 @@ class NaturalQuestionDataset(BaseDataset):
return
dataset
@
LOAD_DATASET
.
register_module
()
class
NQOpenDataset
(
BaseDataset
):
@
staticmethod
def
load
(
path
:
str
):
dataset
=
DatasetDict
()
for
split
in
[
'validation'
,
'train'
]:
filename
=
osp
.
join
(
path
,
f
'nq-open-
{
split
}
.jsonl'
)
raw_data
=
[]
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
doc
in
f
:
doc
=
json
.
loads
(
doc
)
if
split
==
'train'
:
doc
[
'answer'
]
=
doc
[
'answer'
][
0
]
raw_data
.
append
(
doc
)
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
@
ICL_EVALUATORS
.
register_module
()
class
NQEvaluator
(
BaseEvaluator
):
...
...
opencompass/datasets/reasonbench/ReasonBenchDataset.py
View file @
32f40a8f
...
...
@@ -16,13 +16,13 @@ class ReasonBenchDataset(BaseDataset):
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
prompt
=
line
[
'prompt'
]
prompt_ppl
=
line
[
'prompt_ppl'
]
label
=
line
[
'label'
]
label_ppl
=
line
[
'label_ppl'
]
choices
=
line
[
'choices'
]
tag
=
line
[
'tag'
]
source
=
line
[
'source'
]
prompt
=
line
.
get
(
'prompt'
,
''
)
prompt_ppl
=
line
.
get
(
'prompt_ppl'
,
''
)
label
=
line
.
get
(
'label'
,
''
)
label_ppl
=
line
.
get
(
'label_ppl'
,
''
)
choices
=
line
.
get
(
'choices'
,
''
)
tag
=
line
.
get
(
'tag'
,
''
)
source
=
line
.
get
(
'source'
,
''
)
option_content
=
{
choice
:
line
[
choice
]
for
choice
in
choices
}
data
=
{
'prompt'
:
prompt
,
...
...
opencompass/datasets/triviaqa.py
View file @
32f40a8f
import
csv
import
json
import
os.path
as
osp
from
datasets
import
Dataset
,
DatasetDict
...
...
@@ -18,7 +19,7 @@ class TriviaQADataset(BaseDataset):
dataset
=
DatasetDict
()
for
split
in
[
'dev'
,
'test'
]:
filename
=
osp
.
join
(
path
,
f
'trivia-
{
split
}
.qa.csv'
)
with
open
(
filename
)
as
f
:
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
'
\t
'
)
raw_data
=
[]
for
row
in
reader
:
...
...
@@ -32,20 +33,49 @@ class TriviaQADataset(BaseDataset):
return
dataset
@
LOAD_DATASET
.
register_module
()
class
TriviaQADataset_V2
(
BaseDataset
):
@
staticmethod
def
load
(
path
:
str
):
dataset
=
DatasetDict
()
for
split
in
[
'validation'
,
'train'
]:
filename
=
osp
.
join
(
path
,
f
'triviaqa-
{
split
}
.jsonl'
)
raw_data
=
[]
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
doc
in
f
:
doc
=
json
.
loads
(
doc
)
raw_data
.
append
(
doc
)
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
@
LOAD_DATASET
.
register_module
()
class
TriviaQADataset_V3
(
BaseDataset
):
@
staticmethod
def
load
(
path
:
str
):
data_list
=
[]
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
doc
in
f
:
data_list
.
append
(
json
.
loads
(
doc
))
return
Dataset
.
from_list
(
data_list
)
@
ICL_EVALUATORS
.
register_module
()
class
TriviaQAEvaluator
(
BaseEvaluator
):
def
score
(
self
,
predictions
,
references
):
if
len
(
predictions
)
!=
len
(
references
):
return
{
'error'
:
'predictions and references have different '
'length'
}
return
{
'error'
:
'preds and refrs have different length'
}
processed_predictions
=
[]
for
prediction
in
predictions
:
prediction
=
prediction
.
strip
().
split
(
'
\n
'
)[
0
].
lower
()
if
'answer is'
in
prediction
:
prediction
=
prediction
.
split
(
'answer is'
)[
-
1
]
prediction
=
prediction
.
split
(
'answer is'
)[
-
1
]
prediction
=
prediction
.
split
(
'a:'
)[
-
1
]
prediction
=
prediction
.
split
(
'answer:'
)[
-
1
]
prediction
=
prediction
.
strip
()
prediction
=
general_postprocess
(
prediction
)
processed_predictions
.
append
(
prediction
)
processed_answers
=
[[
general_postprocess
(
j
).
lower
()
for
j
in
i
]
...
...
opencompass/lagent/actions/ipython_interpreter.py
View file @
32f40a8f
...
...
@@ -16,11 +16,14 @@ from jupyter_client import KernelManager
from
lagent.actions.base_action
import
BaseAction
from
lagent.schema
import
ActionReturn
,
ActionStatusCode
WORK_DIR
=
os
.
getenv
(
'CODE_INTERPRETER_WORK_DIR'
,
'/tmp/workspace'
)
WORK_DIR
=
os
.
getenv
(
'CODE_INTERPRETER_WORK_DIR'
,
f
"
{
os
.
path
.
abspath
(
'./output_images'
)
}
"
)
DEFAULT_DESCRIPTION
=
"""启动Jupter Kernel用于执行Python代码。"""
START_CODE
=
"""
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
def input(*args, **kwargs):
raise NotImplementedError('Python input() function is disabled.')
...
...
@@ -74,6 +77,10 @@ class IPythonInterpreter(BaseAction):
if
user_data_dir
:
# user_data_dir = os.path.dirname(user_data_dir)
# in case change of dirs
assert
os
.
path
.
exists
(
user_data_dir
),
\
f
'
{
user_data_dir
}
does not exist.'
user_data_dir
=
os
.
path
.
abspath
(
user_data_dir
)
user_data_dir
=
f
"import os
\n
os.chdir('
{
user_data_dir
}
')"
self
.
user_data_dir
=
user_data_dir
self
.
_initialized
=
False
...
...
opencompass/models/__init__.py
View file @
32f40a8f
...
...
@@ -24,5 +24,6 @@ from .qwen_api import Qwen # noqa: F401
from
.sensetime_api
import
SenseTime
# noqa: F401
from
.turbomind
import
TurboMindModel
# noqa: F401
from
.turbomind_tis
import
TurboMindTisModel
# noqa: F401
from
.vllm
import
VLLM
# noqa: F401
from
.xunfei_api
import
XunFei
# noqa: F401
from
.zhipuai_api
import
ZhiPuAI
# noqa: F401
opencompass/models/base.py
View file @
32f40a8f
...
...
@@ -2,6 +2,9 @@ from abc import abstractmethod
from
copy
import
deepcopy
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
import
torch
from
mmengine
import
dist
from
opencompass.utils.prompt
import
PromptList
PromptType
=
Union
[
PromptList
,
str
]
...
...
@@ -21,6 +24,9 @@ class BaseModel:
wrapping of any meta instructions.
generation_kwargs (Dict, optional): The generation kwargs for the
model. Defaults to dict().
sync_rank (bool): Whether to sync inputs between ranks. Do not use this
if you are not familiar with this behavior. Check `sync_inputs`
function for more details. Defaults to False.
"""
is_api
:
bool
=
False
...
...
@@ -30,7 +36,8 @@ class BaseModel:
max_seq_len
:
int
=
2048
,
tokenizer_only
:
bool
=
False
,
meta_template
:
Optional
[
Dict
]
=
None
,
generation_kwargs
:
Optional
[
Dict
]
=
dict
()):
generation_kwargs
:
Optional
[
Dict
]
=
dict
(),
sync_rank
:
bool
=
False
):
self
.
path
=
path
self
.
max_seq_len
=
max_seq_len
self
.
tokenizer_only
=
tokenizer_only
...
...
@@ -40,6 +47,7 @@ class BaseModel:
if
meta_template
and
'eos_token_id'
in
meta_template
:
self
.
eos_token_id
=
meta_template
[
'eos_token_id'
]
self
.
generation_kwargs
=
generation_kwargs
self
.
sync_rank
=
sync_rank
@
abstractmethod
def
generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
)
->
List
[
str
]:
...
...
@@ -77,6 +85,34 @@ class BaseModel:
' ppl-based evaluation yet, try gen-based '
'instead.'
)
@
abstractmethod
def
encode
(
self
,
prompt
:
str
)
->
torch
.
Tensor
:
"""Encode prompt to tokens. Not necessary for most cases.
Args:
prompt (str): Input string.
Returns:
torch.Tensor: Encoded tokens.
"""
raise
NotImplementedError
(
f
'
{
self
.
__class__
.
__name__
}
does not implement'
'`encode` method.'
)
@
abstractmethod
def
decode
(
self
,
tokens
:
torch
.
Tensor
)
->
str
:
"""Decode tokens to text. Not necessary for most cases.
Args:
tokens (torch.Tensor): Input tokens.
Returns:
str: Decoded text.
"""
raise
NotImplementedError
(
f
'
{
self
.
__class__
.
__name__
}
does not implement'
'`decode` method.'
)
@
abstractmethod
def
get_token_len
(
self
,
prompt
:
str
)
->
int
:
"""Get lengths of the tokenized strings.
...
...
@@ -115,20 +151,6 @@ class BaseModel:
inputs
=
self
.
parse_template
(
templates
,
mode
=
'ppl'
)
return
self
.
get_ppl
(
inputs
,
mask_length
)
def
get_loglikelihood_from_template
(
self
,
templates
:
List
[
PromptType
],
conts
:
List
[
str
],
mask_length
=
None
):
"""Get perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs
=
self
.
parse_template
(
templates
,
mode
=
'ppl'
)
return
self
.
get_loglikelihood
(
inputs
,
conts
,
mask_length
)
def
generate_from_template
(
self
,
templates
:
List
[
PromptType
],
max_out_len
:
int
,
**
kwargs
):
"""Generate completion from a list of templates.
...
...
@@ -138,6 +160,8 @@ class BaseModel:
max_out_len (int): The maximum length of the output.
"""
inputs
=
self
.
parse_template
(
templates
,
mode
=
'gen'
)
if
hasattr
(
self
,
'sync_rank'
)
and
self
.
sync_rank
:
inputs
=
self
.
sync_inputs
(
inputs
)
return
self
.
generate
(
inputs
,
max_out_len
=
max_out_len
,
**
kwargs
)
def
get_token_len_from_template
(
...
...
@@ -165,6 +189,39 @@ class BaseModel:
token_lens
=
[
self
.
get_token_len
(
prompt
)
for
prompt
in
prompts
]
return
token_lens
[
0
]
if
not
is_batched
else
token_lens
def
sync_inputs
(
self
,
inputs
:
str
)
->
str
:
"""For some case, when it involves multiprocessing with multiple gpus,
there might be the chance that inputs are different among different
gpus. Therefore, we need to sync inputs for rank0.
Args:
inputs (str): Inputs for each rank.
"""
rank
=
dist
.
get_rank
()
if
rank
==
0
:
tokens
=
self
.
encode
(
inputs
)
length
=
self
.
get_token_len
(
inputs
)
if
length
>
2048
:
from
opencompass.utils
import
get_logger
get_logger
().
info
(
f
'Large tokens nums:
{
length
}
'
)
size
=
torch
.
tensor
([
tokens
.
shape
],
dtype
=
torch
.
long
)
else
:
tokens
=
None
size
=
torch
.
empty
(
2
,
dtype
=
torch
.
long
)
# broadcast data size
dist
.
broadcast
(
size
,
src
=
0
)
if
rank
!=
0
:
tokens
=
torch
.
empty
(
size
.
tolist
(),
dtype
=
torch
.
long
)
# broadcast tokens
dist
.
broadcast
(
tokens
,
src
=
0
)
# the final input might be different from original input
# due to the max sequence limitation
return
self
.
decode
(
tokens
)
def
to
(
self
,
device
):
self
.
model
.
to
(
device
)
...
...
opencompass/models/huggingface.py
View file @
32f40a8f
...
...
@@ -251,8 +251,9 @@ class HuggingFace(BaseModel):
**
generation_kwargs
)
for
input_
in
inputs
),
[])
def
_batch_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
def
_batch_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
stopping_criteria
:
List
[
str
]
=
[],
**
kwargs
)
->
List
[
str
]:
"""Support for batch prompts inference.
...
...
@@ -295,7 +296,9 @@ class HuggingFace(BaseModel):
if
stopping_criteria
:
# Construct huggingface stopping criteria
if
self
.
tokenizer
.
eos_token
is
not
None
:
stopping_criteria
=
stopping_criteria
+
[
self
.
tokenizer
.
eos_token
]
stopping_criteria
=
stopping_criteria
+
[
self
.
tokenizer
.
eos_token
]
stopping_criteria
=
transformers
.
StoppingCriteriaList
([
*
[
MultiTokenEOSCriteria
(
sequence
,
self
.
tokenizer
,
...
...
@@ -372,11 +375,12 @@ class HuggingFace(BaseModel):
max_length
=
self
.
max_seq_len
-
max_out_len
)[
'input_ids'
]
input_ids
=
torch
.
tensor
(
input_ids
,
device
=
self
.
model
.
device
)
if
stopping_criteria
:
# Construct huggingface stopping criteria
if
self
.
tokenizer
.
eos_token
is
not
None
:
stopping_criteria
=
stopping_criteria
+
[
self
.
tokenizer
.
eos_token
]
stopping_criteria
=
stopping_criteria
+
[
self
.
tokenizer
.
eos_token
]
stopping_criteria
=
transformers
.
StoppingCriteriaList
([
*
[
MultiTokenEOSCriteria
(
sequence
,
self
.
tokenizer
,
...
...
@@ -523,13 +527,14 @@ class HuggingFace(BaseModel):
"""
assert
mask_length
is
None
,
'Not support mask_length yet.'
if
self
.
batch_padding
and
len
(
inputs
)
>
1
:
raise
NotImplementedError
(
'Batch padding is not supported yet.'
)
# assert self.tokenizer.pad_token
# return self._get_loglikelihood(inputs, mask_length=mask_length)
return
np
.
array
([
self
.
_get_loglikelihood
(
inputs
=
inputs
[
idx
],
conts
=
conts
[
idx
])
for
idx
in
range
(
len
(
inputs
))
])
assert
self
.
tokenizer
.
pad_token
return
self
.
_get_loglikelihood
(
inputs
,
conts
)
else
:
return
np
.
concatenate
([
self
.
_get_loglikelihood
(
inputs
=
[
inputs
[
idx
]],
conts
=
[
conts
[
idx
]])
for
idx
in
range
(
len
(
inputs
))
])
def
_get_loglikelihood
(
self
,
inputs
:
str
,
conts
:
str
)
->
float
:
"""Get loglikelihood scores given input string and continuation string.
...
...
@@ -540,32 +545,76 @@ class HuggingFace(BaseModel):
Returns:
float: loglikelihood scores.
"""
input_ids
=
self
.
tokenizer
(
inputs
,
padding
=
False
,
truncation
=
True
,
max_length
=
self
.
max_seq_len
)[
'input_ids'
]
input_ids
=
torch
.
tensor
(
input_ids
,
device
=
self
.
model
.
device
)
context_ids
=
self
.
tokenizer
(
inputs
.
replace
(
conts
,
''
),
padding
=
False
,
truncation
=
True
,
max_length
=
self
.
max_seq_len
)[
'input_ids'
]
cont_ids
=
input_ids
[
len
(
context_ids
):]
output
=
self
.
model
(
input_ids
.
unsqueeze
(
0
))
logits
=
output
[
'logits'
][:,
:
-
1
]
logits
=
torch
.
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
)
contlen
=
cont_ids
.
shape
[
0
]
logits
=
logits
[:,
-
contlen
:,
:]
# Reducing the dimension will lead to a wrong outcome
logits_gather
=
torch
.
gather
(
logits
,
2
,
cont_ids
.
unsqueeze
(
0
).
unsqueeze
(
-
1
))
# [1, seq]
# Answer: sum the likelihood of each token in continuation
answer
=
float
(
logits_gather
.
detach
().
cpu
().
sum
())
input_tokenizer_out
=
self
.
tokenizer
(
inputs
,
padding
=
True
,
truncation
=
False
,
return_length
=
True
,
return_tensors
=
'pt'
).
to
(
self
.
model
.
device
)
input_ids
=
input_tokenizer_out
[
'input_ids'
][:,
:
self
.
max_seq_len
]
input_length
=
input_tokenizer_out
[
'length'
]
attention_mask
=
input_tokenizer_out
[
'attention_mask'
]
context_ids
=
[
self
.
tokenizer
(
inputs
[
i
].
replace
(
conts
[
i
],
''
),
padding
=
False
,
truncation
=
True
,
max_length
=
self
.
max_seq_len
)[
'input_ids'
]
for
i
in
range
(
len
(
inputs
))
]
# forward
outputs
=
self
.
model
(
input_ids
,
attention_mask
)[
'logits'
]
outputs
=
torch
.
nn
.
functional
.
log_softmax
(
outputs
,
dim
=-
1
)
# calculate loglikelihood
answer
=
np
.
zeros
(
len
(
inputs
))
for
i
in
range
(
len
(
inputs
)):
if
self
.
tokenizer
.
padding_side
==
'right'
:
cont_ids
=
input_ids
[
i
,
len
(
context_ids
[
i
]):
input_length
[
i
]]
logits
=
outputs
[
i
,
len
(
context_ids
[
i
])
-
1
:
input_length
[
i
]
-
1
,
:]
# noqa
else
:
cont_ids
=
input_ids
[
i
,
len
(
context_ids
[
i
])
-
input_length
[
i
]:]
logits
=
outputs
[
i
,
len
(
context_ids
[
i
])
-
input_length
[
i
]
-
1
:
-
1
]
# Reducing the dimension will lead to a wrong outcome
logits_gather
=
torch
.
gather
(
logits
.
unsqueeze
(
0
),
2
,
cont_ids
.
unsqueeze
(
0
).
unsqueeze
(
-
1
))
# [1, seq]
# Answer: sum the likelihood of each token in continuation
answer
[
i
]
=
float
(
logits_gather
.
detach
().
cpu
().
sum
())
return
answer
def
get_mink_percent
(
self
,
inputs
:
List
[
str
],
k
:
int
=
20
)
->
List
[
float
]:
"""https://swj0419.github.io/detect-pretrain.github.io/"""
if
self
.
batch_padding
and
len
(
inputs
)
>
1
:
assert
self
.
tokenizer
.
pad_token
return
self
.
_get_mink_percent
(
inputs
,
k
=
k
)
else
:
return
np
.
concatenate
([
self
.
_get_mink_percent
(
inputs
=
[
text
],
k
=
k
)
for
text
in
inputs
])
def
_get_mink_percent
(
self
,
inputs
:
List
[
str
],
k
:
int
=
20
)
->
List
[
float
]:
outputs
,
inputs
=
self
.
get_logits
(
inputs
)
shift_logits
=
outputs
[:,
:
-
1
,
:].
contiguous
().
float
()
shift_labels
=
inputs
[
'tokens'
][
'input_ids'
][:,
1
:].
contiguous
()
loss_fct
=
torch
.
nn
.
CrossEntropyLoss
(
reduction
=
'none'
,
ignore_index
=
self
.
tokenizer
.
pad_token_id
)
loss
=
loss_fct
(
shift_logits
.
view
(
-
1
,
shift_logits
.
size
(
-
1
)),
shift_labels
.
view
(
-
1
)).
view
(
shift_labels
.
size
())
lens
=
(
inputs
[
'tokens'
][
'input_ids'
]
!=
self
.
tokenizer
.
pad_token_id
).
sum
(
-
1
).
cpu
().
numpy
()
mink_percent
=
[]
for
nloss
,
nlen
in
zip
(
loss
,
lens
):
nlen
=
max
(
int
(
nlen
)
*
k
//
100
,
1
)
nloss
=
torch
.
topk
(
loss
,
nlen
,
dim
=-
1
)[
0
]
nloss
=
-
nloss
.
mean
().
cpu
().
detach
().
numpy
()
mink_percent
.
append
(
nloss
)
return
np
.
array
(
mink_percent
)
def
get_token_len
(
self
,
prompt
:
str
)
->
int
:
"""Get lengths of the tokenized strings.
...
...
@@ -710,17 +759,15 @@ class HuggingFaceChatGLM3(HuggingFace):
responses
.
append
(
''
)
continue
try
:
response
,
history
=
self
.
model
.
chat
(
self
.
tokenizer
,
user_content
,
history
=
history
,
**
generation_kwargs
)
# response will be dict sometime
if
isinstance
(
response
,
dict
):
response
=
response
.
get
(
'content'
,
''
)
responses
.
append
(
response
)
except
Exception
:
responses
.
append
(
''
)
response
,
history
=
self
.
model
.
chat
(
self
.
tokenizer
,
user_content
,
history
=
history
,
max_new_tokens
=
max_out_len
,
**
generation_kwargs
)
# response will be dict sometime
if
isinstance
(
response
,
dict
):
response
=
response
.
get
(
'content'
,
''
)
responses
.
append
(
response
)
return
responses
def
get_token_len
(
self
,
prompt
:
str
)
->
int
:
...
...
opencompass/models/llama2.py
View file @
32f40a8f
...
...
@@ -100,6 +100,42 @@ class Llama2(BaseModel):
ce_loss
=
loss
.
sum
(
-
1
).
cpu
().
detach
().
numpy
()
/
lens
return
ce_loss
def
get_loglikelihood
(
self
,
inputs
:
List
[
str
],
conts
:
List
[
str
],
mask_length
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
float
]:
assert
mask_length
is
None
,
'mask_length is not supported'
bsz
=
len
(
inputs
)
params
=
self
.
model
.
params
assert
bsz
<=
params
.
max_batch_size
,
(
bsz
,
params
.
max_batch_size
)
# tokenize
input_tokens
=
[
self
.
tokenizer
.
encode
(
x
,
True
,
False
)
for
x
in
inputs
]
max_prompt_size
=
max
([
len
(
t
)
for
t
in
input_tokens
])
total_len
=
min
(
params
.
max_seq_len
,
max_prompt_size
)
tokens
=
torch
.
zeros
((
bsz
,
total_len
)).
cuda
().
long
()
num_token_list
=
[]
cont_tokens
=
[]
for
k
,
t
in
enumerate
(
input_tokens
):
num_token
=
min
(
total_len
,
len
(
t
))
num_token_list
.
append
(
num_token
-
1
)
tokens
[
k
,
:
num_token
]
=
torch
.
tensor
(
t
[
-
num_token
:]).
long
()
context_ids
=
self
.
tokenizer
.
encode
(
inputs
[
k
].
replace
(
conts
[
k
],
''
),
True
,
False
)
cont_tokens
.
append
(
tokens
[
k
,
len
(
context_ids
):
num_token
])
# forward
outputs
=
self
.
model
.
forward
(
tokens
,
0
)[:,
:
-
1
,
:]
outputs
=
torch
.
nn
.
functional
.
log_softmax
(
outputs
,
dim
=-
1
)
loglikelihood_sum
=
torch
.
zeros
(
bsz
).
cuda
()
for
idx
in
range
(
bsz
):
logits
=
outputs
[
idx
,
num_token_list
[
idx
]
-
len
(
cont_tokens
[
idx
]):
num_token_list
[
idx
],
:].
unsqueeze
(
0
)
loglikelihood_sum
[
idx
]
=
torch
.
gather
(
logits
,
2
,
cont_tokens
[
idx
].
unsqueeze
(
0
).
unsqueeze
(
-
1
)).
sum
()
loglikelihood_sum
=
loglikelihood_sum
.
cpu
().
detach
().
numpy
()
return
loglikelihood_sum
def
get_token_len
(
self
,
prompt
:
str
)
->
int
:
return
len
(
self
.
tokenizer
.
encode
(
prompt
,
True
,
True
))
...
...
@@ -115,6 +151,7 @@ class Llama2Chat(BaseModel):
tokenizer_only (bool): whether to load tokenizer only
tokenizer_path (str): path to the tokenizer directory
meta_template (dict): meta template for the model
force_bf16 (bool): whether to force set model to `bfloat16`
"""
def
__init__
(
...
...
@@ -125,6 +162,7 @@ class Llama2Chat(BaseModel):
tokenizer_only
:
bool
=
False
,
tokenizer_path
:
Optional
[
str
]
=
None
,
meta_template
:
Optional
[
Dict
]
=
None
,
force_bf16
:
bool
=
False
,
):
# noqa
if
tokenizer_only
:
self
.
_load_tokenizer
(
tokenizer_path
=
tokenizer_path
)
...
...
@@ -132,7 +170,8 @@ class Llama2Chat(BaseModel):
self
.
_load_model
(
path
=
path
,
max_seq_len
=
max_seq_len
,
max_batch_size
=
max_batch_size
,
tokenizer_path
=
tokenizer_path
)
tokenizer_path
=
tokenizer_path
,
force_bf16
=
force_bf16
)
self
.
max_seq_len
=
max_seq_len
self
.
template_parser
=
APITemplateParser
(
meta_template
)
self
.
logger
=
get_logger
()
...
...
opencompass/models/vllm.py
0 → 100644
View file @
32f40a8f
from
typing
import
Dict
,
List
,
Optional
from
opencompass.models.base
import
BaseModel
from
opencompass.utils
import
get_logger
try
:
from
vllm
import
LLM
,
SamplingParams
except
ImportError
:
LLM
,
SamplingParams
=
None
,
None
DEFAULT_MODEL_KWARGS
=
dict
(
trust_remote_code
=
True
)
class
VLLM
(
BaseModel
):
"""Model Wrapper for VLLM."""
def
__init__
(
self
,
path
:
str
,
max_seq_len
:
int
=
2048
,
model_kwargs
:
dict
=
None
,
generation_kwargs
:
dict
=
dict
(),
meta_template
:
Optional
[
Dict
]
=
None
,
mode
:
str
=
'none'
,
use_fastchat_template
:
bool
=
False
,
end_str
:
Optional
[
str
]
=
None
,
):
super
().
__init__
(
path
=
path
,
max_seq_len
=
max_seq_len
,
meta_template
=
meta_template
)
assert
LLM
,
(
'Please install VLLM with `pip install vllm`. '
'note: torch==2.1.2 is required.'
)
self
.
logger
=
get_logger
()
self
.
_load_model
(
path
,
model_kwargs
)
self
.
tokenizer
=
self
.
model
.
get_tokenizer
()
self
.
generation_kwargs
=
generation_kwargs
self
.
generation_kwargs
.
pop
(
'do_sample'
,
None
)
assert
mode
in
[
'none'
,
'mid'
]
self
.
mode
=
mode
self
.
use_fastchat_template
=
use_fastchat_template
self
.
end_str
=
end_str
def
_load_model
(
self
,
path
:
str
,
add_model_kwargs
:
dict
=
None
,
num_retry
:
int
=
3
):
model_kwargs
=
DEFAULT_MODEL_KWARGS
.
copy
()
if
add_model_kwargs
is
not
None
:
model_kwargs
.
update
(
add_model_kwargs
)
self
.
model
=
LLM
(
path
,
**
model_kwargs
)
def
generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
**
kwargs
)
->
List
[
str
]:
"""Generate results given a list of inputs.
Args:
inputs (List[str]): A list of strings.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
if
self
.
mode
==
'mid'
:
input_ids
=
self
.
tokenizer
(
inputs
,
truncation
=
False
)[
'input_ids'
]
inputs
=
[]
for
input_id
in
input_ids
:
if
len
(
input_id
)
>
self
.
max_seq_len
-
max_out_len
:
half
=
int
((
self
.
max_seq_len
-
max_out_len
)
/
2
)
inputs
.
append
(
self
.
tokenizer
.
decode
(
input_id
[:
half
],
skip_special_tokens
=
True
)
+
self
.
tokenizer
.
decode
(
input_id
[
-
half
:],
skip_special_tokens
=
True
))
else
:
inputs
.
append
(
self
.
tokenizer
.
decode
(
input_id
,
skip_special_tokens
=
True
))
generation_kwargs
=
kwargs
.
copy
()
generation_kwargs
.
update
(
self
.
generation_kwargs
)
generation_kwargs
.
update
({
'max_tokens'
:
max_out_len
})
sampling_kwargs
=
SamplingParams
(
**
generation_kwargs
)
outputs
=
self
.
model
.
generate
(
inputs
,
sampling_kwargs
)
prompt_list
,
output_strs
=
[],
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
if
self
.
end_str
:
generated_text
=
generated_text
.
split
(
self
.
end_str
)[
0
]
prompt_list
.
append
(
prompt
)
output_strs
.
append
(
generated_text
)
return
output_strs
def
prompts_preproccess
(
self
,
inputs
:
List
[
str
]):
if
self
.
use_fastchat_template
:
try
:
from
fastchat.model
import
get_conversation_template
except
ModuleNotFoundError
:
raise
ModuleNotFoundError
(
'Fastchat is not implemented. You can use '
"'pip install
\"
fschat[model_worker,webui]
\"
' "
'to implement fastchat.'
)
conv
=
get_conversation_template
(
'vicuna'
)
conv
.
append_message
(
conv
.
roles
[
0
],
inputs
[
0
])
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
inputs
=
[
conv
.
get_prompt
()]
return
inputs
def
get_token_len
(
self
,
prompt
:
str
)
->
int
:
"""Get lengths of the tokenized strings.
Args:
prompt (str): Input string.
Returns:
int: Length of the input tokens
"""
return
len
(
self
.
model
.
get_tokenizer
().
encode
(
prompt
))
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment