Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
e78857ac
Unverified
Commit
e78857ac
authored
Dec 11, 2023
by
Hubert
Committed by
GitHub
Dec 11, 2023
Browse files
[Sync] minor test (#683)
parent
dd4318f6
Changes
57
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
489 additions
and
108 deletions
+489
-108
configs/models/hf_internlm/hf_internlm_chat_20b.py
configs/models/hf_internlm/hf_internlm_chat_20b.py
+1
-0
configs/models/hf_internlm/hf_internlm_chat_7b.py
configs/models/hf_internlm/hf_internlm_chat_7b.py
+1
-0
configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
+1
-0
configs/models/qwen/hf_qwen_14b_chat.py
configs/models/qwen/hf_qwen_14b_chat.py
+3
-1
configs/models/qwen/hf_qwen_7b_chat.py
configs/models/qwen/hf_qwen_7b_chat.py
+3
-1
configs/summarizers/groups/cibench.py
configs/summarizers/groups/cibench.py
+4
-0
configs/summarizers/groups/mathbench.py
configs/summarizers/groups/mathbench.py
+75
-0
configs/summarizers/math_agent.py
configs/summarizers/math_agent.py
+28
-0
opencompass/datasets/cibench.py
opencompass/datasets/cibench.py
+154
-67
opencompass/datasets/cmnli.py
opencompass/datasets/cmnli.py
+2
-0
opencompass/datasets/ds1000.py
opencompass/datasets/ds1000.py
+11
-0
opencompass/datasets/gsm8k.py
opencompass/datasets/gsm8k.py
+1
-1
opencompass/datasets/wikibench.py
opencompass/datasets/wikibench.py
+1
-1
opencompass/datasets/winogrande.py
opencompass/datasets/winogrande.py
+14
-16
opencompass/lagent/actions/ipython_interpreter.py
opencompass/lagent/actions/ipython_interpreter.py
+26
-7
opencompass/models/base.py
opencompass/models/base.py
+14
-0
opencompass/models/base_api.py
opencompass/models/base_api.py
+17
-2
opencompass/models/huggingface.py
opencompass/models/huggingface.py
+125
-7
opencompass/models/lagent.py
opencompass/models/lagent.py
+3
-2
opencompass/models/llama2.py
opencompass/models/llama2.py
+5
-3
No files found.
configs/models/hf_internlm/hf_internlm_chat_20b.py
View file @
e78857ac
...
...
@@ -29,5 +29,6 @@ models = [
batch_size
=
8
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
2
,
num_procs
=
1
),
end_str
=
'<eoa>'
,
)
]
configs/models/hf_internlm/hf_internlm_chat_7b.py
View file @
e78857ac
...
...
@@ -29,5 +29,6 @@ models = [
batch_size
=
8
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<eoa>'
,
)
]
configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
View file @
e78857ac
...
...
@@ -29,5 +29,6 @@ models = [
batch_size
=
8
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<eoa>'
,
)
]
configs/models/qwen/hf_qwen_14b_chat.py
View file @
e78857ac
...
...
@@ -22,12 +22,14 @@ models = [
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
use_fast
=
False
,),
use_fast
=
False
,
),
pad_token_id
=
151643
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<|im_end|>'
,
)
]
configs/models/qwen/hf_qwen_7b_chat.py
View file @
e78857ac
...
...
@@ -22,12 +22,14 @@ models = [
padding_side
=
'left'
,
truncation_side
=
'left'
,
trust_remote_code
=
True
,
use_fast
=
False
,),
use_fast
=
False
,
),
pad_token_id
=
151643
,
max_out_len
=
100
,
max_seq_len
=
2048
,
batch_size
=
8
,
meta_template
=
_meta_template
,
run_cfg
=
dict
(
num_gpus
=
1
,
num_procs
=
1
),
end_str
=
'<|im_end|>'
,
)
]
configs/summarizers/groups/cibench.py
0 → 100644
View file @
e78857ac
_cibench
=
[
'Pandas'
,
'Matplotlib'
,
'Opencv'
,
'SciPy'
,
'Seaborn'
,
'PyTorch'
]
_cibench
=
[
'cibench_'
+
i
for
i
in
_cibench
]
cibench_summary_groups
=
[{
'name'
:
'cibench'
,
'subsets'
:
_cibench
}]
configs/summarizers/groups/mathbench.py
0 → 100644
View file @
e78857ac
mathbench_summary_groups
=
[
{
'name'
:
'mathbench-college'
,
'subsets'
:
[
[
'mathbench-college-single_choice_cn'
,
'acc_1'
],
[
'mathbench-college-cloze_en'
,
'accuracy'
],
]
},
{
'name'
:
'mathbench-high'
,
'subsets'
:
[
[
'mathbench-high-single_choice_cn'
,
'acc_1'
],
[
'mathbench-high-single_choice_en'
,
'acc_1'
],
]
},
{
'name'
:
'mathbench-middle'
,
'subsets'
:
[
[
'mathbench-middle-single_choice_cn'
,
'acc_1'
],
]
},
{
'name'
:
'mathbench-primary'
,
'subsets'
:
[
[
'mathbench-primary-cloze_cn'
,
'accuracy'
],
]
},
{
'name'
:
'mathbench'
,
'subsets'
:
[
'mathbench-college'
,
'mathbench-high'
,
'mathbench-middle'
,
'mathbench-primary'
,
],
},
{
'name'
:
'mathbench-college-circular'
,
'subsets'
:
[
[
'mathbench-college-single_choice_cn'
,
'perf_4'
],
]
},
{
'name'
:
'mathbench-high-circular'
,
'subsets'
:
[
[
'mathbench-high-single_choice_cn'
,
'perf_4'
],
[
'mathbench-high-single_choice_en'
,
'perf_4'
],
]
},
{
'name'
:
'mathbench-middle-circular'
,
'subsets'
:
[
[
'mathbench-middle-single_choice_cn'
,
'perf_4'
],
]
},
{
'name'
:
'mathbench-circular'
,
'subsets'
:
[
'mathbench-college-circular'
,
'mathbench-high-circular'
,
'mathbench-middle-circular'
,
],
},
{
'name'
:
'mathbench-circular-and-cloze'
,
'subsets'
:
[
'mathbench-high-circular'
,
'mathbench-middle-circular'
,
'mathbench-circular'
,
'mathbench-college-cloze_en'
,
'mathbench-primary-cloze_cn'
,
],
}
]
configs/summarizers/math_agent.py
0 → 100644
View file @
e78857ac
summarizer
=
dict
(
dataset_abbrs
=
[
'######## GSM8K-Agent Accuracy ########'
,
# category
[
'gsm8k-agent'
,
'follow_acc'
],
[
'gsm8k-agent'
,
'reasoning_acc'
],
[
'gsm8k-agent'
,
'code_acc'
],
[
'gsm8k-agent'
,
'action_pct'
],
'######## MATH-Agent Accuracy ########'
,
# category
[
'math-agent'
,
'follow_acc'
],
[
'math-agent'
,
'reasoning_acc'
],
[
'math-agent'
,
'code_acc'
],
[
'math-agent'
,
'action_pct'
],
'######## MathBench-Agent Accuracy ########'
,
# category
[
'mathbench-college-single_choice_cn-agent'
,
'acc_1'
],
[
'mathbench-college-cloze_en-agent'
,
'accuracy'
],
[
'mathbench-high-single_choice_cn-agent'
,
'acc_1'
],
[
'mathbench-high-single_choice_en-agent'
,
'acc_1'
],
[
'mathbench-middle-single_choice_cn-agent'
,
'acc_1'
],
[
'mathbench-primary-cloze_cn-agent'
,
'accuracy'
],
'######## MathBench-Agent CircularEval ########'
,
# category
[
'mathbench-college-single_choice_cn-agent'
,
'perf_4'
],
[
'mathbench-high-single_choice_cn-agent'
,
'perf_4'
],
[
'mathbench-high-single_choice_en-agent'
,
'perf_4'
],
[
'mathbench-middle-single_choice_cn-agent'
,
'perf_4'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
)
opencompass/datasets/cibench.py
View file @
e78857ac
...
...
@@ -2,13 +2,15 @@ import json
import
os
import
os.path
as
osp
import
re
import
subprocess
from
collections
import
defaultdict
from
typing
import
List
,
Optional
import
numpy
as
np
from
datasets
import
Dataset
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
LOAD_DATASET
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
.base
import
BaseDataset
...
...
@@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
with
open
(
file
,
'r'
)
as
f
:
notebook
=
json
.
load
(
f
)
example
=
notebook
[
'cells'
]
metadata
=
notebook
[
'metadata'
]
modules
=
metadata
.
get
(
'modules'
,
[])
if
modules
:
# these two annotations should be the same
assert
len
(
modules
)
==
len
(
metadata
.
get
(
'step_types'
))
# reformat annotations
modules
=
[[
_m
.
strip
()
for
_m
in
_modules
.
split
(
'&'
)]
for
_modules
in
modules
]
questions
=
[]
source_codes
=
[]
outputs
=
[]
tags
=
[]
for
cell
in
example
:
if
cell
[
'cell_type'
]
==
'markdown'
:
text
=
''
.
join
(
cell
[
'source'
])
text
=
''
.
join
(
cell
[
'source'
]).
strip
()
if
modules
:
_modules
=
modules
.
pop
(
0
)
text
+=
f
"Please use
{
' and '
.
join
(
_modules
)
}
modules."
text
=
text
.
strip
()
+
'
\n
'
# append the formatted text
questions
.
append
(
text
)
elif
cell
[
'cell_type'
]
==
'code'
:
source_codes
.
append
(
''
.
join
(
cell
[
'source'
]))
if
cell
[
'outputs'
]
and
'data'
in
cell
[
'outputs'
][
-
1
]:
if
'image/png'
in
cell
[
'outputs'
][
-
1
][
'data'
]:
# skip vis temporarily due to lack of evaluation
...
...
@@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
outputs
.
append
(
''
.
join
(
cell
[
'outputs'
][
-
1
][
'data'
][
'text/plain'
]))
else
:
tags
.
append
(
'exec
utable
'
)
tags
.
append
(
'exec'
)
outputs
.
append
(
None
)
return
dict
(
experiment
=
file
,
questions
=
sum
(([
dict
(
role
=
'user'
,
content
=
question
),
dict
(
role
=
'assistant'
,
content
=
output
)
]
for
question
,
output
in
zip
(
questions
,
outputs
)),
[]),
references
=
dict
(
outputs
=
outputs
,
tags
=
tags
,
experiment
=
file
),
dict
(
role
=
'assistant'
,
content
=
source_code
)
]
for
question
,
source_code
in
zip
(
questions
,
source_codes
)),
[]),
references
=
dict
(
outputs
=
outputs
,
tags
=
tags
,
metadata
=
metadata
,
experiment
=
file
),
)
...
...
@@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
@
staticmethod
def
load
(
path
:
str
):
"""Load whole dataset."""
assert
os
.
path
.
exists
(
path
),
f
'Path
{
path
}
does not exist.'
data_list
=
[]
for
cwd
,
dirs
,
files
in
os
.
walk
(
path
):
dirs
.
sort
()
...
...
@@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
"""Evaluator for CI dataset.
Args:
text_evaluator (optional, dict): The text evaluator for text result
comparison[]. Defaults to None, which use Rouge as defaults.
Please notice that a extra key for `metric_name` should be set
to get the exact metric result, such as `rouge1`.
output_dir (optional, str): The directory to save experiment
files in a markdown or notebook format.
with_ipynb (bool): Generate ipynb correspondingly.
Defaults to False.
user_data_dir (str): The directory to load local files.
Defaults to 'ENV', which means use environment variable
`USER_DATA_DIR` to get the data dir.
"""
def
__init__
(
self
,
text_evaluator
:
Optional
[
dict
]
=
None
,
output_dir
:
Optional
[
str
]
=
None
,
with_ipynb
:
bool
=
False
,
user_data_dir
:
str
=
'ENV'
)
->
None
:
if
text_evaluator
is
None
:
from
opencompass.openicl.icl_evaluator
import
RougeEvaluator
self
.
text_evaluator
=
ICL_EVALUATORS
.
build
(
dict
(
type
=
RougeEvaluator
))
self
.
text_eval_metric
=
'rouge1'
else
:
self
.
text_eval_metric
=
text_evaluator
.
pop
(
'metric_name'
)
self
.
text_evaluator
=
ICL_EVALUATORS
.
build
(
text_evaluator
)
# TODO: should use work dir for this task.
self
.
output_dir
=
output_dir
self
.
user_data_dir
=
self
.
check_user_data_dir
(
user_data_dir
)
self
.
with_ipynb
=
with_ipynb
self
.
TAG_MAPPING
=
{
'exec'
:
(
'executable'
,
self
.
valid_step
),
'general'
:
(
'general_correct'
,
self
.
correct_step
),
'num'
:
(
'numeric_correct'
,
self
.
correct_step
),
'text'
:
(
'text_score'
,
self
.
text_step
),
'vis'
:
(
'vis_sim'
,
self
.
vis_similarity_step
),
}
def
check_user_data_dir
(
self
,
user_data_dir
):
if
user_data_dir
==
'ENV'
:
user_data_dir
=
os
.
environ
.
get
(
'USER_DATA_DIR'
,
''
)
self
.
user_data_dir
=
user_data_dir
user_data_dir
=
user_data_dir
.
rstrip
(
'/'
)
basename
=
osp
.
basename
(
user_data_dir
)
if
basename
and
basename
!=
'data'
:
user_data_dir
=
osp
.
join
(
user_data_dir
,
'data'
)
assert
osp
.
exists
(
user_data_dir
),
\
f
'a subfolder named `data` should exist under
{
user_data_dir
}
.'
elif
basename
:
assert
osp
.
exists
(
user_data_dir
),
\
f
'
{
user_data_dir
}
does not exist.'
return
user_data_dir
@
staticmethod
def
valid_step
(
step
):
...
...
@@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
# Fall back to False
return
False
def
text_step
(
self
,
step
,
target
):
"""Whether the step output is correct."""
# Found the latest code interpreter to determine correct
for
action
in
step
[::
-
1
]:
if
action
[
'type'
]
==
'IPythonInterpreter'
:
if
action
[
'result'
]:
try
:
pred
=
action
[
'result'
][
'text'
]
match
=
re
.
search
(
'```
\n
(.*?)
\n
```'
,
pred
,
re
.
DOTALL
)
if
match
:
out
=
match
.
group
(
1
)
score
=
self
.
text_evaluator
.
score
([
out
],
[
target
])
return
score
[
self
.
text_eval_metric
]
/
100
except
Exception
:
return
False
# Fall back to False
return
False
@
staticmethod
def
vis_similarity_step
(
step
,
target
):
"""Whether the step output image has the same structure similarity with
...
...
@@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
'the conversion processes.'
)
check_jupytext
()
p_list
=
[]
from
opencompass.lagent.actions.ipython_interpreter
import
extract_code
for
idx
,
(
example_origin_prompt
,
example_steps
)
in
enumerate
(
zip
(
origin_prompt
,
steps
)):
...
...
@@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
f
.
writelines
(
markdown_lines
)
# TODO: be careful for this
# The result might be different with infer process
# please check carefully
# convert markdown to ipynb and exectue with error tolerance
# subprocess.Popen(
# "jupytext --to ipynb --pipe-fmt ipynb "
# "--pipe 'jupyter nbconvert --to ipynb --execute "
# f"--allow-errors --stdin --stdout' {md_file}",
# shell=True)
if
self
.
with_ipynb
:
p
=
subprocess
.
Popen
(
'jupytext --to ipynb --pipe-fmt ipynb '
"--pipe 'jupyter nbconvert --to ipynb --execute "
f
"--allow-errors --stdin --stdout'
{
md_file
}
"
,
shell
=
True
)
p_list
.
append
(
p
)
# TODO: async wait
for
p
in
p_list
:
p
.
wait
()
def
set_data_dir
(
self
,
work_dir
):
"""Set work directory and link data files for save notebook results."""
if
self
.
user_data_dir
:
if
self
.
user_data_dir
.
endswith
(
'/'
):
basename
=
osp
.
basename
(
osp
.
split
(
self
.
user_data_dir
)[
0
])
else
:
basename
=
osp
.
basename
(
self
.
user_data_dir
)
basename
=
osp
.
basename
(
self
.
user_data_dir
)
if
not
osp
.
exists
(
osp
.
join
(
self
.
output_dir
,
basename
)):
os
.
symlink
(
self
.
user_data_dir
,
osp
.
join
(
self
.
output_dir
,
basename
))
...
...
@@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
"""Change work directory and keep the symlink."""
os
.
chdir
(
work_dir
)
def
single_exp
(
self
,
gold
,
steps
):
tags
=
gold
[
'tags'
]
outputs
=
gold
[
'outputs'
]
metadata
=
gold
[
'metadata'
]
hard_tags
=
metadata
.
get
(
'step_types'
,
[])
if
hard_tags
:
tags
=
hard_tags
# executable: exec succeed
# general_correct: general correct
# numeric_correct: numerical correct
# text_score: text score
# vis_sim: visual similarity
result
=
defaultdict
(
list
)
for
tag
,
step
,
output
in
zip
(
tags
,
steps
,
outputs
):
# check whether this step is valid
result
[
'executable'
].
append
(
self
.
valid_step
(
step
))
if
tag
!=
'exec'
:
key
,
func
=
self
.
TAG_MAPPING
[
tag
]
result
[
key
].
append
(
func
(
step
,
output
))
# add missing metric for better analyse if not exists
if
hard_tags
:
check_tags
=
[
'exec'
,
'num'
,
'text'
,
'vis'
]
else
:
check_tags
=
[
'exec'
,
'general'
,
'vis'
]
for
tag
in
check_tags
:
key
=
self
.
TAG_MAPPING
[
tag
][
0
]
if
key
not
in
result
:
result
[
key
]
=
[]
return
result
def
get_output_dir
(
self
):
"""Get output dir from eval task.
Notice: output dir should be in format xxx/data.
All the needed files should be
"""
# hard hack for get output dir from eval task
if
hasattr
(
self
,
'_out_dir'
)
and
self
.
output_dir
is
None
:
self
.
output_dir
=
self
.
_out_dir
def
score
(
self
,
predictions
:
List
,
references
:
List
,
steps
:
List
,
origin_prompt
:
List
):
"""Calculate accuracy."""
cwd
=
os
.
getcwd
()
self
.
get_output_dir
()
if
self
.
output_dir
:
if
not
osp
.
exists
(
self
.
output_dir
):
os
.
makedirs
(
self
.
output_dir
)
...
...
@@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
self
.
save_results
(
origin_prompt
,
steps
)
self
.
unset_data_dir
(
cwd
)
num_cells_list
=
[]
num_general_list
=
[]
passed_list
=
[]
correct_list
=
[]
vis_list
=
[]
total_results
=
defaultdict
(
float
)
total_scores
=
defaultdict
(
float
)
total_nums
=
defaultdict
(
int
)
for
gold
,
single_steps
in
zip
(
references
,
steps
):
tags
=
gold
[
'tags'
]
outputs
=
gold
[
'outputs'
]
num_cells
=
len
(
tags
)
num_general
=
sum
([
tag
==
'general'
for
tag
in
tags
])
passed
=
sum
([
self
.
valid_step
(
step
)
for
step
in
single_steps
])
correct
=
0
vis_sim
=
[]
for
tag
,
step
,
output
in
zip
(
tags
,
single_steps
,
outputs
):
if
tag
==
'general'
:
correct
+=
self
.
correct_step
(
step
,
output
)
elif
tag
==
'vis'
:
vis_sim
.
append
(
self
.
vis_similarity_step
(
step
,
output
))
num_cells_list
.
append
(
num_cells
)
num_general_list
.
append
(
num_general
)
passed_list
.
append
(
passed
)
correct_list
.
append
(
correct
)
if
vis_sim
:
vis_list
.
append
(
sum
(
vis_sim
)
/
len
(
vis_sim
))
else
:
vis_list
.
append
(
-
1
)
result
=
self
.
single_exp
(
gold
,
single_steps
)
if
len
([
v
for
v
in
vis_list
if
v
>=
0
])
>
0
:
visualize_similarity
=
sum
([
v
for
v
in
vis_list
if
v
>=
0
])
/
len
(
[
v
for
v
in
vis_list
if
v
>=
0
])
else
:
# not valid
visualize_similarity
=
-
1
for
k
,
v
in
result
.
items
():
total_scores
[
k
]
+=
sum
(
v
)
total_nums
[
k
]
+=
len
(
v
)
if
sum
(
num_general_list
)
>
0
:
general_accuracy
=
sum
(
correct_list
)
/
sum
(
num_general_list
)
else
:
# not valid
general_accuracy
=
-
1
result
=
dict
(
executable_rate
=
sum
(
passed_list
)
/
sum
(
num_cells_list
)
*
100
,
general_accuracy
=
general_accuracy
*
100
,
visualize_similarity
=
visualize_similarity
*
100
,
num_cells_list
=
num_cells_list
,
num_general_list
=
num_general_list
,
passed_list
=
passed_list
,
correct_list
=
correct_list
,
vis_list
=
vis_list
,
)
return
result
for
k
,
v
in
total_scores
.
items
():
if
total_nums
[
k
]
>
0
:
total_results
[
k
]
=
total_scores
[
k
]
/
total_nums
[
k
]
*
100
else
:
total_results
[
k
]
=
-
1
return
total_results
opencompass/datasets/cmnli.py
View file @
e78857ac
...
...
@@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
if
line
[
'label'
]
==
'-'
:
continue
data
.
append
(
line
)
return
Dataset
.
from_list
(
data
)
...
...
opencompass/datasets/ds1000.py
View file @
e78857ac
...
...
@@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
return
text
@
TEXT_POSTPROCESSORS
.
register_module
(
'ds1000_completion'
)
def
ds1000_completion_postprocess
(
text
:
str
)
->
str
:
text
+=
'</code>'
match
=
re
.
search
(
'(.*?)</code>'
,
text
,
re
.
DOTALL
)
if
match
:
text
=
match
.
group
(
1
)
return
text
@
TEXT_POSTPROCESSORS
.
register_module
(
'ds1000_matplotlib'
)
def
ds1000_matplotlib_postprocess
(
text
:
str
)
->
str
:
text
=
ds1000_postprocess
(
text
)
...
...
opencompass/datasets/gsm8k.py
View file @
e78857ac
...
...
@@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
reasoning_acc
=
100
*
(
reasoning_scope
+
final_scope
+
row_reasoning_scope
)
/
total
,
code_acc
=
100
*
(
code_scope
+
final_scope
)
/
total
,
action_
acc
=
100
*
(
action_scope
+
final_scope
)
/
total
,
action_
pct
=
100
*
(
action_scope
+
final_scope
)
/
total
,
)
return
result
opencompass/datasets/wikibench.py
View file @
e78857ac
...
...
@@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
circular_patterns
=
[
'ABCD'
,
'BCDA'
,
'CDAB'
,
'DABC'
]
data
=
[]
with
open
(
path
,
'r'
)
as
infile
:
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
infile
:
for
id
,
line
in
enumerate
(
infile
):
entry
=
json
.
loads
(
line
)
if
'cloze'
in
name
:
...
...
opencompass/datasets/winogrande.py
View file @
e78857ac
...
...
@@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
for
line
in
f
:
line
=
json
.
loads
(
line
)
prompt
=
line
[
'sentence'
]
dataset_list
.
append
({
'opt1'
:
prompt
.
replace
(
'_'
,
line
[
'option1'
]),
'opt2'
:
prompt
.
replace
(
'_'
,
line
[
'option2
'
]
)
,
'
answer'
:
line
[
'answer'
]
}
)
continue_prompt
=
prompt
.
split
(
'_'
)
data_item
=
{
'opt1'
:
prompt
.
replace
(
'_'
,
line
[
'option1'
]),
'opt2'
:
prompt
.
replace
(
'_'
,
line
[
'option2'
]),
'answer'
:
line
[
'answer
'
],
'
cont'
:
continue_prompt
[
1
]
}
dataset_list
.
append
(
data_item
)
dataset_list
=
Dataset
.
from_list
(
dataset_list
)
return
dataset_list
...
...
@@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
prompt
=
line
[
'sentence'
]
answer
=
line
[
'answer'
]
answer
=
' AB'
[
int
(
answer
)]
if
answer
!=
''
else
'NULL'
dataset_list
.
append
({
'opt1'
:
prompt
.
replace
(
'_'
,
line
[
'option1'
]),
'opt2'
:
prompt
.
replace
(
'_'
,
line
[
'option2'
]),
'answer'
:
answer
})
data_item
=
{
'opt1'
:
prompt
.
replace
(
'_'
,
line
[
'option1'
]),
'opt2'
:
prompt
.
replace
(
'_'
,
line
[
'option2'
]),
'answer'
:
answer
,
}
dataset_list
.
append
(
data_item
)
dataset_list
=
Dataset
.
from_list
(
dataset_list
)
return
dataset_list
opencompass/lagent/actions/ipython_interpreter.py
View file @
e78857ac
...
...
@@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
it is disabled. Defaults to None.
timeout (int): Upper bound of waiting time for Python script execution.
Defaults to 20.
trim_output (int, optional): Max characters restriction of ipython
outputs. If None, do not perform any trim.
TODO: Notice that, this is not token len. Anf trim strategies
might be added later. Defaults to 1024.
user_data_dir (str): Specified the user data directory for files
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
Defaults to `ENV`.
...
...
@@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
enable
:
bool
=
True
,
disable_description
:
Optional
[
str
]
=
None
,
timeout
:
int
=
20
,
trim_output
:
Optional
[
int
]
=
1024
,
user_data_dir
:
str
=
'ENV'
)
->
None
:
super
().
__init__
(
description
,
name
,
enable
,
disable_description
)
...
...
@@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
user_data_dir
=
os
.
environ
.
get
(
'USER_DATA_DIR'
,
''
)
if
user_data_dir
:
user_data_dir
=
os
.
path
.
dirname
(
user_data_dir
)
#
user_data_dir = os.path.dirname(user_data_dir)
user_data_dir
=
f
"import os
\n
os.chdir('
{
user_data_dir
}
')"
self
.
user_data_dir
=
user_data_dir
self
.
_initialized
=
False
self
.
trim_output
=
trim_output
if
not
os
.
path
.
exists
(
WORK_DIR
):
os
.
mkdir
(
WORK_DIR
)
...
...
@@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
if
image
:
result
+=
f
'
\n\n
{
image
}
'
if
finished
:
# in case output text too long
# might need better design later
if
self
.
trim_output
and
len
(
result
)
>
self
.
trim_output
:
ellip
=
'......'
half_len
=
int
((
self
.
trim_output
-
len
(
ellip
))
/
2
)
result
=
result
[:
half_len
]
+
ellip
+
result
[
-
half_len
:]
return
succeed
,
result
try
:
...
...
@@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
command
:
str
,
timeout
:
Optional
[
int
]
=
None
)
->
ActionReturn
:
tool_return
=
ActionReturn
(
url
=
None
,
args
=
None
,
type
=
self
.
name
)
tool_return
.
args
=
dict
(
text
=
command
)
succeed
,
result
=
self
.
_call
(
command
,
timeout
)
if
succeed
:
tool_return
.
result
=
dict
(
text
=
result
)
tool_return
.
state
=
ActionStatusCode
.
SUCCESS
extracted_command
=
extract_code
(
command
)
tool_return
.
args
=
dict
(
text
=
command
,
extract_code
=
extracted_command
)
if
extracted_command
:
succeed
,
result
=
self
.
_call
(
extracted_command
,
timeout
)
if
succeed
:
if
not
result
:
result
=
'The code is succeed without any outputs.'
tool_return
.
result
=
dict
(
text
=
result
)
tool_return
.
state
=
ActionStatusCode
.
SUCCESS
else
:
tool_return
.
errmsg
=
repr
(
result
)
tool_return
.
state
=
ActionStatusCode
.
API_ERROR
else
:
tool_return
.
errmsg
=
repr
(
result
)
tool_return
.
errmsg
=
'The input code is empty. Please follow the format.'
# noqa
tool_return
.
state
=
ActionStatusCode
.
API_ERROR
return
tool_return
...
...
opencompass/models/base.py
View file @
e78857ac
...
...
@@ -115,6 +115,20 @@ class BaseModel:
inputs
=
self
.
parse_template
(
templates
,
mode
=
'ppl'
)
return
self
.
get_ppl
(
inputs
,
mask_length
)
def
get_loglikelihood_from_template
(
self
,
templates
:
List
[
PromptType
],
conts
:
List
[
str
],
mask_length
=
None
):
"""Get perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs
=
self
.
parse_template
(
templates
,
mode
=
'ppl'
)
return
self
.
get_loglikelihood
(
inputs
,
conts
,
mask_length
)
def
generate_from_template
(
self
,
templates
:
List
[
PromptType
],
max_out_len
:
int
,
**
kwargs
):
"""Generate completion from a list of templates.
...
...
opencompass/models/base_api.py
View file @
e78857ac
import
re
import
sys
import
threading
import
time
import
warnings
from
abc
import
abstractmethod
from
copy
import
deepcopy
from
queue
import
Queue
from
time
import
sleep
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
...
...
@@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
def
__init__
(
self
,
path
:
str
,
query_per_second
:
int
=
1
,
rpm_verbose
:
bool
=
False
,
retry
:
int
=
2
,
max_seq_len
:
int
=
2048
,
meta_template
:
Optional
[
Dict
]
=
None
,
...
...
@@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
self
.
meta_template
=
meta_template
self
.
retry
=
retry
self
.
query_per_second
=
query_per_second
self
.
token_bucket
=
TokenBucket
(
query_per_second
)
self
.
token_bucket
=
TokenBucket
(
query_per_second
,
rpm_verbose
)
self
.
template_parser
=
APITemplateParser
(
meta_template
)
self
.
logger
=
get_logger
()
self
.
generation_kwargs
=
generation_kwargs
...
...
@@ -422,10 +425,13 @@ class TokenBucket:
query_per_second (float): The rate of the token bucket.
"""
def
__init__
(
self
,
rate
):
def
__init__
(
self
,
rate
,
verbose
=
False
):
self
.
_rate
=
rate
self
.
_tokens
=
threading
.
Semaphore
(
0
)
self
.
started
=
False
self
.
_request_queue
=
Queue
()
self
.
logger
=
get_logger
()
self
.
verbose
=
verbose
def
_add_tokens
(
self
):
"""Add tokens to the bucket."""
...
...
@@ -440,3 +446,12 @@ class TokenBucket:
self
.
started
=
True
threading
.
Thread
(
target
=
self
.
_add_tokens
,
daemon
=
True
).
start
()
self
.
_tokens
.
acquire
()
if
self
.
verbose
:
cur_time
=
time
.
time
()
while
not
self
.
_request_queue
.
empty
():
if
cur_time
-
self
.
_request_queue
.
queue
[
0
]
>
60
:
self
.
_request_queue
.
get
()
else
:
break
self
.
_request_queue
.
put
(
cur_time
)
self
.
logger
.
info
(
f
'Current RPM
{
self
.
_request_queue
.
qsize
()
}
.'
)
opencompass/models/huggingface.py
View file @
e78857ac
...
...
@@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
import
numpy
as
np
import
torch
import
transformers
from
opencompass.models.base
import
BaseModel
from
opencompass.models.base_api
import
APITemplateParser
...
...
@@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
PromptType
=
Union
[
PromptList
,
str
]
class
MultiTokenEOSCriteria
(
transformers
.
StoppingCriteria
):
"""Criteria to stop on the specified multi-token sequence."""
def
__init__
(
self
,
sequence
:
str
,
tokenizer
:
transformers
.
PreTrainedTokenizer
,
batch_size
:
int
,
):
self
.
done_tracker
=
[
False
]
*
batch_size
self
.
sequence
=
sequence
self
.
sequence_ids
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
False
)
self
.
sequence_id_len
=
len
(
self
.
sequence_ids
)
self
.
tokenizer
=
tokenizer
def
__call__
(
self
,
input_ids
,
scores
,
**
kwargs
)
->
bool
:
# compare the last len(stop) tokens
lookback_ids_batch
=
input_ids
[:,
-
self
.
sequence_id_len
:]
lookback_tokens_batch
=
self
.
tokenizer
.
batch_decode
(
lookback_ids_batch
)
for
i
,
done
in
enumerate
(
self
.
done_tracker
):
if
done
:
continue
self
.
done_tracker
[
i
]
=
self
.
sequence
in
lookback_tokens_batch
[
i
]
return
False
not
in
self
.
done_tracker
@
MODELS
.
register_module
()
class
HuggingFace
(
BaseModel
):
"""Model wrapper around HuggingFace models.
...
...
@@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
self
.
model
.
config
.
eos_token_id
=
2
self
.
model
.
config
.
pad_token_id
=
self
.
tokenizer
.
pad_token_id
def
generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
def
generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
stopping_criteria
:
List
[
str
]
=
[],
**
kwargs
)
->
List
[
str
]:
"""Generate results given a list of inputs.
...
...
@@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
max_out_len
=
max_out_len
,
**
generation_kwargs
)
else
:
return
sum
((
self
.
_single_generate
(
inputs
=
[
input_
],
max_out_len
=
max_out_len
,
**
generation_kwargs
)
for
input_
in
inputs
),
[])
return
sum
(
(
self
.
_single_generate
(
inputs
=
[
input_
],
max_out_len
=
max_out_len
,
stopping_criteria
=
stopping_criteria
,
**
generation_kwargs
)
for
input_
in
inputs
),
[])
def
_batch_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
**
kwargs
)
->
List
[
str
]:
...
...
@@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
decodeds
=
[
token
.
split
(
self
.
end_str
)[
0
]
for
token
in
decodeds
]
return
decodeds
def
_single_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
def
_single_generate
(
self
,
inputs
:
List
[
str
],
max_out_len
:
int
,
stopping_criteria
:
List
[
str
]
=
[],
**
kwargs
)
->
List
[
str
]:
"""Support for single prompt inference.
...
...
@@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
max_length
=
self
.
max_seq_len
-
max_out_len
)[
'input_ids'
]
input_ids
=
torch
.
tensor
(
input_ids
,
device
=
self
.
model
.
device
)
if
stopping_criteria
:
# Construct huggingface stopping criteria
stopping_criteria
=
stopping_criteria
+
[
self
.
tokenizer
.
eos_token
]
stopping_criteria
=
transformers
.
StoppingCriteriaList
([
*
[
MultiTokenEOSCriteria
(
sequence
,
self
.
tokenizer
,
input_ids
.
shape
[
0
])
for
sequence
in
stopping_criteria
],
])
kwargs
[
'stopping_criteria'
]
=
stopping_criteria
# To accommodate the PeftModel, parameters should be passed in
# key-value format for generate.
outputs
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
...
...
@@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
ce_loss
=
loss
.
sum
(
-
1
).
cpu
().
detach
().
numpy
()
/
lens
return
ce_loss
def
get_loglikelihood
(
self
,
inputs
:
List
[
str
],
conts
:
List
[
str
],
mask_length
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
float
]:
"""Get loglikelihood scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
conts (List[str]): A list of strings: slices after the space.
NOT SUPPORT mask_length YET!
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of loglikelihood scores.
"""
assert
mask_length
is
None
,
'Not support mask_length yet.'
if
self
.
batch_padding
and
len
(
inputs
)
>
1
:
raise
NotImplementedError
(
'Batch padding is not supported yet.'
)
# assert self.tokenizer.pad_token
# return self._get_loglikelihood(inputs, mask_length=mask_length)
return
np
.
array
([
self
.
_get_loglikelihood
(
inputs
=
inputs
[
idx
],
conts
=
conts
[
idx
])
for
idx
in
range
(
len
(
inputs
))
])
def
_get_loglikelihood
(
self
,
inputs
:
str
,
conts
:
str
)
->
float
:
"""Get loglikelihood scores given input string and continuation string.
Args:
inputs (str): string.
conts (str): strings: slices after the space.
Returns:
float: loglikelihood scores.
"""
input_ids
=
self
.
tokenizer
(
inputs
,
padding
=
False
,
truncation
=
True
,
max_length
=
self
.
max_seq_len
)[
'input_ids'
]
input_ids
=
torch
.
tensor
(
input_ids
,
device
=
self
.
model
.
device
)
context_ids
=
self
.
tokenizer
(
inputs
.
replace
(
conts
,
''
),
padding
=
False
,
truncation
=
True
,
max_length
=
self
.
max_seq_len
)[
'input_ids'
]
cont_ids
=
input_ids
[
len
(
context_ids
):]
output
=
self
.
model
(
input_ids
.
unsqueeze
(
0
))
logits
=
output
[
'logits'
][:,
:
-
1
]
logits
=
torch
.
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
)
contlen
=
cont_ids
.
shape
[
0
]
logits
=
logits
[:,
-
contlen
:,
:]
# Reducing the dimension will lead to a wrong outcome
logits_gather
=
torch
.
gather
(
logits
,
2
,
cont_ids
.
unsqueeze
(
0
).
unsqueeze
(
-
1
))
# [1, seq]
# Answer: sum the likelihood of each token in continuation
answer
=
float
(
logits_gather
.
detach
().
cpu
().
sum
())
return
answer
def
get_token_len
(
self
,
prompt
:
str
)
->
int
:
"""Get lengths of the tokenized strings.
...
...
@@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
'role'
:
{
'HUMAN'
:
'user'
,
'BOT'
:
'assistant'
,
'SYSTEM'
:
'system'
}[
item
[
'role'
]]
'SYSTEM'
:
'system'
,
}[
item
[
'role'
]
.
upper
()
]
}
history
.
append
(
msg
)
user_content
=
history
[
-
1
][
'content'
]
...
...
@@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
response
,
history
=
self
.
model
.
chat
(
self
.
tokenizer
,
user_content
,
history
=
history
)
# response will be dict sometime
if
isinstance
(
response
,
dict
):
response
=
response
.
get
(
'content'
,
''
)
responses
.
append
(
response
)
except
Exception
:
responses
.
append
(
''
)
...
...
opencompass/models/lagent.py
View file @
e78857ac
...
...
@@ -52,7 +52,7 @@ class LagentAgent:
def
chat
(
self
,
user_input
:
str
,
history
:
List
[
dict
]
=
None
)
->
Tuple
[
str
,
List
[
dict
]]:
history
:
List
[
dict
]
=
None
)
->
Tuple
[
str
,
List
[
dict
],
List
[
dict
]]:
"""Chat with agent."""
if
history
:
self
.
agent
.
_session_history
=
history
...
...
@@ -60,6 +60,7 @@ class LagentAgent:
from
lagent.schema
import
ActionReturn
,
AgentReturn
generation
:
AgentReturn
=
self
.
agent
.
chat
(
user_input
)
inner_steps
=
generation
.
inner_steps
answer
=
generation
.
response
steps
=
[]
...
...
@@ -76,7 +77,7 @@ class LagentAgent:
valid
=
int
(
step
.
valid
),
))
return
answer
,
steps
return
answer
,
steps
,
inner_steps
FORCE_STOP_PROMPT_EN
=
(
...
...
opencompass/models/llama2.py
View file @
e78857ac
...
...
@@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
dialog
=
[]
for
item
in
input
:
msg
=
{
'content'
:
item
[
'prompt'
]}
if
item
[
'role'
]
==
'HUMAN'
:
if
item
[
'role'
]
.
upper
()
==
'HUMAN'
:
msg
[
'role'
]
=
'user'
elif
item
[
'role'
]
==
'BOT'
:
elif
item
[
'role'
]
.
upper
()
==
'BOT'
:
msg
[
'role'
]
=
'assistant'
elif
item
[
'role'
]
==
'SYSTEM'
:
elif
item
[
'role'
]
.
upper
()
==
'SYSTEM'
:
msg
[
'role'
]
=
'system'
else
:
raise
ValueError
(
f
'Unknown role:
{
item
[
"role"
]
}
'
)
dialog
.
append
(
msg
)
dialogs
.
append
(
dialog
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment